Discussion:
[PATCH] diffseq: restore TOO_EXPENSIVE heuristic
(too old to reply)
Paul Eggert
2016-10-25 22:24:07 UTC
Permalink
Raw Message
* lib/diffseq.h: Problem with diffutils reported by Andreas Schwab
(Bug#24715). The simplest solution is to restore the
TOO_EXPENSIVE heuristic that I added to GNU diff in 1993, while
using a higher threshold to avoid Bug#16848 on smaller files.
* lib/diffseq.h (struct context): Restore member too_expensive.
(struct partition): Restore members lo_minimal, hi_minimal.
(diag, compareseq): Restore arg find_minimal. All uses changed.
(diag): Restore the TOO_EXPENSIVE heuristic that I added back in
1993 to make 'diff' run faster (but not as well) on large inputs,
but use a threshold of 4096 instead of the old 256.
* lib/fstrcmp.c (strcmp_bounded):
* lib/git-merge-changelog.c (compute_differences):
Adjust to diffseq.h changes.
---
ChangeLog | 17 +++++++
lib/diffseq.h | 118 ++++++++++++++++++++++++++++++++++++++++++----
lib/fstrcmp.c | 10 +++-
lib/git-merge-changelog.c | 3 +-
4 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 1db20d7..6f4e2fc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2016-10-25 Paul Eggert <***@cs.ucla.edu>
+
+ diffseq: restore TOO_EXPENSIVE heuristic
+ * lib/diffseq.h: Problem with diffutils reported by Andreas Schwab
+ (Bug#24715). The simplest solution is to restore the
+ TOO_EXPENSIVE heuristic that I added to GNU diff in 1993, while
+ using a higher threshold to avoid Bug#16848 on smaller files.
+ * lib/diffseq.h (struct context): Restore member too_expensive.
+ (struct partition): Restore members lo_minimal, hi_minimal.
+ (diag, compareseq): Restore arg find_minimal. All uses changed.
+ (diag): Restore the TOO_EXPENSIVE heuristic that I added back in
+ 1993 to make 'diff' run faster (but not as well) on large inputs,
+ but use a threshold of 4096 instead of the old 256.
+ * lib/fstrcmp.c (strcmp_bounded):
+ * lib/git-merge-changelog.c (compute_differences):
+ Adjust to diffseq.h changes.
+
2016-10-22 Bruno Haible <***@clisp.org>

non-recursive-gnulib-prefix-hack: Don't make assumptions about
diff --git a/lib/diffseq.h b/lib/diffseq.h
index 6be7d83..e27b6c2 100644
--- a/lib/diffseq.h
+++ b/lib/diffseq.h
@@ -34,7 +34,12 @@
The basic algorithm was independently discovered as described in:
"Algorithms for Approximate String Matching", Esko Ukkonen,
Information and Control Vol. 64, 1985, pp. 100-118,
- <http://dx.doi.org/10.1016/S0019-9958(85)80046-2>. */
+ <http://dx.doi.org/10.1016/S0019-9958(85)80046-2>.
+
+ Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
+ heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
+ at the price of producing suboptimal output for large inputs with
+ many differences. */

/* Before including this file, you need to define:
ELEMENT The element type of the vectors being compared.
@@ -123,6 +128,9 @@ struct context
bool heuristic;
#endif

+ /* Edit scripts longer than this are too expensive to compute. */
+ OFFSET too_expensive;
+
/* Snakes bigger than this are considered "big". */
#define SNAKE_LIMIT 20
};
@@ -132,6 +140,12 @@ struct partition
/* Midpoints of this partition. */
OFFSET xmid;
OFFSET ymid;
+
+ /* True if low half will be analyzed minimally. */
+ bool lo_minimal;
+
+ /* Likewise for high half. */
+ bool hi_minimal;
};


@@ -143,10 +157,17 @@ struct partition
When the two searches meet, we have found the midpoint of the shortest
edit sequence.

- Set *PART to the midpoint (XMID,YMID). The diagonal number
+ If FIND_MINIMAL is true, find the minimal edit script regardless of
+ expense. Otherwise, if the search is too expensive, use heuristics to
+ stop the search and report a suboptimal answer.
+
+ Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
XMID - YMID equals the number of inserted elements minus the number
of deleted elements (counting only elements before the midpoint).

+ Set PART->lo_minimal to true iff the minimal edit script for the
+ left half of the partition is known; similarly for PART->hi_minimal.
+
This function assumes that the first elements of the specified portions
of the two vectors do not match, and likewise that the last elements do not
match. The caller must trim matching elements from the beginning and end
@@ -156,7 +177,7 @@ struct partition
suboptimal diff output. It cannot cause incorrect diff output. */

static void
-diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
+diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
struct partition *part, struct context *ctxt)
{
OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
@@ -216,6 +237,7 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
{
part->xmid = x;
part->ymid = y;
+ part->lo_minimal = part->hi_minimal = true;
return;
}
}
@@ -248,10 +270,14 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
{
part->xmid = x;
part->ymid = y;
+ part->lo_minimal = part->hi_minimal = true;
return;
}
}

+ if (find_minimal)
+ continue;
+
#ifdef USE_HEURISTIC
/* Heuristic: check occasionally for a diagonal that has made lots
of progress compared with the edit distance. If we have any
@@ -295,7 +321,11 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
}
}
if (best > 0)
- return;
+ {
+ part->lo_minimal = true;
+ part->hi_minimal = false;
+ return;
+ }
}

{
@@ -330,10 +360,77 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
}
}
if (best > 0)
- return;
+ {
+ part->lo_minimal = false;
+ part->hi_minimal = true;
+ return;
+ }
}
}
#endif /* USE_HEURISTIC */
+
+ /* Heuristic: if we've gone well beyond the call of duty, give up
+ and report halfway between our best results so far. */
+ if (c >= ctxt->too_expensive)
+ {
+ OFFSET fxybest;
+ OFFSET fxbest IF_LINT (= 0);
+ OFFSET bxybest;
+ OFFSET bxbest IF_LINT (= 0);
+
+ /* Find forward diagonal that maximizes X + Y. */
+ fxybest = -1;
+ for (d = fmax; d >= fmin; d -= 2)
+ {
+ OFFSET x = MIN (fd[d], xlim);
+ OFFSET y = x - d;
+ if (ylim < y)
+ {
+ x = ylim + d;
+ y = ylim;
+ }
+ if (fxybest < x + y)
+ {
+ fxybest = x + y;
+ fxbest = x;
+ }
+ }
+
+ /* Find backward diagonal that minimizes X + Y. */
+ bxybest = OFFSET_MAX;
+ for (d = bmax; d >= bmin; d -= 2)
+ {
+ OFFSET x = MAX (xoff, bd[d]);
+ OFFSET y = x - d;
+ if (y < yoff)
+ {
+ x = yoff + d;
+ y = yoff;
+ }
+ if (x + y < bxybest)
+ {
+ bxybest = x + y;
+ bxbest = x;
+ }
+ }
+
+ /* Use the better of the two diagonals. */
+ if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
+ {
+ part->xmid = fxbest;
+ part->ymid = fxybest - fxbest;
+ part->lo_minimal = true;
+ part->hi_minimal = false;
+ }
+ else
+ {
+ part->xmid = bxbest;
+ part->ymid = bxybest - bxbest;
+ part->lo_minimal = false;
+ part->hi_minimal = true;
+ }
+ return;
+ }
}
#undef XREF_YREF_EQUAL
}
@@ -347,6 +444,9 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
are origin-0.

+ If FIND_MINIMAL, find a minimal difference no matter how
+ expensive it is.
+
The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.

Return false if terminated normally, or true if terminated through early
@@ -354,7 +454,7 @@ diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,

static bool
compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
- struct context *ctxt)
+ bool find_minimal, struct context *ctxt)
{
#ifdef ELEMENT
ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
@@ -400,12 +500,12 @@ compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });

/* Find a point of correspondence in the middle of the vectors. */
- diag (xoff, xlim, yoff, ylim, &part, ctxt);
+ diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);

/* Use the partitions to split this problem into subproblems. */
- if (compareseq (xoff, part.xmid, yoff, part.ymid, ctxt))
+ if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
return true;
- if (compareseq (part.xmid, xlim, part.ymid, ylim, ctxt))
+ if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
return true;
}

diff --git a/lib/fstrcmp.c b/lib/fstrcmp.c
index a12f2b5..744c73c 100644
--- a/lib/fstrcmp.c
+++ b/lib/fstrcmp.c
@@ -183,6 +183,14 @@ fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)
ctxt.xvec = string1;
ctxt.yvec = string2;

+ /* Set TOO_EXPENSIVE to be approximate square root of input size,
+ bounded below by 4096. */
+ ctxt.too_expensive = 1;
+ for (i = xvec_length + yvec_length; i != 0; i >>= 2)
+ ctxt.too_expensive <<= 1;
+ if (ctxt.too_expensive < 4096)
+ ctxt.too_expensive = 4096;
+
/* Allocate memory for fdiag and bdiag from a thread-local pool. */
fdiag_len = length_sum + 3;
gl_once (keys_init_once, keys_init);
@@ -221,7 +229,7 @@ fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)

/* Now do the main comparison algorithm */
ctxt.edit_count = - ctxt.edit_count_limit;
- if (compareseq (0, xvec_length, 0, yvec_length, &ctxt)) /* Prob: 98% */
+ if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */
/* The edit_count passed the limit. Hence the result would be
< lower_bound. We can return any value < lower_bound instead. */
return 0.0;
diff --git a/lib/git-merge-changelog.c b/lib/git-merge-changelog.c
index 1cbd92b..660e0e0 100644
--- a/lib/git-merge-changelog.c
+++ b/lib/git-merge-changelog.c
@@ -678,10 +678,11 @@ compute_differences (struct changelog_file *file1, struct changelog_file *file2,
ctxt.index_mapping_reverse[j] = 0;
ctxt.fdiag = XNMALLOC (2 * (n1 + n2 + 3), ssize_t) + n2 + 1;
ctxt.bdiag = ctxt.fdiag + n1 + n2 + 3;
+ ctxt.too_expensive = n1 + n2;

/* Store in ctxt.index_mapping and ctxt.index_mapping_reverse a -1 for
each removed or added entry. */
- compareseq (0, n1, 0, n2, &ctxt);
+ compareseq (0, n1, 0, n2, 0, &ctxt);

/* Complete the index_mapping and index_mapping_reverse arrays. */
i = 0;
--
2.7.4
Loading...