1 /* Analyze differences between two vectors.
2 
3    Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2020 Free Software
4    Foundation, Inc.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 
20 /* The basic idea is to consider two vectors as similar if, when
21    transforming the first vector into the second vector through a
22    sequence of edits (inserts and deletes of one element each),
23    this sequence is short - or equivalently, if the ordered list
24    of elements that are untouched by these edits is long.  For a
25    good introduction to the subject, read about the "Levenshtein
26    distance" in Wikipedia.
27 
28    The basic algorithm is described in:
29    "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
30    Algorithmica Vol. 1, 1986, pp. 251-266,
31    <https://doi.org/10.1007/BF01840446>.
32    See especially section 4.2, which describes the variation used below.
33 
34    The basic algorithm was independently discovered as described in:
35    "Algorithms for Approximate String Matching", Esko Ukkonen,
36    Information and Control Vol. 64, 1985, pp. 100-118,
37    <https://doi.org/10.1016/S0019-9958(85)80046-2>.
38 
39    Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
40    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
41    at the price of producing suboptimal output for large inputs with
42    many differences.  */
43 
44 /* Before including this file, you need to define:
45      ELEMENT                 The element type of the vectors being compared.
46      EQUAL                   A two-argument macro that tests two elements for
47                              equality.
48      OFFSET                  A signed integer type sufficient to hold the
49                              difference between two indices.  Usually
50                              something like ptrdiff_t.
51      EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
52      NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
53      NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
54      EARLY_ABORT(ctxt)       (Optional) A boolean expression that triggers an
55                              early abort of the computation.
56      USE_HEURISTIC           (Optional) Define if you want to support the
57                              heuristic for large vectors.
58    It is also possible to use this file with abstract arrays.  In this case,
59    xvec and yvec are not represented in memory.  They only exist conceptually.
60    In this case, the list of defines above is amended as follows:
61      ELEMENT                 Undefined.
62      EQUAL                   Undefined.
63      XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
64                              A three-argument macro: References xvec[xoff] and
65                              yvec[yoff] and tests these elements for equality.
66    Before including this file, you also need to include:
67      #include <limits.h>
68      #include <stdbool.h>
69      #include "minmax.h"
70  */
71 
72 /* Maximum value of type OFFSET.  */
73 #define OFFSET_MAX \
74   ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
75 
76 /* Default to no early abort.  */
77 #ifndef EARLY_ABORT
78 # define EARLY_ABORT(ctxt) false
79 #endif
80 
81 /* Use this to suppress gcc's "...may be used before initialized" warnings.
82    Beware: The Code argument must not contain commas.  */
83 #ifndef IF_LINT
84 # if defined GCC_LINT || defined lint
85 #  define IF_LINT(Code) Code
86 # else
87 #  define IF_LINT(Code) /* empty */
88 # endif
89 #endif
90 
91 /* As above, but when Code must contain one comma. */
92 #ifndef IF_LINT2
93 # if defined GCC_LINT || defined lint
94 #  define IF_LINT2(Code1, Code2) Code1, Code2
95 # else
96 #  define IF_LINT2(Code1, Code2) /* empty */
97 # endif
98 #endif
99 
100 /*
101  * Context of comparison operation.
102  */
103 struct context
104 {
105   #ifdef ELEMENT
106   /* Vectors being compared.  */
107   ELEMENT const *xvec;
108   ELEMENT const *yvec;
109   #endif
110 
111   /* Extra fields.  */
112   EXTRA_CONTEXT_FIELDS
113 
114   /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
115      furthest along the given diagonal in the forward search of the edit
116      matrix.  */
117   OFFSET *fdiag;
118 
119   /* Vector, indexed by diagonal, containing the X coordinate of the point
120      furthest along the given diagonal in the backward search of the edit
121      matrix.  */
122   OFFSET *bdiag;
123 
124   #ifdef USE_HEURISTIC
125   /* This corresponds to the diff --speed-large-files flag.  With this
126      heuristic, for vectors with a constant small density of changes,
127      the algorithm is linear in the vector size.  */
128   bool heuristic;
129   #endif
130 
131   /* Edit scripts longer than this are too expensive to compute.  */
132   OFFSET too_expensive;
133 
134   /* Snakes bigger than this are considered "big".  */
135   #define SNAKE_LIMIT 20
136 };
137 
138 struct partition
139 {
140   /* Midpoints of this partition.  */
141   OFFSET xmid;
142   OFFSET ymid;
143 
144   /* True if low half will be analyzed minimally.  */
145   bool lo_minimal;
146 
147   /* Likewise for high half.  */
148   bool hi_minimal;
149 };
150 
151 
152 /* Find the midpoint of the shortest edit script for a specified portion
153    of the two vectors.
154 
155    Scan from the beginnings of the vectors, and simultaneously from the ends,
156    doing a breadth-first search through the space of edit-sequence.
157    When the two searches meet, we have found the midpoint of the shortest
158    edit sequence.
159 
160    If FIND_MINIMAL is true, find the minimal edit script regardless of
161    expense.  Otherwise, if the search is too expensive, use heuristics to
162    stop the search and report a suboptimal answer.
163 
164    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
165    XMID - YMID equals the number of inserted elements minus the number
166    of deleted elements (counting only elements before the midpoint).
167 
168    Set PART->lo_minimal to true iff the minimal edit script for the
169    left half of the partition is known; similarly for PART->hi_minimal.
170 
171    This function assumes that the first elements of the specified portions
172    of the two vectors do not match, and likewise that the last elements do not
173    match.  The caller must trim matching elements from the beginning and end
174    of the portions it is going to specify.
175 
176    If we return the "wrong" partitions, the worst this can do is cause
177    suboptimal diff output.  It cannot cause incorrect diff output.  */
178 
179 static void
diag(OFFSET xoff,OFFSET xlim,OFFSET yoff,OFFSET ylim,bool find_minimal,struct partition * part,struct context * ctxt)180 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
181       struct partition *part, struct context *ctxt)
182 {
183   OFFSET *const fd = ctxt->fdiag;       /* Give the compiler a chance. */
184   OFFSET *const bd = ctxt->bdiag;       /* Additional help for the compiler. */
185 #ifdef ELEMENT
186   ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
187   ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
188   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
189 #else
190   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
191 #endif
192   const OFFSET dmin = xoff - ylim;      /* Minimum valid diagonal. */
193   const OFFSET dmax = xlim - yoff;      /* Maximum valid diagonal. */
194   const OFFSET fmid = xoff - yoff;      /* Center diagonal of top-down search. */
195   const OFFSET bmid = xlim - ylim;      /* Center diagonal of bottom-up search. */
196   OFFSET fmin = fmid;
197   OFFSET fmax = fmid;           /* Limits of top-down search. */
198   OFFSET bmin = bmid;
199   OFFSET bmax = bmid;           /* Limits of bottom-up search. */
200   OFFSET c;                     /* Cost. */
201   bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
202                                    diagonal with respect to the northwest. */
203 
204   fd[fmid] = xoff;
205   bd[bmid] = xlim;
206 
207   for (c = 1;; ++c)
208     {
209       OFFSET d;                 /* Active diagonal. */
210       bool big_snake _GL_UNUSED = false;
211 
212       /* Extend the top-down search by an edit step in each diagonal. */
213       if (fmin > dmin)
214         fd[--fmin - 1] = -1;
215       else
216         ++fmin;
217       if (fmax < dmax)
218         fd[++fmax + 1] = -1;
219       else
220         --fmax;
221       for (d = fmax; d >= fmin; d -= 2)
222         {
223           OFFSET x;
224           OFFSET y;
225           OFFSET tlo = fd[d - 1];
226           OFFSET thi = fd[d + 1];
227           OFFSET x0 = tlo < thi ? thi : tlo + 1;
228 
229           for (x = x0, y = x0 - d;
230                x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
231                x++, y++)
232             continue;
233           if (x - x0 > SNAKE_LIMIT)
234             big_snake = true;
235           fd[d] = x;
236           if (odd && bmin <= d && d <= bmax && bd[d] <= x)
237             {
238               part->xmid = x;
239               part->ymid = y;
240               part->lo_minimal = part->hi_minimal = true;
241               return;
242             }
243         }
244 
245       /* Similarly extend the bottom-up search.  */
246       if (bmin > dmin)
247         bd[--bmin - 1] = OFFSET_MAX;
248       else
249         ++bmin;
250       if (bmax < dmax)
251         bd[++bmax + 1] = OFFSET_MAX;
252       else
253         --bmax;
254       for (d = bmax; d >= bmin; d -= 2)
255         {
256           OFFSET x;
257           OFFSET y;
258           OFFSET tlo = bd[d - 1];
259           OFFSET thi = bd[d + 1];
260           OFFSET x0 = tlo < thi ? tlo : thi - 1;
261 
262           for (x = x0, y = x0 - d;
263                xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
264                x--, y--)
265             continue;
266           if (x0 - x > SNAKE_LIMIT)
267             big_snake = true;
268           bd[d] = x;
269           if (!odd && fmin <= d && d <= fmax && x <= fd[d])
270             {
271               part->xmid = x;
272               part->ymid = y;
273               part->lo_minimal = part->hi_minimal = true;
274               return;
275             }
276         }
277 
278       if (find_minimal)
279         continue;
280 
281 #ifdef USE_HEURISTIC
282       bool heuristic = ctxt->heuristic;
283 #else
284       bool heuristic = false;
285 #endif
286 
287       /* Heuristic: check occasionally for a diagonal that has made lots
288          of progress compared with the edit distance.  If we have any
289          such, find the one that has made the most progress and return it
290          as if it had succeeded.
291 
292          With this heuristic, for vectors with a constant small density
293          of changes, the algorithm is linear in the vector size.  */
294 
295       if (200 < c && big_snake && heuristic)
296         {
297           {
298             OFFSET best = 0;
299 
300             for (d = fmax; d >= fmin; d -= 2)
301               {
302                 OFFSET dd = d - fmid;
303                 OFFSET x = fd[d];
304                 OFFSET y = x - d;
305                 OFFSET v = (x - xoff) * 2 - dd;
306 
307                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
308                   {
309                     if (v > best
310                         && xoff + SNAKE_LIMIT <= x && x < xlim
311                         && yoff + SNAKE_LIMIT <= y && y < ylim)
312                       {
313                         /* We have a good enough best diagonal; now insist
314                            that it end with a significant snake.  */
315                         int k;
316 
317                         for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
318                           if (k == SNAKE_LIMIT)
319                             {
320                               best = v;
321                               part->xmid = x;
322                               part->ymid = y;
323                               break;
324                             }
325                       }
326                   }
327               }
328             if (best > 0)
329               {
330                 part->lo_minimal = true;
331                 part->hi_minimal = false;
332                 return;
333               }
334           }
335 
336           {
337             OFFSET best = 0;
338 
339             for (d = bmax; d >= bmin; d -= 2)
340               {
341                 OFFSET dd = d - bmid;
342                 OFFSET x = bd[d];
343                 OFFSET y = x - d;
344                 OFFSET v = (xlim - x) * 2 + dd;
345 
346                 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
347                   {
348                     if (v > best
349                         && xoff < x && x <= xlim - SNAKE_LIMIT
350                         && yoff < y && y <= ylim - SNAKE_LIMIT)
351                       {
352                         /* We have a good enough best diagonal; now insist
353                            that it end with a significant snake.  */
354                         int k;
355 
356                         for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
357                           if (k == SNAKE_LIMIT - 1)
358                             {
359                               best = v;
360                               part->xmid = x;
361                               part->ymid = y;
362                               break;
363                             }
364                       }
365                   }
366               }
367             if (best > 0)
368               {
369                 part->lo_minimal = false;
370                 part->hi_minimal = true;
371                 return;
372               }
373           }
374         }
375 
376       /* Heuristic: if we've gone well beyond the call of duty, give up
377          and report halfway between our best results so far.  */
378       if (c >= ctxt->too_expensive)
379         {
380           OFFSET fxybest;
381           OFFSET fxbest IF_LINT (= 0);
382           OFFSET bxybest;
383           OFFSET bxbest IF_LINT (= 0);
384 
385           /* Find forward diagonal that maximizes X + Y.  */
386           fxybest = -1;
387           for (d = fmax; d >= fmin; d -= 2)
388             {
389               OFFSET x = MIN (fd[d], xlim);
390               OFFSET y = x - d;
391               if (ylim < y)
392                 {
393                   x = ylim + d;
394                   y = ylim;
395                 }
396               if (fxybest < x + y)
397                 {
398                   fxybest = x + y;
399                   fxbest = x;
400                 }
401             }
402 
403           /* Find backward diagonal that minimizes X + Y.  */
404           bxybest = OFFSET_MAX;
405           for (d = bmax; d >= bmin; d -= 2)
406             {
407               OFFSET x = MAX (xoff, bd[d]);
408               OFFSET y = x - d;
409               if (y < yoff)
410                 {
411                   x = yoff + d;
412                   y = yoff;
413                 }
414               if (x + y < bxybest)
415                 {
416                   bxybest = x + y;
417                   bxbest = x;
418                 }
419             }
420 
421           /* Use the better of the two diagonals.  */
422           if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
423             {
424               part->xmid = fxbest;
425               part->ymid = fxybest - fxbest;
426               part->lo_minimal = true;
427               part->hi_minimal = false;
428             }
429           else
430             {
431               part->xmid = bxbest;
432               part->ymid = bxybest - bxbest;
433               part->lo_minimal = false;
434               part->hi_minimal = true;
435             }
436           return;
437         }
438     }
439   #undef XREF_YREF_EQUAL
440 }
441 
442 
443 /* Compare in detail contiguous subsequences of the two vectors
444    which are known, as a whole, to match each other.
445 
446    The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
447 
448    Note that XLIM, YLIM are exclusive bounds.  All indices into the vectors
449    are origin-0.
450 
451    If FIND_MINIMAL, find a minimal difference no matter how
452    expensive it is.
453 
454    The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
455 
456    Return false if terminated normally, or true if terminated through early
457    abort.  */
458 
459 static bool
compareseq(OFFSET xoff,OFFSET xlim,OFFSET yoff,OFFSET ylim,bool find_minimal,struct context * ctxt)460 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
461             bool find_minimal, struct context *ctxt)
462 {
463 #ifdef ELEMENT
464   ELEMENT const *xv = ctxt->xvec; /* Help the compiler.  */
465   ELEMENT const *yv = ctxt->yvec;
466   #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
467 #else
468   #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
469 #endif
470 
471   /* Slide down the bottom initial diagonal.  */
472   while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
473     {
474       xoff++;
475       yoff++;
476     }
477 
478   /* Slide up the top initial diagonal. */
479   while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
480     {
481       xlim--;
482       ylim--;
483     }
484 
485   /* Handle simple cases. */
486   if (xoff == xlim)
487     while (yoff < ylim)
488       {
489         NOTE_INSERT (ctxt, yoff);
490         if (EARLY_ABORT (ctxt))
491           return true;
492         yoff++;
493       }
494   else if (yoff == ylim)
495     while (xoff < xlim)
496       {
497         NOTE_DELETE (ctxt, xoff);
498         if (EARLY_ABORT (ctxt))
499           return true;
500         xoff++;
501       }
502   else
503     {
504       struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
505 
506       /* Find a point of correspondence in the middle of the vectors.  */
507       diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
508 
509       /* Use the partitions to split this problem into subproblems.  */
510       if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
511         return true;
512       if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
513         return true;
514     }
515 
516   return false;
517   #undef XREF_YREF_EQUAL
518 }
519 
520 #undef ELEMENT
521 #undef EQUAL
522 #undef OFFSET
523 #undef EXTRA_CONTEXT_FIELDS
524 #undef NOTE_DELETE
525 #undef NOTE_INSERT
526 #undef EARLY_ABORT
527 #undef USE_HEURISTIC
528 #undef XVECREF_YVECREF_EQUAL
529 #undef OFFSET_MAX
530