1 /* Analyze file differences for GNU DIFF.
2 
3    Modified for KDiff3 by Joachim Eibl <joachim.eibl at gmx.de> 2003.
4    The original file was part of GNU DIFF.
5 
6 
7     Part of KDiff3 - Text Diff And Merge Tool
8 
9     SPDX-FileCopyrightText: 1988-2002 Free Software Foundation, Inc.
10     SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de
11     SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com
12     SPDX-License-Identifier: GPL-2.0-or-later
13 */
14 
15 /* The basic algorithm is described in:
16    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
17    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
18    see especially section 4.2, which describes the variation used below.
19    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
20    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
21    at the price of producing suboptimal output for large inputs with
22    many differences.
23 
24    The basic algorithm was independently discovered as described in:
25    "Algorithms for Approximate String Matching", E. Ukkonen,
26    Information and Control Vol. 64, 1985, pp. 100-118.  */
27 
28 #define GDIFF_MAIN
29 
30 #include "gnudiff_diff.h"
31 
32 #include <algorithm>       // for max, min
33 #include <stdlib.h>
34 
35 
36 static GNULineRef *xvec, *yvec;  /* Vectors being compared. */
37 static GNULineRef *fdiag;        /* Vector, indexed by diagonal, containing
38                    1 + the X coordinate of the point furthest
39                    along the given diagonal in the forward
40                    search of the edit matrix. */
41 static GNULineRef *bdiag;        /* Vector, indexed by diagonal, containing
42                    the X coordinate of the point furthest
43                    along the given diagonal in the backward
44                    search of the edit matrix. */
45 static GNULineRef too_expensive; /* Edit scripts longer than this are too
46                    expensive to compute.  */
47 
48 #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'.  */
49 
50 struct partition {
51     GNULineRef xmid, ymid; /* Midpoints of this partition.  */
52     bool lo_minimal;       /* Nonzero if low half will be analyzed minimally.  */
53     bool hi_minimal;       /* Likewise for high half.  */
54 };
55 
56 /* Find the midpoint of the shortest edit script for a specified
57    portion of the two files.
58 
59    Scan from the beginnings of the files, and simultaneously from the ends,
60    doing a breadth-first search through the space of edit-sequence.
61    When the two searches meet, we have found the midpoint of the shortest
62    edit sequence.
63 
64    If FIND_MINIMAL is nonzero, find the minimal edit script regardless
65    of expense.  Otherwise, if the search is too expensive, use
66    heuristics to stop the search and report a suboptimal answer.
67 
68    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
69    XMID - YMID equals the number of inserted lines minus the number
70    of deleted lines (counting only lines before the midpoint).
71    Return the approximate edit cost; this is the total number of
72    lines inserted or deleted (counting only lines before the midpoint),
73    unless a heuristic is used to terminate the search prematurely.
74 
75    Set PART->lo_minimal to true iff the minimal edit script for the
76    left half of the partition is known; similarly for PART->hi_minimal.
77 
78    This function assumes that the first lines of the specified portions
79    of the two files do not match, and likewise that the last lines do not
80    match.  The caller must trim matching lines from the beginning and end
81    of the portions it is going to specify.
82 
83    If we return the "wrong" partitions,
84    the worst this can do is cause suboptimal diff output.
85    It cannot cause incorrect diff output.  */
86 
diag(GNULineRef xoff,GNULineRef xlim,GNULineRef yoff,GNULineRef ylim,bool find_minimal,partition * part) const87 GNULineRef GnuDiff::diag(GNULineRef xoff, GNULineRef xlim, GNULineRef yoff, GNULineRef ylim, bool find_minimal,
88                          partition *part) const
89 {
90     GNULineRef *const fd = fdiag;        /* Give the compiler a chance. */
91     GNULineRef *const bd = bdiag;        /* Additional help for the compiler. */
92     GNULineRef const *const xv = xvec;   /* Still more help for the compiler. */
93     GNULineRef const *const yv = yvec;   /* And more and more . . . */
94     GNULineRef const dmin = xoff - ylim; /* Minimum valid diagonal. */
95     GNULineRef const dmax = xlim - yoff; /* Maximum valid diagonal. */
96     GNULineRef const fmid = xoff - yoff; /* Center diagonal of top-down search. */
97     GNULineRef const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
98     GNULineRef fmin = fmid, fmax = fmid; /* Limits of top-down search. */
99     GNULineRef bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */
100     GNULineRef c;                        /* Cost. */
101     bool odd = (fmid - bmid) & 1;        /* True if southeast corner is on an odd
102                    diagonal with respect to the northwest. */
103 
104     fd[fmid] = xoff;
105     bd[bmid] = xlim;
106 
107     for(c = 1;; ++c)
108     {
109         GNULineRef d; /* Active diagonal. */
110         bool big_snake = false;
111 
112         /* Extend the top-down search by an edit step in each diagonal. */
113         fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
114         fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
115         for(d = fmax; d >= fmin; d -= 2)
116         {
117             GNULineRef x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
118 
119             if(tlo >= thi)
120                 x = tlo + 1;
121             else
122                 x = thi;
123             oldx = x;
124             y = x - d;
125             while(x < xlim && y < ylim && xv[x] == yv[y])
126                 ++x, ++y;
127             if(x - oldx > SNAKE_LIMIT)
128                 big_snake = true;
129             fd[d] = x;
130             if(odd && bmin <= d && d <= bmax && bd[d] <= x)
131             {
132                 part->xmid = x;
133                 part->ymid = y;
134                 part->lo_minimal = part->hi_minimal = true;
135                 return 2 * c - 1;
136             }
137         }
138 
139         /* Similarly extend the bottom-up search.  */
140         bmin > dmin ? bd[--bmin - 1] = GNULINEREF_MAX : ++bmin;
141         bmax < dmax ? bd[++bmax + 1] = GNULINEREF_MAX : --bmax;
142         for(d = bmax; d >= bmin; d -= 2)
143         {
144             GNULineRef x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
145 
146             if(tlo < thi)
147                 x = tlo;
148             else
149                 x = thi - 1;
150             oldx = x;
151             y = x - d;
152             while(x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
153                 --x, --y;
154             if(oldx - x > SNAKE_LIMIT)
155                 big_snake = true;
156             bd[d] = x;
157             if(!odd && fmin <= d && d <= fmax && x <= fd[d])
158             {
159                 part->xmid = x;
160                 part->ymid = y;
161                 part->lo_minimal = part->hi_minimal = true;
162                 return 2 * c;
163             }
164         }
165 
166         if(find_minimal)
167             continue;
168 
169         /* Heuristic: check occasionally for a diagonal that has made
170      lots of progress compared with the edit distance.
171      If we have any such, find the one that has made the most
172      progress and return it as if it had succeeded.
173 
174      With this heuristic, for files with a constant small density
175      of changes, the algorithm is linear in the file size.  */
176 
177         if(200 < c && big_snake && speed_large_files)
178         {
179             GNULineRef best;
180 
181             best = 0;
182             for(d = fmax; d >= fmin; d -= 2)
183             {
184                 GNULineRef dd = d - fmid;
185                 GNULineRef x = fd[d];
186                 GNULineRef y = x - d;
187                 GNULineRef v = (x - xoff) * 2 - dd;
188                 if(v > 12 * (c + (dd < 0 ? -dd : dd)))
189                 {
190                     if(v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim)
191                     {
192                         /* We have a good enough best diagonal;
193              now insist that it end with a significant snake.  */
194                         int k;
195 
196                         for(k = 1; xv[x - k] == yv[y - k]; k++)
197                             if(k == SNAKE_LIMIT)
198                             {
199                                 best = v;
200                                 part->xmid = x;
201                                 part->ymid = y;
202                                 break;
203                             }
204                     }
205                 }
206             }
207             if(best > 0)
208             {
209                 part->lo_minimal = true;
210                 part->hi_minimal = false;
211                 return 2 * c - 1;
212             }
213 
214             best = 0;
215             for(d = bmax; d >= bmin; d -= 2)
216             {
217                 GNULineRef dd = d - bmid;
218                 GNULineRef x = bd[d];
219                 GNULineRef y = x - d;
220                 GNULineRef v = (xlim - x) * 2 + dd;
221                 if(v > 12 * (c + (dd < 0 ? -dd : dd)))
222                 {
223                     if(v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT)
224                     {
225                         /* We have a good enough best diagonal;
226              now insist that it end with a significant snake.  */
227                         int k;
228 
229                         for(k = 0; xv[x + k] == yv[y + k]; k++)
230                             if(k == SNAKE_LIMIT - 1)
231                             {
232                                 best = v;
233                                 part->xmid = x;
234                                 part->ymid = y;
235                                 break;
236                             }
237                     }
238                 }
239             }
240             if(best > 0)
241             {
242                 part->lo_minimal = false;
243                 part->hi_minimal = true;
244                 return 2 * c - 1;
245             }
246         }
247 
248         /* Heuristic: if we've gone well beyond the call of duty,
249      give up and report halfway between our best results so far.  */
250         if(c >= too_expensive)
251         {
252             GNULineRef fxybest, fxbest;
253             GNULineRef bxybest, bxbest;
254 
255             fxbest = bxbest = 0; /* Pacify `gcc -Wall'.  */
256 
257             /* Find forward diagonal that maximizes X + Y.  */
258             fxybest = -1;
259             for(d = fmax; d >= fmin; d -= 2)
260             {
261                 GNULineRef x = std::min(fd[d], xlim);
262                 GNULineRef y = x - d;
263                 if(ylim < y)
264                     x = ylim + d, y = ylim;
265                 if(fxybest < x + y)
266                 {
267                     fxybest = x + y;
268                     fxbest = x;
269                 }
270             }
271 
272             /* Find backward diagonal that minimizes X + Y.  */
273             bxybest = GNULINEREF_MAX;
274             for(d = bmax; d >= bmin; d -= 2)
275             {
276                 GNULineRef x = std::max(xoff, bd[d]);
277                 GNULineRef y = x - d;
278                 if(y < yoff)
279                     x = yoff + d, y = yoff;
280                 if(x + y < bxybest)
281                 {
282                     bxybest = x + y;
283                     bxbest = x;
284                 }
285             }
286 
287             /* Use the better of the two diagonals.  */
288             if((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
289             {
290                 part->xmid = fxbest;
291                 part->ymid = fxybest - fxbest;
292                 part->lo_minimal = true;
293                 part->hi_minimal = false;
294             }
295             else
296             {
297                 part->xmid = bxbest;
298                 part->ymid = bxybest - bxbest;
299                 part->lo_minimal = false;
300                 part->hi_minimal = true;
301             }
302             return 2 * c - 1;
303         }
304     }
305 }
306 
307 /* Compare in detail contiguous subsequences of the two files
308    which are known, as a whole, to match each other.
309 
310    The results are recorded in the vectors files[N].changed, by
311    storing 1 in the element for each line that is an insertion or deletion.
312 
313    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
314 
315    Note that XLIM, YLIM are exclusive bounds.
316    All line numbers are origin-0 and discarded lines are not counted.
317 
318    If FIND_MINIMAL, find a minimal difference no matter how
319    expensive it is.  */
320 
compareseq(GNULineRef xoff,GNULineRef xlim,GNULineRef yoff,GNULineRef ylim,bool find_minimal)321 void GnuDiff::compareseq(GNULineRef xoff, GNULineRef xlim, GNULineRef yoff, GNULineRef ylim, bool find_minimal)
322 {
323     GNULineRef *const xv = xvec; /* Help the compiler.  */
324     GNULineRef *const yv = yvec;
325 
326     /* Slide down the bottom initial diagonal. */
327     while(xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
328         ++xoff, ++yoff;
329     /* Slide up the top initial diagonal. */
330     while(xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
331         --xlim, --ylim;
332 
333     /* Handle simple cases. */
334     if(xoff == xlim)
335         while(yoff < ylim)
336             files[1].changed[files[1].realindexes[yoff++]] = true;
337     else if(yoff == ylim)
338         while(xoff < xlim)
339             files[0].changed[files[0].realindexes[xoff++]] = true;
340     else
341     {
342         GNULineRef c;
343         partition part;
344 
345         /* Find a point of correspondence in the middle of the files.  */
346 
347         c = diag(xoff, xlim, yoff, ylim, find_minimal, &part);
348 
349         /* This should be impossible, because it implies that
350          one of the two subsequences is empty,
351          and that case was handled above without calling `diag'. */
352         Q_ASSERT(c != 1);
353 
354         /* Use the partitions to split this problem into subproblems.  */
355         compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
356         compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
357     }
358 }
359 
360 /* Discard lines from one file that have no matches in the other file.
361 
362    A line which is discarded will not be considered by the actual
363    comparison algorithm; it will be as if that line were not in the file.
364    The file's `realindexes' table maps virtual line numbers
365    (which don't count the discarded lines) into real line numbers;
366    this is how the actual comparison algorithm produces results
367    that are comprehensible when the discarded lines are counted.
368 
369    When we discard a line, we also mark it as a deletion or insertion
370    so that it will be printed in the output.  */
371 
discard_confusing_lines(file_data filevec[])372 void GnuDiff::discard_confusing_lines(file_data filevec[])
373 {
374     int f;
375     GNULineRef i;
376     char *discarded[2];
377     GNULineRef *equiv_count[2];
378     GNULineRef *p;
379 
380     /* Allocate our results.  */
381     p = (GNULineRef *)xmalloc((filevec[0].buffered_lines + filevec[1].buffered_lines) * (2 * sizeof(*p)));
382     for(f = 0; f < 2; ++f)
383     {
384         filevec[f].undiscarded = p;
385         p += filevec[f].buffered_lines;
386         filevec[f].realindexes = p;
387         p += filevec[f].buffered_lines;
388     }
389 
390     /* Set up equiv_count[F][I] as the number of lines in file F
391      that fall in equivalence class I.  */
392 
393     p = (GNULineRef *)zalloc(filevec[0].equiv_max * (2 * sizeof(*p)));
394     equiv_count[0] = p;
395     equiv_count[1] = p + filevec[0].equiv_max;
396 
397     for(i = 0; i < filevec[0].buffered_lines; ++i)
398         ++equiv_count[0][filevec[0].equivs[i]];
399     for(i = 0; i < filevec[1].buffered_lines; ++i)
400         ++equiv_count[1][filevec[1].equivs[i]];
401 
402     /* Set up tables of which lines are going to be discarded.  */
403 
404     discarded[0] = (char *)zalloc(filevec[0].buffered_lines + filevec[1].buffered_lines);
405     discarded[1] = discarded[0] + filevec[0].buffered_lines;
406 
407     /* Mark to be discarded each line that matches no line of the other file.
408      If a line matches many lines, mark it as provisionally discardable.  */
409 
410     for(f = 0; f < 2; ++f)
411     {
412         size_t end = filevec[f].buffered_lines;
413         char *discards = discarded[f];
414         GNULineRef *counts = equiv_count[1 - f];
415         GNULineRef *equivs = filevec[f].equivs;
416         size_t many = 5;
417         size_t tem = end / 64;
418 
419         /* Multiply MANY by approximate square root of number of lines.
420      That is the threshold for provisionally discardable lines.  */
421         while((tem = tem >> 2) > 0)
422             many *= 2;
423 
424         for(i = 0; i < (GNULineRef)end; ++i)
425         {
426             GNULineRef nmatch;
427             if(equivs[i] == 0)
428                 continue;
429             nmatch = counts[equivs[i]];
430             if(nmatch == 0)
431                 discards[i] = 1;
432             else if(nmatch > (GNULineRef)many)
433                 discards[i] = 2;
434         }
435     }
436 
437     /* Don't really discard the provisional lines except when they occur
438      in a run of discardables, with nonprovisionals at the beginning
439      and end.  */
440 
441     for(f = 0; f < 2; ++f)
442     {
443         GNULineRef end = filevec[f].buffered_lines;
444         char *discards = discarded[f];
445 
446         for(i = 0; i < end; ++i)
447         {
448             /* Cancel provisional discards not in middle of run of discards.  */
449             if(discards[i] == 2)
450                 discards[i] = 0;
451             else if(discards[i] != 0)
452             {
453                 /* We have found a nonprovisional discard.  */
454                 GNULineRef j;
455                 GNULineRef length;
456                 GNULineRef provisional = 0;
457 
458                 /* Find end of this run of discardable lines.
459          Count how many are provisionally discardable.  */
460                 for(j = i; j < end; ++j)
461                 {
462                     if(discards[j] == 0)
463                         break;
464                     if(discards[j] == 2)
465                         ++provisional;
466                 }
467 
468                 /* Cancel provisional discards at end, and shrink the run.  */
469                 while(j > i && discards[j - 1] == 2)
470                     discards[--j] = 0, --provisional;
471 
472                 /* Now we have the length of a run of discardable lines
473          whose first and last are not provisional.  */
474                 length = j - i;
475 
476                 /* If 1/4 of the lines in the run are provisional,
477          cancel discarding of all provisional lines in the run.  */
478                 if(provisional * 4 > length)
479                 {
480                     while(j > i)
481                         if(discards[--j] == 2)
482                             discards[j] = 0;
483                 }
484                 else
485                 {
486                     GNULineRef consec;
487                     GNULineRef minimum = 1;
488                     GNULineRef tem = length >> 2;
489 
490                     /* MINIMUM is approximate square root of LENGTH/4.
491              A subrun of two or more provisionals can stand
492              when LENGTH is at least 16.
493              A subrun of 4 or more can stand when LENGTH >= 64.  */
494                     while(0 < (tem >>= 2))
495                         minimum <<= 1;
496                     minimum++;
497 
498                     /* Cancel any subrun of MINIMUM or more provisionals
499              within the larger run.  */
500                     for(j = 0, consec = 0; j < length; ++j)
501                         if(discards[i + j] != 2)
502                             consec = 0;
503                         else if(minimum == ++consec)
504                             /* Back up to start of subrun, to cancel it all.  */
505                             j -= consec;
506                         else if(minimum < consec)
507                             discards[i + j] = 0;
508 
509                     /* Scan from beginning of run
510              until we find 3 or more nonprovisionals in a row
511              or until the first nonprovisional at least 8 lines in.
512              Until that point, cancel any provisionals.  */
513                     for(j = 0, consec = 0; j < length; ++j)
514                     {
515                         if(j >= 8 && discards[i + j] == 1)
516                             break;
517                         if(discards[i + j] == 2)
518                             consec = 0, discards[i + j] = 0;
519                         else if(discards[i + j] == 0)
520                             consec = 0;
521                         else
522                             consec++;
523                         if(consec == 3)
524                             break;
525                     }
526 
527                     /* I advances to the last line of the run.  */
528                     i += length - 1;
529 
530                     /* Same thing, from end.  */
531                     for(j = 0, consec = 0; j < length; ++j)
532                     {
533                         if(j >= 8 && discards[i - j] == 1)
534                             break;
535                         if(discards[i - j] == 2)
536                             consec = 0, discards[i - j] = 0;
537                         else if(discards[i - j] == 0)
538                             consec = 0;
539                         else
540                             consec++;
541                         if(consec == 3)
542                             break;
543                     }
544                 }
545             }
546         }
547     }
548 
549     /* Actually discard the lines. */
550     for(f = 0; f < 2; ++f)
551     {
552         char *discards = discarded[f];
553         GNULineRef end = filevec[f].buffered_lines;
554         GNULineRef j = 0;
555         for(i = 0; i < end; ++i)
556             if(minimal || discards[i] == 0)
557             {
558                 filevec[f].undiscarded[j] = filevec[f].equivs[i];
559                 filevec[f].realindexes[j++] = i;
560             }
561             else
562                 filevec[f].changed[i] = true;
563         filevec[f].nondiscarded_lines = j;
564     }
565 
566     free(discarded[0]);
567     free(equiv_count[0]);
568 }
569 
570 /* Adjust inserts/deletes of identical lines to join changes
571    as much as possible.
572 
573    We do something when a run of changed lines include a
574    line at one end and have an excluded, identical line at the other.
575    We are free to choose which identical line is included.
576    `compareseq' usually chooses the one at the beginning,
577    but usually it is cleaner to consider the following identical line
578    to be the "change".  */
579 
shift_boundaries(file_data filevec[])580 void GnuDiff::shift_boundaries(file_data filevec[])
581 {
582     int f;
583 
584     for(f = 0; f < 2; ++f)
585     {
586         bool *changed = filevec[f].changed;
587         bool const *other_changed = filevec[1 - f].changed;
588         GNULineRef const *equivs = filevec[f].equivs;
589         GNULineRef i = 0;
590         GNULineRef j = 0;
591         GNULineRef i_end = filevec[f].buffered_lines;
592 
593         while(true)
594         {
595             GNULineRef runlength, start, corresponding;
596 
597             /* Scan forwards to find beginning of another run of changes.
598          Also keep track of the corresponding point in the other file.  */
599 
600             while(i < i_end && !changed[i])
601             {
602                 while(other_changed[j++])
603                     continue;
604                 i++;
605             }
606 
607             if(i == i_end)
608                 break;
609 
610             start = i;
611 
612             /* Find the end of this run of changes.  */
613 
614             while(changed[++i])
615                 continue;
616             while(other_changed[j])
617                 j++;
618 
619             do
620             {
621                 /* Record the length of this run of changes, so that
622          we can later determine whether the run has grown.  */
623                 runlength = i - start;
624 
625                 /* Move the changed region back, so long as the
626          previous unchanged line matches the last changed one.
627          This merges with previous changed regions.  */
628 
629                 while(start && equivs[start - 1] == equivs[i - 1])
630                 {
631                     changed[--start] = true;
632                     changed[--i] = false;
633                     while(changed[start - 1])
634                         start--;
635                     while(other_changed[--j])
636                         continue;
637                 }
638 
639                 /* Set CORRESPONDING to the end of the changed run, at the last
640          point where it corresponds to a changed run in the other file.
641          CORRESPONDING == I_END means no such point has been found.  */
642                 corresponding = other_changed[j - 1] ? i : i_end;
643 
644                 /* Move the changed region forward, so long as the
645          first changed line matches the following unchanged one.
646          This merges with following changed regions.
647          Do this second, so that if there are no merges,
648          the changed region is moved forward as far as possible.  */
649 
650                 while(i != i_end && equivs[start] == equivs[i])
651                 {
652                     changed[start++] = false;
653                     changed[i++] = true;
654                     while(changed[i])
655                         i++;
656                     while(other_changed[++j])
657                         corresponding = i;
658                 }
659             } while(runlength != i - start);
660 
661             /* If possible, move the fully-merged run of changes
662          back to a corresponding run in the other file.  */
663 
664             while(corresponding < i)
665             {
666                 changed[--start] = true;
667                 changed[--i] = false;
668                 while(other_changed[--j])
669                     continue;
670             }
671         }
672     }
673 }
674 
675 /* Cons an additional entry onto the front of an edit script OLD.
676    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
677    DELETED is the number of lines deleted here from file 0.
678    INSERTED is the number of lines inserted here in file 1.
679 
680    If DELETED is 0 then LINE0 is the number of the line before
681    which the insertion was done; vice versa for INSERTED and LINE1.  */
682 
add_change(GNULineRef line0,GNULineRef line1,GNULineRef deleted,GNULineRef inserted,change * old)683 GnuDiff::change *GnuDiff::add_change(GNULineRef line0, GNULineRef line1, GNULineRef deleted, GNULineRef inserted, change *old)
684 {
685     change *newChange = (change *)xmalloc(sizeof(*newChange));
686 
687     newChange->line0 = line0;
688     newChange->line1 = line1;
689     newChange->inserted = inserted;
690     newChange->deleted = deleted;
691     newChange->link = old;
692     return newChange;
693 }
694 
695 /* Scan the tables of which lines are inserted and deleted,
696    producing an edit script in reverse order.  */
697 
build_reverse_script(file_data const filevec[])698 GnuDiff::change *GnuDiff::build_reverse_script(file_data const filevec[])
699 {
700     change *script = nullptr;
701     bool *changed0 = filevec[0].changed;
702     bool *changed1 = filevec[1].changed;
703     GNULineRef len0 = filevec[0].buffered_lines;
704     GNULineRef len1 = filevec[1].buffered_lines;
705 
706     /* Note that changedN[len0] does exist, and is 0.  */
707 
708     GNULineRef i0 = 0, i1 = 0;
709 
710     while(i0 < len0 || i1 < len1)
711     {
712         if(changed0[i0] | changed1[i1])
713         {
714             GNULineRef line0 = i0, line1 = i1;
715 
716             /* Find # lines changed here in each file.  */
717             while(changed0[i0]) ++i0;
718             while(changed1[i1]) ++i1;
719 
720             /* Record this change.  */
721             script = add_change(line0, line1, i0 - line0, i1 - line1, script);
722         }
723 
724         /* We have reached lines in the two files that match each other.  */
725         i0++, i1++;
726     }
727 
728     return script;
729 }
730 
731 /* Scan the tables of which lines are inserted and deleted,
732    producing an edit script in forward order.  */
733 
build_script(file_data const filevec[])734 GnuDiff::change *GnuDiff::build_script(file_data const filevec[])
735 {
736     change *script = nullptr;
737     bool *changed0 = filevec[0].changed;
738     bool *changed1 = filevec[1].changed;
739     GNULineRef i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
740 
741     /* Note that changedN[-1] does exist, and is 0.  */
742 
743     while(i0 >= 0 || i1 >= 0)
744     {
745         if(changed0[i0 - 1] | changed1[i1 - 1])
746         {
747             GNULineRef line0 = i0, line1 = i1;
748 
749             /* Find # lines changed here in each file.  */
750             while(changed0[i0 - 1]) --i0;
751             while(changed1[i1 - 1]) --i1;
752 
753             /* Record this change.  */
754             script = add_change(i0, i1, line0 - i0, line1 - i1, script);
755         }
756 
757         /* We have reached lines in the two files that match each other.  */
758         i0--, i1--;
759     }
760 
761     return script;
762 }
763 
764 /* Report the differences of two files.  */
diff_2_files(comparison * cmp)765 GnuDiff::change *GnuDiff::diff_2_files(comparison *cmp)
766 {
767     GNULineRef diags;
768     int f;
769     change *script;
770 
771     read_files(cmp->file, files_can_be_treated_as_binary);
772 
773     {
774         /* Allocate vectors for the results of comparison:
775      a flag for each line of each file, saying whether that line
776      is an insertion or deletion.
777      Allocate an extra element, always 0, at each end of each vector.  */
778 
779         size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
780         bool *flag_space = (bool *)zalloc(s * sizeof(*flag_space));
781         cmp->file[0].changed = flag_space + 1;
782         cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
783 
784         /* Some lines are obviously insertions or deletions
785      because they don't match anything.  Detect them now, and
786      avoid even thinking about them in the main comparison algorithm.  */
787 
788         discard_confusing_lines(cmp->file);
789 
790         /* Now do the main comparison algorithm, considering just the
791      undiscarded lines.  */
792 
793         xvec = cmp->file[0].undiscarded;
794         yvec = cmp->file[1].undiscarded;
795         diags = (cmp->file[0].nondiscarded_lines + cmp->file[1].nondiscarded_lines + 3);
796         fdiag = (GNULineRef *)xmalloc(diags * (2 * sizeof(*fdiag)));
797         bdiag = fdiag + diags;
798         fdiag += cmp->file[1].nondiscarded_lines + 1;
799         bdiag += cmp->file[1].nondiscarded_lines + 1;
800 
801         /* Set TOO_EXPENSIVE to be approximate square root of input size,
802      bounded below by 256.  */
803         too_expensive = 1;
804         for(; diags != 0; diags >>= 2)
805             too_expensive <<= 1;
806         too_expensive = std::max((GNULineRef)256, too_expensive);
807 
808         files[0] = cmp->file[0];
809         files[1] = cmp->file[1];
810 
811         compareseq(0, cmp->file[0].nondiscarded_lines,
812                    0, cmp->file[1].nondiscarded_lines, minimal);
813 
814         free(fdiag - (cmp->file[1].nondiscarded_lines + 1));
815 
816         /* Modify the results slightly to make them prettier
817      in cases where that can validly be done.  */
818 
819         shift_boundaries(cmp->file);
820 
821         /* Get the results of comparison in the form of a chain
822          of `change's -- an edit script.  */
823 
824         script = build_script(cmp->file);
825 
826         free(cmp->file[0].undiscarded);
827 
828         free(flag_space);
829 
830         for(f = 0; f < 2; ++f)
831         {
832             free(cmp->file[f].equivs);
833             free(cmp->file[f].linbuf + cmp->file[f].linbuf_base);
834         }
835     }
836 
837     return script;
838 }
839