1 /* Analyze file differences for GNU DIFF.
2
3 Modified for KDiff3 by Joachim Eibl <joachim.eibl at gmx.de> 2003.
4 The original file was part of GNU DIFF.
5
6
7 Part of KDiff3 - Text Diff And Merge Tool
8
9 SPDX-FileCopyrightText: 1988-2002 Free Software Foundation, Inc.
10 SPDX-FileCopyrightText: 2002-2011 Joachim Eibl, joachim.eibl at gmx.de
11 SPDX-FileCopyrightText: 2018-2020 Michael Reeves reeves.87@gmail.com
12 SPDX-License-Identifier: GPL-2.0-or-later
13 */
14
15 /* The basic algorithm is described in:
16 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
17 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
18 see especially section 4.2, which describes the variation used below.
19 Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
20 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
21 at the price of producing suboptimal output for large inputs with
22 many differences.
23
24 The basic algorithm was independently discovered as described in:
25 "Algorithms for Approximate String Matching", E. Ukkonen,
26 Information and Control Vol. 64, 1985, pp. 100-118. */
27
28 #define GDIFF_MAIN
29
30 #include "gnudiff_diff.h"
31
32 #include <algorithm> // for max, min
33 #include <stdlib.h>
34
35
36 static GNULineRef *xvec, *yvec; /* Vectors being compared. */
37 static GNULineRef *fdiag; /* Vector, indexed by diagonal, containing
38 1 + the X coordinate of the point furthest
39 along the given diagonal in the forward
40 search of the edit matrix. */
41 static GNULineRef *bdiag; /* Vector, indexed by diagonal, containing
42 the X coordinate of the point furthest
43 along the given diagonal in the backward
44 search of the edit matrix. */
45 static GNULineRef too_expensive; /* Edit scripts longer than this are too
46 expensive to compute. */
47
48 #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */
49
50 struct partition {
51 GNULineRef xmid, ymid; /* Midpoints of this partition. */
52 bool lo_minimal; /* Nonzero if low half will be analyzed minimally. */
53 bool hi_minimal; /* Likewise for high half. */
54 };
55
56 /* Find the midpoint of the shortest edit script for a specified
57 portion of the two files.
58
59 Scan from the beginnings of the files, and simultaneously from the ends,
60 doing a breadth-first search through the space of edit-sequence.
61 When the two searches meet, we have found the midpoint of the shortest
62 edit sequence.
63
64 If FIND_MINIMAL is nonzero, find the minimal edit script regardless
65 of expense. Otherwise, if the search is too expensive, use
66 heuristics to stop the search and report a suboptimal answer.
67
68 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
69 XMID - YMID equals the number of inserted lines minus the number
70 of deleted lines (counting only lines before the midpoint).
71 Return the approximate edit cost; this is the total number of
72 lines inserted or deleted (counting only lines before the midpoint),
73 unless a heuristic is used to terminate the search prematurely.
74
75 Set PART->lo_minimal to true iff the minimal edit script for the
76 left half of the partition is known; similarly for PART->hi_minimal.
77
78 This function assumes that the first lines of the specified portions
79 of the two files do not match, and likewise that the last lines do not
80 match. The caller must trim matching lines from the beginning and end
81 of the portions it is going to specify.
82
83 If we return the "wrong" partitions,
84 the worst this can do is cause suboptimal diff output.
85 It cannot cause incorrect diff output. */
86
diag(GNULineRef xoff,GNULineRef xlim,GNULineRef yoff,GNULineRef ylim,bool find_minimal,partition * part) const87 GNULineRef GnuDiff::diag(GNULineRef xoff, GNULineRef xlim, GNULineRef yoff, GNULineRef ylim, bool find_minimal,
88 partition *part) const
89 {
90 GNULineRef *const fd = fdiag; /* Give the compiler a chance. */
91 GNULineRef *const bd = bdiag; /* Additional help for the compiler. */
92 GNULineRef const *const xv = xvec; /* Still more help for the compiler. */
93 GNULineRef const *const yv = yvec; /* And more and more . . . */
94 GNULineRef const dmin = xoff - ylim; /* Minimum valid diagonal. */
95 GNULineRef const dmax = xlim - yoff; /* Maximum valid diagonal. */
96 GNULineRef const fmid = xoff - yoff; /* Center diagonal of top-down search. */
97 GNULineRef const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
98 GNULineRef fmin = fmid, fmax = fmid; /* Limits of top-down search. */
99 GNULineRef bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */
100 GNULineRef c; /* Cost. */
101 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
102 diagonal with respect to the northwest. */
103
104 fd[fmid] = xoff;
105 bd[bmid] = xlim;
106
107 for(c = 1;; ++c)
108 {
109 GNULineRef d; /* Active diagonal. */
110 bool big_snake = false;
111
112 /* Extend the top-down search by an edit step in each diagonal. */
113 fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
114 fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
115 for(d = fmax; d >= fmin; d -= 2)
116 {
117 GNULineRef x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
118
119 if(tlo >= thi)
120 x = tlo + 1;
121 else
122 x = thi;
123 oldx = x;
124 y = x - d;
125 while(x < xlim && y < ylim && xv[x] == yv[y])
126 ++x, ++y;
127 if(x - oldx > SNAKE_LIMIT)
128 big_snake = true;
129 fd[d] = x;
130 if(odd && bmin <= d && d <= bmax && bd[d] <= x)
131 {
132 part->xmid = x;
133 part->ymid = y;
134 part->lo_minimal = part->hi_minimal = true;
135 return 2 * c - 1;
136 }
137 }
138
139 /* Similarly extend the bottom-up search. */
140 bmin > dmin ? bd[--bmin - 1] = GNULINEREF_MAX : ++bmin;
141 bmax < dmax ? bd[++bmax + 1] = GNULINEREF_MAX : --bmax;
142 for(d = bmax; d >= bmin; d -= 2)
143 {
144 GNULineRef x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
145
146 if(tlo < thi)
147 x = tlo;
148 else
149 x = thi - 1;
150 oldx = x;
151 y = x - d;
152 while(x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
153 --x, --y;
154 if(oldx - x > SNAKE_LIMIT)
155 big_snake = true;
156 bd[d] = x;
157 if(!odd && fmin <= d && d <= fmax && x <= fd[d])
158 {
159 part->xmid = x;
160 part->ymid = y;
161 part->lo_minimal = part->hi_minimal = true;
162 return 2 * c;
163 }
164 }
165
166 if(find_minimal)
167 continue;
168
169 /* Heuristic: check occasionally for a diagonal that has made
170 lots of progress compared with the edit distance.
171 If we have any such, find the one that has made the most
172 progress and return it as if it had succeeded.
173
174 With this heuristic, for files with a constant small density
175 of changes, the algorithm is linear in the file size. */
176
177 if(200 < c && big_snake && speed_large_files)
178 {
179 GNULineRef best;
180
181 best = 0;
182 for(d = fmax; d >= fmin; d -= 2)
183 {
184 GNULineRef dd = d - fmid;
185 GNULineRef x = fd[d];
186 GNULineRef y = x - d;
187 GNULineRef v = (x - xoff) * 2 - dd;
188 if(v > 12 * (c + (dd < 0 ? -dd : dd)))
189 {
190 if(v > best && xoff + SNAKE_LIMIT <= x && x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim)
191 {
192 /* We have a good enough best diagonal;
193 now insist that it end with a significant snake. */
194 int k;
195
196 for(k = 1; xv[x - k] == yv[y - k]; k++)
197 if(k == SNAKE_LIMIT)
198 {
199 best = v;
200 part->xmid = x;
201 part->ymid = y;
202 break;
203 }
204 }
205 }
206 }
207 if(best > 0)
208 {
209 part->lo_minimal = true;
210 part->hi_minimal = false;
211 return 2 * c - 1;
212 }
213
214 best = 0;
215 for(d = bmax; d >= bmin; d -= 2)
216 {
217 GNULineRef dd = d - bmid;
218 GNULineRef x = bd[d];
219 GNULineRef y = x - d;
220 GNULineRef v = (xlim - x) * 2 + dd;
221 if(v > 12 * (c + (dd < 0 ? -dd : dd)))
222 {
223 if(v > best && xoff < x && x <= xlim - SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT)
224 {
225 /* We have a good enough best diagonal;
226 now insist that it end with a significant snake. */
227 int k;
228
229 for(k = 0; xv[x + k] == yv[y + k]; k++)
230 if(k == SNAKE_LIMIT - 1)
231 {
232 best = v;
233 part->xmid = x;
234 part->ymid = y;
235 break;
236 }
237 }
238 }
239 }
240 if(best > 0)
241 {
242 part->lo_minimal = false;
243 part->hi_minimal = true;
244 return 2 * c - 1;
245 }
246 }
247
248 /* Heuristic: if we've gone well beyond the call of duty,
249 give up and report halfway between our best results so far. */
250 if(c >= too_expensive)
251 {
252 GNULineRef fxybest, fxbest;
253 GNULineRef bxybest, bxbest;
254
255 fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */
256
257 /* Find forward diagonal that maximizes X + Y. */
258 fxybest = -1;
259 for(d = fmax; d >= fmin; d -= 2)
260 {
261 GNULineRef x = std::min(fd[d], xlim);
262 GNULineRef y = x - d;
263 if(ylim < y)
264 x = ylim + d, y = ylim;
265 if(fxybest < x + y)
266 {
267 fxybest = x + y;
268 fxbest = x;
269 }
270 }
271
272 /* Find backward diagonal that minimizes X + Y. */
273 bxybest = GNULINEREF_MAX;
274 for(d = bmax; d >= bmin; d -= 2)
275 {
276 GNULineRef x = std::max(xoff, bd[d]);
277 GNULineRef y = x - d;
278 if(y < yoff)
279 x = yoff + d, y = yoff;
280 if(x + y < bxybest)
281 {
282 bxybest = x + y;
283 bxbest = x;
284 }
285 }
286
287 /* Use the better of the two diagonals. */
288 if((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
289 {
290 part->xmid = fxbest;
291 part->ymid = fxybest - fxbest;
292 part->lo_minimal = true;
293 part->hi_minimal = false;
294 }
295 else
296 {
297 part->xmid = bxbest;
298 part->ymid = bxybest - bxbest;
299 part->lo_minimal = false;
300 part->hi_minimal = true;
301 }
302 return 2 * c - 1;
303 }
304 }
305 }
306
307 /* Compare in detail contiguous subsequences of the two files
308 which are known, as a whole, to match each other.
309
310 The results are recorded in the vectors files[N].changed, by
311 storing 1 in the element for each line that is an insertion or deletion.
312
313 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
314
315 Note that XLIM, YLIM are exclusive bounds.
316 All line numbers are origin-0 and discarded lines are not counted.
317
318 If FIND_MINIMAL, find a minimal difference no matter how
319 expensive it is. */
320
compareseq(GNULineRef xoff,GNULineRef xlim,GNULineRef yoff,GNULineRef ylim,bool find_minimal)321 void GnuDiff::compareseq(GNULineRef xoff, GNULineRef xlim, GNULineRef yoff, GNULineRef ylim, bool find_minimal)
322 {
323 GNULineRef *const xv = xvec; /* Help the compiler. */
324 GNULineRef *const yv = yvec;
325
326 /* Slide down the bottom initial diagonal. */
327 while(xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
328 ++xoff, ++yoff;
329 /* Slide up the top initial diagonal. */
330 while(xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
331 --xlim, --ylim;
332
333 /* Handle simple cases. */
334 if(xoff == xlim)
335 while(yoff < ylim)
336 files[1].changed[files[1].realindexes[yoff++]] = true;
337 else if(yoff == ylim)
338 while(xoff < xlim)
339 files[0].changed[files[0].realindexes[xoff++]] = true;
340 else
341 {
342 GNULineRef c;
343 partition part;
344
345 /* Find a point of correspondence in the middle of the files. */
346
347 c = diag(xoff, xlim, yoff, ylim, find_minimal, &part);
348
349 /* This should be impossible, because it implies that
350 one of the two subsequences is empty,
351 and that case was handled above without calling `diag'. */
352 Q_ASSERT(c != 1);
353
354 /* Use the partitions to split this problem into subproblems. */
355 compareseq(xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
356 compareseq(part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
357 }
358 }
359
360 /* Discard lines from one file that have no matches in the other file.
361
362 A line which is discarded will not be considered by the actual
363 comparison algorithm; it will be as if that line were not in the file.
364 The file's `realindexes' table maps virtual line numbers
365 (which don't count the discarded lines) into real line numbers;
366 this is how the actual comparison algorithm produces results
367 that are comprehensible when the discarded lines are counted.
368
369 When we discard a line, we also mark it as a deletion or insertion
370 so that it will be printed in the output. */
371
discard_confusing_lines(file_data filevec[])372 void GnuDiff::discard_confusing_lines(file_data filevec[])
373 {
374 int f;
375 GNULineRef i;
376 char *discarded[2];
377 GNULineRef *equiv_count[2];
378 GNULineRef *p;
379
380 /* Allocate our results. */
381 p = (GNULineRef *)xmalloc((filevec[0].buffered_lines + filevec[1].buffered_lines) * (2 * sizeof(*p)));
382 for(f = 0; f < 2; ++f)
383 {
384 filevec[f].undiscarded = p;
385 p += filevec[f].buffered_lines;
386 filevec[f].realindexes = p;
387 p += filevec[f].buffered_lines;
388 }
389
390 /* Set up equiv_count[F][I] as the number of lines in file F
391 that fall in equivalence class I. */
392
393 p = (GNULineRef *)zalloc(filevec[0].equiv_max * (2 * sizeof(*p)));
394 equiv_count[0] = p;
395 equiv_count[1] = p + filevec[0].equiv_max;
396
397 for(i = 0; i < filevec[0].buffered_lines; ++i)
398 ++equiv_count[0][filevec[0].equivs[i]];
399 for(i = 0; i < filevec[1].buffered_lines; ++i)
400 ++equiv_count[1][filevec[1].equivs[i]];
401
402 /* Set up tables of which lines are going to be discarded. */
403
404 discarded[0] = (char *)zalloc(filevec[0].buffered_lines + filevec[1].buffered_lines);
405 discarded[1] = discarded[0] + filevec[0].buffered_lines;
406
407 /* Mark to be discarded each line that matches no line of the other file.
408 If a line matches many lines, mark it as provisionally discardable. */
409
410 for(f = 0; f < 2; ++f)
411 {
412 size_t end = filevec[f].buffered_lines;
413 char *discards = discarded[f];
414 GNULineRef *counts = equiv_count[1 - f];
415 GNULineRef *equivs = filevec[f].equivs;
416 size_t many = 5;
417 size_t tem = end / 64;
418
419 /* Multiply MANY by approximate square root of number of lines.
420 That is the threshold for provisionally discardable lines. */
421 while((tem = tem >> 2) > 0)
422 many *= 2;
423
424 for(i = 0; i < (GNULineRef)end; ++i)
425 {
426 GNULineRef nmatch;
427 if(equivs[i] == 0)
428 continue;
429 nmatch = counts[equivs[i]];
430 if(nmatch == 0)
431 discards[i] = 1;
432 else if(nmatch > (GNULineRef)many)
433 discards[i] = 2;
434 }
435 }
436
437 /* Don't really discard the provisional lines except when they occur
438 in a run of discardables, with nonprovisionals at the beginning
439 and end. */
440
441 for(f = 0; f < 2; ++f)
442 {
443 GNULineRef end = filevec[f].buffered_lines;
444 char *discards = discarded[f];
445
446 for(i = 0; i < end; ++i)
447 {
448 /* Cancel provisional discards not in middle of run of discards. */
449 if(discards[i] == 2)
450 discards[i] = 0;
451 else if(discards[i] != 0)
452 {
453 /* We have found a nonprovisional discard. */
454 GNULineRef j;
455 GNULineRef length;
456 GNULineRef provisional = 0;
457
458 /* Find end of this run of discardable lines.
459 Count how many are provisionally discardable. */
460 for(j = i; j < end; ++j)
461 {
462 if(discards[j] == 0)
463 break;
464 if(discards[j] == 2)
465 ++provisional;
466 }
467
468 /* Cancel provisional discards at end, and shrink the run. */
469 while(j > i && discards[j - 1] == 2)
470 discards[--j] = 0, --provisional;
471
472 /* Now we have the length of a run of discardable lines
473 whose first and last are not provisional. */
474 length = j - i;
475
476 /* If 1/4 of the lines in the run are provisional,
477 cancel discarding of all provisional lines in the run. */
478 if(provisional * 4 > length)
479 {
480 while(j > i)
481 if(discards[--j] == 2)
482 discards[j] = 0;
483 }
484 else
485 {
486 GNULineRef consec;
487 GNULineRef minimum = 1;
488 GNULineRef tem = length >> 2;
489
490 /* MINIMUM is approximate square root of LENGTH/4.
491 A subrun of two or more provisionals can stand
492 when LENGTH is at least 16.
493 A subrun of 4 or more can stand when LENGTH >= 64. */
494 while(0 < (tem >>= 2))
495 minimum <<= 1;
496 minimum++;
497
498 /* Cancel any subrun of MINIMUM or more provisionals
499 within the larger run. */
500 for(j = 0, consec = 0; j < length; ++j)
501 if(discards[i + j] != 2)
502 consec = 0;
503 else if(minimum == ++consec)
504 /* Back up to start of subrun, to cancel it all. */
505 j -= consec;
506 else if(minimum < consec)
507 discards[i + j] = 0;
508
509 /* Scan from beginning of run
510 until we find 3 or more nonprovisionals in a row
511 or until the first nonprovisional at least 8 lines in.
512 Until that point, cancel any provisionals. */
513 for(j = 0, consec = 0; j < length; ++j)
514 {
515 if(j >= 8 && discards[i + j] == 1)
516 break;
517 if(discards[i + j] == 2)
518 consec = 0, discards[i + j] = 0;
519 else if(discards[i + j] == 0)
520 consec = 0;
521 else
522 consec++;
523 if(consec == 3)
524 break;
525 }
526
527 /* I advances to the last line of the run. */
528 i += length - 1;
529
530 /* Same thing, from end. */
531 for(j = 0, consec = 0; j < length; ++j)
532 {
533 if(j >= 8 && discards[i - j] == 1)
534 break;
535 if(discards[i - j] == 2)
536 consec = 0, discards[i - j] = 0;
537 else if(discards[i - j] == 0)
538 consec = 0;
539 else
540 consec++;
541 if(consec == 3)
542 break;
543 }
544 }
545 }
546 }
547 }
548
549 /* Actually discard the lines. */
550 for(f = 0; f < 2; ++f)
551 {
552 char *discards = discarded[f];
553 GNULineRef end = filevec[f].buffered_lines;
554 GNULineRef j = 0;
555 for(i = 0; i < end; ++i)
556 if(minimal || discards[i] == 0)
557 {
558 filevec[f].undiscarded[j] = filevec[f].equivs[i];
559 filevec[f].realindexes[j++] = i;
560 }
561 else
562 filevec[f].changed[i] = true;
563 filevec[f].nondiscarded_lines = j;
564 }
565
566 free(discarded[0]);
567 free(equiv_count[0]);
568 }
569
570 /* Adjust inserts/deletes of identical lines to join changes
571 as much as possible.
572
573 We do something when a run of changed lines include a
574 line at one end and have an excluded, identical line at the other.
575 We are free to choose which identical line is included.
576 `compareseq' usually chooses the one at the beginning,
577 but usually it is cleaner to consider the following identical line
578 to be the "change". */
579
shift_boundaries(file_data filevec[])580 void GnuDiff::shift_boundaries(file_data filevec[])
581 {
582 int f;
583
584 for(f = 0; f < 2; ++f)
585 {
586 bool *changed = filevec[f].changed;
587 bool const *other_changed = filevec[1 - f].changed;
588 GNULineRef const *equivs = filevec[f].equivs;
589 GNULineRef i = 0;
590 GNULineRef j = 0;
591 GNULineRef i_end = filevec[f].buffered_lines;
592
593 while(true)
594 {
595 GNULineRef runlength, start, corresponding;
596
597 /* Scan forwards to find beginning of another run of changes.
598 Also keep track of the corresponding point in the other file. */
599
600 while(i < i_end && !changed[i])
601 {
602 while(other_changed[j++])
603 continue;
604 i++;
605 }
606
607 if(i == i_end)
608 break;
609
610 start = i;
611
612 /* Find the end of this run of changes. */
613
614 while(changed[++i])
615 continue;
616 while(other_changed[j])
617 j++;
618
619 do
620 {
621 /* Record the length of this run of changes, so that
622 we can later determine whether the run has grown. */
623 runlength = i - start;
624
625 /* Move the changed region back, so long as the
626 previous unchanged line matches the last changed one.
627 This merges with previous changed regions. */
628
629 while(start && equivs[start - 1] == equivs[i - 1])
630 {
631 changed[--start] = true;
632 changed[--i] = false;
633 while(changed[start - 1])
634 start--;
635 while(other_changed[--j])
636 continue;
637 }
638
639 /* Set CORRESPONDING to the end of the changed run, at the last
640 point where it corresponds to a changed run in the other file.
641 CORRESPONDING == I_END means no such point has been found. */
642 corresponding = other_changed[j - 1] ? i : i_end;
643
644 /* Move the changed region forward, so long as the
645 first changed line matches the following unchanged one.
646 This merges with following changed regions.
647 Do this second, so that if there are no merges,
648 the changed region is moved forward as far as possible. */
649
650 while(i != i_end && equivs[start] == equivs[i])
651 {
652 changed[start++] = false;
653 changed[i++] = true;
654 while(changed[i])
655 i++;
656 while(other_changed[++j])
657 corresponding = i;
658 }
659 } while(runlength != i - start);
660
661 /* If possible, move the fully-merged run of changes
662 back to a corresponding run in the other file. */
663
664 while(corresponding < i)
665 {
666 changed[--start] = true;
667 changed[--i] = false;
668 while(other_changed[--j])
669 continue;
670 }
671 }
672 }
673 }
674
675 /* Cons an additional entry onto the front of an edit script OLD.
676 LINE0 and LINE1 are the first affected lines in the two files (origin 0).
677 DELETED is the number of lines deleted here from file 0.
678 INSERTED is the number of lines inserted here in file 1.
679
680 If DELETED is 0 then LINE0 is the number of the line before
681 which the insertion was done; vice versa for INSERTED and LINE1. */
682
add_change(GNULineRef line0,GNULineRef line1,GNULineRef deleted,GNULineRef inserted,change * old)683 GnuDiff::change *GnuDiff::add_change(GNULineRef line0, GNULineRef line1, GNULineRef deleted, GNULineRef inserted, change *old)
684 {
685 change *newChange = (change *)xmalloc(sizeof(*newChange));
686
687 newChange->line0 = line0;
688 newChange->line1 = line1;
689 newChange->inserted = inserted;
690 newChange->deleted = deleted;
691 newChange->link = old;
692 return newChange;
693 }
694
695 /* Scan the tables of which lines are inserted and deleted,
696 producing an edit script in reverse order. */
697
build_reverse_script(file_data const filevec[])698 GnuDiff::change *GnuDiff::build_reverse_script(file_data const filevec[])
699 {
700 change *script = nullptr;
701 bool *changed0 = filevec[0].changed;
702 bool *changed1 = filevec[1].changed;
703 GNULineRef len0 = filevec[0].buffered_lines;
704 GNULineRef len1 = filevec[1].buffered_lines;
705
706 /* Note that changedN[len0] does exist, and is 0. */
707
708 GNULineRef i0 = 0, i1 = 0;
709
710 while(i0 < len0 || i1 < len1)
711 {
712 if(changed0[i0] | changed1[i1])
713 {
714 GNULineRef line0 = i0, line1 = i1;
715
716 /* Find # lines changed here in each file. */
717 while(changed0[i0]) ++i0;
718 while(changed1[i1]) ++i1;
719
720 /* Record this change. */
721 script = add_change(line0, line1, i0 - line0, i1 - line1, script);
722 }
723
724 /* We have reached lines in the two files that match each other. */
725 i0++, i1++;
726 }
727
728 return script;
729 }
730
731 /* Scan the tables of which lines are inserted and deleted,
732 producing an edit script in forward order. */
733
build_script(file_data const filevec[])734 GnuDiff::change *GnuDiff::build_script(file_data const filevec[])
735 {
736 change *script = nullptr;
737 bool *changed0 = filevec[0].changed;
738 bool *changed1 = filevec[1].changed;
739 GNULineRef i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
740
741 /* Note that changedN[-1] does exist, and is 0. */
742
743 while(i0 >= 0 || i1 >= 0)
744 {
745 if(changed0[i0 - 1] | changed1[i1 - 1])
746 {
747 GNULineRef line0 = i0, line1 = i1;
748
749 /* Find # lines changed here in each file. */
750 while(changed0[i0 - 1]) --i0;
751 while(changed1[i1 - 1]) --i1;
752
753 /* Record this change. */
754 script = add_change(i0, i1, line0 - i0, line1 - i1, script);
755 }
756
757 /* We have reached lines in the two files that match each other. */
758 i0--, i1--;
759 }
760
761 return script;
762 }
763
764 /* Report the differences of two files. */
diff_2_files(comparison * cmp)765 GnuDiff::change *GnuDiff::diff_2_files(comparison *cmp)
766 {
767 GNULineRef diags;
768 int f;
769 change *script;
770
771 read_files(cmp->file, files_can_be_treated_as_binary);
772
773 {
774 /* Allocate vectors for the results of comparison:
775 a flag for each line of each file, saying whether that line
776 is an insertion or deletion.
777 Allocate an extra element, always 0, at each end of each vector. */
778
779 size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
780 bool *flag_space = (bool *)zalloc(s * sizeof(*flag_space));
781 cmp->file[0].changed = flag_space + 1;
782 cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
783
784 /* Some lines are obviously insertions or deletions
785 because they don't match anything. Detect them now, and
786 avoid even thinking about them in the main comparison algorithm. */
787
788 discard_confusing_lines(cmp->file);
789
790 /* Now do the main comparison algorithm, considering just the
791 undiscarded lines. */
792
793 xvec = cmp->file[0].undiscarded;
794 yvec = cmp->file[1].undiscarded;
795 diags = (cmp->file[0].nondiscarded_lines + cmp->file[1].nondiscarded_lines + 3);
796 fdiag = (GNULineRef *)xmalloc(diags * (2 * sizeof(*fdiag)));
797 bdiag = fdiag + diags;
798 fdiag += cmp->file[1].nondiscarded_lines + 1;
799 bdiag += cmp->file[1].nondiscarded_lines + 1;
800
801 /* Set TOO_EXPENSIVE to be approximate square root of input size,
802 bounded below by 256. */
803 too_expensive = 1;
804 for(; diags != 0; diags >>= 2)
805 too_expensive <<= 1;
806 too_expensive = std::max((GNULineRef)256, too_expensive);
807
808 files[0] = cmp->file[0];
809 files[1] = cmp->file[1];
810
811 compareseq(0, cmp->file[0].nondiscarded_lines,
812 0, cmp->file[1].nondiscarded_lines, minimal);
813
814 free(fdiag - (cmp->file[1].nondiscarded_lines + 1));
815
816 /* Modify the results slightly to make them prettier
817 in cases where that can validly be done. */
818
819 shift_boundaries(cmp->file);
820
821 /* Get the results of comparison in the form of a chain
822 of `change's -- an edit script. */
823
824 script = build_script(cmp->file);
825
826 free(cmp->file[0].undiscarded);
827
828 free(flag_space);
829
830 for(f = 0; f < 2; ++f)
831 {
832 free(cmp->file[f].equivs);
833 free(cmp->file[f].linbuf + cmp->file[f].linbuf_base);
834 }
835 }
836
837 return script;
838 }
839