1 /* Analyze file differences for GNU DIFF.
2 Copyright (C) 1988, 1989, 1992, 1993, 1997 Free Software Foundation, Inc.
3
4 This file is part of GNU DIFF.
5
6 GNU DIFF is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 GNU DIFF is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 */
17
18 /* The basic algorithm is described in:
19 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
20 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
21 see especially section 4.2, which describes the variation used below.
22 Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
23 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
24 at the price of producing suboptimal output for large inputs with
25 many differences.
26
27 The basic algorithm was independently discovered as described in:
28 "Algorithms for Approximate String Matching", E. Ukkonen,
29 Information and Control Vol. 64, 1985, pp. 100-118. */
30
31 #include "diff.h"
32 #include "cmpbuf.h"
33
34 extern int no_discards;
35
36 static int *xvec, *yvec; /* Vectors being compared. */
37 static int *fdiag; /* Vector, indexed by diagonal, containing
38 1 + the X coordinate of the point furthest
39 along the given diagonal in the forward
40 search of the edit matrix. */
41 static int *bdiag; /* Vector, indexed by diagonal, containing
42 the X coordinate of the point furthest
43 along the given diagonal in the backward
44 search of the edit matrix. */
45 static int too_expensive; /* Edit scripts longer than this are too
46 expensive to compute. */
47
48 #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */
49
50 struct partition
51 {
52 int xmid, ymid; /* Midpoints of this partition. */
53 int lo_minimal; /* Nonzero if low half will be analyzed minimally. */
54 int hi_minimal; /* Likewise for high half. */
55 };
56
57 static int diag PARAMS((int, int, int, int, int, struct partition *));
58 static struct change *add_change PARAMS((int, int, int, int, struct change *));
59 static struct change *build_reverse_script PARAMS((struct file_data const[]));
60 static struct change *build_script PARAMS((struct file_data const[]));
61 static void briefly_report PARAMS((int, struct file_data const[]));
62 static void compareseq PARAMS((int, int, int, int, int));
63 static void discard_confusing_lines PARAMS((struct file_data[]));
64 static void shift_boundaries PARAMS((struct file_data[]));
65
66 /* Find the midpoint of the shortest edit script for a specified
67 portion of the two files.
68
69 Scan from the beginnings of the files, and simultaneously from the ends,
70 doing a breadth-first search through the space of edit-sequence.
71 When the two searches meet, we have found the midpoint of the shortest
72 edit sequence.
73
74 If MINIMAL is nonzero, find the minimal edit script regardless
75 of expense. Otherwise, if the search is too expensive, use
76 heuristics to stop the search and report a suboptimal answer.
77
78 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal number
79 XMID - YMID equals the number of inserted lines minus the number
80 of deleted lines (counting only lines before the midpoint).
81 Return the approximate edit cost; this is the total number of
82 lines inserted or deleted (counting only lines before the midpoint),
83 unless a heuristic is used to terminate the search prematurely.
84
85 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script for the
86 left half of the partition is known; similarly for PART->RIGHT_MINIMAL.
87
88 This function assumes that the first lines of the specified portions
89 of the two files do not match, and likewise that the last lines do not
90 match. The caller must trim matching lines from the beginning and end
91 of the portions it is going to specify.
92
93 If we return the "wrong" partitions,
94 the worst this can do is cause suboptimal diff output.
95 It cannot cause incorrect diff output. */
96
97 static int
diag(xoff,xlim,yoff,ylim,minimal,part)98 diag (xoff, xlim, yoff, ylim, minimal, part)
99 int xoff, xlim, yoff, ylim, minimal;
100 struct partition *part;
101 {
102 int *const fd = fdiag; /* Give the compiler a chance. */
103 int *const bd = bdiag; /* Additional help for the compiler. */
104 int const *const xv = xvec; /* Still more help for the compiler. */
105 int const *const yv = yvec; /* And more and more . . . */
106 int const dmin = xoff - ylim; /* Minimum valid diagonal. */
107 int const dmax = xlim - yoff; /* Maximum valid diagonal. */
108 int const fmid = xoff - yoff; /* Center diagonal of top-down search. */
109 int const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
110 int fmin = fmid, fmax = fmid; /* Limits of top-down search. */
111 int bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */
112 int c; /* Cost. */
113 int odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
114 diagonal with respect to the northwest. */
115
116 fd[fmid] = xoff;
117 bd[bmid] = xlim;
118
119 for (c = 1;; ++c)
120 {
121 int d; /* Active diagonal. */
122 int big_snake = 0;
123
124 /* Extend the top-down search by an edit step in each diagonal. */
125 fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
126 fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
127 for (d = fmax; d >= fmin; d -= 2)
128 {
129 int x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
130
131 if (tlo >= thi)
132 x = tlo + 1;
133 else
134 x = thi;
135 oldx = x;
136 y = x - d;
137 while (x < xlim && y < ylim && xv[x] == yv[y])
138 ++x, ++y;
139 if (x - oldx > SNAKE_LIMIT)
140 big_snake = 1;
141 fd[d] = x;
142 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
143 {
144 part->xmid = x;
145 part->ymid = y;
146 part->lo_minimal = part->hi_minimal = 1;
147 return 2 * c - 1;
148 }
149 }
150
151 /* Similarly extend the bottom-up search. */
152 bmin > dmin ? bd[--bmin - 1] = INT_MAX : ++bmin;
153 bmax < dmax ? bd[++bmax + 1] = INT_MAX : --bmax;
154 for (d = bmax; d >= bmin; d -= 2)
155 {
156 int x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
157
158 if (tlo < thi)
159 x = tlo;
160 else
161 x = thi - 1;
162 oldx = x;
163 y = x - d;
164 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
165 --x, --y;
166 if (oldx - x > SNAKE_LIMIT)
167 big_snake = 1;
168 bd[d] = x;
169 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
170 {
171 part->xmid = x;
172 part->ymid = y;
173 part->lo_minimal = part->hi_minimal = 1;
174 return 2 * c;
175 }
176 }
177
178 if (minimal)
179 continue;
180
181 /* Heuristic: check occasionally for a diagonal that has made
182 lots of progress compared with the edit distance.
183 If we have any such, find the one that has made the most
184 progress and return it as if it had succeeded.
185
186 With this heuristic, for files with a constant small density
187 of changes, the algorithm is linear in the file size. */
188
189 if (c > 200 && big_snake && heuristic)
190 {
191 int best;
192
193 best = 0;
194 for (d = fmax; d >= fmin; d -= 2)
195 {
196 int dd = d - fmid;
197 int x = fd[d];
198 int y = x - d;
199 int v = (x - xoff) * 2 - dd;
200 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
201 {
202 if (v > best
203 && xoff + SNAKE_LIMIT <= x && x < xlim
204 && yoff + SNAKE_LIMIT <= y && y < ylim)
205 {
206 /* We have a good enough best diagonal;
207 now insist that it end with a significant snake. */
208 int k;
209
210 for (k = 1; xv[x - k] == yv[y - k]; k++)
211 if (k == SNAKE_LIMIT)
212 {
213 best = v;
214 part->xmid = x;
215 part->ymid = y;
216 break;
217 }
218 }
219 }
220 }
221 if (best > 0)
222 {
223 part->lo_minimal = 1;
224 part->hi_minimal = 0;
225 return 2 * c - 1;
226 }
227
228 best = 0;
229 for (d = bmax; d >= bmin; d -= 2)
230 {
231 int dd = d - bmid;
232 int x = bd[d];
233 int y = x - d;
234 int v = (xlim - x) * 2 + dd;
235 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
236 {
237 if (v > best
238 && xoff < x && x <= xlim - SNAKE_LIMIT
239 && yoff < y && y <= ylim - SNAKE_LIMIT)
240 {
241 /* We have a good enough best diagonal;
242 now insist that it end with a significant snake. */
243 int k;
244
245 for (k = 0; xv[x + k] == yv[y + k]; k++)
246 if (k == SNAKE_LIMIT - 1)
247 {
248 best = v;
249 part->xmid = x;
250 part->ymid = y;
251 break;
252 }
253 }
254 }
255 }
256 if (best > 0)
257 {
258 part->lo_minimal = 0;
259 part->hi_minimal = 1;
260 return 2 * c - 1;
261 }
262 }
263
264 /* Heuristic: if we've gone well beyond the call of duty,
265 give up and report halfway between our best results so far. */
266 if (c >= too_expensive)
267 {
268 int fxybest, fxbest;
269 int bxybest, bxbest;
270
271 fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */
272
273 /* Find forward diagonal that maximizes X + Y. */
274 fxybest = -1;
275 for (d = fmax; d >= fmin; d -= 2)
276 {
277 int x = min (fd[d], xlim);
278 int y = x - d;
279 if (ylim < y)
280 x = ylim + d, y = ylim;
281 if (fxybest < x + y)
282 {
283 fxybest = x + y;
284 fxbest = x;
285 }
286 }
287
288 /* Find backward diagonal that minimizes X + Y. */
289 bxybest = INT_MAX;
290 for (d = bmax; d >= bmin; d -= 2)
291 {
292 int x = max (xoff, bd[d]);
293 int y = x - d;
294 if (y < yoff)
295 x = yoff + d, y = yoff;
296 if (x + y < bxybest)
297 {
298 bxybest = x + y;
299 bxbest = x;
300 }
301 }
302
303 /* Use the better of the two diagonals. */
304 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
305 {
306 part->xmid = fxbest;
307 part->ymid = fxybest - fxbest;
308 part->lo_minimal = 1;
309 part->hi_minimal = 0;
310 }
311 else
312 {
313 part->xmid = bxbest;
314 part->ymid = bxybest - bxbest;
315 part->lo_minimal = 0;
316 part->hi_minimal = 1;
317 }
318 return 2 * c - 1;
319 }
320 }
321 }
322
323 /* Compare in detail contiguous subsequences of the two files
324 which are known, as a whole, to match each other.
325
326 The results are recorded in the vectors files[N].changed_flag, by
327 storing a 1 in the element for each line that is an insertion or deletion.
328
329 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
330
331 Note that XLIM, YLIM are exclusive bounds.
332 All line numbers are origin-0 and discarded lines are not counted.
333
334 If MINIMAL is nonzero, find a minimal difference no matter how
335 expensive it is. */
336
337 static void
compareseq(xoff,xlim,yoff,ylim,minimal)338 compareseq (xoff, xlim, yoff, ylim, minimal)
339 int xoff, xlim, yoff, ylim, minimal;
340 {
341 int * const xv = xvec; /* Help the compiler. */
342 int * const yv = yvec;
343
344 /* Slide down the bottom initial diagonal. */
345 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
346 ++xoff, ++yoff;
347 /* Slide up the top initial diagonal. */
348 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
349 --xlim, --ylim;
350
351 /* Handle simple cases. */
352 if (xoff == xlim)
353 while (yoff < ylim)
354 files[1].changed_flag[files[1].realindexes[yoff++]] = 1;
355 else if (yoff == ylim)
356 while (xoff < xlim)
357 files[0].changed_flag[files[0].realindexes[xoff++]] = 1;
358 else
359 {
360 int c;
361 struct partition part;
362
363 /* Find a point of correspondence in the middle of the files. */
364
365 c = diag (xoff, xlim, yoff, ylim, minimal, &part);
366
367 if (c == 1)
368 {
369 /* This should be impossible, because it implies that
370 one of the two subsequences is empty,
371 and that case was handled above without calling `diag'.
372 Let's verify that this is true. */
373 abort ();
374 #if 0
375 /* The two subsequences differ by a single insert or delete;
376 record it and we are done. */
377 if (part.xmid - part.ymid < xoff - yoff)
378 files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1;
379 else
380 files[0].changed_flag[files[0].realindexes[part.xmid]] = 1;
381 #endif
382 }
383 else
384 {
385 /* Use the partitions to split this problem into subproblems. */
386 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
387 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
388 }
389 }
390 }
391
392 /* Discard lines from one file that have no matches in the other file.
393
394 A line which is discarded will not be considered by the actual
395 comparison algorithm; it will be as if that line were not in the file.
396 The file's `realindexes' table maps virtual line numbers
397 (which don't count the discarded lines) into real line numbers;
398 this is how the actual comparison algorithm produces results
399 that are comprehensible when the discarded lines are counted.
400
401 When we discard a line, we also mark it as a deletion or insertion
402 so that it will be printed in the output. */
403
404 static void
discard_confusing_lines(filevec)405 discard_confusing_lines (filevec)
406 struct file_data filevec[];
407 {
408 unsigned int f, i;
409 char *discarded[2];
410 int *equiv_count[2];
411 int *p;
412
413 /* Allocate our results. */
414 p = (int *) xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
415 * (2 * sizeof (int)));
416 for (f = 0; f < 2; f++)
417 {
418 filevec[f].undiscarded = p; p += filevec[f].buffered_lines;
419 filevec[f].realindexes = p; p += filevec[f].buffered_lines;
420 }
421
422 /* Set up equiv_count[F][I] as the number of lines in file F
423 that fall in equivalence class I. */
424
425 p = (int *) xmalloc (filevec[0].equiv_max * (2 * sizeof (int)));
426 equiv_count[0] = p;
427 equiv_count[1] = p + filevec[0].equiv_max;
428 bzero (p, filevec[0].equiv_max * (2 * sizeof (int)));
429
430 for (i = 0; i < filevec[0].buffered_lines; ++i)
431 ++equiv_count[0][filevec[0].equivs[i]];
432 for (i = 0; i < filevec[1].buffered_lines; ++i)
433 ++equiv_count[1][filevec[1].equivs[i]];
434
435 /* Set up tables of which lines are going to be discarded. */
436
437 discarded[0] = xmalloc (sizeof (char)
438 * (filevec[0].buffered_lines
439 + filevec[1].buffered_lines));
440 discarded[1] = discarded[0] + filevec[0].buffered_lines;
441 bzero (discarded[0], sizeof (char) * (filevec[0].buffered_lines
442 + filevec[1].buffered_lines));
443
444 /* Mark to be discarded each line that matches no line of the other file.
445 If a line matches many lines, mark it as provisionally discardable. */
446
447 for (f = 0; f < 2; f++)
448 {
449 unsigned int end = filevec[f].buffered_lines;
450 char *discards = discarded[f];
451 int *counts = equiv_count[1 - f];
452 int *equivs = filevec[f].equivs;
453 unsigned int many = 5;
454 unsigned int tem = end / 64;
455
456 /* Multiply MANY by approximate square root of number of lines.
457 That is the threshold for provisionally discardable lines. */
458 while ((tem = tem >> 2) > 0)
459 many *= 2;
460
461 for (i = 0; i < end; i++)
462 {
463 int nmatch;
464 if (equivs[i] == 0)
465 continue;
466 nmatch = counts[equivs[i]];
467 if (nmatch == 0)
468 discards[i] = 1;
469 else if (nmatch > many)
470 discards[i] = 2;
471 }
472 }
473
474 /* Don't really discard the provisional lines except when they occur
475 in a run of discardables, with nonprovisionals at the beginning
476 and end. */
477
478 for (f = 0; f < 2; f++)
479 {
480 unsigned int end = filevec[f].buffered_lines;
481 register char *discards = discarded[f];
482
483 for (i = 0; i < end; i++)
484 {
485 /* Cancel provisional discards not in middle of run of discards. */
486 if (discards[i] == 2)
487 discards[i] = 0;
488 else if (discards[i] != 0)
489 {
490 /* We have found a nonprovisional discard. */
491 register int j;
492 unsigned int length;
493 unsigned int provisional = 0;
494
495 /* Find end of this run of discardable lines.
496 Count how many are provisionally discardable. */
497 for (j = i; j < end; j++)
498 {
499 if (discards[j] == 0)
500 break;
501 if (discards[j] == 2)
502 ++provisional;
503 }
504
505 /* Cancel provisional discards at end, and shrink the run. */
506 while (j > i && discards[j - 1] == 2)
507 discards[--j] = 0, --provisional;
508
509 /* Now we have the length of a run of discardable lines
510 whose first and last are not provisional. */
511 length = j - i;
512
513 /* If 1/4 of the lines in the run are provisional,
514 cancel discarding of all provisional lines in the run. */
515 if (provisional * 4 > length)
516 {
517 while (j > i)
518 if (discards[--j] == 2)
519 discards[j] = 0;
520 }
521 else
522 {
523 register unsigned int consec;
524 unsigned int minimum = 1;
525 unsigned int tem = length / 4;
526
527 /* MINIMUM is approximate square root of LENGTH/4.
528 A subrun of two or more provisionals can stand
529 when LENGTH is at least 16.
530 A subrun of 4 or more can stand when LENGTH >= 64. */
531 while ((tem = tem >> 2) > 0)
532 minimum *= 2;
533 minimum++;
534
535 /* Cancel any subrun of MINIMUM or more provisionals
536 within the larger run. */
537 for (j = 0, consec = 0; j < length; j++)
538 if (discards[i + j] != 2)
539 consec = 0;
540 else if (minimum == ++consec)
541 /* Back up to start of subrun, to cancel it all. */
542 j -= consec;
543 else if (minimum < consec)
544 discards[i + j] = 0;
545
546 /* Scan from beginning of run
547 until we find 3 or more nonprovisionals in a row
548 or until the first nonprovisional at least 8 lines in.
549 Until that point, cancel any provisionals. */
550 for (j = 0, consec = 0; j < length; j++)
551 {
552 if (j >= 8 && discards[i + j] == 1)
553 break;
554 if (discards[i + j] == 2)
555 consec = 0, discards[i + j] = 0;
556 else if (discards[i + j] == 0)
557 consec = 0;
558 else
559 consec++;
560 if (consec == 3)
561 break;
562 }
563
564 /* I advances to the last line of the run. */
565 i += length - 1;
566
567 /* Same thing, from end. */
568 for (j = 0, consec = 0; j < length; j++)
569 {
570 if (j >= 8 && discards[i - j] == 1)
571 break;
572 if (discards[i - j] == 2)
573 consec = 0, discards[i - j] = 0;
574 else if (discards[i - j] == 0)
575 consec = 0;
576 else
577 consec++;
578 if (consec == 3)
579 break;
580 }
581 }
582 }
583 }
584 }
585
586 /* Actually discard the lines. */
587 for (f = 0; f < 2; f++)
588 {
589 char *discards = discarded[f];
590 unsigned int end = filevec[f].buffered_lines;
591 unsigned int j = 0;
592 for (i = 0; i < end; ++i)
593 if (no_discards || discards[i] == 0)
594 {
595 filevec[f].undiscarded[j] = filevec[f].equivs[i];
596 filevec[f].realindexes[j++] = i;
597 }
598 else
599 filevec[f].changed_flag[i] = 1;
600 filevec[f].nondiscarded_lines = j;
601 }
602
603 free (discarded[0]);
604 free (equiv_count[0]);
605 }
606
607 /* Adjust inserts/deletes of identical lines to join changes
608 as much as possible.
609
610 We do something when a run of changed lines include a
611 line at one end and have an excluded, identical line at the other.
612 We are free to choose which identical line is included.
613 `compareseq' usually chooses the one at the beginning,
614 but usually it is cleaner to consider the following identical line
615 to be the "change". */
616
617 int inhibit;
618
619 static void
shift_boundaries(filevec)620 shift_boundaries (filevec)
621 struct file_data filevec[];
622 {
623 int f;
624
625 if (inhibit)
626 return;
627
628 for (f = 0; f < 2; f++)
629 {
630 char *changed = filevec[f].changed_flag;
631 char const *other_changed = filevec[1-f].changed_flag;
632 int const *equivs = filevec[f].equivs;
633 int i = 0;
634 int j = 0;
635 int i_end = filevec[f].buffered_lines;
636
637 while (1)
638 {
639 int runlength, start, corresponding;
640
641 /* Scan forwards to find beginning of another run of changes.
642 Also keep track of the corresponding point in the other file. */
643
644 while (i < i_end && changed[i] == 0)
645 {
646 while (other_changed[j++])
647 continue;
648 i++;
649 }
650
651 if (i == i_end)
652 break;
653
654 start = i;
655
656 /* Find the end of this run of changes. */
657
658 while (changed[++i])
659 continue;
660 while (other_changed[j])
661 j++;
662
663 do
664 {
665 /* Record the length of this run of changes, so that
666 we can later determine whether the run has grown. */
667 runlength = i - start;
668
669 /* Move the changed region back, so long as the
670 previous unchanged line matches the last changed one.
671 This merges with previous changed regions. */
672
673 while (start && equivs[start - 1] == equivs[i - 1])
674 {
675 changed[--start] = 1;
676 changed[--i] = 0;
677 while (changed[start - 1])
678 start--;
679 while (other_changed[--j])
680 continue;
681 }
682
683 /* Set CORRESPONDING to the end of the changed run, at the last
684 point where it corresponds to a changed run in the other file.
685 CORRESPONDING == I_END means no such point has been found. */
686 corresponding = other_changed[j - 1] ? i : i_end;
687
688 /* Move the changed region forward, so long as the
689 first changed line matches the following unchanged one.
690 This merges with following changed regions.
691 Do this second, so that if there are no merges,
692 the changed region is moved forward as far as possible. */
693
694 while (i != i_end && equivs[start] == equivs[i])
695 {
696 changed[start++] = 0;
697 changed[i++] = 1;
698 while (changed[i])
699 i++;
700 while (other_changed[++j])
701 corresponding = i;
702 }
703 }
704 while (runlength != i - start);
705
706 /* If possible, move the fully-merged run of changes
707 back to a corresponding run in the other file. */
708
709 while (corresponding < i)
710 {
711 changed[--start] = 1;
712 changed[--i] = 0;
713 while (other_changed[--j])
714 continue;
715 }
716 }
717 }
718 }
719
720 /* Cons an additional entry onto the front of an edit script OLD.
721 LINE0 and LINE1 are the first affected lines in the two files (origin 0).
722 DELETED is the number of lines deleted here from file 0.
723 INSERTED is the number of lines inserted here in file 1.
724
725 If DELETED is 0 then LINE0 is the number of the line before
726 which the insertion was done; vice versa for INSERTED and LINE1. */
727
728 static struct change *
add_change(line0,line1,deleted,inserted,old)729 add_change (line0, line1, deleted, inserted, old)
730 int line0, line1, deleted, inserted;
731 struct change *old;
732 {
733 struct change *new = (struct change *) xmalloc (sizeof (struct change));
734
735 new->line0 = line0;
736 new->line1 = line1;
737 new->inserted = inserted;
738 new->deleted = deleted;
739 new->link = old;
740 return new;
741 }
742
743 /* Scan the tables of which lines are inserted and deleted,
744 producing an edit script in reverse order. */
745
746 static struct change *
build_reverse_script(filevec)747 build_reverse_script (filevec)
748 struct file_data const filevec[];
749 {
750 struct change *script = 0;
751 char *changed0 = filevec[0].changed_flag;
752 char *changed1 = filevec[1].changed_flag;
753 int len0 = filevec[0].buffered_lines;
754 int len1 = filevec[1].buffered_lines;
755
756 /* Note that changedN[len0] does exist, and contains 0. */
757
758 int i0 = 0, i1 = 0;
759
760 while (i0 < len0 || i1 < len1)
761 {
762 if (changed0[i0] || changed1[i1])
763 {
764 int line0 = i0, line1 = i1;
765
766 /* Find # lines changed here in each file. */
767 while (changed0[i0]) ++i0;
768 while (changed1[i1]) ++i1;
769
770 /* Record this change. */
771 script = add_change (line0, line1, i0 - line0, i1 - line1, script);
772 }
773
774 /* We have reached lines in the two files that match each other. */
775 i0++, i1++;
776 }
777
778 return script;
779 }
780
781 /* Scan the tables of which lines are inserted and deleted,
782 producing an edit script in forward order. */
783
784 static struct change *
build_script(filevec)785 build_script (filevec)
786 struct file_data const filevec[];
787 {
788 struct change *script = 0;
789 char *changed0 = filevec[0].changed_flag;
790 char *changed1 = filevec[1].changed_flag;
791 int i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
792
793 /* Note that changedN[-1] does exist, and contains 0. */
794
795 while (i0 >= 0 || i1 >= 0)
796 {
797 if (changed0[i0 - 1] || changed1[i1 - 1])
798 {
799 int line0 = i0, line1 = i1;
800
801 /* Find # lines changed here in each file. */
802 while (changed0[i0 - 1]) --i0;
803 while (changed1[i1 - 1]) --i1;
804
805 /* Record this change. */
806 script = add_change (i0, i1, line0 - i0, line1 - i1, script);
807 }
808
809 /* We have reached lines in the two files that match each other. */
810 i0--, i1--;
811 }
812
813 return script;
814 }
815
816 /* If CHANGES, briefly report that two files differed. */
817 static void
briefly_report(changes,filevec)818 briefly_report (changes, filevec)
819 int changes;
820 struct file_data const filevec[];
821 {
822 if (changes)
823 message (no_details_flag ? "Files %s and %s differ\n"
824 : "Binary files %s and %s differ\n",
825 filevec[0].name, filevec[1].name);
826 }
827
828 /* Report the differences of two files. DEPTH is the current directory
829 depth. */
830 int
diff_2_files(filevec,depth)831 diff_2_files (filevec, depth)
832 struct file_data filevec[];
833 int depth;
834 {
835 int diags;
836 int i;
837 struct change *e, *p;
838 struct change *script;
839 int changes;
840
841
842 /* If we have detected that either file is binary,
843 compare the two files as binary. This can happen
844 only when the first chunk is read.
845 Also, --brief without any --ignore-* options means
846 we can speed things up by treating the files as binary. */
847
848 if (read_files (filevec, no_details_flag & ~ignore_some_changes))
849 {
850 /* Files with different lengths must be different. */
851 if (filevec[0].stat.st_size != filevec[1].stat.st_size
852 && (filevec[0].desc < 0 || S_ISREG (filevec[0].stat.st_mode))
853 && (filevec[1].desc < 0 || S_ISREG (filevec[1].stat.st_mode)))
854 changes = 1;
855
856 /* Standard input equals itself. */
857 else if (filevec[0].desc == filevec[1].desc)
858 changes = 0;
859
860 else
861 /* Scan both files, a buffer at a time, looking for a difference. */
862 {
863 /* Allocate same-sized buffers for both files. */
864 size_t buffer_size = buffer_lcm (STAT_BLOCKSIZE (filevec[0].stat),
865 STAT_BLOCKSIZE (filevec[1].stat));
866 for (i = 0; i < 2; i++)
867 filevec[i].buffer = xrealloc (filevec[i].buffer, buffer_size);
868
869 for (;; filevec[0].buffered_chars = filevec[1].buffered_chars = 0)
870 {
871 /* Read a buffer's worth from both files. */
872 for (i = 0; i < 2; i++)
873 if (0 <= filevec[i].desc)
874 while (filevec[i].buffered_chars != buffer_size)
875 {
876 int r = read (filevec[i].desc,
877 filevec[i].buffer
878 + filevec[i].buffered_chars,
879 buffer_size - filevec[i].buffered_chars);
880 if (r == 0)
881 break;
882 if (r < 0)
883 pfatal_with_name (filevec[i].name);
884 filevec[i].buffered_chars += r;
885 }
886
887 /* If the buffers differ, the files differ. */
888 if (filevec[0].buffered_chars != filevec[1].buffered_chars
889 || (filevec[0].buffered_chars != 0
890 && memcmp (filevec[0].buffer,
891 filevec[1].buffer,
892 filevec[0].buffered_chars) != 0))
893 {
894 changes = 1;
895 break;
896 }
897
898 /* If we reach end of file, the files are the same. */
899 if (filevec[0].buffered_chars != buffer_size)
900 {
901 changes = 0;
902 break;
903 }
904 }
905 }
906
907 briefly_report (changes, filevec);
908 }
909 else
910 {
911 /* Allocate vectors for the results of comparison:
912 a flag for each line of each file, saying whether that line
913 is an insertion or deletion.
914 Allocate an extra element, always zero, at each end of each vector. */
915
916 size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4;
917 filevec[0].changed_flag = xmalloc (s);
918 bzero (filevec[0].changed_flag, s);
919 filevec[0].changed_flag++;
920 filevec[1].changed_flag = filevec[0].changed_flag
921 + filevec[0].buffered_lines + 2;
922
923 /* Some lines are obviously insertions or deletions
924 because they don't match anything. Detect them now, and
925 avoid even thinking about them in the main comparison algorithm. */
926
927 discard_confusing_lines (filevec);
928
929 /* Now do the main comparison algorithm, considering just the
930 undiscarded lines. */
931
932 xvec = filevec[0].undiscarded;
933 yvec = filevec[1].undiscarded;
934 diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
935 fdiag = (int *) xmalloc (diags * (2 * sizeof (int)));
936 bdiag = fdiag + diags;
937 fdiag += filevec[1].nondiscarded_lines + 1;
938 bdiag += filevec[1].nondiscarded_lines + 1;
939
940 /* Set TOO_EXPENSIVE to be approximate square root of input size,
941 bounded below by 256. */
942 too_expensive = 1;
943 for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines;
944 i != 0; i >>= 2)
945 too_expensive <<= 1;
946 too_expensive = max (256, too_expensive);
947
948 files[0] = filevec[0];
949 files[1] = filevec[1];
950
951 compareseq (0, filevec[0].nondiscarded_lines,
952 0, filevec[1].nondiscarded_lines, no_discards);
953
954 free (fdiag - (filevec[1].nondiscarded_lines + 1));
955
956 /* Modify the results slightly to make them prettier
957 in cases where that can validly be done. */
958
959 shift_boundaries (filevec);
960
961 /* Get the results of comparison in the form of a chain
962 of `struct change's -- an edit script. */
963
964 if (output_style == OUTPUT_ED)
965 script = build_reverse_script (filevec);
966 else
967 script = build_script (filevec);
968
969 /* Set CHANGES if we had any diffs.
970 If some changes are ignored, we must scan the script to decide. */
971 if (ignore_blank_lines_flag || ignore_regexp_list)
972 {
973 struct change *next = script;
974 changes = 0;
975
976 while (next && changes == 0)
977 {
978 struct change *this, *end;
979 int first0, last0, first1, last1, deletes, inserts;
980
981 /* Find a set of changes that belong together. */
982 this = next;
983 end = find_change (next);
984
985 /* Disconnect them from the rest of the changes, making them
986 a hunk, and remember the rest for next iteration. */
987 next = end->link;
988 end->link = 0;
989
990 /* Determine whether this hunk is really a difference. */
991 analyze_hunk (this, &first0, &last0, &first1, &last1,
992 &deletes, &inserts);
993
994 /* Reconnect the script so it will all be freed properly. */
995 end->link = next;
996
997 if (deletes || inserts)
998 changes = 1;
999 }
1000 }
1001 else
1002 changes = (script != 0);
1003
1004 if (no_details_flag)
1005 briefly_report (changes, filevec);
1006 else
1007 {
1008 if (changes || ! no_diff_means_no_output)
1009 {
1010 /* Record info for starting up output,
1011 to be used if and when we have some output to print. */
1012 setup_output (files[0].name, files[1].name, depth);
1013
1014 switch (output_style)
1015 {
1016 case OUTPUT_CONTEXT:
1017 print_context_script (script, 0);
1018 break;
1019
1020 case OUTPUT_UNIFIED:
1021 print_context_script (script, 1);
1022 break;
1023
1024 case OUTPUT_ED:
1025 print_ed_script (script);
1026 break;
1027
1028 case OUTPUT_FORWARD_ED:
1029 pr_forward_ed_script (script);
1030 break;
1031
1032 case OUTPUT_RCS:
1033 print_rcs_script (script);
1034 break;
1035
1036 case OUTPUT_NORMAL:
1037 print_normal_script (script);
1038 break;
1039
1040 case OUTPUT_IFDEF:
1041 print_ifdef_script (script);
1042 break;
1043
1044 case OUTPUT_SDIFF:
1045 print_sdiff_script (script);
1046 }
1047
1048 finish_output ();
1049 }
1050 }
1051
1052 free (filevec[0].undiscarded);
1053
1054 free (filevec[0].changed_flag - 1);
1055
1056 for (i = 1; i >= 0; --i)
1057 free (filevec[i].equivs);
1058
1059 for (i = 0; i < 2; ++i)
1060 free (filevec[i].linbuf + filevec[i].linbuf_base);
1061
1062 for (e = script; e; e = p)
1063 {
1064 p = e->link;
1065 free (e);
1066 }
1067
1068 if (! ROBUST_OUTPUT_STYLE (output_style))
1069 for (i = 0; i < 2; ++i)
1070 if (filevec[i].missing_newline)
1071 {
1072 diff_error ("No newline at end of file %s", filevec[i].name, "");
1073 changes = 2;
1074 }
1075 }
1076
1077 if (filevec[0].buffer != filevec[1].buffer)
1078 free (filevec[0].buffer);
1079 free (filevec[1].buffer);
1080
1081 return changes;
1082 }
1083