xref: /dragonfly/contrib/cvs-1.12/diff/analyze.c (revision 6e5c5008)
1 /* Analyze file differences for GNU DIFF.
2    Copyright (C) 1988, 1989, 1992, 1993, 1997 Free Software Foundation, Inc.
3 
4 This file is part of GNU DIFF.
5 
6 GNU DIFF is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU DIFF is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 */
17 
18 /* The basic algorithm is described in:
19    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
20    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
21    see especially section 4.2, which describes the variation used below.
22    Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
23    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
24    at the price of producing suboptimal output for large inputs with
25    many differences.
26 
27    The basic algorithm was independently discovered as described in:
28    "Algorithms for Approximate String Matching", E. Ukkonen,
29    Information and Control Vol. 64, 1985, pp. 100-118.  */
30 
31 #include "diff.h"
32 #include "cmpbuf.h"
33 
34 extern int no_discards;
35 
36 static int *xvec, *yvec;	/* Vectors being compared. */
37 static int *fdiag;		/* Vector, indexed by diagonal, containing
38 				   1 + the X coordinate of the point furthest
39 				   along the given diagonal in the forward
40 				   search of the edit matrix. */
41 static int *bdiag;		/* Vector, indexed by diagonal, containing
42 				   the X coordinate of the point furthest
43 				   along the given diagonal in the backward
44 				   search of the edit matrix. */
45 static int too_expensive;	/* Edit scripts longer than this are too
46 				   expensive to compute.  */
47 
48 #define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
49 
50 struct partition
51 {
52   int xmid, ymid;	/* Midpoints of this partition.  */
53   int lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
54   int hi_minimal;	/* Likewise for high half.  */
55 };
56 
57 static int diag PARAMS((int, int, int, int, int, struct partition *));
58 static struct change *add_change PARAMS((int, int, int, int, struct change *));
59 static struct change *build_reverse_script PARAMS((struct file_data const[]));
60 static struct change *build_script PARAMS((struct file_data const[]));
61 static void briefly_report PARAMS((int, struct file_data const[]));
62 static void compareseq PARAMS((int, int, int, int, int));
63 static void discard_confusing_lines PARAMS((struct file_data[]));
64 static void shift_boundaries PARAMS((struct file_data[]));
65 
66 /* Find the midpoint of the shortest edit script for a specified
67    portion of the two files.
68 
69    Scan from the beginnings of the files, and simultaneously from the ends,
70    doing a breadth-first search through the space of edit-sequence.
71    When the two searches meet, we have found the midpoint of the shortest
72    edit sequence.
73 
74    If MINIMAL is nonzero, find the minimal edit script regardless
75    of expense.  Otherwise, if the search is too expensive, use
76    heuristics to stop the search and report a suboptimal answer.
77 
78    Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal number
79    XMID - YMID equals the number of inserted lines minus the number
80    of deleted lines (counting only lines before the midpoint).
81    Return the approximate edit cost; this is the total number of
82    lines inserted or deleted (counting only lines before the midpoint),
83    unless a heuristic is used to terminate the search prematurely.
84 
85    Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script for the
86    left half of the partition is known; similarly for PART->RIGHT_MINIMAL.
87 
88    This function assumes that the first lines of the specified portions
89    of the two files do not match, and likewise that the last lines do not
90    match.  The caller must trim matching lines from the beginning and end
91    of the portions it is going to specify.
92 
93    If we return the "wrong" partitions,
94    the worst this can do is cause suboptimal diff output.
95    It cannot cause incorrect diff output.  */
96 
97 static int
98 diag (xoff, xlim, yoff, ylim, minimal, part)
99      int xoff, xlim, yoff, ylim, minimal;
100      struct partition *part;
101 {
102   int *const fd = fdiag;	/* Give the compiler a chance. */
103   int *const bd = bdiag;	/* Additional help for the compiler. */
104   int const *const xv = xvec;	/* Still more help for the compiler. */
105   int const *const yv = yvec;	/* And more and more . . . */
106   int const dmin = xoff - ylim;	/* Minimum valid diagonal. */
107   int const dmax = xlim - yoff;	/* Maximum valid diagonal. */
108   int const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
109   int const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
110   int fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
111   int bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
112   int c;			/* Cost. */
113   int odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
114 				   diagonal with respect to the northwest. */
115 
116   fd[fmid] = xoff;
117   bd[bmid] = xlim;
118 
119   for (c = 1;; ++c)
120     {
121       int d;			/* Active diagonal. */
122       int big_snake = 0;
123 
124       /* Extend the top-down search by an edit step in each diagonal. */
125       fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
126       fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
127       for (d = fmax; d >= fmin; d -= 2)
128 	{
129 	  int x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
130 
131 	  if (tlo >= thi)
132 	    x = tlo + 1;
133 	  else
134 	    x = thi;
135 	  oldx = x;
136 	  y = x - d;
137 	  while (x < xlim && y < ylim && xv[x] == yv[y])
138 	    ++x, ++y;
139 	  if (x - oldx > SNAKE_LIMIT)
140 	    big_snake = 1;
141 	  fd[d] = x;
142 	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
143 	    {
144 	      part->xmid = x;
145 	      part->ymid = y;
146 	      part->lo_minimal = part->hi_minimal = 1;
147 	      return 2 * c - 1;
148 	    }
149 	}
150 
151       /* Similarly extend the bottom-up search.  */
152       bmin > dmin ? bd[--bmin - 1] = INT_MAX : ++bmin;
153       bmax < dmax ? bd[++bmax + 1] = INT_MAX : --bmax;
154       for (d = bmax; d >= bmin; d -= 2)
155 	{
156 	  int x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
157 
158 	  if (tlo < thi)
159 	    x = tlo;
160 	  else
161 	    x = thi - 1;
162 	  oldx = x;
163 	  y = x - d;
164 	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
165 	    --x, --y;
166 	  if (oldx - x > SNAKE_LIMIT)
167 	    big_snake = 1;
168 	  bd[d] = x;
169 	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
170 	    {
171 	      part->xmid = x;
172 	      part->ymid = y;
173 	      part->lo_minimal = part->hi_minimal = 1;
174 	      return 2 * c;
175 	    }
176 	}
177 
178       if (minimal)
179 	continue;
180 
181       /* Heuristic: check occasionally for a diagonal that has made
182 	 lots of progress compared with the edit distance.
183 	 If we have any such, find the one that has made the most
184 	 progress and return it as if it had succeeded.
185 
186 	 With this heuristic, for files with a constant small density
187 	 of changes, the algorithm is linear in the file size.  */
188 
189       if (c > 200 && big_snake && heuristic)
190 	{
191 	  int best;
192 
193 	  best = 0;
194 	  for (d = fmax; d >= fmin; d -= 2)
195 	    {
196 	      int dd = d - fmid;
197 	      int x = fd[d];
198 	      int y = x - d;
199 	      int v = (x - xoff) * 2 - dd;
200 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
201 		{
202 		  if (v > best
203 		      && xoff + SNAKE_LIMIT <= x && x < xlim
204 		      && yoff + SNAKE_LIMIT <= y && y < ylim)
205 		    {
206 		      /* We have a good enough best diagonal;
207 			 now insist that it end with a significant snake.  */
208 		      int k;
209 
210 		      for (k = 1; xv[x - k] == yv[y - k]; k++)
211 			if (k == SNAKE_LIMIT)
212 			  {
213 			    best = v;
214 			    part->xmid = x;
215 			    part->ymid = y;
216 			    break;
217 			  }
218 		    }
219 		}
220 	    }
221 	  if (best > 0)
222 	    {
223 	      part->lo_minimal = 1;
224 	      part->hi_minimal = 0;
225 	      return 2 * c - 1;
226 	    }
227 
228 	  best = 0;
229 	  for (d = bmax; d >= bmin; d -= 2)
230 	    {
231 	      int dd = d - bmid;
232 	      int x = bd[d];
233 	      int y = x - d;
234 	      int v = (xlim - x) * 2 + dd;
235 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
236 		{
237 		  if (v > best
238 		      && xoff < x && x <= xlim - SNAKE_LIMIT
239 		      && yoff < y && y <= ylim - SNAKE_LIMIT)
240 		    {
241 		      /* We have a good enough best diagonal;
242 			 now insist that it end with a significant snake.  */
243 		      int k;
244 
245 		      for (k = 0; xv[x + k] == yv[y + k]; k++)
246 			if (k == SNAKE_LIMIT - 1)
247 			  {
248 			    best = v;
249 			    part->xmid = x;
250 			    part->ymid = y;
251 			    break;
252 			  }
253 		    }
254 		}
255 	    }
256 	  if (best > 0)
257 	    {
258 	      part->lo_minimal = 0;
259 	      part->hi_minimal = 1;
260 	      return 2 * c - 1;
261 	    }
262 	}
263 
264       /* Heuristic: if we've gone well beyond the call of duty,
265 	 give up and report halfway between our best results so far.  */
266       if (c >= too_expensive)
267 	{
268 	  int fxybest, fxbest;
269 	  int bxybest, bxbest;
270 
271 	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
272 
273 	  /* Find forward diagonal that maximizes X + Y.  */
274 	  fxybest = -1;
275 	  for (d = fmax; d >= fmin; d -= 2)
276 	    {
277 	      int x = min (fd[d], xlim);
278 	      int y = x - d;
279 	      if (ylim < y)
280 		x = ylim + d, y = ylim;
281 	      if (fxybest < x + y)
282 		{
283 		  fxybest = x + y;
284 		  fxbest = x;
285 		}
286 	    }
287 
288 	  /* Find backward diagonal that minimizes X + Y.  */
289 	  bxybest = INT_MAX;
290 	  for (d = bmax; d >= bmin; d -= 2)
291 	    {
292 	      int x = max (xoff, bd[d]);
293 	      int y = x - d;
294 	      if (y < yoff)
295 		x = yoff + d, y = yoff;
296 	      if (x + y < bxybest)
297 		{
298 		  bxybest = x + y;
299 		  bxbest = x;
300 		}
301 	    }
302 
303 	  /* Use the better of the two diagonals.  */
304 	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
305 	    {
306 	      part->xmid = fxbest;
307 	      part->ymid = fxybest - fxbest;
308 	      part->lo_minimal = 1;
309 	      part->hi_minimal = 0;
310 	    }
311 	  else
312 	    {
313 	      part->xmid = bxbest;
314 	      part->ymid = bxybest - bxbest;
315 	      part->lo_minimal = 0;
316 	      part->hi_minimal = 1;
317 	    }
318 	  return 2 * c - 1;
319 	}
320     }
321 }
322 
323 /* Compare in detail contiguous subsequences of the two files
324    which are known, as a whole, to match each other.
325 
326    The results are recorded in the vectors files[N].changed_flag, by
327    storing a 1 in the element for each line that is an insertion or deletion.
328 
329    The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
330 
331    Note that XLIM, YLIM are exclusive bounds.
332    All line numbers are origin-0 and discarded lines are not counted.
333 
334    If MINIMAL is nonzero, find a minimal difference no matter how
335    expensive it is.  */
336 
337 static void
338 compareseq (xoff, xlim, yoff, ylim, minimal)
339      int xoff, xlim, yoff, ylim, minimal;
340 {
341   int * const xv = xvec; /* Help the compiler.  */
342   int * const yv = yvec;
343 
344   /* Slide down the bottom initial diagonal. */
345   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
346     ++xoff, ++yoff;
347   /* Slide up the top initial diagonal. */
348   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
349     --xlim, --ylim;
350 
351   /* Handle simple cases. */
352   if (xoff == xlim)
353     while (yoff < ylim)
354       files[1].changed_flag[files[1].realindexes[yoff++]] = 1;
355   else if (yoff == ylim)
356     while (xoff < xlim)
357       files[0].changed_flag[files[0].realindexes[xoff++]] = 1;
358   else
359     {
360       int c;
361       struct partition part;
362 
363       /* Find a point of correspondence in the middle of the files.  */
364 
365       c = diag (xoff, xlim, yoff, ylim, minimal, &part);
366 
367       if (c == 1)
368 	{
369 	  /* This should be impossible, because it implies that
370 	     one of the two subsequences is empty,
371 	     and that case was handled above without calling `diag'.
372 	     Let's verify that this is true.  */
373 	  abort ();
374 #if 0
375 	  /* The two subsequences differ by a single insert or delete;
376 	     record it and we are done.  */
377 	  if (part.xmid - part.ymid < xoff - yoff)
378 	    files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1;
379 	  else
380 	    files[0].changed_flag[files[0].realindexes[part.xmid]] = 1;
381 #endif
382 	}
383       else
384 	{
385 	  /* Use the partitions to split this problem into subproblems.  */
386 	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
387 	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
388 	}
389     }
390 }
391 
392 /* Discard lines from one file that have no matches in the other file.
393 
394    A line which is discarded will not be considered by the actual
395    comparison algorithm; it will be as if that line were not in the file.
396    The file's `realindexes' table maps virtual line numbers
397    (which don't count the discarded lines) into real line numbers;
398    this is how the actual comparison algorithm produces results
399    that are comprehensible when the discarded lines are counted.
400 
401    When we discard a line, we also mark it as a deletion or insertion
402    so that it will be printed in the output.  */
403 
404 static void
405 discard_confusing_lines (filevec)
406      struct file_data filevec[];
407 {
408   unsigned int f, i;
409   char *discarded[2];
410   int *equiv_count[2];
411   int *p;
412 
413   /* Allocate our results.  */
414   p = (int *) xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
415 		       * (2 * sizeof (int)));
416   for (f = 0; f < 2; f++)
417     {
418       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
419       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
420     }
421 
422   /* Set up equiv_count[F][I] as the number of lines in file F
423      that fall in equivalence class I.  */
424 
425   p = (int *) xmalloc (filevec[0].equiv_max * (2 * sizeof (int)));
426   equiv_count[0] = p;
427   equiv_count[1] = p + filevec[0].equiv_max;
428   bzero (p, filevec[0].equiv_max * (2 * sizeof (int)));
429 
430   for (i = 0; i < filevec[0].buffered_lines; ++i)
431     ++equiv_count[0][filevec[0].equivs[i]];
432   for (i = 0; i < filevec[1].buffered_lines; ++i)
433     ++equiv_count[1][filevec[1].equivs[i]];
434 
435   /* Set up tables of which lines are going to be discarded.  */
436 
437   discarded[0] = xmalloc (sizeof (char)
438 			  * (filevec[0].buffered_lines
439 			     + filevec[1].buffered_lines));
440   discarded[1] = discarded[0] + filevec[0].buffered_lines;
441   bzero (discarded[0], sizeof (char) * (filevec[0].buffered_lines
442 					+ filevec[1].buffered_lines));
443 
444   /* Mark to be discarded each line that matches no line of the other file.
445      If a line matches many lines, mark it as provisionally discardable.  */
446 
447   for (f = 0; f < 2; f++)
448     {
449       unsigned int end = filevec[f].buffered_lines;
450       char *discards = discarded[f];
451       int *counts = equiv_count[1 - f];
452       int *equivs = filevec[f].equivs;
453       unsigned int many = 5;
454       unsigned int tem = end / 64;
455 
456       /* Multiply MANY by approximate square root of number of lines.
457 	 That is the threshold for provisionally discardable lines.  */
458       while ((tem = tem >> 2) > 0)
459 	many *= 2;
460 
461       for (i = 0; i < end; i++)
462 	{
463 	  int nmatch;
464 	  if (equivs[i] == 0)
465 	    continue;
466 	  nmatch = counts[equivs[i]];
467 	  if (nmatch == 0)
468 	    discards[i] = 1;
469 	  else if (nmatch > many)
470 	    discards[i] = 2;
471 	}
472     }
473 
474   /* Don't really discard the provisional lines except when they occur
475      in a run of discardables, with nonprovisionals at the beginning
476      and end.  */
477 
478   for (f = 0; f < 2; f++)
479     {
480       unsigned int end = filevec[f].buffered_lines;
481       register char *discards = discarded[f];
482 
483       for (i = 0; i < end; i++)
484 	{
485 	  /* Cancel provisional discards not in middle of run of discards.  */
486 	  if (discards[i] == 2)
487 	    discards[i] = 0;
488 	  else if (discards[i] != 0)
489 	    {
490 	      /* We have found a nonprovisional discard.  */
491 	      register int j;
492 	      unsigned int length;
493 	      unsigned int provisional = 0;
494 
495 	      /* Find end of this run of discardable lines.
496 		 Count how many are provisionally discardable.  */
497 	      for (j = i; j < end; j++)
498 		{
499 		  if (discards[j] == 0)
500 		    break;
501 		  if (discards[j] == 2)
502 		    ++provisional;
503 		}
504 
505 	      /* Cancel provisional discards at end, and shrink the run.  */
506 	      while (j > i && discards[j - 1] == 2)
507 		discards[--j] = 0, --provisional;
508 
509 	      /* Now we have the length of a run of discardable lines
510 		 whose first and last are not provisional.  */
511 	      length = j - i;
512 
513 	      /* If 1/4 of the lines in the run are provisional,
514 		 cancel discarding of all provisional lines in the run.  */
515 	      if (provisional * 4 > length)
516 		{
517 		  while (j > i)
518 		    if (discards[--j] == 2)
519 		      discards[j] = 0;
520 		}
521 	      else
522 		{
523 		  register unsigned int consec;
524 		  unsigned int minimum = 1;
525 		  unsigned int tem = length / 4;
526 
527 		  /* MINIMUM is approximate square root of LENGTH/4.
528 		     A subrun of two or more provisionals can stand
529 		     when LENGTH is at least 16.
530 		     A subrun of 4 or more can stand when LENGTH >= 64.  */
531 		  while ((tem = tem >> 2) > 0)
532 		    minimum *= 2;
533 		  minimum++;
534 
535 		  /* Cancel any subrun of MINIMUM or more provisionals
536 		     within the larger run.  */
537 		  for (j = 0, consec = 0; j < length; j++)
538 		    if (discards[i + j] != 2)
539 		      consec = 0;
540 		    else if (minimum == ++consec)
541 		      /* Back up to start of subrun, to cancel it all.  */
542 		      j -= consec;
543 		    else if (minimum < consec)
544 		      discards[i + j] = 0;
545 
546 		  /* Scan from beginning of run
547 		     until we find 3 or more nonprovisionals in a row
548 		     or until the first nonprovisional at least 8 lines in.
549 		     Until that point, cancel any provisionals.  */
550 		  for (j = 0, consec = 0; j < length; j++)
551 		    {
552 		      if (j >= 8 && discards[i + j] == 1)
553 			break;
554 		      if (discards[i + j] == 2)
555 			consec = 0, discards[i + j] = 0;
556 		      else if (discards[i + j] == 0)
557 			consec = 0;
558 		      else
559 			consec++;
560 		      if (consec == 3)
561 			break;
562 		    }
563 
564 		  /* I advances to the last line of the run.  */
565 		  i += length - 1;
566 
567 		  /* Same thing, from end.  */
568 		  for (j = 0, consec = 0; j < length; j++)
569 		    {
570 		      if (j >= 8 && discards[i - j] == 1)
571 			break;
572 		      if (discards[i - j] == 2)
573 			consec = 0, discards[i - j] = 0;
574 		      else if (discards[i - j] == 0)
575 			consec = 0;
576 		      else
577 			consec++;
578 		      if (consec == 3)
579 			break;
580 		    }
581 		}
582 	    }
583 	}
584     }
585 
586   /* Actually discard the lines. */
587   for (f = 0; f < 2; f++)
588     {
589       char *discards = discarded[f];
590       unsigned int end = filevec[f].buffered_lines;
591       unsigned int j = 0;
592       for (i = 0; i < end; ++i)
593 	if (no_discards || discards[i] == 0)
594 	  {
595 	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
596 	    filevec[f].realindexes[j++] = i;
597 	  }
598 	else
599 	  filevec[f].changed_flag[i] = 1;
600       filevec[f].nondiscarded_lines = j;
601     }
602 
603   free (discarded[0]);
604   free (equiv_count[0]);
605 }
606 
607 /* Adjust inserts/deletes of identical lines to join changes
608    as much as possible.
609 
610    We do something when a run of changed lines include a
611    line at one end and have an excluded, identical line at the other.
612    We are free to choose which identical line is included.
613    `compareseq' usually chooses the one at the beginning,
614    but usually it is cleaner to consider the following identical line
615    to be the "change".  */
616 
617 int inhibit;
618 
619 static void
620 shift_boundaries (filevec)
621      struct file_data filevec[];
622 {
623   int f;
624 
625   if (inhibit)
626     return;
627 
628   for (f = 0; f < 2; f++)
629     {
630       char *changed = filevec[f].changed_flag;
631       char const *other_changed = filevec[1-f].changed_flag;
632       int const *equivs = filevec[f].equivs;
633       int i = 0;
634       int j = 0;
635       int i_end = filevec[f].buffered_lines;
636 
637       while (1)
638 	{
639 	  int runlength, start, corresponding;
640 
641 	  /* Scan forwards to find beginning of another run of changes.
642 	     Also keep track of the corresponding point in the other file.  */
643 
644 	  while (i < i_end && changed[i] == 0)
645 	    {
646 	      while (other_changed[j++])
647 		continue;
648 	      i++;
649 	    }
650 
651 	  if (i == i_end)
652 	    break;
653 
654 	  start = i;
655 
656 	  /* Find the end of this run of changes.  */
657 
658 	  while (changed[++i])
659 	    continue;
660 	  while (other_changed[j])
661 	    j++;
662 
663 	  do
664 	    {
665 	      /* Record the length of this run of changes, so that
666 		 we can later determine whether the run has grown.  */
667 	      runlength = i - start;
668 
669 	      /* Move the changed region back, so long as the
670 		 previous unchanged line matches the last changed one.
671 		 This merges with previous changed regions.  */
672 
673 	      while (start && equivs[start - 1] == equivs[i - 1])
674 		{
675 		  changed[--start] = 1;
676 		  changed[--i] = 0;
677 		  while (changed[start - 1])
678 		    start--;
679 		  while (other_changed[--j])
680 		    continue;
681 		}
682 
683 	      /* Set CORRESPONDING to the end of the changed run, at the last
684 		 point where it corresponds to a changed run in the other file.
685 		 CORRESPONDING == I_END means no such point has been found.  */
686 	      corresponding = other_changed[j - 1] ? i : i_end;
687 
688 	      /* Move the changed region forward, so long as the
689 		 first changed line matches the following unchanged one.
690 		 This merges with following changed regions.
691 		 Do this second, so that if there are no merges,
692 		 the changed region is moved forward as far as possible.  */
693 
694 	      while (i != i_end && equivs[start] == equivs[i])
695 		{
696 		  changed[start++] = 0;
697 		  changed[i++] = 1;
698 		  while (changed[i])
699 		    i++;
700 		  while (other_changed[++j])
701 		    corresponding = i;
702 		}
703 	    }
704 	  while (runlength != i - start);
705 
706 	  /* If possible, move the fully-merged run of changes
707 	     back to a corresponding run in the other file.  */
708 
709 	  while (corresponding < i)
710 	    {
711 	      changed[--start] = 1;
712 	      changed[--i] = 0;
713 	      while (other_changed[--j])
714 		continue;
715 	    }
716 	}
717     }
718 }
719 
720 /* Cons an additional entry onto the front of an edit script OLD.
721    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
722    DELETED is the number of lines deleted here from file 0.
723    INSERTED is the number of lines inserted here in file 1.
724 
725    If DELETED is 0 then LINE0 is the number of the line before
726    which the insertion was done; vice versa for INSERTED and LINE1.  */
727 
728 static struct change *
729 add_change (line0, line1, deleted, inserted, old)
730      int line0, line1, deleted, inserted;
731      struct change *old;
732 {
733   struct change *new = (struct change *) xmalloc (sizeof (struct change));
734 
735   new->line0 = line0;
736   new->line1 = line1;
737   new->inserted = inserted;
738   new->deleted = deleted;
739   new->link = old;
740   return new;
741 }
742 
743 /* Scan the tables of which lines are inserted and deleted,
744    producing an edit script in reverse order.  */
745 
746 static struct change *
747 build_reverse_script (filevec)
748      struct file_data const filevec[];
749 {
750   struct change *script = 0;
751   char *changed0 = filevec[0].changed_flag;
752   char *changed1 = filevec[1].changed_flag;
753   int len0 = filevec[0].buffered_lines;
754   int len1 = filevec[1].buffered_lines;
755 
756   /* Note that changedN[len0] does exist, and contains 0.  */
757 
758   int i0 = 0, i1 = 0;
759 
760   while (i0 < len0 || i1 < len1)
761     {
762       if (changed0[i0] || changed1[i1])
763 	{
764 	  int line0 = i0, line1 = i1;
765 
766 	  /* Find # lines changed here in each file.  */
767 	  while (changed0[i0]) ++i0;
768 	  while (changed1[i1]) ++i1;
769 
770 	  /* Record this change.  */
771 	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
772 	}
773 
774       /* We have reached lines in the two files that match each other.  */
775       i0++, i1++;
776     }
777 
778   return script;
779 }
780 
781 /* Scan the tables of which lines are inserted and deleted,
782    producing an edit script in forward order.  */
783 
784 static struct change *
785 build_script (filevec)
786      struct file_data const filevec[];
787 {
788   struct change *script = 0;
789   char *changed0 = filevec[0].changed_flag;
790   char *changed1 = filevec[1].changed_flag;
791   int i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
792 
793   /* Note that changedN[-1] does exist, and contains 0.  */
794 
795   while (i0 >= 0 || i1 >= 0)
796     {
797       if (changed0[i0 - 1] || changed1[i1 - 1])
798 	{
799 	  int line0 = i0, line1 = i1;
800 
801 	  /* Find # lines changed here in each file.  */
802 	  while (changed0[i0 - 1]) --i0;
803 	  while (changed1[i1 - 1]) --i1;
804 
805 	  /* Record this change.  */
806 	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
807 	}
808 
809       /* We have reached lines in the two files that match each other.  */
810       i0--, i1--;
811     }
812 
813   return script;
814 }
815 
816 /* If CHANGES, briefly report that two files differed.  */
817 static void
818 briefly_report (changes, filevec)
819      int changes;
820      struct file_data const filevec[];
821 {
822   if (changes)
823     message (no_details_flag ? "Files %s and %s differ\n"
824 	     : "Binary files %s and %s differ\n",
825 	     filevec[0].name, filevec[1].name);
826 }
827 
828 /* Report the differences of two files.  DEPTH is the current directory
829    depth. */
830 int
831 diff_2_files (filevec, depth)
832      struct file_data filevec[];
833      int depth;
834 {
835   int diags;
836   int i;
837   struct change *e, *p;
838   struct change *script;
839   int changes;
840 
841 
842   /* If we have detected that either file is binary,
843      compare the two files as binary.  This can happen
844      only when the first chunk is read.
845      Also, --brief without any --ignore-* options means
846      we can speed things up by treating the files as binary.  */
847 
848   if (read_files (filevec, no_details_flag & ~ignore_some_changes))
849     {
850       /* Files with different lengths must be different.  */
851       if (filevec[0].stat.st_size != filevec[1].stat.st_size
852 	  && (filevec[0].desc < 0 || S_ISREG (filevec[0].stat.st_mode))
853 	  && (filevec[1].desc < 0 || S_ISREG (filevec[1].stat.st_mode)))
854 	changes = 1;
855 
856       /* Standard input equals itself.  */
857       else if (filevec[0].desc == filevec[1].desc)
858 	changes = 0;
859 
860       else
861 	/* Scan both files, a buffer at a time, looking for a difference.  */
862 	{
863 	  /* Allocate same-sized buffers for both files.  */
864 	  size_t buffer_size = buffer_lcm (STAT_BLOCKSIZE (filevec[0].stat),
865 					   STAT_BLOCKSIZE (filevec[1].stat));
866 	  for (i = 0; i < 2; i++)
867 	    filevec[i].buffer = xrealloc (filevec[i].buffer, buffer_size);
868 
869 	  for (;;  filevec[0].buffered_chars = filevec[1].buffered_chars = 0)
870 	    {
871 	      /* Read a buffer's worth from both files.  */
872 	      for (i = 0; i < 2; i++)
873 		if (0 <= filevec[i].desc)
874 		  while (filevec[i].buffered_chars != buffer_size)
875 		    {
876 		      int r = read (filevec[i].desc,
877 				    filevec[i].buffer
878 				    + filevec[i].buffered_chars,
879 				    buffer_size - filevec[i].buffered_chars);
880 		      if (r == 0)
881 			break;
882 		      if (r < 0)
883 			pfatal_with_name (filevec[i].name);
884 		      filevec[i].buffered_chars += r;
885 		    }
886 
887 	      /* If the buffers differ, the files differ.  */
888 	      if (filevec[0].buffered_chars != filevec[1].buffered_chars
889 		  || (filevec[0].buffered_chars != 0
890 		      && memcmp (filevec[0].buffer,
891 				 filevec[1].buffer,
892 				 filevec[0].buffered_chars) != 0))
893 		{
894 		  changes = 1;
895 		  break;
896 		}
897 
898 	      /* If we reach end of file, the files are the same.  */
899 	      if (filevec[0].buffered_chars != buffer_size)
900 		{
901 		  changes = 0;
902 		  break;
903 		}
904 	    }
905 	}
906 
907       briefly_report (changes, filevec);
908     }
909   else
910     {
911       /* Allocate vectors for the results of comparison:
912 	 a flag for each line of each file, saying whether that line
913 	 is an insertion or deletion.
914 	 Allocate an extra element, always zero, at each end of each vector.  */
915 
916       size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4;
917       filevec[0].changed_flag = xmalloc (s);
918       bzero (filevec[0].changed_flag, s);
919       filevec[0].changed_flag++;
920       filevec[1].changed_flag = filevec[0].changed_flag
921 				+ filevec[0].buffered_lines + 2;
922 
923       /* Some lines are obviously insertions or deletions
924 	 because they don't match anything.  Detect them now, and
925 	 avoid even thinking about them in the main comparison algorithm.  */
926 
927       discard_confusing_lines (filevec);
928 
929       /* Now do the main comparison algorithm, considering just the
930 	 undiscarded lines.  */
931 
932       xvec = filevec[0].undiscarded;
933       yvec = filevec[1].undiscarded;
934       diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
935       fdiag = (int *) xmalloc (diags * (2 * sizeof (int)));
936       bdiag = fdiag + diags;
937       fdiag += filevec[1].nondiscarded_lines + 1;
938       bdiag += filevec[1].nondiscarded_lines + 1;
939 
940       /* Set TOO_EXPENSIVE to be approximate square root of input size,
941 	 bounded below by 256.  */
942       too_expensive = 1;
943       for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines;
944 	   i != 0; i >>= 2)
945 	too_expensive <<= 1;
946       too_expensive = max (256, too_expensive);
947 
948       files[0] = filevec[0];
949       files[1] = filevec[1];
950 
951       compareseq (0, filevec[0].nondiscarded_lines,
952 		  0, filevec[1].nondiscarded_lines, no_discards);
953 
954       free (fdiag - (filevec[1].nondiscarded_lines + 1));
955 
956       /* Modify the results slightly to make them prettier
957 	 in cases where that can validly be done.  */
958 
959       shift_boundaries (filevec);
960 
961       /* Get the results of comparison in the form of a chain
962 	 of `struct change's -- an edit script.  */
963 
964       if (output_style == OUTPUT_ED)
965 	script = build_reverse_script (filevec);
966       else
967 	script = build_script (filevec);
968 
969       /* Set CHANGES if we had any diffs.
970 	 If some changes are ignored, we must scan the script to decide.  */
971       if (ignore_blank_lines_flag || ignore_regexp_list)
972 	{
973 	  struct change *next = script;
974 	  changes = 0;
975 
976 	  while (next && changes == 0)
977 	    {
978 	      struct change *this, *end;
979 	      int first0, last0, first1, last1, deletes, inserts;
980 
981 	      /* Find a set of changes that belong together.  */
982 	      this = next;
983 	      end = find_change (next);
984 
985 	      /* Disconnect them from the rest of the changes, making them
986 		 a hunk, and remember the rest for next iteration.  */
987 	      next = end->link;
988 	      end->link = 0;
989 
990 	      /* Determine whether this hunk is really a difference.  */
991 	      analyze_hunk (this, &first0, &last0, &first1, &last1,
992 			    &deletes, &inserts);
993 
994 	      /* Reconnect the script so it will all be freed properly.  */
995 	      end->link = next;
996 
997 	      if (deletes || inserts)
998 		changes = 1;
999 	    }
1000 	}
1001       else
1002 	changes = (script != 0);
1003 
1004       if (no_details_flag)
1005 	briefly_report (changes, filevec);
1006       else
1007 	{
1008 	  if (changes || ! no_diff_means_no_output)
1009 	    {
1010 	      /* Record info for starting up output,
1011 		 to be used if and when we have some output to print.  */
1012 	      setup_output (files[0].name, files[1].name, depth);
1013 
1014 	      switch (output_style)
1015 		{
1016 		case OUTPUT_CONTEXT:
1017 		  print_context_script (script, 0);
1018 		  break;
1019 
1020 		case OUTPUT_UNIFIED:
1021 		  print_context_script (script, 1);
1022 		  break;
1023 
1024 		case OUTPUT_ED:
1025 		  print_ed_script (script);
1026 		  break;
1027 
1028 		case OUTPUT_FORWARD_ED:
1029 		  pr_forward_ed_script (script);
1030 		  break;
1031 
1032 		case OUTPUT_RCS:
1033 		  print_rcs_script (script);
1034 		  break;
1035 
1036 		case OUTPUT_NORMAL:
1037 		  print_normal_script (script);
1038 		  break;
1039 
1040 		case OUTPUT_IFDEF:
1041 		  print_ifdef_script (script);
1042 		  break;
1043 
1044 		case OUTPUT_SDIFF:
1045 		  print_sdiff_script (script);
1046 		}
1047 
1048 	      finish_output ();
1049 	    }
1050 	}
1051 
1052       free (filevec[0].undiscarded);
1053 
1054       free (filevec[0].changed_flag - 1);
1055 
1056       for (i = 1; i >= 0; --i)
1057 	free (filevec[i].equivs);
1058 
1059       for (i = 0; i < 2; ++i)
1060 	free (filevec[i].linbuf + filevec[i].linbuf_base);
1061 
1062       for (e = script; e; e = p)
1063 	{
1064 	  p = e->link;
1065 	  free (e);
1066 	}
1067 
1068       if (! ROBUST_OUTPUT_STYLE (output_style))
1069 	for (i = 0; i < 2; ++i)
1070 	  if (filevec[i].missing_newline)
1071 	    {
1072 	      diff_error ("No newline at end of file %s", filevec[i].name, "");
1073 	      changes = 2;
1074 	    }
1075     }
1076 
1077   if (filevec[0].buffer != filevec[1].buffer)
1078     free (filevec[0].buffer);
1079   free (filevec[1].buffer);
1080 
1081   return changes;
1082 }
1083