xref: /dragonfly/contrib/diffutils/src/analyze.c (revision 0db87cb7)
1 /* Analyze file differences for GNU DIFF.
2 
3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006-2007,
4    2009-2013 Free Software Foundation, Inc.
5 
6    This file is part of GNU DIFF.
7 
8    This program is free software: you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation, either version 3 of the License, or
11    (at your option) any later version.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
20 
21 #include "diff.h"
22 #include <cmpbuf.h>
23 #include <error.h>
24 #include <file-type.h>
25 #include <xalloc.h>
26 
27 /* The core of the Diff algorithm.  */
28 #define ELEMENT lin
29 #define EQUAL(x,y) ((x) == (y))
30 #define OFFSET lin
31 #define EXTRA_CONTEXT_FIELDS /* none */
32 #define NOTE_DELETE(c, xoff) (files[0].changed[files[0].realindexes[xoff]] = 1)
33 #define NOTE_INSERT(c, yoff) (files[1].changed[files[1].realindexes[yoff]] = 1)
34 #define USE_HEURISTIC 1
35 #include <diffseq.h>
36 
37 /* Discard lines from one file that have no matches in the other file.
38 
39    A line which is discarded will not be considered by the actual
40    comparison algorithm; it will be as if that line were not in the file.
41    The file's 'realindexes' table maps virtual line numbers
42    (which don't count the discarded lines) into real line numbers;
43    this is how the actual comparison algorithm produces results
44    that are comprehensible when the discarded lines are counted.
45 
46    When we discard a line, we also mark it as a deletion or insertion
47    so that it will be printed in the output.  */
48 
49 static void
50 discard_confusing_lines (struct file_data filevec[])
51 {
52   int f;
53   lin i;
54   char *discarded[2];
55   lin *equiv_count[2];
56   lin *p;
57 
58   /* Allocate our results.  */
59   p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
60 	       * (2 * sizeof *p));
61   for (f = 0; f < 2; f++)
62     {
63       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
64       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
65     }
66 
67   /* Set up equiv_count[F][I] as the number of lines in file F
68      that fall in equivalence class I.  */
69 
70   p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
71   equiv_count[0] = p;
72   equiv_count[1] = p + filevec[0].equiv_max;
73 
74   for (i = 0; i < filevec[0].buffered_lines; ++i)
75     ++equiv_count[0][filevec[0].equivs[i]];
76   for (i = 0; i < filevec[1].buffered_lines; ++i)
77     ++equiv_count[1][filevec[1].equivs[i]];
78 
79   /* Set up tables of which lines are going to be discarded.  */
80 
81   discarded[0] = zalloc (filevec[0].buffered_lines
82 			 + filevec[1].buffered_lines);
83   discarded[1] = discarded[0] + filevec[0].buffered_lines;
84 
85   /* Mark to be discarded each line that matches no line of the other file.
86      If a line matches many lines, mark it as provisionally discardable.  */
87 
88   for (f = 0; f < 2; f++)
89     {
90       size_t end = filevec[f].buffered_lines;
91       char *discards = discarded[f];
92       lin *counts = equiv_count[1 - f];
93       lin *equivs = filevec[f].equivs;
94       size_t many = 5;
95       size_t tem = end / 64;
96 
97       /* Multiply MANY by approximate square root of number of lines.
98 	 That is the threshold for provisionally discardable lines.  */
99       while ((tem = tem >> 2) > 0)
100 	many *= 2;
101 
102       for (i = 0; i < end; i++)
103 	{
104 	  lin nmatch;
105 	  if (equivs[i] == 0)
106 	    continue;
107 	  nmatch = counts[equivs[i]];
108 	  if (nmatch == 0)
109 	    discards[i] = 1;
110 	  else if (nmatch > many)
111 	    discards[i] = 2;
112 	}
113     }
114 
115   /* Don't really discard the provisional lines except when they occur
116      in a run of discardables, with nonprovisionals at the beginning
117      and end.  */
118 
119   for (f = 0; f < 2; f++)
120     {
121       lin end = filevec[f].buffered_lines;
122       register char *discards = discarded[f];
123 
124       for (i = 0; i < end; i++)
125 	{
126 	  /* Cancel provisional discards not in middle of run of discards.  */
127 	  if (discards[i] == 2)
128 	    discards[i] = 0;
129 	  else if (discards[i] != 0)
130 	    {
131 	      /* We have found a nonprovisional discard.  */
132 	      register lin j;
133 	      lin length;
134 	      lin provisional = 0;
135 
136 	      /* Find end of this run of discardable lines.
137 		 Count how many are provisionally discardable.  */
138 	      for (j = i; j < end; j++)
139 		{
140 		  if (discards[j] == 0)
141 		    break;
142 		  if (discards[j] == 2)
143 		    ++provisional;
144 		}
145 
146 	      /* Cancel provisional discards at end, and shrink the run.  */
147 	      while (j > i && discards[j - 1] == 2)
148 		discards[--j] = 0, --provisional;
149 
150 	      /* Now we have the length of a run of discardable lines
151 		 whose first and last are not provisional.  */
152 	      length = j - i;
153 
154 	      /* If 1/4 of the lines in the run are provisional,
155 		 cancel discarding of all provisional lines in the run.  */
156 	      if (provisional * 4 > length)
157 		{
158 		  while (j > i)
159 		    if (discards[--j] == 2)
160 		      discards[j] = 0;
161 		}
162 	      else
163 		{
164 		  register lin consec;
165 		  lin minimum = 1;
166 		  lin tem = length >> 2;
167 
168 		  /* MINIMUM is approximate square root of LENGTH/4.
169 		     A subrun of two or more provisionals can stand
170 		     when LENGTH is at least 16.
171 		     A subrun of 4 or more can stand when LENGTH >= 64.  */
172 		  while (0 < (tem >>= 2))
173 		    minimum <<= 1;
174 		  minimum++;
175 
176 		  /* Cancel any subrun of MINIMUM or more provisionals
177 		     within the larger run.  */
178 		  for (j = 0, consec = 0; j < length; j++)
179 		    if (discards[i + j] != 2)
180 		      consec = 0;
181 		    else if (minimum == ++consec)
182 		      /* Back up to start of subrun, to cancel it all.  */
183 		      j -= consec;
184 		    else if (minimum < consec)
185 		      discards[i + j] = 0;
186 
187 		  /* Scan from beginning of run
188 		     until we find 3 or more nonprovisionals in a row
189 		     or until the first nonprovisional at least 8 lines in.
190 		     Until that point, cancel any provisionals.  */
191 		  for (j = 0, consec = 0; j < length; j++)
192 		    {
193 		      if (j >= 8 && discards[i + j] == 1)
194 			break;
195 		      if (discards[i + j] == 2)
196 			consec = 0, discards[i + j] = 0;
197 		      else if (discards[i + j] == 0)
198 			consec = 0;
199 		      else
200 			consec++;
201 		      if (consec == 3)
202 			break;
203 		    }
204 
205 		  /* I advances to the last line of the run.  */
206 		  i += length - 1;
207 
208 		  /* Same thing, from end.  */
209 		  for (j = 0, consec = 0; j < length; j++)
210 		    {
211 		      if (j >= 8 && discards[i - j] == 1)
212 			break;
213 		      if (discards[i - j] == 2)
214 			consec = 0, discards[i - j] = 0;
215 		      else if (discards[i - j] == 0)
216 			consec = 0;
217 		      else
218 			consec++;
219 		      if (consec == 3)
220 			break;
221 		    }
222 		}
223 	    }
224 	}
225     }
226 
227   /* Actually discard the lines. */
228   for (f = 0; f < 2; f++)
229     {
230       char *discards = discarded[f];
231       lin end = filevec[f].buffered_lines;
232       lin j = 0;
233       for (i = 0; i < end; ++i)
234 	if (minimal || discards[i] == 0)
235 	  {
236 	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
237 	    filevec[f].realindexes[j++] = i;
238 	  }
239 	else
240 	  filevec[f].changed[i] = 1;
241       filevec[f].nondiscarded_lines = j;
242     }
243 
244   free (discarded[0]);
245   free (equiv_count[0]);
246 }
247 
248 /* Adjust inserts/deletes of identical lines to join changes
249    as much as possible.
250 
251    We do something when a run of changed lines include a
252    line at one end and have an excluded, identical line at the other.
253    We are free to choose which identical line is included.
254    'compareseq' usually chooses the one at the beginning,
255    but usually it is cleaner to consider the following identical line
256    to be the "change".  */
257 
258 static void
259 shift_boundaries (struct file_data filevec[])
260 {
261   int f;
262 
263   for (f = 0; f < 2; f++)
264     {
265       char *changed = filevec[f].changed;
266       char *other_changed = filevec[1 - f].changed;
267       lin const *equivs = filevec[f].equivs;
268       lin i = 0;
269       lin j = 0;
270       lin i_end = filevec[f].buffered_lines;
271 
272       while (1)
273 	{
274 	  lin runlength, start, corresponding;
275 
276 	  /* Scan forwards to find beginning of another run of changes.
277 	     Also keep track of the corresponding point in the other file.  */
278 
279 	  while (i < i_end && !changed[i])
280 	    {
281 	      while (other_changed[j++])
282 		continue;
283 	      i++;
284 	    }
285 
286 	  if (i == i_end)
287 	    break;
288 
289 	  start = i;
290 
291 	  /* Find the end of this run of changes.  */
292 
293 	  while (changed[++i])
294 	    continue;
295 	  while (other_changed[j])
296 	    j++;
297 
298 	  do
299 	    {
300 	      /* Record the length of this run of changes, so that
301 		 we can later determine whether the run has grown.  */
302 	      runlength = i - start;
303 
304 	      /* Move the changed region back, so long as the
305 		 previous unchanged line matches the last changed one.
306 		 This merges with previous changed regions.  */
307 
308 	      while (start && equivs[start - 1] == equivs[i - 1])
309 		{
310 		  changed[--start] = 1;
311 		  changed[--i] = 0;
312 		  while (changed[start - 1])
313 		    start--;
314 		  while (other_changed[--j])
315 		    continue;
316 		}
317 
318 	      /* Set CORRESPONDING to the end of the changed run, at the last
319 		 point where it corresponds to a changed run in the other file.
320 		 CORRESPONDING == I_END means no such point has been found.  */
321 	      corresponding = other_changed[j - 1] ? i : i_end;
322 
323 	      /* Move the changed region forward, so long as the
324 		 first changed line matches the following unchanged one.
325 		 This merges with following changed regions.
326 		 Do this second, so that if there are no merges,
327 		 the changed region is moved forward as far as possible.  */
328 
329 	      while (i != i_end && equivs[start] == equivs[i])
330 		{
331 		  changed[start++] = 0;
332 		  changed[i++] = 1;
333 		  while (changed[i])
334 		    i++;
335 		  while (other_changed[++j])
336 		    corresponding = i;
337 		}
338 	    }
339 	  while (runlength != i - start);
340 
341 	  /* If possible, move the fully-merged run of changes
342 	     back to a corresponding run in the other file.  */
343 
344 	  while (corresponding < i)
345 	    {
346 	      changed[--start] = 1;
347 	      changed[--i] = 0;
348 	      while (other_changed[--j])
349 		continue;
350 	    }
351 	}
352     }
353 }
354 
355 /* Cons an additional entry onto the front of an edit script OLD.
356    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
357    DELETED is the number of lines deleted here from file 0.
358    INSERTED is the number of lines inserted here in file 1.
359 
360    If DELETED is 0 then LINE0 is the number of the line before
361    which the insertion was done; vice versa for INSERTED and LINE1.  */
362 
363 static struct change *
364 add_change (lin line0, lin line1, lin deleted, lin inserted,
365 	    struct change *old)
366 {
367   struct change *new = xmalloc (sizeof *new);
368 
369   new->line0 = line0;
370   new->line1 = line1;
371   new->inserted = inserted;
372   new->deleted = deleted;
373   new->link = old;
374   return new;
375 }
376 
377 /* Scan the tables of which lines are inserted and deleted,
378    producing an edit script in reverse order.  */
379 
380 static struct change *
381 build_reverse_script (struct file_data const filevec[])
382 {
383   struct change *script = 0;
384   char *changed0 = filevec[0].changed;
385   char *changed1 = filevec[1].changed;
386   lin len0 = filevec[0].buffered_lines;
387   lin len1 = filevec[1].buffered_lines;
388 
389   /* Note that changedN[lenN] does exist, and is 0.  */
390 
391   lin i0 = 0, i1 = 0;
392 
393   while (i0 < len0 || i1 < len1)
394     {
395       if (changed0[i0] | changed1[i1])
396 	{
397 	  lin line0 = i0, line1 = i1;
398 
399 	  /* Find # lines changed here in each file.  */
400 	  while (changed0[i0]) ++i0;
401 	  while (changed1[i1]) ++i1;
402 
403 	  /* Record this change.  */
404 	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
405 	}
406 
407       /* We have reached lines in the two files that match each other.  */
408       i0++, i1++;
409     }
410 
411   return script;
412 }
413 
414 /* Scan the tables of which lines are inserted and deleted,
415    producing an edit script in forward order.  */
416 
417 static struct change *
418 build_script (struct file_data const filevec[])
419 {
420   struct change *script = 0;
421   char *changed0 = filevec[0].changed;
422   char *changed1 = filevec[1].changed;
423   lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
424 
425   /* Note that changedN[-1] does exist, and is 0.  */
426 
427   while (i0 >= 0 || i1 >= 0)
428     {
429       if (changed0[i0 - 1] | changed1[i1 - 1])
430 	{
431 	  lin line0 = i0, line1 = i1;
432 
433 	  /* Find # lines changed here in each file.  */
434 	  while (changed0[i0 - 1]) --i0;
435 	  while (changed1[i1 - 1]) --i1;
436 
437 	  /* Record this change.  */
438 	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
439 	}
440 
441       /* We have reached lines in the two files that match each other.  */
442       i0--, i1--;
443     }
444 
445   return script;
446 }
447 
448 /* If CHANGES, briefly report that two files differed.
449    Return 2 if trouble, CHANGES otherwise.  */
450 static int
451 briefly_report (int changes, struct file_data const filevec[])
452 {
453   if (changes)
454     {
455       char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
456       char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
457 
458       if (brief)
459 	message ("Files %s and %s differ\n", label0, label1);
460       else
461 	{
462 	  message ("Binary files %s and %s differ\n", label0, label1);
463 	  changes = 2;
464 	}
465     }
466 
467   return changes;
468 }
469 
470 /* Report the differences of two files.  */
471 int
472 diff_2_files (struct comparison *cmp)
473 {
474   int f;
475   struct change *e, *p;
476   struct change *script;
477   int changes;
478 
479 
480   /* If we have detected that either file is binary,
481      compare the two files as binary.  This can happen
482      only when the first chunk is read.
483      Also, --brief without any --ignore-* options means
484      we can speed things up by treating the files as binary.  */
485 
486   if (read_files (cmp->file, files_can_be_treated_as_binary))
487     {
488       /* Files with different lengths must be different.  */
489       if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
490 	  && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
491 	  && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
492 	changes = 1;
493 
494       /* Standard input equals itself.  */
495       else if (cmp->file[0].desc == cmp->file[1].desc)
496 	changes = 0;
497 
498       else
499 	/* Scan both files, a buffer at a time, looking for a difference.  */
500 	{
501 	  /* Allocate same-sized buffers for both files.  */
502 	  size_t lcm_max = PTRDIFF_MAX - 1;
503 	  size_t buffer_size =
504 	    buffer_lcm (sizeof (word),
505 			buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
506 				    STAT_BLOCKSIZE (cmp->file[1].stat),
507 				    lcm_max),
508 			lcm_max);
509 	  for (f = 0; f < 2; f++)
510 	    cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
511 
512 	  for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
513 	    {
514 	      /* Read a buffer's worth from both files.  */
515 	      for (f = 0; f < 2; f++)
516 		if (0 <= cmp->file[f].desc)
517 		  file_block_read (&cmp->file[f],
518 				   buffer_size - cmp->file[f].buffered);
519 
520 	      /* If the buffers differ, the files differ.  */
521 	      if (cmp->file[0].buffered != cmp->file[1].buffered
522 		  || memcmp (cmp->file[0].buffer,
523 			     cmp->file[1].buffer,
524 			     cmp->file[0].buffered))
525 		{
526 		  changes = 1;
527 		  break;
528 		}
529 
530 	      /* If we reach end of file, the files are the same.  */
531 	      if (cmp->file[0].buffered != buffer_size)
532 		{
533 		  changes = 0;
534 		  break;
535 		}
536 	    }
537 	}
538 
539       changes = briefly_report (changes, cmp->file);
540     }
541   else
542     {
543       struct context ctxt;
544       lin diags;
545       lin too_expensive;
546 
547       /* Allocate vectors for the results of comparison:
548 	 a flag for each line of each file, saying whether that line
549 	 is an insertion or deletion.
550 	 Allocate an extra element, always 0, at each end of each vector.  */
551 
552       size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
553       char *flag_space = zalloc (s);
554       cmp->file[0].changed = flag_space + 1;
555       cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
556 
557       /* Some lines are obviously insertions or deletions
558 	 because they don't match anything.  Detect them now, and
559 	 avoid even thinking about them in the main comparison algorithm.  */
560 
561       discard_confusing_lines (cmp->file);
562 
563       /* Now do the main comparison algorithm, considering just the
564 	 undiscarded lines.  */
565 
566       ctxt.xvec = cmp->file[0].undiscarded;
567       ctxt.yvec = cmp->file[1].undiscarded;
568       diags = (cmp->file[0].nondiscarded_lines
569 	       + cmp->file[1].nondiscarded_lines + 3);
570       ctxt.fdiag = xmalloc (diags * (2 * sizeof *ctxt.fdiag));
571       ctxt.bdiag = ctxt.fdiag + diags;
572       ctxt.fdiag += cmp->file[1].nondiscarded_lines + 1;
573       ctxt.bdiag += cmp->file[1].nondiscarded_lines + 1;
574 
575       ctxt.heuristic = speed_large_files;
576 
577       /* Set TOO_EXPENSIVE to be approximate square root of input size,
578 	 bounded below by 256.  */
579       too_expensive = 1;
580       for (;  diags != 0;  diags >>= 2)
581 	too_expensive <<= 1;
582       ctxt.too_expensive = MAX (256, too_expensive);
583 
584       files[0] = cmp->file[0];
585       files[1] = cmp->file[1];
586 
587       compareseq (0, cmp->file[0].nondiscarded_lines,
588 		  0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);
589 
590       free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));
591 
592       /* Modify the results slightly to make them prettier
593 	 in cases where that can validly be done.  */
594 
595       shift_boundaries (cmp->file);
596 
597       /* Get the results of comparison in the form of a chain
598 	 of 'struct change's -- an edit script.  */
599 
600       if (output_style == OUTPUT_ED)
601 	script = build_reverse_script (cmp->file);
602       else
603 	script = build_script (cmp->file);
604 
605       /* Set CHANGES if we had any diffs.
606 	 If some changes are ignored, we must scan the script to decide.  */
607       if (ignore_blank_lines || ignore_regexp.fastmap)
608 	{
609 	  struct change *next = script;
610 	  changes = 0;
611 
612 	  while (next && changes == 0)
613 	    {
614 	      struct change *this, *end;
615 	      lin first0, last0, first1, last1;
616 
617 	      /* Find a set of changes that belong together.  */
618 	      this = next;
619 	      end = find_change (next);
620 
621 	      /* Disconnect them from the rest of the changes, making them
622 		 a hunk, and remember the rest for next iteration.  */
623 	      next = end->link;
624 	      end->link = 0;
625 
626 	      /* Determine whether this hunk is really a difference.  */
627 	      if (analyze_hunk (this, &first0, &last0, &first1, &last1))
628 		changes = 1;
629 
630 	      /* Reconnect the script so it will all be freed properly.  */
631 	      end->link = next;
632 	    }
633 	}
634       else
635 	changes = (script != 0);
636 
637       if (brief)
638 	changes = briefly_report (changes, cmp->file);
639       else
640 	{
641 	  if (changes || !no_diff_means_no_output)
642 	    {
643 	      /* Record info for starting up output,
644 		 to be used if and when we have some output to print.  */
645 	      setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
646 			    file_label[1] ? file_label[1] : cmp->file[1].name,
647 			    cmp->parent != 0);
648 
649 	      switch (output_style)
650 		{
651 		case OUTPUT_CONTEXT:
652 		  print_context_script (script, false);
653 		  break;
654 
655 		case OUTPUT_UNIFIED:
656 		  print_context_script (script, true);
657 		  break;
658 
659 		case OUTPUT_ED:
660 		  print_ed_script (script);
661 		  break;
662 
663 		case OUTPUT_FORWARD_ED:
664 		  pr_forward_ed_script (script);
665 		  break;
666 
667 		case OUTPUT_RCS:
668 		  print_rcs_script (script);
669 		  break;
670 
671 		case OUTPUT_NORMAL:
672 		  print_normal_script (script);
673 		  break;
674 
675 		case OUTPUT_IFDEF:
676 		  print_ifdef_script (script);
677 		  break;
678 
679 		case OUTPUT_SDIFF:
680 		  print_sdiff_script (script);
681 		  break;
682 
683 		default:
684 		  abort ();
685 		}
686 
687 	      finish_output ();
688 	    }
689 	}
690 
691       free (cmp->file[0].undiscarded);
692 
693       free (flag_space);
694 
695       for (f = 0; f < 2; f++)
696 	{
697 	  free (cmp->file[f].equivs);
698 	  free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
699 	}
700 
701       for (e = script; e; e = p)
702 	{
703 	  p = e->link;
704 	  free (e);
705 	}
706 
707       if (! ROBUST_OUTPUT_STYLE (output_style))
708 	for (f = 0; f < 2; ++f)
709 	  if (cmp->file[f].missing_newline)
710 	    {
711 	      error (0, 0, "%s: %s\n",
712 		     file_label[f] ? file_label[f] : cmp->file[f].name,
713 		     _("No newline at end of file"));
714 	      changes = 2;
715 	    }
716     }
717 
718   if (cmp->file[0].buffer != cmp->file[1].buffer)
719     free (cmp->file[0].buffer);
720   free (cmp->file[1].buffer);
721 
722   return changes;
723 }
724