xref: /dragonfly/contrib/diffutils/src/analyze.c (revision e6d22e9b)
1 /* Analyze file differences for GNU DIFF.
2 
3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006-2007,
4    2009-2013, 2015-2018 Free Software Foundation, Inc.
5 
6    This file is part of GNU DIFF.
7 
8    This program is free software: you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation, either version 3 of the License, or
11    (at your option) any later version.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
20 
21 #include "diff.h"
22 #include <cmpbuf.h>
23 #include <error.h>
24 #include <file-type.h>
25 #include <xalloc.h>
26 
27 /* The core of the Diff algorithm.  */
28 #define ELEMENT lin
29 #define EQUAL(x,y) ((x) == (y))
30 #define OFFSET lin
31 #define EXTRA_CONTEXT_FIELDS /* none */
32 #define NOTE_DELETE(c, xoff) (files[0].changed[files[0].realindexes[xoff]] = 1)
33 #define NOTE_INSERT(c, yoff) (files[1].changed[files[1].realindexes[yoff]] = 1)
34 #define USE_HEURISTIC 1
35 #include <diffseq.h>
36 
37 /* Discard lines from one file that have no matches in the other file.
38 
39    A line which is discarded will not be considered by the actual
40    comparison algorithm; it will be as if that line were not in the file.
41    The file's 'realindexes' table maps virtual line numbers
42    (which don't count the discarded lines) into real line numbers;
43    this is how the actual comparison algorithm produces results
44    that are comprehensible when the discarded lines are counted.
45 
46    When we discard a line, we also mark it as a deletion or insertion
47    so that it will be printed in the output.  */
48 
49 static void
50 discard_confusing_lines (struct file_data filevec[])
51 {
52   int f;
53   lin i;
54   char *discarded[2];
55   lin *equiv_count[2];
56   lin *p;
57 
58   /* Allocate our results.  */
59   p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
60 	       * (2 * sizeof *p));
61   for (f = 0; f < 2; f++)
62     {
63       filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
64       filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
65     }
66 
67   /* Set up equiv_count[F][I] as the number of lines in file F
68      that fall in equivalence class I.  */
69 
70   p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
71   equiv_count[0] = p;
72   equiv_count[1] = p + filevec[0].equiv_max;
73 
74   for (i = 0; i < filevec[0].buffered_lines; ++i)
75     ++equiv_count[0][filevec[0].equivs[i]];
76   for (i = 0; i < filevec[1].buffered_lines; ++i)
77     ++equiv_count[1][filevec[1].equivs[i]];
78 
79   /* Set up tables of which lines are going to be discarded.  */
80 
81   discarded[0] = zalloc (filevec[0].buffered_lines
82 			 + filevec[1].buffered_lines);
83   discarded[1] = discarded[0] + filevec[0].buffered_lines;
84 
85   /* Mark to be discarded each line that matches no line of the other file.
86      If a line matches many lines, mark it as provisionally discardable.  */
87 
88   for (f = 0; f < 2; f++)
89     {
90       size_t end = filevec[f].buffered_lines;
91       char *discards = discarded[f];
92       lin *counts = equiv_count[1 - f];
93       lin *equivs = filevec[f].equivs;
94       size_t many = 5;
95       size_t tem = end / 64;
96 
97       /* Multiply MANY by approximate square root of number of lines.
98 	 That is the threshold for provisionally discardable lines.  */
99       while ((tem = tem >> 2) > 0)
100 	many *= 2;
101 
102       for (i = 0; i < end; i++)
103 	{
104 	  lin nmatch;
105 	  if (equivs[i] == 0)
106 	    continue;
107 	  nmatch = counts[equivs[i]];
108 	  if (nmatch == 0)
109 	    discards[i] = 1;
110 	  else if (nmatch > many)
111 	    discards[i] = 2;
112 	}
113     }
114 
115   /* Don't really discard the provisional lines except when they occur
116      in a run of discardables, with nonprovisionals at the beginning
117      and end.  */
118 
119   for (f = 0; f < 2; f++)
120     {
121       lin end = filevec[f].buffered_lines;
122       register char *discards = discarded[f];
123 
124       for (i = 0; i < end; i++)
125 	{
126 	  /* Cancel provisional discards not in middle of run of discards.  */
127 	  if (discards[i] == 2)
128 	    discards[i] = 0;
129 	  else if (discards[i] != 0)
130 	    {
131 	      /* We have found a nonprovisional discard.  */
132 	      register lin j;
133 	      lin length;
134 	      lin provisional = 0;
135 
136 	      /* Find end of this run of discardable lines.
137 		 Count how many are provisionally discardable.  */
138 	      for (j = i; j < end; j++)
139 		{
140 		  if (discards[j] == 0)
141 		    break;
142 		  if (discards[j] == 2)
143 		    ++provisional;
144 		}
145 
146 	      /* Cancel provisional discards at end, and shrink the run.  */
147 	      while (j > i && discards[j - 1] == 2)
148 		discards[--j] = 0, --provisional;
149 
150 	      /* Now we have the length of a run of discardable lines
151 		 whose first and last are not provisional.  */
152 	      length = j - i;
153 
154 	      /* If 1/4 of the lines in the run are provisional,
155 		 cancel discarding of all provisional lines in the run.  */
156 	      if (provisional * 4 > length)
157 		{
158 		  while (j > i)
159 		    if (discards[--j] == 2)
160 		      discards[j] = 0;
161 		}
162 	      else
163 		{
164 		  register lin consec;
165 		  lin minimum = 1;
166 		  lin tem = length >> 2;
167 
168 		  /* MINIMUM is approximate square root of LENGTH/4.
169 		     A subrun of two or more provisionals can stand
170 		     when LENGTH is at least 16.
171 		     A subrun of 4 or more can stand when LENGTH >= 64.  */
172 		  while (0 < (tem >>= 2))
173 		    minimum <<= 1;
174 		  minimum++;
175 
176 		  /* Cancel any subrun of MINIMUM or more provisionals
177 		     within the larger run.  */
178 		  for (j = 0, consec = 0; j < length; j++)
179 		    if (discards[i + j] != 2)
180 		      consec = 0;
181 		    else if (minimum == ++consec)
182 		      /* Back up to start of subrun, to cancel it all.  */
183 		      j -= consec;
184 		    else if (minimum < consec)
185 		      discards[i + j] = 0;
186 
187 		  /* Scan from beginning of run
188 		     until we find 3 or more nonprovisionals in a row
189 		     or until the first nonprovisional at least 8 lines in.
190 		     Until that point, cancel any provisionals.  */
191 		  for (j = 0, consec = 0; j < length; j++)
192 		    {
193 		      if (j >= 8 && discards[i + j] == 1)
194 			break;
195 		      if (discards[i + j] == 2)
196 			consec = 0, discards[i + j] = 0;
197 		      else if (discards[i + j] == 0)
198 			consec = 0;
199 		      else
200 			consec++;
201 		      if (consec == 3)
202 			break;
203 		    }
204 
205 		  /* I advances to the last line of the run.  */
206 		  i += length - 1;
207 
208 		  /* Same thing, from end.  */
209 		  for (j = 0, consec = 0; j < length; j++)
210 		    {
211 		      if (j >= 8 && discards[i - j] == 1)
212 			break;
213 		      if (discards[i - j] == 2)
214 			consec = 0, discards[i - j] = 0;
215 		      else if (discards[i - j] == 0)
216 			consec = 0;
217 		      else
218 			consec++;
219 		      if (consec == 3)
220 			break;
221 		    }
222 		}
223 	    }
224 	}
225     }
226 
227   /* Actually discard the lines. */
228   for (f = 0; f < 2; f++)
229     {
230       char *discards = discarded[f];
231       lin end = filevec[f].buffered_lines;
232       lin j = 0;
233       for (i = 0; i < end; ++i)
234 	if (minimal || discards[i] == 0)
235 	  {
236 	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
237 	    filevec[f].realindexes[j++] = i;
238 	  }
239 	else
240 	  filevec[f].changed[i] = 1;
241       filevec[f].nondiscarded_lines = j;
242     }
243 
244   free (discarded[0]);
245   free (equiv_count[0]);
246 }
247 
248 /* Adjust inserts/deletes of identical lines to join changes
249    as much as possible.
250 
251    We do something when a run of changed lines include a
252    line at one end and have an excluded, identical line at the other.
253    We are free to choose which identical line is included.
254    'compareseq' usually chooses the one at the beginning,
255    but usually it is cleaner to consider the following identical line
256    to be the "change".  */
257 
258 static void
259 shift_boundaries (struct file_data filevec[])
260 {
261   int f;
262 
263   for (f = 0; f < 2; f++)
264     {
265       char *changed = filevec[f].changed;
266       char *other_changed = filevec[1 - f].changed;
267       lin const *equivs = filevec[f].equivs;
268       lin i = 0;
269       lin j = 0;
270       lin i_end = filevec[f].buffered_lines;
271 
272       while (1)
273 	{
274 	  lin runlength, start, corresponding;
275 
276 	  /* Scan forwards to find beginning of another run of changes.
277 	     Also keep track of the corresponding point in the other file.  */
278 
279 	  while (i < i_end && !changed[i])
280 	    {
281 	      while (other_changed[j++])
282 		continue;
283 	      i++;
284 	    }
285 
286 	  if (i == i_end)
287 	    break;
288 
289 	  start = i;
290 
291 	  /* Find the end of this run of changes.  */
292 
293 	  while (changed[++i])
294 	    continue;
295 	  while (other_changed[j])
296 	    j++;
297 
298 	  do
299 	    {
300 	      /* Record the length of this run of changes, so that
301 		 we can later determine whether the run has grown.  */
302 	      runlength = i - start;
303 
304 	      /* Move the changed region back, so long as the
305 		 previous unchanged line matches the last changed one.
306 		 This merges with previous changed regions.  */
307 
308 	      while (start && equivs[start - 1] == equivs[i - 1])
309 		{
310 		  changed[--start] = 1;
311 		  changed[--i] = 0;
312 		  while (changed[start - 1])
313 		    start--;
314 		  while (other_changed[--j])
315 		    continue;
316 		}
317 
318 	      /* Set CORRESPONDING to the end of the changed run, at the last
319 		 point where it corresponds to a changed run in the other file.
320 		 CORRESPONDING == I_END means no such point has been found.  */
321 	      corresponding = other_changed[j - 1] ? i : i_end;
322 
323 	      /* Move the changed region forward, so long as the
324 		 first changed line matches the following unchanged one.
325 		 This merges with following changed regions.
326 		 Do this second, so that if there are no merges,
327 		 the changed region is moved forward as far as possible.  */
328 
329 	      while (i != i_end && equivs[start] == equivs[i])
330 		{
331 		  changed[start++] = 0;
332 		  changed[i++] = 1;
333 		  while (changed[i])
334 		    i++;
335 		  while (other_changed[++j])
336 		    corresponding = i;
337 		}
338 	    }
339 	  while (runlength != i - start);
340 
341 	  /* If possible, move the fully-merged run of changes
342 	     back to a corresponding run in the other file.  */
343 
344 	  while (corresponding < i)
345 	    {
346 	      changed[--start] = 1;
347 	      changed[--i] = 0;
348 	      while (other_changed[--j])
349 		continue;
350 	    }
351 	}
352     }
353 }
354 
355 /* Cons an additional entry onto the front of an edit script OLD.
356    LINE0 and LINE1 are the first affected lines in the two files (origin 0).
357    DELETED is the number of lines deleted here from file 0.
358    INSERTED is the number of lines inserted here in file 1.
359 
360    If DELETED is 0 then LINE0 is the number of the line before
361    which the insertion was done; vice versa for INSERTED and LINE1.  */
362 
363 static struct change *
364 add_change (lin line0, lin line1, lin deleted, lin inserted,
365 	    struct change *old)
366 {
367   struct change *new = xmalloc (sizeof *new);
368 
369   new->line0 = line0;
370   new->line1 = line1;
371   new->inserted = inserted;
372   new->deleted = deleted;
373   new->link = old;
374   return new;
375 }
376 
377 /* Scan the tables of which lines are inserted and deleted,
378    producing an edit script in reverse order.  */
379 
380 static struct change *
381 build_reverse_script (struct file_data const filevec[])
382 {
383   struct change *script = 0;
384   char *changed0 = filevec[0].changed;
385   char *changed1 = filevec[1].changed;
386   lin len0 = filevec[0].buffered_lines;
387   lin len1 = filevec[1].buffered_lines;
388 
389   /* Note that changedN[lenN] does exist, and is 0.  */
390 
391   lin i0 = 0, i1 = 0;
392 
393   while (i0 < len0 || i1 < len1)
394     {
395       if (changed0[i0] | changed1[i1])
396 	{
397 	  lin line0 = i0, line1 = i1;
398 
399 	  /* Find # lines changed here in each file.  */
400 	  while (changed0[i0]) ++i0;
401 	  while (changed1[i1]) ++i1;
402 
403 	  /* Record this change.  */
404 	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
405 	}
406 
407       /* We have reached lines in the two files that match each other.  */
408       i0++, i1++;
409     }
410 
411   return script;
412 }
413 
414 /* Scan the tables of which lines are inserted and deleted,
415    producing an edit script in forward order.  */
416 
417 static struct change *
418 build_script (struct file_data const filevec[])
419 {
420   struct change *script = 0;
421   char *changed0 = filevec[0].changed;
422   char *changed1 = filevec[1].changed;
423   lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
424 
425   /* Note that changedN[-1] does exist, and is 0.  */
426 
427   while (i0 >= 0 || i1 >= 0)
428     {
429       if (changed0[i0 - 1] | changed1[i1 - 1])
430 	{
431 	  lin line0 = i0, line1 = i1;
432 
433 	  /* Find # lines changed here in each file.  */
434 	  while (changed0[i0 - 1]) --i0;
435 	  while (changed1[i1 - 1]) --i1;
436 
437 	  /* Record this change.  */
438 	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
439 	}
440 
441       /* We have reached lines in the two files that match each other.  */
442       i0--, i1--;
443     }
444 
445   return script;
446 }
447 
448 /* If CHANGES, briefly report that two files differed.  */
449 static void
450 briefly_report (int changes, struct file_data const filevec[])
451 {
452   if (changes)
453     message ((brief
454 	      ? _("Files %s and %s differ\n")
455 	      : _("Binary files %s and %s differ\n")),
456 	     file_label[0] ? file_label[0] : filevec[0].name,
457 	     file_label[1] ? file_label[1] : filevec[1].name);
458 }
459 
460 /* Report the differences of two files.  */
461 int
462 diff_2_files (struct comparison *cmp)
463 {
464   int f;
465   struct change *e, *p;
466   struct change *script;
467   int changes;
468 
469 
470   /* If we have detected that either file is binary,
471      compare the two files as binary.  This can happen
472      only when the first chunk is read.
473      Also, --brief without any --ignore-* options means
474      we can speed things up by treating the files as binary.  */
475 
476   if (read_files (cmp->file, files_can_be_treated_as_binary))
477     {
478       /* Files with different lengths must be different.  */
479       if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
480 	  && 0 < cmp->file[0].stat.st_size
481 	  && 0 < cmp->file[1].stat.st_size
482 	  && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
483 	  && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
484 	changes = 1;
485 
486       /* Standard input equals itself.  */
487       else if (cmp->file[0].desc == cmp->file[1].desc)
488 	changes = 0;
489 
490       else
491 	/* Scan both files, a buffer at a time, looking for a difference.  */
492 	{
493 	  /* Allocate same-sized buffers for both files.  */
494 	  size_t lcm_max = PTRDIFF_MAX - 1;
495 	  size_t buffer_size =
496 	    buffer_lcm (sizeof (word),
497 			buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
498 				    STAT_BLOCKSIZE (cmp->file[1].stat),
499 				    lcm_max),
500 			lcm_max);
501 	  for (f = 0; f < 2; f++)
502 	    cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
503 
504 	  for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
505 	    {
506 	      /* Read a buffer's worth from both files.  */
507 	      for (f = 0; f < 2; f++)
508 		if (0 <= cmp->file[f].desc)
509 		  file_block_read (&cmp->file[f],
510 				   buffer_size - cmp->file[f].buffered);
511 
512 	      /* If the buffers differ, the files differ.  */
513 	      if (cmp->file[0].buffered != cmp->file[1].buffered
514 		  || memcmp (cmp->file[0].buffer,
515 			     cmp->file[1].buffer,
516 			     cmp->file[0].buffered))
517 		{
518 		  changes = 1;
519 		  break;
520 		}
521 
522 	      /* If we reach end of file, the files are the same.  */
523 	      if (cmp->file[0].buffered != buffer_size)
524 		{
525 		  changes = 0;
526 		  break;
527 		}
528 	    }
529 	}
530 
531       briefly_report (changes, cmp->file);
532     }
533   else
534     {
535       struct context ctxt;
536       lin diags;
537       lin too_expensive;
538 
539       /* Allocate vectors for the results of comparison:
540 	 a flag for each line of each file, saying whether that line
541 	 is an insertion or deletion.
542 	 Allocate an extra element, always 0, at each end of each vector.  */
543 
544       size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
545       char *flag_space = zalloc (s);
546       cmp->file[0].changed = flag_space + 1;
547       cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
548 
549       /* Some lines are obviously insertions or deletions
550 	 because they don't match anything.  Detect them now, and
551 	 avoid even thinking about them in the main comparison algorithm.  */
552 
553       discard_confusing_lines (cmp->file);
554 
555       /* Now do the main comparison algorithm, considering just the
556 	 undiscarded lines.  */
557 
558       ctxt.xvec = cmp->file[0].undiscarded;
559       ctxt.yvec = cmp->file[1].undiscarded;
560       diags = (cmp->file[0].nondiscarded_lines
561 	       + cmp->file[1].nondiscarded_lines + 3);
562       ctxt.fdiag = xmalloc (diags * (2 * sizeof *ctxt.fdiag));
563       ctxt.bdiag = ctxt.fdiag + diags;
564       ctxt.fdiag += cmp->file[1].nondiscarded_lines + 1;
565       ctxt.bdiag += cmp->file[1].nondiscarded_lines + 1;
566 
567       ctxt.heuristic = speed_large_files;
568 
569       /* Set TOO_EXPENSIVE to be the approximate square root of the
570 	 input size, bounded below by 4096.  4096 seems to be good for
571 	 circa-2016 CPUs; see Bug#16848 and Bug#24715.  */
572       too_expensive = 1;
573       for (;  diags != 0;  diags >>= 2)
574 	too_expensive <<= 1;
575       ctxt.too_expensive = MAX (4096, too_expensive);
576 
577       files[0] = cmp->file[0];
578       files[1] = cmp->file[1];
579 
580       compareseq (0, cmp->file[0].nondiscarded_lines,
581 		  0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);
582 
583       free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));
584 
585       /* Modify the results slightly to make them prettier
586 	 in cases where that can validly be done.  */
587 
588       shift_boundaries (cmp->file);
589 
590       /* Get the results of comparison in the form of a chain
591 	 of 'struct change's -- an edit script.  */
592 
593       if (output_style == OUTPUT_ED)
594 	script = build_reverse_script (cmp->file);
595       else
596 	script = build_script (cmp->file);
597 
598       /* Set CHANGES if we had any diffs.
599 	 If some changes are ignored, we must scan the script to decide.  */
600       if (ignore_blank_lines || ignore_regexp.fastmap)
601 	{
602 	  struct change *next = script;
603 	  changes = 0;
604 
605 	  while (next && changes == 0)
606 	    {
607 	      struct change *this, *end;
608 	      lin first0, last0, first1, last1;
609 
610 	      /* Find a set of changes that belong together.  */
611 	      this = next;
612 	      end = find_change (next);
613 
614 	      /* Disconnect them from the rest of the changes, making them
615 		 a hunk, and remember the rest for next iteration.  */
616 	      next = end->link;
617 	      end->link = 0;
618 
619 	      /* Determine whether this hunk is really a difference.  */
620 	      if (analyze_hunk (this, &first0, &last0, &first1, &last1))
621 		changes = 1;
622 
623 	      /* Reconnect the script so it will all be freed properly.  */
624 	      end->link = next;
625 	    }
626 	}
627       else
628 	changes = (script != 0);
629 
630       if (brief)
631 	briefly_report (changes, cmp->file);
632       else
633 	{
634 	  if (changes || !no_diff_means_no_output)
635 	    {
636 	      /* Record info for starting up output,
637 		 to be used if and when we have some output to print.  */
638 	      setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
639 			    file_label[1] ? file_label[1] : cmp->file[1].name,
640 			    cmp->parent != 0);
641 
642 	      switch (output_style)
643 		{
644 		case OUTPUT_CONTEXT:
645 		  print_context_script (script, false);
646 		  break;
647 
648 		case OUTPUT_UNIFIED:
649 		  print_context_script (script, true);
650 		  break;
651 
652 		case OUTPUT_ED:
653 		  print_ed_script (script);
654 		  break;
655 
656 		case OUTPUT_FORWARD_ED:
657 		  pr_forward_ed_script (script);
658 		  break;
659 
660 		case OUTPUT_RCS:
661 		  print_rcs_script (script);
662 		  break;
663 
664 		case OUTPUT_NORMAL:
665 		  print_normal_script (script);
666 		  break;
667 
668 		case OUTPUT_IFDEF:
669 		  print_ifdef_script (script);
670 		  break;
671 
672 		case OUTPUT_SDIFF:
673 		  print_sdiff_script (script);
674 		  break;
675 
676 		default:
677 		  abort ();
678 		}
679 
680 	      finish_output ();
681 	    }
682 	}
683 
684       free (cmp->file[0].undiscarded);
685 
686       free (flag_space);
687 
688       for (f = 0; f < 2; f++)
689 	{
690 	  free (cmp->file[f].equivs);
691 	  free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
692 	}
693 
694       for (e = script; e; e = p)
695 	{
696 	  p = e->link;
697 	  free (e);
698 	}
699 
700       if (! ROBUST_OUTPUT_STYLE (output_style))
701 	for (f = 0; f < 2; ++f)
702 	  if (cmp->file[f].missing_newline)
703 	    {
704 	      error (0, 0, "%s: %s\n",
705 		     file_label[f] ? file_label[f] : cmp->file[f].name,
706 		     _("No newline at end of file"));
707 	      changes = 2;
708 	    }
709     }
710 
711   if (cmp->file[0].buffer != cmp->file[1].buffer)
712     free (cmp->file[0].buffer);
713   free (cmp->file[1].buffer);
714 
715   return changes;
716 }
717