xref: /dragonfly/contrib/diffutils/src/io.c (revision 52f9f0d9)
1 /* File I/O for GNU DIFF.
2 
3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2011
4    Free Software Foundation, Inc.
5 
6    This file is part of GNU DIFF.
7 
8    This program is free software: you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation, either version 3 of the License, or
11    (at your option) any later version.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
20 
21 #include "diff.h"
22 #include <cmpbuf.h>
23 #include <file-type.h>
24 #include <xalloc.h>
25 
26 /* Rotate an unsigned value to the left.  */
27 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
28 
29 /* Given a hash value and a new character, return a new hash value.  */
30 #define HASH(h, c) ((c) + ROL (h, 7))
31 
32 /* The type of a hash value.  */
33 typedef size_t hash_value;
34 verify (! TYPE_SIGNED (hash_value));
35 
36 /* Lines are put into equivalence classes of lines that match in lines_differ.
37    Each equivalence class is represented by one of these structures,
38    but only while the classes are being computed.
39    Afterward, each class is represented by a number.  */
40 struct equivclass
41 {
42   lin next;		/* Next item in this bucket.  */
43   hash_value hash;	/* Hash of lines in this class.  */
44   char const *line;	/* A line that fits this class.  */
45   size_t length;	/* That line's length, not counting its newline.  */
46 };
47 
48 /* Hash-table: array of buckets, each being a chain of equivalence classes.
49    buckets[-1] is reserved for incomplete lines.  */
50 static lin *buckets;
51 
52 /* Number of buckets in the hash table array, not counting buckets[-1].  */
53 static size_t nbuckets;
54 
55 /* Array in which the equivalence classes are allocated.
56    The bucket-chains go through the elements in this array.
57    The number of an equivalence class is its index in this array.  */
58 static struct equivclass *equivs;
59 
60 /* Index of first free element in the array `equivs'.  */
61 static lin equivs_index;
62 
63 /* Number of elements allocated in the array `equivs'.  */
64 static lin equivs_alloc;
65 
66 /* Read a block of data into a file buffer, checking for EOF and error.  */
67 
68 void
69 file_block_read (struct file_data *current, size_t size)
70 {
71   if (size && ! current->eof)
72     {
73       size_t s = block_read (current->desc,
74 			     FILE_BUFFER (current) + current->buffered, size);
75       if (s == SIZE_MAX)
76 	pfatal_with_name (current->name);
77       current->buffered += s;
78       current->eof = s < size;
79     }
80 }
81 
82 /* Check for binary files and compare them for exact identity.  */
83 
84 /* Return 1 if BUF contains a non text character.
85    SIZE is the number of characters in BUF.  */
86 
87 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
88 
89 /* Get ready to read the current file.
90    Return nonzero if SKIP_TEST is zero,
91    and if it appears to be a binary file.  */
92 
93 static bool
94 sip (struct file_data *current, bool skip_test)
95 {
96   /* If we have a nonexistent file at this stage, treat it as empty.  */
97   if (current->desc < 0)
98     {
99       /* Leave room for a sentinel.  */
100       current->bufsize = sizeof (word);
101       current->buffer = xmalloc (current->bufsize);
102     }
103   else
104     {
105       current->bufsize = buffer_lcm (sizeof (word),
106 				     STAT_BLOCKSIZE (current->stat),
107 				     PTRDIFF_MAX - 2 * sizeof (word));
108       current->buffer = xmalloc (current->bufsize);
109 
110       if (! skip_test)
111 	{
112 	  /* Check first part of file to see if it's a binary file.  */
113 
114 	  /* FIXME: if O_BINARY, this should revert to text mode
115 	     if the file is not binary.  */
116 
117 	  file_block_read (current, current->bufsize);
118 	  return binary_file_p (current->buffer, current->buffered);
119 	}
120     }
121 
122   current->buffered = 0;
123   current->eof = false;
124   return false;
125 }
126 
127 /* Slurp the rest of the current file completely into memory.  */
128 
129 static void
130 slurp (struct file_data *current)
131 {
132   size_t cc;
133 
134   if (current->desc < 0)
135     {
136       /* The file is nonexistent.  */
137       return;
138     }
139 
140   if (S_ISREG (current->stat.st_mode))
141     {
142       /* It's a regular file; slurp in the rest all at once.  */
143 
144       /* Get the size out of the stat block.
145 	 Allocate just enough room for appended newline plus word sentinel,
146 	 plus word-alignment since we want the buffer word-aligned.  */
147       size_t file_size = current->stat.st_size;
148       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
149       if (file_size != current->stat.st_size || cc < file_size
150 	  || PTRDIFF_MAX <= cc)
151 	xalloc_die ();
152 
153       if (current->bufsize < cc)
154 	{
155 	  current->bufsize = cc;
156 	  current->buffer = xrealloc (current->buffer, cc);
157 	}
158 
159       /* Try to read at least 1 more byte than the size indicates, to
160 	 detect whether the file is growing.  This is a nicety for
161 	 users who run 'diff' on files while they are changing.  */
162 
163       if (current->buffered <= file_size)
164 	{
165 	  file_block_read (current, file_size + 1 - current->buffered);
166 	  if (current->buffered <= file_size)
167 	    return;
168 	}
169     }
170 
171   /* It's not a regular file, or it's a growing regular file; read it,
172      growing the buffer as needed.  */
173 
174   file_block_read (current, current->bufsize - current->buffered);
175 
176   if (current->buffered)
177     {
178       while (current->buffered == current->bufsize)
179 	{
180 	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
181 	    xalloc_die ();
182 	  current->bufsize *= 2;
183 	  current->buffer = xrealloc (current->buffer, current->bufsize);
184 	  file_block_read (current, current->bufsize - current->buffered);
185 	}
186 
187       /* Allocate just enough room for appended newline plus word
188 	 sentinel, plus word-alignment.  */
189       cc = current->buffered + 2 * sizeof (word);
190       current->bufsize = cc - cc % sizeof (word);
191       current->buffer = xrealloc (current->buffer, current->bufsize);
192     }
193 }
194 
195 /* Split the file into lines, simultaneously computing the equivalence
196    class for each line.  */
197 
198 static void
199 find_and_hash_each_line (struct file_data *current)
200 {
201   char const *p = current->prefix_end;
202   lin i, *bucket;
203   size_t length;
204 
205   /* Cache often-used quantities in local variables to help the compiler.  */
206   char const **linbuf = current->linbuf;
207   lin alloc_lines = current->alloc_lines;
208   lin line = 0;
209   lin linbuf_base = current->linbuf_base;
210   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
211   struct equivclass *eqs = equivs;
212   lin eqs_index = equivs_index;
213   lin eqs_alloc = equivs_alloc;
214   char const *suffix_begin = current->suffix_begin;
215   char const *bufend = FILE_BUFFER (current) + current->buffered;
216   bool ig_case = ignore_case;
217   enum DIFF_white_space ig_white_space = ignore_white_space;
218   bool diff_length_compare_anyway =
219     ig_white_space != IGNORE_NO_WHITE_SPACE;
220   bool same_length_diff_contents_compare_anyway =
221     diff_length_compare_anyway | ig_case;
222 
223   while (p < suffix_begin)
224     {
225       char const *ip = p;
226       hash_value h = 0;
227       unsigned char c;
228 
229       /* Hash this line until we find a newline.  */
230       switch (ig_white_space)
231 	{
232 	case IGNORE_ALL_SPACE:
233 	  while ((c = *p++) != '\n')
234 	    if (! isspace (c))
235 	      h = HASH (h, ig_case ? tolower (c) : c);
236 	  break;
237 
238 	case IGNORE_SPACE_CHANGE:
239 	  while ((c = *p++) != '\n')
240 	    {
241 	      if (isspace (c))
242 		{
243 		  do
244 		    if ((c = *p++) == '\n')
245 		      goto hashing_done;
246 		  while (isspace (c));
247 
248 		  h = HASH (h, ' ');
249 		}
250 
251 	      /* C is now the first non-space.  */
252 	      h = HASH (h, ig_case ? tolower (c) : c);
253 	    }
254 	  break;
255 
256 	case IGNORE_TAB_EXPANSION:
257 	case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
258 	case IGNORE_TRAILING_SPACE:
259 	  {
260 	    size_t column = 0;
261 	    while ((c = *p++) != '\n')
262 	      {
263 		if (ig_white_space & IGNORE_TRAILING_SPACE
264 		    && isspace (c))
265 		  {
266 		    char const *p1 = p;
267 		    unsigned char c1;
268 		    do
269 		      if ((c1 = *p1++) == '\n')
270 			{
271 			  p = p1;
272 			  goto hashing_done;
273 			}
274 		    while (isspace (c1));
275 		  }
276 
277 		size_t repetitions = 1;
278 
279 		if (ig_white_space & IGNORE_TAB_EXPANSION)
280 		  switch (c)
281 		    {
282 		    case '\b':
283 		      column -= 0 < column;
284 		      break;
285 
286 		    case '\t':
287 		      c = ' ';
288 		      repetitions = tabsize - column % tabsize;
289 		      column = (column + repetitions < column
290 				? 0
291 				: column + repetitions);
292 		      break;
293 
294 		    case '\r':
295 		      column = 0;
296 		      break;
297 
298 		    default:
299 		      column++;
300 		      break;
301 		    }
302 
303 		if (ig_case)
304 		  c = tolower (c);
305 
306 		do
307 		  h = HASH (h, c);
308 		while (--repetitions != 0);
309 	      }
310 	  }
311 	  break;
312 
313 	default:
314 	  if (ig_case)
315 	    while ((c = *p++) != '\n')
316 	      h = HASH (h, tolower (c));
317 	  else
318 	    while ((c = *p++) != '\n')
319 	      h = HASH (h, c);
320 	  break;
321 	}
322 
323    hashing_done:;
324 
325       bucket = &buckets[h % nbuckets];
326       length = p - ip - 1;
327 
328       if (p == bufend
329 	  && current->missing_newline
330 	  && ROBUST_OUTPUT_STYLE (output_style))
331 	{
332 	  /* The last line is incomplete and we do not silently
333 	     complete lines.  If the line cannot compare equal to any
334 	     complete line, put it into buckets[-1] so that it can
335 	     compare equal only to the other file's incomplete line
336 	     (if one exists).  */
337 	  if (ig_white_space < IGNORE_TRAILING_SPACE)
338 	    bucket = &buckets[-1];
339 	}
340 
341       for (i = *bucket;  ;  i = eqs[i].next)
342 	if (!i)
343 	  {
344 	    /* Create a new equivalence class in this bucket.  */
345 	    i = eqs_index++;
346 	    if (i == eqs_alloc)
347 	      {
348 		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
349 		  xalloc_die ();
350 		eqs_alloc *= 2;
351 		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
352 	      }
353 	    eqs[i].next = *bucket;
354 	    eqs[i].hash = h;
355 	    eqs[i].line = ip;
356 	    eqs[i].length = length;
357 	    *bucket = i;
358 	    break;
359 	  }
360 	else if (eqs[i].hash == h)
361 	  {
362 	    char const *eqline = eqs[i].line;
363 
364 	    /* Reuse existing class if lines_differ reports the lines
365                equal.  */
366 	    if (eqs[i].length == length)
367 	      {
368 		/* Reuse existing equivalence class if the lines are identical.
369 		   This detects the common case of exact identity
370 		   faster than lines_differ would.  */
371 		if (memcmp (eqline, ip, length) == 0)
372 		  break;
373 		if (!same_length_diff_contents_compare_anyway)
374 		  continue;
375 	      }
376 	    else if (!diff_length_compare_anyway)
377 	      continue;
378 
379 	    if (! lines_differ (eqline, ip))
380 	      break;
381 	  }
382 
383       /* Maybe increase the size of the line table.  */
384       if (line == alloc_lines)
385 	{
386 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
387 	  if (PTRDIFF_MAX / 3 <= alloc_lines
388 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
389 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
390 	    xalloc_die ();
391 	  alloc_lines = 2 * alloc_lines - linbuf_base;
392 	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
393 	  linbuf += linbuf_base;
394 	  linbuf = xrealloc (linbuf,
395 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
396 	  linbuf -= linbuf_base;
397 	}
398       linbuf[line] = ip;
399       cureqs[line] = i;
400       ++line;
401     }
402 
403   current->buffered_lines = line;
404 
405   for (i = 0;  ;  i++)
406     {
407       /* Record the line start for lines in the suffix that we care about.
408 	 Record one more line start than lines,
409 	 so that we can compute the length of any buffered line.  */
410       if (line == alloc_lines)
411 	{
412 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
413 	  if (PTRDIFF_MAX / 3 <= alloc_lines
414 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
415 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
416 	    xalloc_die ();
417 	  alloc_lines = 2 * alloc_lines - linbuf_base;
418 	  linbuf += linbuf_base;
419 	  linbuf = xrealloc (linbuf,
420 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
421 	  linbuf -= linbuf_base;
422 	}
423       linbuf[line] = p;
424 
425       if (p == bufend)
426 	{
427 	  /* If the last line is incomplete and we do not silently
428 	     complete lines, don't count its appended newline.  */
429 	  if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style))
430 	    linbuf[line]--;
431 	  break;
432 	}
433 
434       if (context <= i && no_diff_means_no_output)
435 	break;
436 
437       line++;
438 
439       while (*p++ != '\n')
440 	continue;
441     }
442 
443   /* Done with cache in local variables.  */
444   current->linbuf = linbuf;
445   current->valid_lines = line;
446   current->alloc_lines = alloc_lines;
447   current->equivs = cureqs;
448   equivs = eqs;
449   equivs_alloc = eqs_alloc;
450   equivs_index = eqs_index;
451 }
452 
453 /* Prepare the text.  Make sure the text end is initialized.
454    Make sure text ends in a newline,
455    but remember that we had to add one.
456    Strip trailing CRs, if that was requested.  */
457 
458 static void
459 prepare_text (struct file_data *current)
460 {
461   size_t buffered = current->buffered;
462   char *p = FILE_BUFFER (current);
463   char *dst;
464 
465   if (buffered == 0 || p[buffered - 1] == '\n')
466     current->missing_newline = false;
467   else
468     {
469       p[buffered++] = '\n';
470       current->missing_newline = true;
471     }
472 
473   if (!p)
474     return;
475 
476   /* Don't use uninitialized storage when planting or using sentinels.  */
477   memset (p + buffered, 0, sizeof (word));
478 
479   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
480     {
481       char const *src = dst;
482       char const *srclim = p + buffered;
483 
484       do
485 	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
486       while (src < srclim);
487 
488       buffered -= src - dst;
489     }
490 
491   current->buffered = buffered;
492 }
493 
494 /* We have found N lines in a buffer of size S; guess the
495    proportionate number of lines that will be found in a buffer of
496    size T.  However, do not guess a number of lines so large that the
497    resulting line table might cause overflow in size calculations.  */
498 static lin
499 guess_lines (lin n, size_t s, size_t t)
500 {
501   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
502   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
503   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
504 }
505 
506 /* Given a vector of two file_data objects, find the identical
507    prefixes and suffixes of each object.  */
508 
509 static void
510 find_identical_ends (struct file_data filevec[])
511 {
512   word *w0, *w1;
513   char *p0, *p1, *buffer0, *buffer1;
514   char const *end0, *beg0;
515   char const **linbuf0, **linbuf1;
516   lin i, lines;
517   size_t n0, n1;
518   lin alloc_lines0, alloc_lines1;
519   lin buffered_prefix, prefix_count, prefix_mask;
520   lin middle_guess, suffix_guess;
521 
522   slurp (&filevec[0]);
523   prepare_text (&filevec[0]);
524   if (filevec[0].desc != filevec[1].desc)
525     {
526       slurp (&filevec[1]);
527       prepare_text (&filevec[1]);
528     }
529   else
530     {
531       filevec[1].buffer = filevec[0].buffer;
532       filevec[1].bufsize = filevec[0].bufsize;
533       filevec[1].buffered = filevec[0].buffered;
534       filevec[1].missing_newline = filevec[0].missing_newline;
535     }
536 
537   /* Find identical prefix.  */
538 
539   w0 = filevec[0].buffer;
540   w1 = filevec[1].buffer;
541   p0 = buffer0 = (char *) w0;
542   p1 = buffer1 = (char *) w1;
543   n0 = filevec[0].buffered;
544   n1 = filevec[1].buffered;
545 
546   if (p0 == p1)
547     /* The buffers are the same; sentinels won't work.  */
548     p0 = p1 += n1;
549   else
550     {
551       /* Insert end sentinels, in this case characters that are guaranteed
552 	 to make the equality test false, and thus terminate the loop.  */
553 
554       if (n0 < n1)
555 	p0[n0] = ~p1[n0];
556       else
557 	p1[n1] = ~p0[n1];
558 
559       /* Loop until first mismatch, or to the sentinel characters.  */
560 
561       /* Compare a word at a time for speed.  */
562       while (*w0 == *w1)
563 	w0++, w1++;
564 
565       /* Do the last few bytes of comparison a byte at a time.  */
566       p0 = (char *) w0;
567       p1 = (char *) w1;
568       while (*p0 == *p1)
569 	p0++, p1++;
570 
571       /* Don't mistakenly count missing newline as part of prefix.  */
572       if (ROBUST_OUTPUT_STYLE (output_style)
573 	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
574 	      !=
575 	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
576 	p0--, p1--;
577     }
578 
579   /* Now P0 and P1 point at the first nonmatching characters.  */
580 
581   /* Skip back to last line-beginning in the prefix,
582      and then discard up to HORIZON_LINES lines from the prefix.  */
583   i = horizon_lines;
584   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
585     p0--, p1--;
586 
587   /* Record the prefix.  */
588   filevec[0].prefix_end = p0;
589   filevec[1].prefix_end = p1;
590 
591   /* Find identical suffix.  */
592 
593   /* P0 and P1 point beyond the last chars not yet compared.  */
594   p0 = buffer0 + n0;
595   p1 = buffer1 + n1;
596 
597   if (! ROBUST_OUTPUT_STYLE (output_style)
598       || filevec[0].missing_newline == filevec[1].missing_newline)
599     {
600       end0 = p0;	/* Addr of last char in file 0.  */
601 
602       /* Get value of P0 at which we should stop scanning backward:
603 	 this is when either P0 or P1 points just past the last char
604 	 of the identical prefix.  */
605       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
606 
607       /* Scan back until chars don't match or we reach that point.  */
608       while (p0 != beg0)
609 	if (*--p0 != *--p1)
610 	  {
611 	    /* Point at the first char of the matching suffix.  */
612 	    ++p0, ++p1;
613 	    beg0 = p0;
614 	    break;
615 	  }
616 
617       /* Are we at a line-beginning in both files?  If not, add the rest of
618 	 this line to the main body.  Discard up to HORIZON_LINES lines from
619 	 the identical suffix.  Also, discard one extra line,
620 	 because shift_boundaries may need it.  */
621       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
622 			    &&
623 			    (buffer1 == p1 || p1[-1] == '\n'));
624       while (i-- && p0 != end0)
625 	while (*p0++ != '\n')
626 	  continue;
627 
628       p1 += p0 - beg0;
629     }
630 
631   /* Record the suffix.  */
632   filevec[0].suffix_begin = p0;
633   filevec[1].suffix_begin = p1;
634 
635   /* Calculate number of lines of prefix to save.
636 
637      prefix_count == 0 means save the whole prefix;
638      we need this for options like -D that output the whole file,
639      or for enormous contexts (to avoid worrying about arithmetic overflow).
640      We also need it for options like -F that output some preceding line;
641      at least we will need to find the last few lines,
642      but since we don't know how many, it's easiest to find them all.
643 
644      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
645      of the line buffer; they'll be moved to the proper location later.
646      Handle 1 more line than the context says (because we count 1 too many),
647      rounded up to the next power of 2 to speed index computation.  */
648 
649   if (no_diff_means_no_output && ! function_regexp.fastmap
650       && context < LIN_MAX / 4 && context < n0)
651     {
652       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
653       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
654       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
655 	continue;
656       alloc_lines0 = (prefix_count + middle_guess
657 		      + MIN (context, suffix_guess));
658     }
659   else
660     {
661       prefix_count = 0;
662       alloc_lines0 = guess_lines (0, 0, n0);
663     }
664 
665   prefix_mask = prefix_count - 1;
666   lines = 0;
667   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
668   p0 = buffer0;
669 
670   /* If the prefix is needed, find the prefix lines.  */
671   if (! (no_diff_means_no_output
672 	 && filevec[0].prefix_end == p0
673 	 && filevec[1].prefix_end == p1))
674     {
675       end0 = filevec[0].prefix_end;
676       while (p0 != end0)
677 	{
678 	  lin l = lines++ & prefix_mask;
679 	  if (l == alloc_lines0)
680 	    {
681 	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
682 		xalloc_die ();
683 	      alloc_lines0 *= 2;
684 	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
685 	    }
686 	  linbuf0[l] = p0;
687 	  while (*p0++ != '\n')
688 	    continue;
689 	}
690     }
691   buffered_prefix = prefix_count && context < lines ? context : lines;
692 
693   /* Allocate line buffer 1.  */
694 
695   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
696   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
697   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
698   if (alloc_lines1 < buffered_prefix
699       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
700     xalloc_die ();
701   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
702 
703   if (buffered_prefix != lines)
704     {
705       /* Rotate prefix lines to proper location.  */
706       for (i = 0;  i < buffered_prefix;  i++)
707 	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
708       for (i = 0;  i < buffered_prefix;  i++)
709 	linbuf0[i] = linbuf1[i];
710     }
711 
712   /* Initialize line buffer 1 from line buffer 0.  */
713   for (i = 0; i < buffered_prefix; i++)
714     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
715 
716   /* Record the line buffer, adjusted so that
717      linbuf[0] points at the first differing line.  */
718   filevec[0].linbuf = linbuf0 + buffered_prefix;
719   filevec[1].linbuf = linbuf1 + buffered_prefix;
720   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
721   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
722   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
723   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
724 }
725 
726 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
727    than 2**k.  This table is derived from Chris K. Caldwell's list
728    <http://www.utm.edu/research/primes/lists/2small/>.  */
729 
730 static unsigned char const prime_offset[] =
731 {
732   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
733   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
734   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
735   55, 93, 1, 57, 25
736 };
737 
738 /* Verify that this host's size_t is not too wide for the above table.  */
739 
740 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
741 
742 /* Given a vector of two file_data objects, read the file associated
743    with each one, and build the table of equivalence classes.
744    Return nonzero if either file appears to be a binary file.
745    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
746 
747 bool
748 read_files (struct file_data filevec[], bool pretend_binary)
749 {
750   int i;
751   bool skip_test = text | pretend_binary;
752   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
753 
754   if (filevec[0].desc != filevec[1].desc)
755     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
756   else
757     {
758       filevec[1].buffer = filevec[0].buffer;
759       filevec[1].bufsize = filevec[0].bufsize;
760       filevec[1].buffered = filevec[0].buffered;
761     }
762   if (appears_binary)
763     {
764       /* FIXME: If O_BINARY, this should set both files to binary mode.  */
765       return true;
766     }
767 
768   find_identical_ends (filevec);
769 
770   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
771   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
772     xalloc_die ();
773   equivs = xmalloc (equivs_alloc * sizeof *equivs);
774   /* Equivalence class 0 is permanently safe for lines that were not
775      hashed.  Real equivalence classes start at 1.  */
776   equivs_index = 1;
777 
778   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
779      number between 1/3 and 2/3 of the value of equiv_allocs,
780      approximately.  */
781   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
782     continue;
783   nbuckets = ((size_t) 1 << i) - prime_offset[i];
784   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
785     xalloc_die ();
786   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
787   buckets++;
788 
789   for (i = 0; i < 2; i++)
790     find_and_hash_each_line (&filevec[i]);
791 
792   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
793 
794   free (equivs);
795   free (buckets - 1);
796 
797   return false;
798 }
799