xref: /dragonfly/contrib/diffutils/src/io.c (revision d19ef5a2)
1 /* File I/O for GNU DIFF.
2 
3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013,
4    2015-2018 Free Software Foundation, Inc.
5 
6    This file is part of GNU DIFF.
7 
8    This program is free software: you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation, either version 3 of the License, or
11    (at your option) any later version.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
20 
21 #include "diff.h"
22 #include <binary-io.h>
23 #include <cmpbuf.h>
24 #include <file-type.h>
25 #include <xalloc.h>
26 
27 /* Rotate an unsigned value to the left.  */
28 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
29 
30 /* Given a hash value and a new character, return a new hash value.  */
31 #define HASH(h, c) ((c) + ROL (h, 7))
32 
33 /* The type of a hash value.  */
34 typedef size_t hash_value;
35 verify (! TYPE_SIGNED (hash_value));
36 
37 /* Lines are put into equivalence classes of lines that match in lines_differ.
38    Each equivalence class is represented by one of these structures,
39    but only while the classes are being computed.
40    Afterward, each class is represented by a number.  */
41 struct equivclass
42 {
43   lin next;		/* Next item in this bucket.  */
44   hash_value hash;	/* Hash of lines in this class.  */
45   char const *line;	/* A line that fits this class.  */
46   size_t length;	/* That line's length, not counting its newline.  */
47 };
48 
49 /* Hash-table: array of buckets, each being a chain of equivalence classes.
50    buckets[-1] is reserved for incomplete lines.  */
51 static lin *buckets;
52 
53 /* Number of buckets in the hash table array, not counting buckets[-1].  */
54 static size_t nbuckets;
55 
56 /* Array in which the equivalence classes are allocated.
57    The bucket-chains go through the elements in this array.
58    The number of an equivalence class is its index in this array.  */
59 static struct equivclass *equivs;
60 
61 /* Index of first free element in the array 'equivs'.  */
62 static lin equivs_index;
63 
64 /* Number of elements allocated in the array 'equivs'.  */
65 static lin equivs_alloc;
66 
67 /* Read a block of data into a file buffer, checking for EOF and error.  */
68 
69 void
70 file_block_read (struct file_data *current, size_t size)
71 {
72   if (size && ! current->eof)
73     {
74       size_t s = block_read (current->desc,
75 			     FILE_BUFFER (current) + current->buffered, size);
76       if (s == SIZE_MAX)
77 	pfatal_with_name (current->name);
78       current->buffered += s;
79       current->eof = s < size;
80     }
81 }
82 
83 /* Check for binary files and compare them for exact identity.  */
84 
85 /* Return 1 if BUF contains a non text character.
86    SIZE is the number of characters in BUF.  */
87 
88 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
89 
90 /* Get ready to read the current file.
91    Return nonzero if SKIP_TEST is zero,
92    and if it appears to be a binary file.  */
93 
94 static bool
95 sip (struct file_data *current, bool skip_test)
96 {
97   /* If we have a nonexistent file at this stage, treat it as empty.  */
98   if (current->desc < 0)
99     {
100       /* Leave room for a sentinel.  */
101       current->bufsize = sizeof (word);
102       current->buffer = xmalloc (current->bufsize);
103     }
104   else
105     {
106       current->bufsize = buffer_lcm (sizeof (word),
107 				     STAT_BLOCKSIZE (current->stat),
108 				     PTRDIFF_MAX - 2 * sizeof (word));
109       current->buffer = xmalloc (current->bufsize);
110 
111 #ifdef __KLIBC__
112       /* Skip test if seek is not possible */
113       skip_test = skip_test
114 		  || (lseek (current->desc, 0, SEEK_CUR) < 0
115 		      && errno == ESPIPE);
116 #endif
117 
118       if (! skip_test)
119 	{
120 	  /* Check first part of file to see if it's a binary file.  */
121 
122 	  int prev_mode = set_binary_mode (current->desc, O_BINARY);
123 	  off_t buffered;
124 	  file_block_read (current, current->bufsize);
125 	  buffered = current->buffered;
126 
127 	  if (prev_mode != O_BINARY)
128 	    {
129 	      /* Revert to text mode and seek back to the start to reread
130 		 the file.  Use relative seek, since file descriptors
131 		 like stdin might not start at offset zero.  */
132 	      if (lseek (current->desc, - buffered, SEEK_CUR) < 0)
133 		pfatal_with_name (current->name);
134 	      set_binary_mode (current->desc, prev_mode);
135 	      current->buffered = 0;
136 	      current->eof = false;
137 	    }
138 
139 	  return binary_file_p (current->buffer, buffered);
140 	}
141     }
142 
143   current->buffered = 0;
144   current->eof = false;
145   return false;
146 }
147 
148 /* Slurp the rest of the current file completely into memory.  */
149 
150 static void
151 slurp (struct file_data *current)
152 {
153   size_t cc;
154 
155   if (current->desc < 0)
156     {
157       /* The file is nonexistent.  */
158       return;
159     }
160 
161   if (S_ISREG (current->stat.st_mode))
162     {
163       /* It's a regular file; slurp in the rest all at once.  */
164 
165       /* Get the size out of the stat block.
166 	 Allocate just enough room for appended newline plus word sentinel,
167 	 plus word-alignment since we want the buffer word-aligned.  */
168       size_t file_size = current->stat.st_size;
169       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
170       if (file_size != current->stat.st_size || cc < file_size
171 	  || PTRDIFF_MAX <= cc)
172 	xalloc_die ();
173 
174       if (current->bufsize < cc)
175 	{
176 	  current->bufsize = cc;
177 	  current->buffer = xrealloc (current->buffer, cc);
178 	}
179 
180       /* Try to read at least 1 more byte than the size indicates, to
181 	 detect whether the file is growing.  This is a nicety for
182 	 users who run 'diff' on files while they are changing.  */
183 
184       if (current->buffered <= file_size)
185 	{
186 	  file_block_read (current, file_size + 1 - current->buffered);
187 	  if (current->buffered <= file_size)
188 	    return;
189 	}
190     }
191 
192   /* It's not a regular file, or it's a growing regular file; read it,
193      growing the buffer as needed.  */
194 
195   file_block_read (current, current->bufsize - current->buffered);
196 
197   if (current->buffered)
198     {
199       while (current->buffered == current->bufsize)
200 	{
201 	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
202 	    xalloc_die ();
203 	  current->bufsize *= 2;
204 	  current->buffer = xrealloc (current->buffer, current->bufsize);
205 	  file_block_read (current, current->bufsize - current->buffered);
206 	}
207 
208       /* Allocate just enough room for appended newline plus word
209 	 sentinel, plus word-alignment.  */
210       cc = current->buffered + 2 * sizeof (word);
211       current->bufsize = cc - cc % sizeof (word);
212       current->buffer = xrealloc (current->buffer, current->bufsize);
213     }
214 }
215 
216 /* Split the file into lines, simultaneously computing the equivalence
217    class for each line.  */
218 
219 static void
220 find_and_hash_each_line (struct file_data *current)
221 {
222   char const *p = current->prefix_end;
223   lin i, *bucket;
224   size_t length;
225 
226   /* Cache often-used quantities in local variables to help the compiler.  */
227   char const **linbuf = current->linbuf;
228   lin alloc_lines = current->alloc_lines;
229   lin line = 0;
230   lin linbuf_base = current->linbuf_base;
231   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
232   struct equivclass *eqs = equivs;
233   lin eqs_index = equivs_index;
234   lin eqs_alloc = equivs_alloc;
235   char const *suffix_begin = current->suffix_begin;
236   char const *bufend = FILE_BUFFER (current) + current->buffered;
237   bool ig_case = ignore_case;
238   enum DIFF_white_space ig_white_space = ignore_white_space;
239   bool diff_length_compare_anyway =
240     ig_white_space != IGNORE_NO_WHITE_SPACE;
241   bool same_length_diff_contents_compare_anyway =
242     diff_length_compare_anyway | ig_case;
243 
244   while (p < suffix_begin)
245     {
246       char const *ip = p;
247       hash_value h = 0;
248       unsigned char c;
249 
250       /* Hash this line until we find a newline.  */
251       switch (ig_white_space)
252 	{
253 	case IGNORE_ALL_SPACE:
254 	  while ((c = *p++) != '\n')
255 	    if (! isspace (c))
256 	      h = HASH (h, ig_case ? tolower (c) : c);
257 	  break;
258 
259 	case IGNORE_SPACE_CHANGE:
260 	  while ((c = *p++) != '\n')
261 	    {
262 	      if (isspace (c))
263 		{
264 		  do
265 		    if ((c = *p++) == '\n')
266 		      goto hashing_done;
267 		  while (isspace (c));
268 
269 		  h = HASH (h, ' ');
270 		}
271 
272 	      /* C is now the first non-space.  */
273 	      h = HASH (h, ig_case ? tolower (c) : c);
274 	    }
275 	  break;
276 
277 	case IGNORE_TAB_EXPANSION:
278 	case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
279 	case IGNORE_TRAILING_SPACE:
280 	  {
281 	    size_t column = 0;
282 	    while ((c = *p++) != '\n')
283 	      {
284 		if (ig_white_space & IGNORE_TRAILING_SPACE
285 		    && isspace (c))
286 		  {
287 		    char const *p1 = p;
288 		    unsigned char c1;
289 		    do
290 		      if ((c1 = *p1++) == '\n')
291 			{
292 			  p = p1;
293 			  goto hashing_done;
294 			}
295 		    while (isspace (c1));
296 		  }
297 
298 		size_t repetitions = 1;
299 
300 		if (ig_white_space & IGNORE_TAB_EXPANSION)
301 		  switch (c)
302 		    {
303 		    case '\b':
304 		      column -= 0 < column;
305 		      break;
306 
307 		    case '\t':
308 		      c = ' ';
309 		      repetitions = tabsize - column % tabsize;
310 		      column = (column + repetitions < column
311 				? 0
312 				: column + repetitions);
313 		      break;
314 
315 		    case '\r':
316 		      column = 0;
317 		      break;
318 
319 		    default:
320 		      column++;
321 		      break;
322 		    }
323 
324 		if (ig_case)
325 		  c = tolower (c);
326 
327 		do
328 		  h = HASH (h, c);
329 		while (--repetitions != 0);
330 	      }
331 	  }
332 	  break;
333 
334 	default:
335 	  if (ig_case)
336 	    while ((c = *p++) != '\n')
337 	      h = HASH (h, tolower (c));
338 	  else
339 	    while ((c = *p++) != '\n')
340 	      h = HASH (h, c);
341 	  break;
342 	}
343 
344    hashing_done:;
345 
346       bucket = &buckets[h % nbuckets];
347       length = p - ip - 1;
348 
349       if (p == bufend
350 	  && current->missing_newline
351 	  && ROBUST_OUTPUT_STYLE (output_style))
352 	{
353 	  /* The last line is incomplete and we do not silently
354 	     complete lines.  If the line cannot compare equal to any
355 	     complete line, put it into buckets[-1] so that it can
356 	     compare equal only to the other file's incomplete line
357 	     (if one exists).  */
358 	  if (ig_white_space < IGNORE_TRAILING_SPACE)
359 	    bucket = &buckets[-1];
360 	}
361 
362       for (i = *bucket;  ;  i = eqs[i].next)
363 	if (!i)
364 	  {
365 	    /* Create a new equivalence class in this bucket.  */
366 	    i = eqs_index++;
367 	    if (i == eqs_alloc)
368 	      {
369 		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
370 		  xalloc_die ();
371 		eqs_alloc *= 2;
372 		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
373 	      }
374 	    eqs[i].next = *bucket;
375 	    eqs[i].hash = h;
376 	    eqs[i].line = ip;
377 	    eqs[i].length = length;
378 	    *bucket = i;
379 	    break;
380 	  }
381 	else if (eqs[i].hash == h)
382 	  {
383 	    char const *eqline = eqs[i].line;
384 
385 	    /* Reuse existing class if lines_differ reports the lines
386                equal.  */
387 	    if (eqs[i].length == length)
388 	      {
389 		/* Reuse existing equivalence class if the lines are identical.
390 		   This detects the common case of exact identity
391 		   faster than lines_differ would.  */
392 		if (memcmp (eqline, ip, length) == 0)
393 		  break;
394 		if (!same_length_diff_contents_compare_anyway)
395 		  continue;
396 	      }
397 	    else if (!diff_length_compare_anyway)
398 	      continue;
399 
400 	    if (! lines_differ (eqline, ip))
401 	      break;
402 	  }
403 
404       /* Maybe increase the size of the line table.  */
405       if (line == alloc_lines)
406 	{
407 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
408 	  if (PTRDIFF_MAX / 3 <= alloc_lines
409 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
410 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
411 	    xalloc_die ();
412 	  alloc_lines = 2 * alloc_lines - linbuf_base;
413 	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
414 	  linbuf += linbuf_base;
415 	  linbuf = xrealloc (linbuf,
416 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
417 	  linbuf -= linbuf_base;
418 	}
419       linbuf[line] = ip;
420       cureqs[line] = i;
421       ++line;
422     }
423 
424   current->buffered_lines = line;
425 
426   for (i = 0;  ;  i++)
427     {
428       /* Record the line start for lines in the suffix that we care about.
429 	 Record one more line start than lines,
430 	 so that we can compute the length of any buffered line.  */
431       if (line == alloc_lines)
432 	{
433 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
434 	  if (PTRDIFF_MAX / 3 <= alloc_lines
435 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
436 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
437 	    xalloc_die ();
438 	  alloc_lines = 2 * alloc_lines - linbuf_base;
439 	  linbuf += linbuf_base;
440 	  linbuf = xrealloc (linbuf,
441 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
442 	  linbuf -= linbuf_base;
443 	}
444       linbuf[line] = p;
445 
446       if (p == bufend)
447 	{
448 	  /* If the last line is incomplete and we do not silently
449 	     complete lines, don't count its appended newline.  */
450 	  if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style))
451 	    linbuf[line]--;
452 	  break;
453 	}
454 
455       if (context <= i && no_diff_means_no_output)
456 	break;
457 
458       line++;
459 
460       while (*p++ != '\n')
461 	continue;
462     }
463 
464   /* Done with cache in local variables.  */
465   current->linbuf = linbuf;
466   current->valid_lines = line;
467   current->alloc_lines = alloc_lines;
468   current->equivs = cureqs;
469   equivs = eqs;
470   equivs_alloc = eqs_alloc;
471   equivs_index = eqs_index;
472 }
473 
474 /* Prepare the text.  Make sure the text end is initialized.
475    Make sure text ends in a newline,
476    but remember that we had to add one.
477    Strip trailing CRs, if that was requested.  */
478 
479 static void
480 prepare_text (struct file_data *current)
481 {
482   size_t buffered = current->buffered;
483   char *p = FILE_BUFFER (current);
484   if (!p)
485     return;
486 
487   if (strip_trailing_cr)
488     {
489       char *srclim = p + buffered;
490       *srclim = '\r';
491       char *dst = rawmemchr (p, '\r');
492 
493       for (char const *src = dst; src != srclim; src++)
494 	{
495 	  src += *src == '\r' && src[1] == '\n';
496 	  *dst++ = *src;
497 	}
498 
499       buffered -= srclim - dst;
500     }
501 
502   if (buffered != 0 && p[buffered - 1] != '\n')
503     {
504       p[buffered++] = '\n';
505       current->missing_newline = true;
506     }
507 
508   /* Don't use uninitialized storage when planting or using sentinels.  */
509   memset (p + buffered, 0, sizeof (word));
510 
511   current->buffered = buffered;
512 }
513 
514 /* We have found N lines in a buffer of size S; guess the
515    proportionate number of lines that will be found in a buffer of
516    size T.  However, do not guess a number of lines so large that the
517    resulting line table might cause overflow in size calculations.  */
518 static lin
519 guess_lines (lin n, size_t s, size_t t)
520 {
521   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
522   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
523   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
524 }
525 
526 /* Given a vector of two file_data objects, find the identical
527    prefixes and suffixes of each object.  */
528 
529 static void
530 find_identical_ends (struct file_data filevec[])
531 {
532   word *w0, *w1;
533   char *p0, *p1, *buffer0, *buffer1;
534   char const *end0, *beg0;
535   char const **linbuf0, **linbuf1;
536   lin i, lines;
537   size_t n0, n1;
538   lin alloc_lines0, alloc_lines1;
539   bool prefix_needed;
540   lin buffered_prefix, prefix_count, prefix_mask;
541   lin middle_guess, suffix_guess;
542 
543   slurp (&filevec[0]);
544   prepare_text (&filevec[0]);
545   if (filevec[0].desc != filevec[1].desc)
546     {
547       slurp (&filevec[1]);
548       prepare_text (&filevec[1]);
549     }
550   else
551     {
552       filevec[1].buffer = filevec[0].buffer;
553       filevec[1].bufsize = filevec[0].bufsize;
554       filevec[1].buffered = filevec[0].buffered;
555       filevec[1].missing_newline = filevec[0].missing_newline;
556     }
557 
558   /* Find identical prefix.  */
559 
560   w0 = filevec[0].buffer;
561   w1 = filevec[1].buffer;
562   p0 = buffer0 = (char *) w0;
563   p1 = buffer1 = (char *) w1;
564   n0 = filevec[0].buffered;
565   n1 = filevec[1].buffered;
566 
567   if (p0 == p1)
568     /* The buffers are the same; sentinels won't work.  */
569     p0 = p1 += n1;
570   else
571     {
572       /* Insert end sentinels, in this case characters that are guaranteed
573 	 to make the equality test false, and thus terminate the loop.  */
574 
575       if (n0 < n1)
576 	p0[n0] = ~p1[n0];
577       else
578 	p1[n1] = ~p0[n1];
579 
580       /* Loop until first mismatch, or to the sentinel characters.  */
581 
582       /* Compare a word at a time for speed.  */
583       while (*w0 == *w1)
584 	w0++, w1++;
585 
586       /* Do the last few bytes of comparison a byte at a time.  */
587       p0 = (char *) w0;
588       p1 = (char *) w1;
589       while (*p0 == *p1)
590 	p0++, p1++;
591 
592       /* Don't mistakenly count missing newline as part of prefix.  */
593       if (ROBUST_OUTPUT_STYLE (output_style)
594 	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
595 	      !=
596 	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
597 	p0--, p1--;
598     }
599 
600   /* Now P0 and P1 point at the first nonmatching characters.  */
601 
602   /* Skip back to last line-beginning in the prefix,
603      and then discard up to HORIZON_LINES lines from the prefix.  */
604   i = horizon_lines;
605   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
606     p0--, p1--;
607 
608   /* Record the prefix.  */
609   filevec[0].prefix_end = p0;
610   filevec[1].prefix_end = p1;
611 
612   /* Find identical suffix.  */
613 
614   /* P0 and P1 point beyond the last chars not yet compared.  */
615   p0 = buffer0 + n0;
616   p1 = buffer1 + n1;
617 
618   if (! ROBUST_OUTPUT_STYLE (output_style)
619       || filevec[0].missing_newline == filevec[1].missing_newline)
620     {
621       end0 = p0;	/* Addr of last char in file 0.  */
622 
623       /* Get value of P0 at which we should stop scanning backward:
624 	 this is when either P0 or P1 points just past the last char
625 	 of the identical prefix.  */
626       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
627 
628       /* Scan back until chars don't match or we reach that point.  */
629       while (p0 != beg0)
630 	if (*--p0 != *--p1)
631 	  {
632 	    /* Point at the first char of the matching suffix.  */
633 	    ++p0, ++p1;
634 	    beg0 = p0;
635 	    break;
636 	  }
637 
638       /* Are we at a line-beginning in both files?  If not, add the rest of
639 	 this line to the main body.  Discard up to HORIZON_LINES lines from
640 	 the identical suffix.  Also, discard one extra line,
641 	 because shift_boundaries may need it.  */
642       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
643 			    &&
644 			    (buffer1 == p1 || p1[-1] == '\n'));
645       while (i-- && p0 != end0)
646 	while (*p0++ != '\n')
647 	  continue;
648 
649       p1 += p0 - beg0;
650     }
651 
652   /* Record the suffix.  */
653   filevec[0].suffix_begin = p0;
654   filevec[1].suffix_begin = p1;
655 
656   /* Calculate number of lines of prefix to save.
657 
658      prefix_count == 0 means save the whole prefix;
659      we need this for options like -D that output the whole file,
660      or for enormous contexts (to avoid worrying about arithmetic overflow).
661      We also need it for options like -F that output some preceding line;
662      at least we will need to find the last few lines,
663      but since we don't know how many, it's easiest to find them all.
664 
665      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
666      of the line buffer; they'll be moved to the proper location later.
667      Handle 1 more line than the context says (because we count 1 too many),
668      rounded up to the next power of 2 to speed index computation.  */
669 
670   if (no_diff_means_no_output && ! function_regexp.fastmap
671       && context < LIN_MAX / 4 && context < n0)
672     {
673       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
674       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
675       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
676 	continue;
677       alloc_lines0 = (prefix_count + middle_guess
678 		      + MIN (context, suffix_guess));
679     }
680   else
681     {
682       prefix_count = 0;
683       alloc_lines0 = guess_lines (0, 0, n0);
684     }
685 
686   prefix_mask = prefix_count - 1;
687   lines = 0;
688   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
689   prefix_needed = ! (no_diff_means_no_output
690 		     && filevec[0].prefix_end == p0
691 		     && filevec[1].prefix_end == p1);
692   p0 = buffer0;
693 
694   /* If the prefix is needed, find the prefix lines.  */
695   if (prefix_needed)
696     {
697       end0 = filevec[0].prefix_end;
698       while (p0 != end0)
699 	{
700 	  lin l = lines++ & prefix_mask;
701 	  if (l == alloc_lines0)
702 	    {
703 	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
704 		xalloc_die ();
705 	      alloc_lines0 *= 2;
706 	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
707 	    }
708 	  linbuf0[l] = p0;
709 	  while (*p0++ != '\n')
710 	    continue;
711 	}
712     }
713   buffered_prefix = prefix_count && context < lines ? context : lines;
714 
715   /* Allocate line buffer 1.  */
716 
717   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
718   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
719   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
720   if (alloc_lines1 < buffered_prefix
721       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
722     xalloc_die ();
723   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
724 
725   if (buffered_prefix != lines)
726     {
727       /* Rotate prefix lines to proper location.  */
728       for (i = 0;  i < buffered_prefix;  i++)
729 	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
730       for (i = 0;  i < buffered_prefix;  i++)
731 	linbuf0[i] = linbuf1[i];
732     }
733 
734   /* Initialize line buffer 1 from line buffer 0.  */
735   for (i = 0; i < buffered_prefix; i++)
736     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
737 
738   /* Record the line buffer, adjusted so that
739      linbuf[0] points at the first differing line.  */
740   filevec[0].linbuf = linbuf0 + buffered_prefix;
741   filevec[1].linbuf = linbuf1 + buffered_prefix;
742   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
743   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
744   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
745   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
746 }
747 
748 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
749    than 2**k.  This table is derived from Chris K. Caldwell's list
750    <http://www.utm.edu/research/primes/lists/2small/>.  */
751 
752 static unsigned char const prime_offset[] =
753 {
754   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
755   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
756   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
757   55, 93, 1, 57, 25
758 };
759 
760 /* Verify that this host's size_t is not too wide for the above table.  */
761 
762 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
763 
764 /* Given a vector of two file_data objects, read the file associated
765    with each one, and build the table of equivalence classes.
766    Return nonzero if either file appears to be a binary file.
767    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
768 
769 bool
770 read_files (struct file_data filevec[], bool pretend_binary)
771 {
772   int i;
773   bool skip_test = text | pretend_binary;
774   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
775 
776   if (filevec[0].desc != filevec[1].desc)
777     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
778   else
779     {
780       filevec[1].buffer = filevec[0].buffer;
781       filevec[1].bufsize = filevec[0].bufsize;
782       filevec[1].buffered = filevec[0].buffered;
783     }
784   if (appears_binary)
785     {
786       set_binary_mode (filevec[0].desc, O_BINARY);
787       set_binary_mode (filevec[1].desc, O_BINARY);
788       return true;
789     }
790 
791   find_identical_ends (filevec);
792 
793   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
794   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
795     xalloc_die ();
796   equivs = xmalloc (equivs_alloc * sizeof *equivs);
797   /* Equivalence class 0 is permanently safe for lines that were not
798      hashed.  Real equivalence classes start at 1.  */
799   equivs_index = 1;
800 
801   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
802      number between 1/3 and 2/3 of the value of equiv_allocs,
803      approximately.  */
804   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
805     continue;
806   nbuckets = ((size_t) 1 << i) - prime_offset[i];
807   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
808     xalloc_die ();
809   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
810   buckets++;
811 
812   for (i = 0; i < 2; i++)
813     find_and_hash_each_line (&filevec[i]);
814 
815   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
816 
817   free (equivs);
818   free (buckets - 1);
819 
820   return false;
821 }
822