xref: /dragonfly/contrib/diffutils/src/io.c (revision 10f4bf95)
1 /* File I/O for GNU DIFF.
2 
3    Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2010
4    Free Software Foundation, Inc.
5 
6    This file is part of GNU DIFF.
7 
8    This program is free software: you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation, either version 3 of the License, or
11    (at your option) any later version.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
20 
21 #include "diff.h"
22 #include <cmpbuf.h>
23 #include <file-type.h>
24 #include <xalloc.h>
25 
26 /* Rotate an unsigned value to the left.  */
27 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
28 
29 /* Given a hash value and a new character, return a new hash value.  */
30 #define HASH(h, c) ((c) + ROL (h, 7))
31 
32 /* The type of a hash value.  */
33 typedef size_t hash_value;
34 verify (! TYPE_SIGNED (hash_value));
35 
36 /* Lines are put into equivalence classes of lines that match in lines_differ.
37    Each equivalence class is represented by one of these structures,
38    but only while the classes are being computed.
39    Afterward, each class is represented by a number.  */
40 struct equivclass
41 {
42   lin next;		/* Next item in this bucket.  */
43   hash_value hash;	/* Hash of lines in this class.  */
44   char const *line;	/* A line that fits this class.  */
45   size_t length;	/* That line's length, not counting its newline.  */
46 };
47 
48 /* Hash-table: array of buckets, each being a chain of equivalence classes.
49    buckets[-1] is reserved for incomplete lines.  */
50 static lin *buckets;
51 
52 /* Number of buckets in the hash table array, not counting buckets[-1].  */
53 static size_t nbuckets;
54 
55 /* Array in which the equivalence classes are allocated.
56    The bucket-chains go through the elements in this array.
57    The number of an equivalence class is its index in this array.  */
58 static struct equivclass *equivs;
59 
60 /* Index of first free element in the array `equivs'.  */
61 static lin equivs_index;
62 
63 /* Number of elements allocated in the array `equivs'.  */
64 static lin equivs_alloc;
65 
66 /* Read a block of data into a file buffer, checking for EOF and error.  */
67 
68 void
69 file_block_read (struct file_data *current, size_t size)
70 {
71   if (size && ! current->eof)
72     {
73       size_t s = block_read (current->desc,
74 			     FILE_BUFFER (current) + current->buffered, size);
75       if (s == SIZE_MAX)
76 	pfatal_with_name (current->name);
77       current->buffered += s;
78       current->eof = s < size;
79     }
80 }
81 
82 /* Check for binary files and compare them for exact identity.  */
83 
84 /* Return 1 if BUF contains a non text character.
85    SIZE is the number of characters in BUF.  */
86 
87 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
88 
89 /* Get ready to read the current file.
90    Return nonzero if SKIP_TEST is zero,
91    and if it appears to be a binary file.  */
92 
93 static bool
94 sip (struct file_data *current, bool skip_test)
95 {
96   /* If we have a nonexistent file at this stage, treat it as empty.  */
97   if (current->desc < 0)
98     {
99       /* Leave room for a sentinel.  */
100       current->bufsize = sizeof (word);
101       current->buffer = xmalloc (current->bufsize);
102     }
103   else
104     {
105       current->bufsize = buffer_lcm (sizeof (word),
106 				     STAT_BLOCKSIZE (current->stat),
107 				     PTRDIFF_MAX - 2 * sizeof (word));
108       current->buffer = xmalloc (current->bufsize);
109 
110       if (! skip_test)
111 	{
112 	  /* Check first part of file to see if it's a binary file.  */
113 
114 	  /* FIXME: if O_BINARY, this should revert to text mode
115 	     if the file is not binary.  */
116 
117 	  file_block_read (current, current->bufsize);
118 	  return binary_file_p (current->buffer, current->buffered);
119 	}
120     }
121 
122   current->buffered = 0;
123   current->eof = false;
124   return false;
125 }
126 
127 /* Slurp the rest of the current file completely into memory.  */
128 
129 static void
130 slurp (struct file_data *current)
131 {
132   size_t cc;
133 
134   if (current->desc < 0)
135     {
136       /* The file is nonexistent.  */
137       return;
138     }
139 
140   if (S_ISREG (current->stat.st_mode))
141     {
142       /* It's a regular file; slurp in the rest all at once.  */
143 
144       /* Get the size out of the stat block.
145 	 Allocate just enough room for appended newline plus word sentinel,
146 	 plus word-alignment since we want the buffer word-aligned.  */
147       size_t file_size = current->stat.st_size;
148       cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
149       if (file_size != current->stat.st_size || cc < file_size
150 	  || PTRDIFF_MAX <= cc)
151 	xalloc_die ();
152 
153       if (current->bufsize < cc)
154 	{
155 	  current->bufsize = cc;
156 	  current->buffer = xrealloc (current->buffer, cc);
157 	}
158 
159       /* Try to read at least 1 more byte than the size indicates, to
160 	 detect whether the file is growing.  This is a nicety for
161 	 users who run 'diff' on files while they are changing.  */
162 
163       if (current->buffered <= file_size)
164 	{
165 	  file_block_read (current, file_size + 1 - current->buffered);
166 	  if (current->buffered <= file_size)
167 	    return;
168 	}
169     }
170 
171   /* It's not a regular file, or it's a growing regular file; read it,
172      growing the buffer as needed.  */
173 
174   file_block_read (current, current->bufsize - current->buffered);
175 
176   if (current->buffered)
177     {
178       while (current->buffered == current->bufsize)
179 	{
180 	  if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
181 	    xalloc_die ();
182 	  current->bufsize *= 2;
183 	  current->buffer = xrealloc (current->buffer, current->bufsize);
184 	  file_block_read (current, current->bufsize - current->buffered);
185 	}
186 
187       /* Allocate just enough room for appended newline plus word
188 	 sentinel, plus word-alignment.  */
189       cc = current->buffered + 2 * sizeof (word);
190       current->bufsize = cc - cc % sizeof (word);
191       current->buffer = xrealloc (current->buffer, current->bufsize);
192     }
193 }
194 
195 /* Split the file into lines, simultaneously computing the equivalence
196    class for each line.  */
197 
198 static void
199 find_and_hash_each_line (struct file_data *current)
200 {
201   hash_value h;
202   char const *p = current->prefix_end;
203   unsigned char c;
204   lin i, *bucket;
205   size_t length;
206 
207   /* Cache often-used quantities in local variables to help the compiler.  */
208   char const **linbuf = current->linbuf;
209   lin alloc_lines = current->alloc_lines;
210   lin line = 0;
211   lin linbuf_base = current->linbuf_base;
212   lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
213   struct equivclass *eqs = equivs;
214   lin eqs_index = equivs_index;
215   lin eqs_alloc = equivs_alloc;
216   char const *suffix_begin = current->suffix_begin;
217   char const *bufend = FILE_BUFFER (current) + current->buffered;
218   bool diff_length_compare_anyway =
219     ignore_white_space != IGNORE_NO_WHITE_SPACE;
220   bool same_length_diff_contents_compare_anyway =
221     diff_length_compare_anyway | ignore_case;
222 
223   while (p < suffix_begin)
224     {
225       char const *ip = p;
226 
227       h = 0;
228 
229       /* Hash this line until we find a newline.  */
230       if (ignore_case)
231 	switch (ignore_white_space)
232 	  {
233 	  case IGNORE_ALL_SPACE:
234 	    while ((c = *p++) != '\n')
235 	      if (! isspace (c))
236 		h = HASH (h, tolower (c));
237 	    break;
238 
239 	  case IGNORE_SPACE_CHANGE:
240 	    while ((c = *p++) != '\n')
241 	      {
242 		if (isspace (c))
243 		  {
244 		    do
245 		      if ((c = *p++) == '\n')
246 			goto hashing_done;
247 		    while (isspace (c));
248 
249 		    h = HASH (h, ' ');
250 		  }
251 
252 		/* C is now the first non-space.  */
253 		h = HASH (h, tolower (c));
254 	      }
255 	    break;
256 
257 	  case IGNORE_TAB_EXPANSION:
258 	    {
259 	      size_t column = 0;
260 	      while ((c = *p++) != '\n')
261 		{
262 		  size_t repetitions = 1;
263 
264 		  switch (c)
265 		    {
266 		    case '\b':
267 		      column -= 0 < column;
268 		      break;
269 
270 		    case '\t':
271 		      c = ' ';
272 		      repetitions = tabsize - column % tabsize;
273 		      column = (column + repetitions < column
274 				? 0
275 				: column + repetitions);
276 		      break;
277 
278 		    case '\r':
279 		      column = 0;
280 		      break;
281 
282 		    default:
283 		      c = tolower (c);
284 		      column++;
285 		      break;
286 		    }
287 
288 		  do
289 		    h = HASH (h, c);
290 		  while (--repetitions != 0);
291 		}
292 	    }
293 	    break;
294 
295 	  default:
296 	    while ((c = *p++) != '\n')
297 	      h = HASH (h, tolower (c));
298 	    break;
299 	  }
300       else
301 	switch (ignore_white_space)
302 	  {
303 	  case IGNORE_ALL_SPACE:
304 	    while ((c = *p++) != '\n')
305 	      if (! isspace (c))
306 		h = HASH (h, c);
307 	    break;
308 
309 	  case IGNORE_SPACE_CHANGE:
310 	    while ((c = *p++) != '\n')
311 	      {
312 		if (isspace (c))
313 		  {
314 		    do
315 		      if ((c = *p++) == '\n')
316 			goto hashing_done;
317 		    while (isspace (c));
318 
319 		    h = HASH (h, ' ');
320 		  }
321 
322 		/* C is now the first non-space.  */
323 		h = HASH (h, c);
324 	      }
325 	    break;
326 
327 	  case IGNORE_TAB_EXPANSION:
328 	    {
329 	      size_t column = 0;
330 	      while ((c = *p++) != '\n')
331 		{
332 		  size_t repetitions = 1;
333 
334 		  switch (c)
335 		    {
336 		    case '\b':
337 		      column -= 0 < column;
338 		      break;
339 
340 		    case '\t':
341 		      c = ' ';
342 		      repetitions = tabsize - column % tabsize;
343 		      column = (column + repetitions < column
344 				? 0
345 				: column + repetitions);
346 		      break;
347 
348 		    case '\r':
349 		      column = 0;
350 		      break;
351 
352 		    default:
353 		      column++;
354 		      break;
355 		    }
356 
357 		  do
358 		    h = HASH (h, c);
359 		  while (--repetitions != 0);
360 		}
361 	    }
362 	    break;
363 
364 	  default:
365 	    while ((c = *p++) != '\n')
366 	      h = HASH (h, c);
367 	    break;
368 	  }
369 
370    hashing_done:;
371 
372       bucket = &buckets[h % nbuckets];
373       length = p - ip - 1;
374 
375       if (p == bufend
376 	  && current->missing_newline
377 	  && ROBUST_OUTPUT_STYLE (output_style))
378 	{
379 	  /* The last line is incomplete and we do not silently
380 	     complete lines.  If the line cannot compare equal to any
381 	     complete line, put it into buckets[-1] so that it can
382 	     compare equal only to the other file's incomplete line
383 	     (if one exists).  */
384 	  if (ignore_white_space < IGNORE_SPACE_CHANGE)
385 	    bucket = &buckets[-1];
386 	}
387 
388       for (i = *bucket;  ;  i = eqs[i].next)
389 	if (!i)
390 	  {
391 	    /* Create a new equivalence class in this bucket.  */
392 	    i = eqs_index++;
393 	    if (i == eqs_alloc)
394 	      {
395 		if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
396 		  xalloc_die ();
397 		eqs_alloc *= 2;
398 		eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
399 	      }
400 	    eqs[i].next = *bucket;
401 	    eqs[i].hash = h;
402 	    eqs[i].line = ip;
403 	    eqs[i].length = length;
404 	    *bucket = i;
405 	    break;
406 	  }
407 	else if (eqs[i].hash == h)
408 	  {
409 	    char const *eqline = eqs[i].line;
410 
411 	    /* Reuse existing class if lines_differ reports the lines
412                equal.  */
413 	    if (eqs[i].length == length)
414 	      {
415 		/* Reuse existing equivalence class if the lines are identical.
416 		   This detects the common case of exact identity
417 		   faster than lines_differ would.  */
418 		if (memcmp (eqline, ip, length) == 0)
419 		  break;
420 		if (!same_length_diff_contents_compare_anyway)
421 		  continue;
422 	      }
423 	    else if (!diff_length_compare_anyway)
424 	      continue;
425 
426 	    if (! lines_differ (eqline, ip))
427 	      break;
428 	  }
429 
430       /* Maybe increase the size of the line table.  */
431       if (line == alloc_lines)
432 	{
433 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
434 	  if (PTRDIFF_MAX / 3 <= alloc_lines
435 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
436 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
437 	    xalloc_die ();
438 	  alloc_lines = 2 * alloc_lines - linbuf_base;
439 	  cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
440 	  linbuf += linbuf_base;
441 	  linbuf = xrealloc (linbuf,
442 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
443 	  linbuf -= linbuf_base;
444 	}
445       linbuf[line] = ip;
446       cureqs[line] = i;
447       ++line;
448     }
449 
450   current->buffered_lines = line;
451 
452   for (i = 0;  ;  i++)
453     {
454       /* Record the line start for lines in the suffix that we care about.
455 	 Record one more line start than lines,
456 	 so that we can compute the length of any buffered line.  */
457       if (line == alloc_lines)
458 	{
459 	  /* Double (alloc_lines - linbuf_base) by adding to alloc_lines.  */
460 	  if (PTRDIFF_MAX / 3 <= alloc_lines
461 	      || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
462 	      || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
463 	    xalloc_die ();
464 	  alloc_lines = 2 * alloc_lines - linbuf_base;
465 	  linbuf += linbuf_base;
466 	  linbuf = xrealloc (linbuf,
467 			     (alloc_lines - linbuf_base) * sizeof *linbuf);
468 	  linbuf -= linbuf_base;
469 	}
470       linbuf[line] = p;
471 
472       if (p == bufend)
473 	{
474 	  /* If the last line is incomplete and we do not silently
475 	     complete lines, don't count its appended newline.  */
476 	  if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style))
477 	    linbuf[line]--;
478 	  break;
479 	}
480 
481       if (context <= i && no_diff_means_no_output)
482 	break;
483 
484       line++;
485 
486       while (*p++ != '\n')
487 	continue;
488     }
489 
490   /* Done with cache in local variables.  */
491   current->linbuf = linbuf;
492   current->valid_lines = line;
493   current->alloc_lines = alloc_lines;
494   current->equivs = cureqs;
495   equivs = eqs;
496   equivs_alloc = eqs_alloc;
497   equivs_index = eqs_index;
498 }
499 
500 /* Prepare the text.  Make sure the text end is initialized.
501    Make sure text ends in a newline,
502    but remember that we had to add one.
503    Strip trailing CRs, if that was requested.  */
504 
505 static void
506 prepare_text (struct file_data *current)
507 {
508   size_t buffered = current->buffered;
509   char *p = FILE_BUFFER (current);
510   char *dst;
511 
512   if (buffered == 0 || p[buffered - 1] == '\n')
513     current->missing_newline = false;
514   else
515     {
516       p[buffered++] = '\n';
517       current->missing_newline = true;
518     }
519 
520   if (!p)
521     return;
522 
523   /* Don't use uninitialized storage when planting or using sentinels.  */
524   memset (p + buffered, 0, sizeof (word));
525 
526   if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
527     {
528       char const *src = dst;
529       char const *srclim = p + buffered;
530 
531       do
532 	dst += ! ((*dst = *src++) == '\r' && *src == '\n');
533       while (src < srclim);
534 
535       buffered -= src - dst;
536     }
537 
538   current->buffered = buffered;
539 }
540 
541 /* We have found N lines in a buffer of size S; guess the
542    proportionate number of lines that will be found in a buffer of
543    size T.  However, do not guess a number of lines so large that the
544    resulting line table might cause overflow in size calculations.  */
545 static lin
546 guess_lines (lin n, size_t s, size_t t)
547 {
548   size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
549   lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
550   return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
551 }
552 
553 /* Given a vector of two file_data objects, find the identical
554    prefixes and suffixes of each object.  */
555 
556 static void
557 find_identical_ends (struct file_data filevec[])
558 {
559   word *w0, *w1;
560   char *p0, *p1, *buffer0, *buffer1;
561   char const *end0, *beg0;
562   char const **linbuf0, **linbuf1;
563   lin i, lines;
564   size_t n0, n1;
565   lin alloc_lines0, alloc_lines1;
566   lin buffered_prefix, prefix_count, prefix_mask;
567   lin middle_guess, suffix_guess;
568 
569   slurp (&filevec[0]);
570   prepare_text (&filevec[0]);
571   if (filevec[0].desc != filevec[1].desc)
572     {
573       slurp (&filevec[1]);
574       prepare_text (&filevec[1]);
575     }
576   else
577     {
578       filevec[1].buffer = filevec[0].buffer;
579       filevec[1].bufsize = filevec[0].bufsize;
580       filevec[1].buffered = filevec[0].buffered;
581       filevec[1].missing_newline = filevec[0].missing_newline;
582     }
583 
584   /* Find identical prefix.  */
585 
586   w0 = filevec[0].buffer;
587   w1 = filevec[1].buffer;
588   p0 = buffer0 = (char *) w0;
589   p1 = buffer1 = (char *) w1;
590   n0 = filevec[0].buffered;
591   n1 = filevec[1].buffered;
592 
593   if (p0 == p1)
594     /* The buffers are the same; sentinels won't work.  */
595     p0 = p1 += n1;
596   else
597     {
598       /* Insert end sentinels, in this case characters that are guaranteed
599 	 to make the equality test false, and thus terminate the loop.  */
600 
601       if (n0 < n1)
602 	p0[n0] = ~p1[n0];
603       else
604 	p1[n1] = ~p0[n1];
605 
606       /* Loop until first mismatch, or to the sentinel characters.  */
607 
608       /* Compare a word at a time for speed.  */
609       while (*w0 == *w1)
610 	w0++, w1++;
611 
612       /* Do the last few bytes of comparison a byte at a time.  */
613       p0 = (char *) w0;
614       p1 = (char *) w1;
615       while (*p0 == *p1)
616 	p0++, p1++;
617 
618       /* Don't mistakenly count missing newline as part of prefix.  */
619       if (ROBUST_OUTPUT_STYLE (output_style)
620 	  && ((buffer0 + n0 - filevec[0].missing_newline < p0)
621 	      !=
622 	      (buffer1 + n1 - filevec[1].missing_newline < p1)))
623 	p0--, p1--;
624     }
625 
626   /* Now P0 and P1 point at the first nonmatching characters.  */
627 
628   /* Skip back to last line-beginning in the prefix,
629      and then discard up to HORIZON_LINES lines from the prefix.  */
630   i = horizon_lines;
631   while (p0 != buffer0 && (p0[-1] != '\n' || i--))
632     p0--, p1--;
633 
634   /* Record the prefix.  */
635   filevec[0].prefix_end = p0;
636   filevec[1].prefix_end = p1;
637 
638   /* Find identical suffix.  */
639 
640   /* P0 and P1 point beyond the last chars not yet compared.  */
641   p0 = buffer0 + n0;
642   p1 = buffer1 + n1;
643 
644   if (! ROBUST_OUTPUT_STYLE (output_style)
645       || filevec[0].missing_newline == filevec[1].missing_newline)
646     {
647       end0 = p0;	/* Addr of last char in file 0.  */
648 
649       /* Get value of P0 at which we should stop scanning backward:
650 	 this is when either P0 or P1 points just past the last char
651 	 of the identical prefix.  */
652       beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
653 
654       /* Scan back until chars don't match or we reach that point.  */
655       while (p0 != beg0)
656 	if (*--p0 != *--p1)
657 	  {
658 	    /* Point at the first char of the matching suffix.  */
659 	    ++p0, ++p1;
660 	    beg0 = p0;
661 	    break;
662 	  }
663 
664       /* Are we at a line-beginning in both files?  If not, add the rest of
665 	 this line to the main body.  Discard up to HORIZON_LINES lines from
666 	 the identical suffix.  Also, discard one extra line,
667 	 because shift_boundaries may need it.  */
668       i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
669 			    &&
670 			    (buffer1 == p1 || p1[-1] == '\n'));
671       while (i-- && p0 != end0)
672 	while (*p0++ != '\n')
673 	  continue;
674 
675       p1 += p0 - beg0;
676     }
677 
678   /* Record the suffix.  */
679   filevec[0].suffix_begin = p0;
680   filevec[1].suffix_begin = p1;
681 
682   /* Calculate number of lines of prefix to save.
683 
684      prefix_count == 0 means save the whole prefix;
685      we need this for options like -D that output the whole file,
686      or for enormous contexts (to avoid worrying about arithmetic overflow).
687      We also need it for options like -F that output some preceding line;
688      at least we will need to find the last few lines,
689      but since we don't know how many, it's easiest to find them all.
690 
691      Otherwise, prefix_count != 0.  Save just prefix_count lines at start
692      of the line buffer; they'll be moved to the proper location later.
693      Handle 1 more line than the context says (because we count 1 too many),
694      rounded up to the next power of 2 to speed index computation.  */
695 
696   if (no_diff_means_no_output && ! function_regexp.fastmap
697       && context < LIN_MAX / 4 && context < n0)
698     {
699       middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
700       suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
701       for (prefix_count = 1;  prefix_count <= context;  prefix_count *= 2)
702 	continue;
703       alloc_lines0 = (prefix_count + middle_guess
704 		      + MIN (context, suffix_guess));
705     }
706   else
707     {
708       prefix_count = 0;
709       alloc_lines0 = guess_lines (0, 0, n0);
710     }
711 
712   prefix_mask = prefix_count - 1;
713   lines = 0;
714   linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
715   p0 = buffer0;
716 
717   /* If the prefix is needed, find the prefix lines.  */
718   if (! (no_diff_means_no_output
719 	 && filevec[0].prefix_end == p0
720 	 && filevec[1].prefix_end == p1))
721     {
722       end0 = filevec[0].prefix_end;
723       while (p0 != end0)
724 	{
725 	  lin l = lines++ & prefix_mask;
726 	  if (l == alloc_lines0)
727 	    {
728 	      if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
729 		xalloc_die ();
730 	      alloc_lines0 *= 2;
731 	      linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
732 	    }
733 	  linbuf0[l] = p0;
734 	  while (*p0++ != '\n')
735 	    continue;
736 	}
737     }
738   buffered_prefix = prefix_count && context < lines ? context : lines;
739 
740   /* Allocate line buffer 1.  */
741 
742   middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
743   suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
744   alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
745   if (alloc_lines1 < buffered_prefix
746       || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
747     xalloc_die ();
748   linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
749 
750   if (buffered_prefix != lines)
751     {
752       /* Rotate prefix lines to proper location.  */
753       for (i = 0;  i < buffered_prefix;  i++)
754 	linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
755       for (i = 0;  i < buffered_prefix;  i++)
756 	linbuf0[i] = linbuf1[i];
757     }
758 
759   /* Initialize line buffer 1 from line buffer 0.  */
760   for (i = 0; i < buffered_prefix; i++)
761     linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
762 
763   /* Record the line buffer, adjusted so that
764      linbuf[0] points at the first differing line.  */
765   filevec[0].linbuf = linbuf0 + buffered_prefix;
766   filevec[1].linbuf = linbuf1 + buffered_prefix;
767   filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
768   filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
769   filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
770   filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
771 }
772 
773 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
774    than 2**k.  This table is derived from Chris K. Caldwell's list
775    <http://www.utm.edu/research/primes/lists/2small/>.  */
776 
777 static unsigned char const prime_offset[] =
778 {
779   0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
780   15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
781   11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
782   55, 93, 1, 57, 25
783 };
784 
785 /* Verify that this host's size_t is not too wide for the above table.  */
786 
787 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
788 
789 /* Given a vector of two file_data objects, read the file associated
790    with each one, and build the table of equivalence classes.
791    Return nonzero if either file appears to be a binary file.
792    If PRETEND_BINARY is nonzero, pretend they are binary regardless.  */
793 
794 bool
795 read_files (struct file_data filevec[], bool pretend_binary)
796 {
797   int i;
798   bool skip_test = text | pretend_binary;
799   bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
800 
801   if (filevec[0].desc != filevec[1].desc)
802     appears_binary |= sip (&filevec[1], skip_test | appears_binary);
803   else
804     {
805       filevec[1].buffer = filevec[0].buffer;
806       filevec[1].bufsize = filevec[0].bufsize;
807       filevec[1].buffered = filevec[0].buffered;
808     }
809   if (appears_binary)
810     {
811       /* FIXME: If O_BINARY, this should set both files to binary mode.  */
812       return true;
813     }
814 
815   find_identical_ends (filevec);
816 
817   equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
818   if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
819     xalloc_die ();
820   equivs = xmalloc (equivs_alloc * sizeof *equivs);
821   /* Equivalence class 0 is permanently safe for lines that were not
822      hashed.  Real equivalence classes start at 1.  */
823   equivs_index = 1;
824 
825   /* Allocate (one plus) a prime number of hash buckets.  Use a prime
826      number between 1/3 and 2/3 of the value of equiv_allocs,
827      approximately.  */
828   for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
829     continue;
830   nbuckets = ((size_t) 1 << i) - prime_offset[i];
831   if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
832     xalloc_die ();
833   buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
834   buckets++;
835 
836   for (i = 0; i < 2; i++)
837     find_and_hash_each_line (&filevec[i]);
838 
839   filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
840 
841   free (equivs);
842   free (buckets - 1);
843 
844   return false;
845 }
846