1 /* File I/O for GNU DIFF. 2 3 Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2011 4 Free Software Foundation, Inc. 5 6 This file is part of GNU DIFF. 7 8 This program is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 3 of the License, or 11 (at your option) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 20 21 #include "diff.h" 22 #include <cmpbuf.h> 23 #include <file-type.h> 24 #include <xalloc.h> 25 26 /* Rotate an unsigned value to the left. */ 27 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n))) 28 29 /* Given a hash value and a new character, return a new hash value. */ 30 #define HASH(h, c) ((c) + ROL (h, 7)) 31 32 /* The type of a hash value. */ 33 typedef size_t hash_value; 34 verify (! TYPE_SIGNED (hash_value)); 35 36 /* Lines are put into equivalence classes of lines that match in lines_differ. 37 Each equivalence class is represented by one of these structures, 38 but only while the classes are being computed. 39 Afterward, each class is represented by a number. */ 40 struct equivclass 41 { 42 lin next; /* Next item in this bucket. */ 43 hash_value hash; /* Hash of lines in this class. */ 44 char const *line; /* A line that fits this class. */ 45 size_t length; /* That line's length, not counting its newline. */ 46 }; 47 48 /* Hash-table: array of buckets, each being a chain of equivalence classes. 49 buckets[-1] is reserved for incomplete lines. */ 50 static lin *buckets; 51 52 /* Number of buckets in the hash table array, not counting buckets[-1]. */ 53 static size_t nbuckets; 54 55 /* Array in which the equivalence classes are allocated. 56 The bucket-chains go through the elements in this array. 57 The number of an equivalence class is its index in this array. */ 58 static struct equivclass *equivs; 59 60 /* Index of first free element in the array `equivs'. */ 61 static lin equivs_index; 62 63 /* Number of elements allocated in the array `equivs'. */ 64 static lin equivs_alloc; 65 66 /* Read a block of data into a file buffer, checking for EOF and error. */ 67 68 void 69 file_block_read (struct file_data *current, size_t size) 70 { 71 if (size && ! current->eof) 72 { 73 size_t s = block_read (current->desc, 74 FILE_BUFFER (current) + current->buffered, size); 75 if (s == SIZE_MAX) 76 pfatal_with_name (current->name); 77 current->buffered += s; 78 current->eof = s < size; 79 } 80 } 81 82 /* Check for binary files and compare them for exact identity. */ 83 84 /* Return 1 if BUF contains a non text character. 85 SIZE is the number of characters in BUF. */ 86 87 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0) 88 89 /* Get ready to read the current file. 90 Return nonzero if SKIP_TEST is zero, 91 and if it appears to be a binary file. */ 92 93 static bool 94 sip (struct file_data *current, bool skip_test) 95 { 96 /* If we have a nonexistent file at this stage, treat it as empty. */ 97 if (current->desc < 0) 98 { 99 /* Leave room for a sentinel. */ 100 current->bufsize = sizeof (word); 101 current->buffer = xmalloc (current->bufsize); 102 } 103 else 104 { 105 current->bufsize = buffer_lcm (sizeof (word), 106 STAT_BLOCKSIZE (current->stat), 107 PTRDIFF_MAX - 2 * sizeof (word)); 108 current->buffer = xmalloc (current->bufsize); 109 110 if (! skip_test) 111 { 112 /* Check first part of file to see if it's a binary file. */ 113 114 /* FIXME: if O_BINARY, this should revert to text mode 115 if the file is not binary. */ 116 117 file_block_read (current, current->bufsize); 118 return binary_file_p (current->buffer, current->buffered); 119 } 120 } 121 122 current->buffered = 0; 123 current->eof = false; 124 return false; 125 } 126 127 /* Slurp the rest of the current file completely into memory. */ 128 129 static void 130 slurp (struct file_data *current) 131 { 132 size_t cc; 133 134 if (current->desc < 0) 135 { 136 /* The file is nonexistent. */ 137 return; 138 } 139 140 if (S_ISREG (current->stat.st_mode)) 141 { 142 /* It's a regular file; slurp in the rest all at once. */ 143 144 /* Get the size out of the stat block. 145 Allocate just enough room for appended newline plus word sentinel, 146 plus word-alignment since we want the buffer word-aligned. */ 147 size_t file_size = current->stat.st_size; 148 cc = file_size + 2 * sizeof (word) - file_size % sizeof (word); 149 if (file_size != current->stat.st_size || cc < file_size 150 || PTRDIFF_MAX <= cc) 151 xalloc_die (); 152 153 if (current->bufsize < cc) 154 { 155 current->bufsize = cc; 156 current->buffer = xrealloc (current->buffer, cc); 157 } 158 159 /* Try to read at least 1 more byte than the size indicates, to 160 detect whether the file is growing. This is a nicety for 161 users who run 'diff' on files while they are changing. */ 162 163 if (current->buffered <= file_size) 164 { 165 file_block_read (current, file_size + 1 - current->buffered); 166 if (current->buffered <= file_size) 167 return; 168 } 169 } 170 171 /* It's not a regular file, or it's a growing regular file; read it, 172 growing the buffer as needed. */ 173 174 file_block_read (current, current->bufsize - current->buffered); 175 176 if (current->buffered) 177 { 178 while (current->buffered == current->bufsize) 179 { 180 if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize) 181 xalloc_die (); 182 current->bufsize *= 2; 183 current->buffer = xrealloc (current->buffer, current->bufsize); 184 file_block_read (current, current->bufsize - current->buffered); 185 } 186 187 /* Allocate just enough room for appended newline plus word 188 sentinel, plus word-alignment. */ 189 cc = current->buffered + 2 * sizeof (word); 190 current->bufsize = cc - cc % sizeof (word); 191 current->buffer = xrealloc (current->buffer, current->bufsize); 192 } 193 } 194 195 /* Split the file into lines, simultaneously computing the equivalence 196 class for each line. */ 197 198 static void 199 find_and_hash_each_line (struct file_data *current) 200 { 201 char const *p = current->prefix_end; 202 lin i, *bucket; 203 size_t length; 204 205 /* Cache often-used quantities in local variables to help the compiler. */ 206 char const **linbuf = current->linbuf; 207 lin alloc_lines = current->alloc_lines; 208 lin line = 0; 209 lin linbuf_base = current->linbuf_base; 210 lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs); 211 struct equivclass *eqs = equivs; 212 lin eqs_index = equivs_index; 213 lin eqs_alloc = equivs_alloc; 214 char const *suffix_begin = current->suffix_begin; 215 char const *bufend = FILE_BUFFER (current) + current->buffered; 216 bool ig_case = ignore_case; 217 enum DIFF_white_space ig_white_space = ignore_white_space; 218 bool diff_length_compare_anyway = 219 ig_white_space != IGNORE_NO_WHITE_SPACE; 220 bool same_length_diff_contents_compare_anyway = 221 diff_length_compare_anyway | ig_case; 222 223 while (p < suffix_begin) 224 { 225 char const *ip = p; 226 hash_value h = 0; 227 unsigned char c; 228 229 /* Hash this line until we find a newline. */ 230 switch (ig_white_space) 231 { 232 case IGNORE_ALL_SPACE: 233 while ((c = *p++) != '\n') 234 if (! isspace (c)) 235 h = HASH (h, ig_case ? tolower (c) : c); 236 break; 237 238 case IGNORE_SPACE_CHANGE: 239 while ((c = *p++) != '\n') 240 { 241 if (isspace (c)) 242 { 243 do 244 if ((c = *p++) == '\n') 245 goto hashing_done; 246 while (isspace (c)); 247 248 h = HASH (h, ' '); 249 } 250 251 /* C is now the first non-space. */ 252 h = HASH (h, ig_case ? tolower (c) : c); 253 } 254 break; 255 256 case IGNORE_TAB_EXPANSION: 257 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE: 258 case IGNORE_TRAILING_SPACE: 259 { 260 size_t column = 0; 261 while ((c = *p++) != '\n') 262 { 263 if (ig_white_space & IGNORE_TRAILING_SPACE 264 && isspace (c)) 265 { 266 char const *p1 = p; 267 unsigned char c1; 268 do 269 if ((c1 = *p1++) == '\n') 270 { 271 p = p1; 272 goto hashing_done; 273 } 274 while (isspace (c1)); 275 } 276 277 size_t repetitions = 1; 278 279 if (ig_white_space & IGNORE_TAB_EXPANSION) 280 switch (c) 281 { 282 case '\b': 283 column -= 0 < column; 284 break; 285 286 case '\t': 287 c = ' '; 288 repetitions = tabsize - column % tabsize; 289 column = (column + repetitions < column 290 ? 0 291 : column + repetitions); 292 break; 293 294 case '\r': 295 column = 0; 296 break; 297 298 default: 299 column++; 300 break; 301 } 302 303 if (ig_case) 304 c = tolower (c); 305 306 do 307 h = HASH (h, c); 308 while (--repetitions != 0); 309 } 310 } 311 break; 312 313 default: 314 if (ig_case) 315 while ((c = *p++) != '\n') 316 h = HASH (h, tolower (c)); 317 else 318 while ((c = *p++) != '\n') 319 h = HASH (h, c); 320 break; 321 } 322 323 hashing_done:; 324 325 bucket = &buckets[h % nbuckets]; 326 length = p - ip - 1; 327 328 if (p == bufend 329 && current->missing_newline 330 && ROBUST_OUTPUT_STYLE (output_style)) 331 { 332 /* The last line is incomplete and we do not silently 333 complete lines. If the line cannot compare equal to any 334 complete line, put it into buckets[-1] so that it can 335 compare equal only to the other file's incomplete line 336 (if one exists). */ 337 if (ig_white_space < IGNORE_TRAILING_SPACE) 338 bucket = &buckets[-1]; 339 } 340 341 for (i = *bucket; ; i = eqs[i].next) 342 if (!i) 343 { 344 /* Create a new equivalence class in this bucket. */ 345 i = eqs_index++; 346 if (i == eqs_alloc) 347 { 348 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc) 349 xalloc_die (); 350 eqs_alloc *= 2; 351 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs); 352 } 353 eqs[i].next = *bucket; 354 eqs[i].hash = h; 355 eqs[i].line = ip; 356 eqs[i].length = length; 357 *bucket = i; 358 break; 359 } 360 else if (eqs[i].hash == h) 361 { 362 char const *eqline = eqs[i].line; 363 364 /* Reuse existing class if lines_differ reports the lines 365 equal. */ 366 if (eqs[i].length == length) 367 { 368 /* Reuse existing equivalence class if the lines are identical. 369 This detects the common case of exact identity 370 faster than lines_differ would. */ 371 if (memcmp (eqline, ip, length) == 0) 372 break; 373 if (!same_length_diff_contents_compare_anyway) 374 continue; 375 } 376 else if (!diff_length_compare_anyway) 377 continue; 378 379 if (! lines_differ (eqline, ip)) 380 break; 381 } 382 383 /* Maybe increase the size of the line table. */ 384 if (line == alloc_lines) 385 { 386 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 387 if (PTRDIFF_MAX / 3 <= alloc_lines 388 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 389 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 390 xalloc_die (); 391 alloc_lines = 2 * alloc_lines - linbuf_base; 392 cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs); 393 linbuf += linbuf_base; 394 linbuf = xrealloc (linbuf, 395 (alloc_lines - linbuf_base) * sizeof *linbuf); 396 linbuf -= linbuf_base; 397 } 398 linbuf[line] = ip; 399 cureqs[line] = i; 400 ++line; 401 } 402 403 current->buffered_lines = line; 404 405 for (i = 0; ; i++) 406 { 407 /* Record the line start for lines in the suffix that we care about. 408 Record one more line start than lines, 409 so that we can compute the length of any buffered line. */ 410 if (line == alloc_lines) 411 { 412 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 413 if (PTRDIFF_MAX / 3 <= alloc_lines 414 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 415 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 416 xalloc_die (); 417 alloc_lines = 2 * alloc_lines - linbuf_base; 418 linbuf += linbuf_base; 419 linbuf = xrealloc (linbuf, 420 (alloc_lines - linbuf_base) * sizeof *linbuf); 421 linbuf -= linbuf_base; 422 } 423 linbuf[line] = p; 424 425 if (p == bufend) 426 { 427 /* If the last line is incomplete and we do not silently 428 complete lines, don't count its appended newline. */ 429 if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style)) 430 linbuf[line]--; 431 break; 432 } 433 434 if (context <= i && no_diff_means_no_output) 435 break; 436 437 line++; 438 439 while (*p++ != '\n') 440 continue; 441 } 442 443 /* Done with cache in local variables. */ 444 current->linbuf = linbuf; 445 current->valid_lines = line; 446 current->alloc_lines = alloc_lines; 447 current->equivs = cureqs; 448 equivs = eqs; 449 equivs_alloc = eqs_alloc; 450 equivs_index = eqs_index; 451 } 452 453 /* Prepare the text. Make sure the text end is initialized. 454 Make sure text ends in a newline, 455 but remember that we had to add one. 456 Strip trailing CRs, if that was requested. */ 457 458 static void 459 prepare_text (struct file_data *current) 460 { 461 size_t buffered = current->buffered; 462 char *p = FILE_BUFFER (current); 463 char *dst; 464 465 if (buffered == 0 || p[buffered - 1] == '\n') 466 current->missing_newline = false; 467 else 468 { 469 p[buffered++] = '\n'; 470 current->missing_newline = true; 471 } 472 473 if (!p) 474 return; 475 476 /* Don't use uninitialized storage when planting or using sentinels. */ 477 memset (p + buffered, 0, sizeof (word)); 478 479 if (strip_trailing_cr && (dst = memchr (p, '\r', buffered))) 480 { 481 char const *src = dst; 482 char const *srclim = p + buffered; 483 484 do 485 dst += ! ((*dst = *src++) == '\r' && *src == '\n'); 486 while (src < srclim); 487 488 buffered -= src - dst; 489 } 490 491 current->buffered = buffered; 492 } 493 494 /* We have found N lines in a buffer of size S; guess the 495 proportionate number of lines that will be found in a buffer of 496 size T. However, do not guess a number of lines so large that the 497 resulting line table might cause overflow in size calculations. */ 498 static lin 499 guess_lines (lin n, size_t s, size_t t) 500 { 501 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1); 502 lin guessed_lines = MAX (1, t / guessed_bytes_per_line); 503 return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5; 504 } 505 506 /* Given a vector of two file_data objects, find the identical 507 prefixes and suffixes of each object. */ 508 509 static void 510 find_identical_ends (struct file_data filevec[]) 511 { 512 word *w0, *w1; 513 char *p0, *p1, *buffer0, *buffer1; 514 char const *end0, *beg0; 515 char const **linbuf0, **linbuf1; 516 lin i, lines; 517 size_t n0, n1; 518 lin alloc_lines0, alloc_lines1; 519 lin buffered_prefix, prefix_count, prefix_mask; 520 lin middle_guess, suffix_guess; 521 522 slurp (&filevec[0]); 523 prepare_text (&filevec[0]); 524 if (filevec[0].desc != filevec[1].desc) 525 { 526 slurp (&filevec[1]); 527 prepare_text (&filevec[1]); 528 } 529 else 530 { 531 filevec[1].buffer = filevec[0].buffer; 532 filevec[1].bufsize = filevec[0].bufsize; 533 filevec[1].buffered = filevec[0].buffered; 534 filevec[1].missing_newline = filevec[0].missing_newline; 535 } 536 537 /* Find identical prefix. */ 538 539 w0 = filevec[0].buffer; 540 w1 = filevec[1].buffer; 541 p0 = buffer0 = (char *) w0; 542 p1 = buffer1 = (char *) w1; 543 n0 = filevec[0].buffered; 544 n1 = filevec[1].buffered; 545 546 if (p0 == p1) 547 /* The buffers are the same; sentinels won't work. */ 548 p0 = p1 += n1; 549 else 550 { 551 /* Insert end sentinels, in this case characters that are guaranteed 552 to make the equality test false, and thus terminate the loop. */ 553 554 if (n0 < n1) 555 p0[n0] = ~p1[n0]; 556 else 557 p1[n1] = ~p0[n1]; 558 559 /* Loop until first mismatch, or to the sentinel characters. */ 560 561 /* Compare a word at a time for speed. */ 562 while (*w0 == *w1) 563 w0++, w1++; 564 565 /* Do the last few bytes of comparison a byte at a time. */ 566 p0 = (char *) w0; 567 p1 = (char *) w1; 568 while (*p0 == *p1) 569 p0++, p1++; 570 571 /* Don't mistakenly count missing newline as part of prefix. */ 572 if (ROBUST_OUTPUT_STYLE (output_style) 573 && ((buffer0 + n0 - filevec[0].missing_newline < p0) 574 != 575 (buffer1 + n1 - filevec[1].missing_newline < p1))) 576 p0--, p1--; 577 } 578 579 /* Now P0 and P1 point at the first nonmatching characters. */ 580 581 /* Skip back to last line-beginning in the prefix, 582 and then discard up to HORIZON_LINES lines from the prefix. */ 583 i = horizon_lines; 584 while (p0 != buffer0 && (p0[-1] != '\n' || i--)) 585 p0--, p1--; 586 587 /* Record the prefix. */ 588 filevec[0].prefix_end = p0; 589 filevec[1].prefix_end = p1; 590 591 /* Find identical suffix. */ 592 593 /* P0 and P1 point beyond the last chars not yet compared. */ 594 p0 = buffer0 + n0; 595 p1 = buffer1 + n1; 596 597 if (! ROBUST_OUTPUT_STYLE (output_style) 598 || filevec[0].missing_newline == filevec[1].missing_newline) 599 { 600 end0 = p0; /* Addr of last char in file 0. */ 601 602 /* Get value of P0 at which we should stop scanning backward: 603 this is when either P0 or P1 points just past the last char 604 of the identical prefix. */ 605 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1); 606 607 /* Scan back until chars don't match or we reach that point. */ 608 while (p0 != beg0) 609 if (*--p0 != *--p1) 610 { 611 /* Point at the first char of the matching suffix. */ 612 ++p0, ++p1; 613 beg0 = p0; 614 break; 615 } 616 617 /* Are we at a line-beginning in both files? If not, add the rest of 618 this line to the main body. Discard up to HORIZON_LINES lines from 619 the identical suffix. Also, discard one extra line, 620 because shift_boundaries may need it. */ 621 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n') 622 && 623 (buffer1 == p1 || p1[-1] == '\n')); 624 while (i-- && p0 != end0) 625 while (*p0++ != '\n') 626 continue; 627 628 p1 += p0 - beg0; 629 } 630 631 /* Record the suffix. */ 632 filevec[0].suffix_begin = p0; 633 filevec[1].suffix_begin = p1; 634 635 /* Calculate number of lines of prefix to save. 636 637 prefix_count == 0 means save the whole prefix; 638 we need this for options like -D that output the whole file, 639 or for enormous contexts (to avoid worrying about arithmetic overflow). 640 We also need it for options like -F that output some preceding line; 641 at least we will need to find the last few lines, 642 but since we don't know how many, it's easiest to find them all. 643 644 Otherwise, prefix_count != 0. Save just prefix_count lines at start 645 of the line buffer; they'll be moved to the proper location later. 646 Handle 1 more line than the context says (because we count 1 too many), 647 rounded up to the next power of 2 to speed index computation. */ 648 649 if (no_diff_means_no_output && ! function_regexp.fastmap 650 && context < LIN_MAX / 4 && context < n0) 651 { 652 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end); 653 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0); 654 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2) 655 continue; 656 alloc_lines0 = (prefix_count + middle_guess 657 + MIN (context, suffix_guess)); 658 } 659 else 660 { 661 prefix_count = 0; 662 alloc_lines0 = guess_lines (0, 0, n0); 663 } 664 665 prefix_mask = prefix_count - 1; 666 lines = 0; 667 linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0); 668 p0 = buffer0; 669 670 /* If the prefix is needed, find the prefix lines. */ 671 if (! (no_diff_means_no_output 672 && filevec[0].prefix_end == p0 673 && filevec[1].prefix_end == p1)) 674 { 675 end0 = filevec[0].prefix_end; 676 while (p0 != end0) 677 { 678 lin l = lines++ & prefix_mask; 679 if (l == alloc_lines0) 680 { 681 if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0) 682 xalloc_die (); 683 alloc_lines0 *= 2; 684 linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0); 685 } 686 linbuf0[l] = p0; 687 while (*p0++ != '\n') 688 continue; 689 } 690 } 691 buffered_prefix = prefix_count && context < lines ? context : lines; 692 693 /* Allocate line buffer 1. */ 694 695 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end); 696 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1); 697 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess); 698 if (alloc_lines1 < buffered_prefix 699 || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1) 700 xalloc_die (); 701 linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1); 702 703 if (buffered_prefix != lines) 704 { 705 /* Rotate prefix lines to proper location. */ 706 for (i = 0; i < buffered_prefix; i++) 707 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask]; 708 for (i = 0; i < buffered_prefix; i++) 709 linbuf0[i] = linbuf1[i]; 710 } 711 712 /* Initialize line buffer 1 from line buffer 0. */ 713 for (i = 0; i < buffered_prefix; i++) 714 linbuf1[i] = linbuf0[i] - buffer0 + buffer1; 715 716 /* Record the line buffer, adjusted so that 717 linbuf[0] points at the first differing line. */ 718 filevec[0].linbuf = linbuf0 + buffered_prefix; 719 filevec[1].linbuf = linbuf1 + buffered_prefix; 720 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix; 721 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix; 722 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix; 723 filevec[0].prefix_lines = filevec[1].prefix_lines = lines; 724 } 725 726 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less 727 than 2**k. This table is derived from Chris K. Caldwell's list 728 <http://www.utm.edu/research/primes/lists/2small/>. */ 729 730 static unsigned char const prime_offset[] = 731 { 732 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3, 733 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21, 734 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27, 735 55, 93, 1, 57, 25 736 }; 737 738 /* Verify that this host's size_t is not too wide for the above table. */ 739 740 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset); 741 742 /* Given a vector of two file_data objects, read the file associated 743 with each one, and build the table of equivalence classes. 744 Return nonzero if either file appears to be a binary file. 745 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */ 746 747 bool 748 read_files (struct file_data filevec[], bool pretend_binary) 749 { 750 int i; 751 bool skip_test = text | pretend_binary; 752 bool appears_binary = pretend_binary | sip (&filevec[0], skip_test); 753 754 if (filevec[0].desc != filevec[1].desc) 755 appears_binary |= sip (&filevec[1], skip_test | appears_binary); 756 else 757 { 758 filevec[1].buffer = filevec[0].buffer; 759 filevec[1].bufsize = filevec[0].bufsize; 760 filevec[1].buffered = filevec[0].buffered; 761 } 762 if (appears_binary) 763 { 764 /* FIXME: If O_BINARY, this should set both files to binary mode. */ 765 return true; 766 } 767 768 find_identical_ends (filevec); 769 770 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1; 771 if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc) 772 xalloc_die (); 773 equivs = xmalloc (equivs_alloc * sizeof *equivs); 774 /* Equivalence class 0 is permanently safe for lines that were not 775 hashed. Real equivalence classes start at 1. */ 776 equivs_index = 1; 777 778 /* Allocate (one plus) a prime number of hash buckets. Use a prime 779 number between 1/3 and 2/3 of the value of equiv_allocs, 780 approximately. */ 781 for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++) 782 continue; 783 nbuckets = ((size_t) 1 << i) - prime_offset[i]; 784 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets) 785 xalloc_die (); 786 buckets = zalloc ((nbuckets + 1) * sizeof *buckets); 787 buckets++; 788 789 for (i = 0; i < 2; i++) 790 find_and_hash_each_line (&filevec[i]); 791 792 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index; 793 794 free (equivs); 795 free (buckets - 1); 796 797 return false; 798 } 799