1 /* File I/O for GNU DIFF. 2 3 Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2010 4 Free Software Foundation, Inc. 5 6 This file is part of GNU DIFF. 7 8 This program is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 3 of the License, or 11 (at your option) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 20 21 #include "diff.h" 22 #include <cmpbuf.h> 23 #include <file-type.h> 24 #include <xalloc.h> 25 26 /* Rotate an unsigned value to the left. */ 27 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n))) 28 29 /* Given a hash value and a new character, return a new hash value. */ 30 #define HASH(h, c) ((c) + ROL (h, 7)) 31 32 /* The type of a hash value. */ 33 typedef size_t hash_value; 34 verify (! TYPE_SIGNED (hash_value)); 35 36 /* Lines are put into equivalence classes of lines that match in lines_differ. 37 Each equivalence class is represented by one of these structures, 38 but only while the classes are being computed. 39 Afterward, each class is represented by a number. */ 40 struct equivclass 41 { 42 lin next; /* Next item in this bucket. */ 43 hash_value hash; /* Hash of lines in this class. */ 44 char const *line; /* A line that fits this class. */ 45 size_t length; /* That line's length, not counting its newline. */ 46 }; 47 48 /* Hash-table: array of buckets, each being a chain of equivalence classes. 49 buckets[-1] is reserved for incomplete lines. */ 50 static lin *buckets; 51 52 /* Number of buckets in the hash table array, not counting buckets[-1]. */ 53 static size_t nbuckets; 54 55 /* Array in which the equivalence classes are allocated. 56 The bucket-chains go through the elements in this array. 57 The number of an equivalence class is its index in this array. */ 58 static struct equivclass *equivs; 59 60 /* Index of first free element in the array `equivs'. */ 61 static lin equivs_index; 62 63 /* Number of elements allocated in the array `equivs'. */ 64 static lin equivs_alloc; 65 66 /* Read a block of data into a file buffer, checking for EOF and error. */ 67 68 void 69 file_block_read (struct file_data *current, size_t size) 70 { 71 if (size && ! current->eof) 72 { 73 size_t s = block_read (current->desc, 74 FILE_BUFFER (current) + current->buffered, size); 75 if (s == SIZE_MAX) 76 pfatal_with_name (current->name); 77 current->buffered += s; 78 current->eof = s < size; 79 } 80 } 81 82 /* Check for binary files and compare them for exact identity. */ 83 84 /* Return 1 if BUF contains a non text character. 85 SIZE is the number of characters in BUF. */ 86 87 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0) 88 89 /* Get ready to read the current file. 90 Return nonzero if SKIP_TEST is zero, 91 and if it appears to be a binary file. */ 92 93 static bool 94 sip (struct file_data *current, bool skip_test) 95 { 96 /* If we have a nonexistent file at this stage, treat it as empty. */ 97 if (current->desc < 0) 98 { 99 /* Leave room for a sentinel. */ 100 current->bufsize = sizeof (word); 101 current->buffer = xmalloc (current->bufsize); 102 } 103 else 104 { 105 current->bufsize = buffer_lcm (sizeof (word), 106 STAT_BLOCKSIZE (current->stat), 107 PTRDIFF_MAX - 2 * sizeof (word)); 108 current->buffer = xmalloc (current->bufsize); 109 110 if (! skip_test) 111 { 112 /* Check first part of file to see if it's a binary file. */ 113 114 /* FIXME: if O_BINARY, this should revert to text mode 115 if the file is not binary. */ 116 117 file_block_read (current, current->bufsize); 118 return binary_file_p (current->buffer, current->buffered); 119 } 120 } 121 122 current->buffered = 0; 123 current->eof = false; 124 return false; 125 } 126 127 /* Slurp the rest of the current file completely into memory. */ 128 129 static void 130 slurp (struct file_data *current) 131 { 132 size_t cc; 133 134 if (current->desc < 0) 135 { 136 /* The file is nonexistent. */ 137 return; 138 } 139 140 if (S_ISREG (current->stat.st_mode)) 141 { 142 /* It's a regular file; slurp in the rest all at once. */ 143 144 /* Get the size out of the stat block. 145 Allocate just enough room for appended newline plus word sentinel, 146 plus word-alignment since we want the buffer word-aligned. */ 147 size_t file_size = current->stat.st_size; 148 cc = file_size + 2 * sizeof (word) - file_size % sizeof (word); 149 if (file_size != current->stat.st_size || cc < file_size 150 || PTRDIFF_MAX <= cc) 151 xalloc_die (); 152 153 if (current->bufsize < cc) 154 { 155 current->bufsize = cc; 156 current->buffer = xrealloc (current->buffer, cc); 157 } 158 159 /* Try to read at least 1 more byte than the size indicates, to 160 detect whether the file is growing. This is a nicety for 161 users who run 'diff' on files while they are changing. */ 162 163 if (current->buffered <= file_size) 164 { 165 file_block_read (current, file_size + 1 - current->buffered); 166 if (current->buffered <= file_size) 167 return; 168 } 169 } 170 171 /* It's not a regular file, or it's a growing regular file; read it, 172 growing the buffer as needed. */ 173 174 file_block_read (current, current->bufsize - current->buffered); 175 176 if (current->buffered) 177 { 178 while (current->buffered == current->bufsize) 179 { 180 if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize) 181 xalloc_die (); 182 current->bufsize *= 2; 183 current->buffer = xrealloc (current->buffer, current->bufsize); 184 file_block_read (current, current->bufsize - current->buffered); 185 } 186 187 /* Allocate just enough room for appended newline plus word 188 sentinel, plus word-alignment. */ 189 cc = current->buffered + 2 * sizeof (word); 190 current->bufsize = cc - cc % sizeof (word); 191 current->buffer = xrealloc (current->buffer, current->bufsize); 192 } 193 } 194 195 /* Split the file into lines, simultaneously computing the equivalence 196 class for each line. */ 197 198 static void 199 find_and_hash_each_line (struct file_data *current) 200 { 201 hash_value h; 202 char const *p = current->prefix_end; 203 unsigned char c; 204 lin i, *bucket; 205 size_t length; 206 207 /* Cache often-used quantities in local variables to help the compiler. */ 208 char const **linbuf = current->linbuf; 209 lin alloc_lines = current->alloc_lines; 210 lin line = 0; 211 lin linbuf_base = current->linbuf_base; 212 lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs); 213 struct equivclass *eqs = equivs; 214 lin eqs_index = equivs_index; 215 lin eqs_alloc = equivs_alloc; 216 char const *suffix_begin = current->suffix_begin; 217 char const *bufend = FILE_BUFFER (current) + current->buffered; 218 bool diff_length_compare_anyway = 219 ignore_white_space != IGNORE_NO_WHITE_SPACE; 220 bool same_length_diff_contents_compare_anyway = 221 diff_length_compare_anyway | ignore_case; 222 223 while (p < suffix_begin) 224 { 225 char const *ip = p; 226 227 h = 0; 228 229 /* Hash this line until we find a newline. */ 230 if (ignore_case) 231 switch (ignore_white_space) 232 { 233 case IGNORE_ALL_SPACE: 234 while ((c = *p++) != '\n') 235 if (! isspace (c)) 236 h = HASH (h, tolower (c)); 237 break; 238 239 case IGNORE_SPACE_CHANGE: 240 while ((c = *p++) != '\n') 241 { 242 if (isspace (c)) 243 { 244 do 245 if ((c = *p++) == '\n') 246 goto hashing_done; 247 while (isspace (c)); 248 249 h = HASH (h, ' '); 250 } 251 252 /* C is now the first non-space. */ 253 h = HASH (h, tolower (c)); 254 } 255 break; 256 257 case IGNORE_TAB_EXPANSION: 258 { 259 size_t column = 0; 260 while ((c = *p++) != '\n') 261 { 262 size_t repetitions = 1; 263 264 switch (c) 265 { 266 case '\b': 267 column -= 0 < column; 268 break; 269 270 case '\t': 271 c = ' '; 272 repetitions = tabsize - column % tabsize; 273 column = (column + repetitions < column 274 ? 0 275 : column + repetitions); 276 break; 277 278 case '\r': 279 column = 0; 280 break; 281 282 default: 283 c = tolower (c); 284 column++; 285 break; 286 } 287 288 do 289 h = HASH (h, c); 290 while (--repetitions != 0); 291 } 292 } 293 break; 294 295 default: 296 while ((c = *p++) != '\n') 297 h = HASH (h, tolower (c)); 298 break; 299 } 300 else 301 switch (ignore_white_space) 302 { 303 case IGNORE_ALL_SPACE: 304 while ((c = *p++) != '\n') 305 if (! isspace (c)) 306 h = HASH (h, c); 307 break; 308 309 case IGNORE_SPACE_CHANGE: 310 while ((c = *p++) != '\n') 311 { 312 if (isspace (c)) 313 { 314 do 315 if ((c = *p++) == '\n') 316 goto hashing_done; 317 while (isspace (c)); 318 319 h = HASH (h, ' '); 320 } 321 322 /* C is now the first non-space. */ 323 h = HASH (h, c); 324 } 325 break; 326 327 case IGNORE_TAB_EXPANSION: 328 { 329 size_t column = 0; 330 while ((c = *p++) != '\n') 331 { 332 size_t repetitions = 1; 333 334 switch (c) 335 { 336 case '\b': 337 column -= 0 < column; 338 break; 339 340 case '\t': 341 c = ' '; 342 repetitions = tabsize - column % tabsize; 343 column = (column + repetitions < column 344 ? 0 345 : column + repetitions); 346 break; 347 348 case '\r': 349 column = 0; 350 break; 351 352 default: 353 column++; 354 break; 355 } 356 357 do 358 h = HASH (h, c); 359 while (--repetitions != 0); 360 } 361 } 362 break; 363 364 default: 365 while ((c = *p++) != '\n') 366 h = HASH (h, c); 367 break; 368 } 369 370 hashing_done:; 371 372 bucket = &buckets[h % nbuckets]; 373 length = p - ip - 1; 374 375 if (p == bufend 376 && current->missing_newline 377 && ROBUST_OUTPUT_STYLE (output_style)) 378 { 379 /* The last line is incomplete and we do not silently 380 complete lines. If the line cannot compare equal to any 381 complete line, put it into buckets[-1] so that it can 382 compare equal only to the other file's incomplete line 383 (if one exists). */ 384 if (ignore_white_space < IGNORE_SPACE_CHANGE) 385 bucket = &buckets[-1]; 386 } 387 388 for (i = *bucket; ; i = eqs[i].next) 389 if (!i) 390 { 391 /* Create a new equivalence class in this bucket. */ 392 i = eqs_index++; 393 if (i == eqs_alloc) 394 { 395 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc) 396 xalloc_die (); 397 eqs_alloc *= 2; 398 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs); 399 } 400 eqs[i].next = *bucket; 401 eqs[i].hash = h; 402 eqs[i].line = ip; 403 eqs[i].length = length; 404 *bucket = i; 405 break; 406 } 407 else if (eqs[i].hash == h) 408 { 409 char const *eqline = eqs[i].line; 410 411 /* Reuse existing class if lines_differ reports the lines 412 equal. */ 413 if (eqs[i].length == length) 414 { 415 /* Reuse existing equivalence class if the lines are identical. 416 This detects the common case of exact identity 417 faster than lines_differ would. */ 418 if (memcmp (eqline, ip, length) == 0) 419 break; 420 if (!same_length_diff_contents_compare_anyway) 421 continue; 422 } 423 else if (!diff_length_compare_anyway) 424 continue; 425 426 if (! lines_differ (eqline, ip)) 427 break; 428 } 429 430 /* Maybe increase the size of the line table. */ 431 if (line == alloc_lines) 432 { 433 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 434 if (PTRDIFF_MAX / 3 <= alloc_lines 435 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 436 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 437 xalloc_die (); 438 alloc_lines = 2 * alloc_lines - linbuf_base; 439 cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs); 440 linbuf += linbuf_base; 441 linbuf = xrealloc (linbuf, 442 (alloc_lines - linbuf_base) * sizeof *linbuf); 443 linbuf -= linbuf_base; 444 } 445 linbuf[line] = ip; 446 cureqs[line] = i; 447 ++line; 448 } 449 450 current->buffered_lines = line; 451 452 for (i = 0; ; i++) 453 { 454 /* Record the line start for lines in the suffix that we care about. 455 Record one more line start than lines, 456 so that we can compute the length of any buffered line. */ 457 if (line == alloc_lines) 458 { 459 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 460 if (PTRDIFF_MAX / 3 <= alloc_lines 461 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 462 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 463 xalloc_die (); 464 alloc_lines = 2 * alloc_lines - linbuf_base; 465 linbuf += linbuf_base; 466 linbuf = xrealloc (linbuf, 467 (alloc_lines - linbuf_base) * sizeof *linbuf); 468 linbuf -= linbuf_base; 469 } 470 linbuf[line] = p; 471 472 if (p == bufend) 473 { 474 /* If the last line is incomplete and we do not silently 475 complete lines, don't count its appended newline. */ 476 if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style)) 477 linbuf[line]--; 478 break; 479 } 480 481 if (context <= i && no_diff_means_no_output) 482 break; 483 484 line++; 485 486 while (*p++ != '\n') 487 continue; 488 } 489 490 /* Done with cache in local variables. */ 491 current->linbuf = linbuf; 492 current->valid_lines = line; 493 current->alloc_lines = alloc_lines; 494 current->equivs = cureqs; 495 equivs = eqs; 496 equivs_alloc = eqs_alloc; 497 equivs_index = eqs_index; 498 } 499 500 /* Prepare the text. Make sure the text end is initialized. 501 Make sure text ends in a newline, 502 but remember that we had to add one. 503 Strip trailing CRs, if that was requested. */ 504 505 static void 506 prepare_text (struct file_data *current) 507 { 508 size_t buffered = current->buffered; 509 char *p = FILE_BUFFER (current); 510 char *dst; 511 512 if (buffered == 0 || p[buffered - 1] == '\n') 513 current->missing_newline = false; 514 else 515 { 516 p[buffered++] = '\n'; 517 current->missing_newline = true; 518 } 519 520 if (!p) 521 return; 522 523 /* Don't use uninitialized storage when planting or using sentinels. */ 524 memset (p + buffered, 0, sizeof (word)); 525 526 if (strip_trailing_cr && (dst = memchr (p, '\r', buffered))) 527 { 528 char const *src = dst; 529 char const *srclim = p + buffered; 530 531 do 532 dst += ! ((*dst = *src++) == '\r' && *src == '\n'); 533 while (src < srclim); 534 535 buffered -= src - dst; 536 } 537 538 current->buffered = buffered; 539 } 540 541 /* We have found N lines in a buffer of size S; guess the 542 proportionate number of lines that will be found in a buffer of 543 size T. However, do not guess a number of lines so large that the 544 resulting line table might cause overflow in size calculations. */ 545 static lin 546 guess_lines (lin n, size_t s, size_t t) 547 { 548 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1); 549 lin guessed_lines = MAX (1, t / guessed_bytes_per_line); 550 return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5; 551 } 552 553 /* Given a vector of two file_data objects, find the identical 554 prefixes and suffixes of each object. */ 555 556 static void 557 find_identical_ends (struct file_data filevec[]) 558 { 559 word *w0, *w1; 560 char *p0, *p1, *buffer0, *buffer1; 561 char const *end0, *beg0; 562 char const **linbuf0, **linbuf1; 563 lin i, lines; 564 size_t n0, n1; 565 lin alloc_lines0, alloc_lines1; 566 lin buffered_prefix, prefix_count, prefix_mask; 567 lin middle_guess, suffix_guess; 568 569 slurp (&filevec[0]); 570 prepare_text (&filevec[0]); 571 if (filevec[0].desc != filevec[1].desc) 572 { 573 slurp (&filevec[1]); 574 prepare_text (&filevec[1]); 575 } 576 else 577 { 578 filevec[1].buffer = filevec[0].buffer; 579 filevec[1].bufsize = filevec[0].bufsize; 580 filevec[1].buffered = filevec[0].buffered; 581 filevec[1].missing_newline = filevec[0].missing_newline; 582 } 583 584 /* Find identical prefix. */ 585 586 w0 = filevec[0].buffer; 587 w1 = filevec[1].buffer; 588 p0 = buffer0 = (char *) w0; 589 p1 = buffer1 = (char *) w1; 590 n0 = filevec[0].buffered; 591 n1 = filevec[1].buffered; 592 593 if (p0 == p1) 594 /* The buffers are the same; sentinels won't work. */ 595 p0 = p1 += n1; 596 else 597 { 598 /* Insert end sentinels, in this case characters that are guaranteed 599 to make the equality test false, and thus terminate the loop. */ 600 601 if (n0 < n1) 602 p0[n0] = ~p1[n0]; 603 else 604 p1[n1] = ~p0[n1]; 605 606 /* Loop until first mismatch, or to the sentinel characters. */ 607 608 /* Compare a word at a time for speed. */ 609 while (*w0 == *w1) 610 w0++, w1++; 611 612 /* Do the last few bytes of comparison a byte at a time. */ 613 p0 = (char *) w0; 614 p1 = (char *) w1; 615 while (*p0 == *p1) 616 p0++, p1++; 617 618 /* Don't mistakenly count missing newline as part of prefix. */ 619 if (ROBUST_OUTPUT_STYLE (output_style) 620 && ((buffer0 + n0 - filevec[0].missing_newline < p0) 621 != 622 (buffer1 + n1 - filevec[1].missing_newline < p1))) 623 p0--, p1--; 624 } 625 626 /* Now P0 and P1 point at the first nonmatching characters. */ 627 628 /* Skip back to last line-beginning in the prefix, 629 and then discard up to HORIZON_LINES lines from the prefix. */ 630 i = horizon_lines; 631 while (p0 != buffer0 && (p0[-1] != '\n' || i--)) 632 p0--, p1--; 633 634 /* Record the prefix. */ 635 filevec[0].prefix_end = p0; 636 filevec[1].prefix_end = p1; 637 638 /* Find identical suffix. */ 639 640 /* P0 and P1 point beyond the last chars not yet compared. */ 641 p0 = buffer0 + n0; 642 p1 = buffer1 + n1; 643 644 if (! ROBUST_OUTPUT_STYLE (output_style) 645 || filevec[0].missing_newline == filevec[1].missing_newline) 646 { 647 end0 = p0; /* Addr of last char in file 0. */ 648 649 /* Get value of P0 at which we should stop scanning backward: 650 this is when either P0 or P1 points just past the last char 651 of the identical prefix. */ 652 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1); 653 654 /* Scan back until chars don't match or we reach that point. */ 655 while (p0 != beg0) 656 if (*--p0 != *--p1) 657 { 658 /* Point at the first char of the matching suffix. */ 659 ++p0, ++p1; 660 beg0 = p0; 661 break; 662 } 663 664 /* Are we at a line-beginning in both files? If not, add the rest of 665 this line to the main body. Discard up to HORIZON_LINES lines from 666 the identical suffix. Also, discard one extra line, 667 because shift_boundaries may need it. */ 668 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n') 669 && 670 (buffer1 == p1 || p1[-1] == '\n')); 671 while (i-- && p0 != end0) 672 while (*p0++ != '\n') 673 continue; 674 675 p1 += p0 - beg0; 676 } 677 678 /* Record the suffix. */ 679 filevec[0].suffix_begin = p0; 680 filevec[1].suffix_begin = p1; 681 682 /* Calculate number of lines of prefix to save. 683 684 prefix_count == 0 means save the whole prefix; 685 we need this for options like -D that output the whole file, 686 or for enormous contexts (to avoid worrying about arithmetic overflow). 687 We also need it for options like -F that output some preceding line; 688 at least we will need to find the last few lines, 689 but since we don't know how many, it's easiest to find them all. 690 691 Otherwise, prefix_count != 0. Save just prefix_count lines at start 692 of the line buffer; they'll be moved to the proper location later. 693 Handle 1 more line than the context says (because we count 1 too many), 694 rounded up to the next power of 2 to speed index computation. */ 695 696 if (no_diff_means_no_output && ! function_regexp.fastmap 697 && context < LIN_MAX / 4 && context < n0) 698 { 699 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end); 700 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0); 701 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2) 702 continue; 703 alloc_lines0 = (prefix_count + middle_guess 704 + MIN (context, suffix_guess)); 705 } 706 else 707 { 708 prefix_count = 0; 709 alloc_lines0 = guess_lines (0, 0, n0); 710 } 711 712 prefix_mask = prefix_count - 1; 713 lines = 0; 714 linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0); 715 p0 = buffer0; 716 717 /* If the prefix is needed, find the prefix lines. */ 718 if (! (no_diff_means_no_output 719 && filevec[0].prefix_end == p0 720 && filevec[1].prefix_end == p1)) 721 { 722 end0 = filevec[0].prefix_end; 723 while (p0 != end0) 724 { 725 lin l = lines++ & prefix_mask; 726 if (l == alloc_lines0) 727 { 728 if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0) 729 xalloc_die (); 730 alloc_lines0 *= 2; 731 linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0); 732 } 733 linbuf0[l] = p0; 734 while (*p0++ != '\n') 735 continue; 736 } 737 } 738 buffered_prefix = prefix_count && context < lines ? context : lines; 739 740 /* Allocate line buffer 1. */ 741 742 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end); 743 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1); 744 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess); 745 if (alloc_lines1 < buffered_prefix 746 || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1) 747 xalloc_die (); 748 linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1); 749 750 if (buffered_prefix != lines) 751 { 752 /* Rotate prefix lines to proper location. */ 753 for (i = 0; i < buffered_prefix; i++) 754 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask]; 755 for (i = 0; i < buffered_prefix; i++) 756 linbuf0[i] = linbuf1[i]; 757 } 758 759 /* Initialize line buffer 1 from line buffer 0. */ 760 for (i = 0; i < buffered_prefix; i++) 761 linbuf1[i] = linbuf0[i] - buffer0 + buffer1; 762 763 /* Record the line buffer, adjusted so that 764 linbuf[0] points at the first differing line. */ 765 filevec[0].linbuf = linbuf0 + buffered_prefix; 766 filevec[1].linbuf = linbuf1 + buffered_prefix; 767 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix; 768 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix; 769 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix; 770 filevec[0].prefix_lines = filevec[1].prefix_lines = lines; 771 } 772 773 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less 774 than 2**k. This table is derived from Chris K. Caldwell's list 775 <http://www.utm.edu/research/primes/lists/2small/>. */ 776 777 static unsigned char const prime_offset[] = 778 { 779 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3, 780 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21, 781 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27, 782 55, 93, 1, 57, 25 783 }; 784 785 /* Verify that this host's size_t is not too wide for the above table. */ 786 787 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset); 788 789 /* Given a vector of two file_data objects, read the file associated 790 with each one, and build the table of equivalence classes. 791 Return nonzero if either file appears to be a binary file. 792 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */ 793 794 bool 795 read_files (struct file_data filevec[], bool pretend_binary) 796 { 797 int i; 798 bool skip_test = text | pretend_binary; 799 bool appears_binary = pretend_binary | sip (&filevec[0], skip_test); 800 801 if (filevec[0].desc != filevec[1].desc) 802 appears_binary |= sip (&filevec[1], skip_test | appears_binary); 803 else 804 { 805 filevec[1].buffer = filevec[0].buffer; 806 filevec[1].bufsize = filevec[0].bufsize; 807 filevec[1].buffered = filevec[0].buffered; 808 } 809 if (appears_binary) 810 { 811 /* FIXME: If O_BINARY, this should set both files to binary mode. */ 812 return true; 813 } 814 815 find_identical_ends (filevec); 816 817 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1; 818 if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc) 819 xalloc_die (); 820 equivs = xmalloc (equivs_alloc * sizeof *equivs); 821 /* Equivalence class 0 is permanently safe for lines that were not 822 hashed. Real equivalence classes start at 1. */ 823 equivs_index = 1; 824 825 /* Allocate (one plus) a prime number of hash buckets. Use a prime 826 number between 1/3 and 2/3 of the value of equiv_allocs, 827 approximately. */ 828 for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++) 829 continue; 830 nbuckets = ((size_t) 1 << i) - prime_offset[i]; 831 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets) 832 xalloc_die (); 833 buckets = zalloc ((nbuckets + 1) * sizeof *buckets); 834 buckets++; 835 836 for (i = 0; i < 2; i++) 837 find_and_hash_each_line (&filevec[i]); 838 839 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index; 840 841 free (equivs); 842 free (buckets - 1); 843 844 return false; 845 } 846