1 /* File I/O for GNU DIFF. 2 3 Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013 4 Free Software Foundation, Inc. 5 6 This file is part of GNU DIFF. 7 8 This program is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 3 of the License, or 11 (at your option) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 20 21 #include "diff.h" 22 #include <binary-io.h> 23 #include <cmpbuf.h> 24 #include <file-type.h> 25 #include <xalloc.h> 26 27 /* Rotate an unsigned value to the left. */ 28 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n))) 29 30 /* Given a hash value and a new character, return a new hash value. */ 31 #define HASH(h, c) ((c) + ROL (h, 7)) 32 33 /* The type of a hash value. */ 34 typedef size_t hash_value; 35 verify (! TYPE_SIGNED (hash_value)); 36 37 /* Lines are put into equivalence classes of lines that match in lines_differ. 38 Each equivalence class is represented by one of these structures, 39 but only while the classes are being computed. 40 Afterward, each class is represented by a number. */ 41 struct equivclass 42 { 43 lin next; /* Next item in this bucket. */ 44 hash_value hash; /* Hash of lines in this class. */ 45 char const *line; /* A line that fits this class. */ 46 size_t length; /* That line's length, not counting its newline. */ 47 }; 48 49 /* Hash-table: array of buckets, each being a chain of equivalence classes. 50 buckets[-1] is reserved for incomplete lines. */ 51 static lin *buckets; 52 53 /* Number of buckets in the hash table array, not counting buckets[-1]. */ 54 static size_t nbuckets; 55 56 /* Array in which the equivalence classes are allocated. 57 The bucket-chains go through the elements in this array. 58 The number of an equivalence class is its index in this array. */ 59 static struct equivclass *equivs; 60 61 /* Index of first free element in the array 'equivs'. */ 62 static lin equivs_index; 63 64 /* Number of elements allocated in the array 'equivs'. */ 65 static lin equivs_alloc; 66 67 /* Read a block of data into a file buffer, checking for EOF and error. */ 68 69 void 70 file_block_read (struct file_data *current, size_t size) 71 { 72 if (size && ! current->eof) 73 { 74 size_t s = block_read (current->desc, 75 FILE_BUFFER (current) + current->buffered, size); 76 if (s == SIZE_MAX) 77 pfatal_with_name (current->name); 78 current->buffered += s; 79 current->eof = s < size; 80 } 81 } 82 83 /* Check for binary files and compare them for exact identity. */ 84 85 /* Return 1 if BUF contains a non text character. 86 SIZE is the number of characters in BUF. */ 87 88 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0) 89 90 /* Get ready to read the current file. 91 Return nonzero if SKIP_TEST is zero, 92 and if it appears to be a binary file. */ 93 94 static bool 95 sip (struct file_data *current, bool skip_test) 96 { 97 /* If we have a nonexistent file at this stage, treat it as empty. */ 98 if (current->desc < 0) 99 { 100 /* Leave room for a sentinel. */ 101 current->bufsize = sizeof (word); 102 current->buffer = xmalloc (current->bufsize); 103 } 104 else 105 { 106 current->bufsize = buffer_lcm (sizeof (word), 107 STAT_BLOCKSIZE (current->stat), 108 PTRDIFF_MAX - 2 * sizeof (word)); 109 current->buffer = xmalloc (current->bufsize); 110 111 if (! skip_test) 112 { 113 /* Check first part of file to see if it's a binary file. */ 114 115 int prev_mode = set_binary_mode (current->desc, O_BINARY); 116 off_t buffered; 117 file_block_read (current, current->bufsize); 118 buffered = current->buffered; 119 120 if (prev_mode != O_BINARY) 121 { 122 /* Revert to text mode and seek back to the start to reread 123 the file. Use relative seek, since file descriptors 124 like stdin might not start at offset zero. */ 125 if (lseek (current->desc, - buffered, SEEK_CUR) < 0) 126 pfatal_with_name (current->name); 127 set_binary_mode (current->desc, prev_mode); 128 current->buffered = 0; 129 current->eof = false; 130 } 131 132 return binary_file_p (current->buffer, buffered); 133 } 134 } 135 136 current->buffered = 0; 137 current->eof = false; 138 return false; 139 } 140 141 /* Slurp the rest of the current file completely into memory. */ 142 143 static void 144 slurp (struct file_data *current) 145 { 146 size_t cc; 147 148 if (current->desc < 0) 149 { 150 /* The file is nonexistent. */ 151 return; 152 } 153 154 if (S_ISREG (current->stat.st_mode)) 155 { 156 /* It's a regular file; slurp in the rest all at once. */ 157 158 /* Get the size out of the stat block. 159 Allocate just enough room for appended newline plus word sentinel, 160 plus word-alignment since we want the buffer word-aligned. */ 161 size_t file_size = current->stat.st_size; 162 cc = file_size + 2 * sizeof (word) - file_size % sizeof (word); 163 if (file_size != current->stat.st_size || cc < file_size 164 || PTRDIFF_MAX <= cc) 165 xalloc_die (); 166 167 if (current->bufsize < cc) 168 { 169 current->bufsize = cc; 170 current->buffer = xrealloc (current->buffer, cc); 171 } 172 173 /* Try to read at least 1 more byte than the size indicates, to 174 detect whether the file is growing. This is a nicety for 175 users who run 'diff' on files while they are changing. */ 176 177 if (current->buffered <= file_size) 178 { 179 file_block_read (current, file_size + 1 - current->buffered); 180 if (current->buffered <= file_size) 181 return; 182 } 183 } 184 185 /* It's not a regular file, or it's a growing regular file; read it, 186 growing the buffer as needed. */ 187 188 file_block_read (current, current->bufsize - current->buffered); 189 190 if (current->buffered) 191 { 192 while (current->buffered == current->bufsize) 193 { 194 if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize) 195 xalloc_die (); 196 current->bufsize *= 2; 197 current->buffer = xrealloc (current->buffer, current->bufsize); 198 file_block_read (current, current->bufsize - current->buffered); 199 } 200 201 /* Allocate just enough room for appended newline plus word 202 sentinel, plus word-alignment. */ 203 cc = current->buffered + 2 * sizeof (word); 204 current->bufsize = cc - cc % sizeof (word); 205 current->buffer = xrealloc (current->buffer, current->bufsize); 206 } 207 } 208 209 /* Split the file into lines, simultaneously computing the equivalence 210 class for each line. */ 211 212 static void 213 find_and_hash_each_line (struct file_data *current) 214 { 215 char const *p = current->prefix_end; 216 lin i, *bucket; 217 size_t length; 218 219 /* Cache often-used quantities in local variables to help the compiler. */ 220 char const **linbuf = current->linbuf; 221 lin alloc_lines = current->alloc_lines; 222 lin line = 0; 223 lin linbuf_base = current->linbuf_base; 224 lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs); 225 struct equivclass *eqs = equivs; 226 lin eqs_index = equivs_index; 227 lin eqs_alloc = equivs_alloc; 228 char const *suffix_begin = current->suffix_begin; 229 char const *bufend = FILE_BUFFER (current) + current->buffered; 230 bool ig_case = ignore_case; 231 enum DIFF_white_space ig_white_space = ignore_white_space; 232 bool diff_length_compare_anyway = 233 ig_white_space != IGNORE_NO_WHITE_SPACE; 234 bool same_length_diff_contents_compare_anyway = 235 diff_length_compare_anyway | ig_case; 236 237 while (p < suffix_begin) 238 { 239 char const *ip = p; 240 hash_value h = 0; 241 unsigned char c; 242 243 /* Hash this line until we find a newline. */ 244 switch (ig_white_space) 245 { 246 case IGNORE_ALL_SPACE: 247 while ((c = *p++) != '\n') 248 if (! isspace (c)) 249 h = HASH (h, ig_case ? tolower (c) : c); 250 break; 251 252 case IGNORE_SPACE_CHANGE: 253 while ((c = *p++) != '\n') 254 { 255 if (isspace (c)) 256 { 257 do 258 if ((c = *p++) == '\n') 259 goto hashing_done; 260 while (isspace (c)); 261 262 h = HASH (h, ' '); 263 } 264 265 /* C is now the first non-space. */ 266 h = HASH (h, ig_case ? tolower (c) : c); 267 } 268 break; 269 270 case IGNORE_TAB_EXPANSION: 271 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE: 272 case IGNORE_TRAILING_SPACE: 273 { 274 size_t column = 0; 275 while ((c = *p++) != '\n') 276 { 277 if (ig_white_space & IGNORE_TRAILING_SPACE 278 && isspace (c)) 279 { 280 char const *p1 = p; 281 unsigned char c1; 282 do 283 if ((c1 = *p1++) == '\n') 284 { 285 p = p1; 286 goto hashing_done; 287 } 288 while (isspace (c1)); 289 } 290 291 size_t repetitions = 1; 292 293 if (ig_white_space & IGNORE_TAB_EXPANSION) 294 switch (c) 295 { 296 case '\b': 297 column -= 0 < column; 298 break; 299 300 case '\t': 301 c = ' '; 302 repetitions = tabsize - column % tabsize; 303 column = (column + repetitions < column 304 ? 0 305 : column + repetitions); 306 break; 307 308 case '\r': 309 column = 0; 310 break; 311 312 default: 313 column++; 314 break; 315 } 316 317 if (ig_case) 318 c = tolower (c); 319 320 do 321 h = HASH (h, c); 322 while (--repetitions != 0); 323 } 324 } 325 break; 326 327 default: 328 if (ig_case) 329 while ((c = *p++) != '\n') 330 h = HASH (h, tolower (c)); 331 else 332 while ((c = *p++) != '\n') 333 h = HASH (h, c); 334 break; 335 } 336 337 hashing_done:; 338 339 bucket = &buckets[h % nbuckets]; 340 length = p - ip - 1; 341 342 if (p == bufend 343 && current->missing_newline 344 && ROBUST_OUTPUT_STYLE (output_style)) 345 { 346 /* The last line is incomplete and we do not silently 347 complete lines. If the line cannot compare equal to any 348 complete line, put it into buckets[-1] so that it can 349 compare equal only to the other file's incomplete line 350 (if one exists). */ 351 if (ig_white_space < IGNORE_TRAILING_SPACE) 352 bucket = &buckets[-1]; 353 } 354 355 for (i = *bucket; ; i = eqs[i].next) 356 if (!i) 357 { 358 /* Create a new equivalence class in this bucket. */ 359 i = eqs_index++; 360 if (i == eqs_alloc) 361 { 362 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc) 363 xalloc_die (); 364 eqs_alloc *= 2; 365 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs); 366 } 367 eqs[i].next = *bucket; 368 eqs[i].hash = h; 369 eqs[i].line = ip; 370 eqs[i].length = length; 371 *bucket = i; 372 break; 373 } 374 else if (eqs[i].hash == h) 375 { 376 char const *eqline = eqs[i].line; 377 378 /* Reuse existing class if lines_differ reports the lines 379 equal. */ 380 if (eqs[i].length == length) 381 { 382 /* Reuse existing equivalence class if the lines are identical. 383 This detects the common case of exact identity 384 faster than lines_differ would. */ 385 if (memcmp (eqline, ip, length) == 0) 386 break; 387 if (!same_length_diff_contents_compare_anyway) 388 continue; 389 } 390 else if (!diff_length_compare_anyway) 391 continue; 392 393 if (! lines_differ (eqline, ip)) 394 break; 395 } 396 397 /* Maybe increase the size of the line table. */ 398 if (line == alloc_lines) 399 { 400 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 401 if (PTRDIFF_MAX / 3 <= alloc_lines 402 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 403 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 404 xalloc_die (); 405 alloc_lines = 2 * alloc_lines - linbuf_base; 406 cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs); 407 linbuf += linbuf_base; 408 linbuf = xrealloc (linbuf, 409 (alloc_lines - linbuf_base) * sizeof *linbuf); 410 linbuf -= linbuf_base; 411 } 412 linbuf[line] = ip; 413 cureqs[line] = i; 414 ++line; 415 } 416 417 current->buffered_lines = line; 418 419 for (i = 0; ; i++) 420 { 421 /* Record the line start for lines in the suffix that we care about. 422 Record one more line start than lines, 423 so that we can compute the length of any buffered line. */ 424 if (line == alloc_lines) 425 { 426 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */ 427 if (PTRDIFF_MAX / 3 <= alloc_lines 428 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base 429 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base) 430 xalloc_die (); 431 alloc_lines = 2 * alloc_lines - linbuf_base; 432 linbuf += linbuf_base; 433 linbuf = xrealloc (linbuf, 434 (alloc_lines - linbuf_base) * sizeof *linbuf); 435 linbuf -= linbuf_base; 436 } 437 linbuf[line] = p; 438 439 if (p == bufend) 440 { 441 /* If the last line is incomplete and we do not silently 442 complete lines, don't count its appended newline. */ 443 if (current->missing_newline && ROBUST_OUTPUT_STYLE (output_style)) 444 linbuf[line]--; 445 break; 446 } 447 448 if (context <= i && no_diff_means_no_output) 449 break; 450 451 line++; 452 453 while (*p++ != '\n') 454 continue; 455 } 456 457 /* Done with cache in local variables. */ 458 current->linbuf = linbuf; 459 current->valid_lines = line; 460 current->alloc_lines = alloc_lines; 461 current->equivs = cureqs; 462 equivs = eqs; 463 equivs_alloc = eqs_alloc; 464 equivs_index = eqs_index; 465 } 466 467 /* Prepare the text. Make sure the text end is initialized. 468 Make sure text ends in a newline, 469 but remember that we had to add one. 470 Strip trailing CRs, if that was requested. */ 471 472 static void 473 prepare_text (struct file_data *current) 474 { 475 size_t buffered = current->buffered; 476 char *p = FILE_BUFFER (current); 477 char *dst; 478 479 if (buffered == 0 || p[buffered - 1] == '\n') 480 current->missing_newline = false; 481 else 482 { 483 p[buffered++] = '\n'; 484 current->missing_newline = true; 485 } 486 487 if (!p) 488 return; 489 490 /* Don't use uninitialized storage when planting or using sentinels. */ 491 memset (p + buffered, 0, sizeof (word)); 492 493 if (strip_trailing_cr && (dst = memchr (p, '\r', buffered))) 494 { 495 char const *src = dst; 496 char const *srclim = p + buffered; 497 498 do 499 dst += ! ((*dst = *src++) == '\r' && *src == '\n'); 500 while (src < srclim); 501 502 buffered -= src - dst; 503 } 504 505 current->buffered = buffered; 506 } 507 508 /* We have found N lines in a buffer of size S; guess the 509 proportionate number of lines that will be found in a buffer of 510 size T. However, do not guess a number of lines so large that the 511 resulting line table might cause overflow in size calculations. */ 512 static lin 513 guess_lines (lin n, size_t s, size_t t) 514 { 515 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1); 516 lin guessed_lines = MAX (1, t / guessed_bytes_per_line); 517 return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5; 518 } 519 520 /* Given a vector of two file_data objects, find the identical 521 prefixes and suffixes of each object. */ 522 523 static void 524 find_identical_ends (struct file_data filevec[]) 525 { 526 word *w0, *w1; 527 char *p0, *p1, *buffer0, *buffer1; 528 char const *end0, *beg0; 529 char const **linbuf0, **linbuf1; 530 lin i, lines; 531 size_t n0, n1; 532 lin alloc_lines0, alloc_lines1; 533 lin buffered_prefix, prefix_count, prefix_mask; 534 lin middle_guess, suffix_guess; 535 536 slurp (&filevec[0]); 537 prepare_text (&filevec[0]); 538 if (filevec[0].desc != filevec[1].desc) 539 { 540 slurp (&filevec[1]); 541 prepare_text (&filevec[1]); 542 } 543 else 544 { 545 filevec[1].buffer = filevec[0].buffer; 546 filevec[1].bufsize = filevec[0].bufsize; 547 filevec[1].buffered = filevec[0].buffered; 548 filevec[1].missing_newline = filevec[0].missing_newline; 549 } 550 551 /* Find identical prefix. */ 552 553 w0 = filevec[0].buffer; 554 w1 = filevec[1].buffer; 555 p0 = buffer0 = (char *) w0; 556 p1 = buffer1 = (char *) w1; 557 n0 = filevec[0].buffered; 558 n1 = filevec[1].buffered; 559 560 if (p0 == p1) 561 /* The buffers are the same; sentinels won't work. */ 562 p0 = p1 += n1; 563 else 564 { 565 /* Insert end sentinels, in this case characters that are guaranteed 566 to make the equality test false, and thus terminate the loop. */ 567 568 if (n0 < n1) 569 p0[n0] = ~p1[n0]; 570 else 571 p1[n1] = ~p0[n1]; 572 573 /* Loop until first mismatch, or to the sentinel characters. */ 574 575 /* Compare a word at a time for speed. */ 576 while (*w0 == *w1) 577 w0++, w1++; 578 579 /* Do the last few bytes of comparison a byte at a time. */ 580 p0 = (char *) w0; 581 p1 = (char *) w1; 582 while (*p0 == *p1) 583 p0++, p1++; 584 585 /* Don't mistakenly count missing newline as part of prefix. */ 586 if (ROBUST_OUTPUT_STYLE (output_style) 587 && ((buffer0 + n0 - filevec[0].missing_newline < p0) 588 != 589 (buffer1 + n1 - filevec[1].missing_newline < p1))) 590 p0--, p1--; 591 } 592 593 /* Now P0 and P1 point at the first nonmatching characters. */ 594 595 /* Skip back to last line-beginning in the prefix, 596 and then discard up to HORIZON_LINES lines from the prefix. */ 597 i = horizon_lines; 598 while (p0 != buffer0 && (p0[-1] != '\n' || i--)) 599 p0--, p1--; 600 601 /* Record the prefix. */ 602 filevec[0].prefix_end = p0; 603 filevec[1].prefix_end = p1; 604 605 /* Find identical suffix. */ 606 607 /* P0 and P1 point beyond the last chars not yet compared. */ 608 p0 = buffer0 + n0; 609 p1 = buffer1 + n1; 610 611 if (! ROBUST_OUTPUT_STYLE (output_style) 612 || filevec[0].missing_newline == filevec[1].missing_newline) 613 { 614 end0 = p0; /* Addr of last char in file 0. */ 615 616 /* Get value of P0 at which we should stop scanning backward: 617 this is when either P0 or P1 points just past the last char 618 of the identical prefix. */ 619 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1); 620 621 /* Scan back until chars don't match or we reach that point. */ 622 while (p0 != beg0) 623 if (*--p0 != *--p1) 624 { 625 /* Point at the first char of the matching suffix. */ 626 ++p0, ++p1; 627 beg0 = p0; 628 break; 629 } 630 631 /* Are we at a line-beginning in both files? If not, add the rest of 632 this line to the main body. Discard up to HORIZON_LINES lines from 633 the identical suffix. Also, discard one extra line, 634 because shift_boundaries may need it. */ 635 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n') 636 && 637 (buffer1 == p1 || p1[-1] == '\n')); 638 while (i-- && p0 != end0) 639 while (*p0++ != '\n') 640 continue; 641 642 p1 += p0 - beg0; 643 } 644 645 /* Record the suffix. */ 646 filevec[0].suffix_begin = p0; 647 filevec[1].suffix_begin = p1; 648 649 /* Calculate number of lines of prefix to save. 650 651 prefix_count == 0 means save the whole prefix; 652 we need this for options like -D that output the whole file, 653 or for enormous contexts (to avoid worrying about arithmetic overflow). 654 We also need it for options like -F that output some preceding line; 655 at least we will need to find the last few lines, 656 but since we don't know how many, it's easiest to find them all. 657 658 Otherwise, prefix_count != 0. Save just prefix_count lines at start 659 of the line buffer; they'll be moved to the proper location later. 660 Handle 1 more line than the context says (because we count 1 too many), 661 rounded up to the next power of 2 to speed index computation. */ 662 663 if (no_diff_means_no_output && ! function_regexp.fastmap 664 && context < LIN_MAX / 4 && context < n0) 665 { 666 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end); 667 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0); 668 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2) 669 continue; 670 alloc_lines0 = (prefix_count + middle_guess 671 + MIN (context, suffix_guess)); 672 } 673 else 674 { 675 prefix_count = 0; 676 alloc_lines0 = guess_lines (0, 0, n0); 677 } 678 679 prefix_mask = prefix_count - 1; 680 lines = 0; 681 linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0); 682 p0 = buffer0; 683 684 /* If the prefix is needed, find the prefix lines. */ 685 if (! (no_diff_means_no_output 686 && filevec[0].prefix_end == p0 687 && filevec[1].prefix_end == p1)) 688 { 689 end0 = filevec[0].prefix_end; 690 while (p0 != end0) 691 { 692 lin l = lines++ & prefix_mask; 693 if (l == alloc_lines0) 694 { 695 if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0) 696 xalloc_die (); 697 alloc_lines0 *= 2; 698 linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0); 699 } 700 linbuf0[l] = p0; 701 while (*p0++ != '\n') 702 continue; 703 } 704 } 705 buffered_prefix = prefix_count && context < lines ? context : lines; 706 707 /* Allocate line buffer 1. */ 708 709 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end); 710 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1); 711 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess); 712 if (alloc_lines1 < buffered_prefix 713 || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1) 714 xalloc_die (); 715 linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1); 716 717 if (buffered_prefix != lines) 718 { 719 /* Rotate prefix lines to proper location. */ 720 for (i = 0; i < buffered_prefix; i++) 721 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask]; 722 for (i = 0; i < buffered_prefix; i++) 723 linbuf0[i] = linbuf1[i]; 724 } 725 726 /* Initialize line buffer 1 from line buffer 0. */ 727 for (i = 0; i < buffered_prefix; i++) 728 linbuf1[i] = linbuf0[i] - buffer0 + buffer1; 729 730 /* Record the line buffer, adjusted so that 731 linbuf[0] points at the first differing line. */ 732 filevec[0].linbuf = linbuf0 + buffered_prefix; 733 filevec[1].linbuf = linbuf1 + buffered_prefix; 734 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix; 735 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix; 736 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix; 737 filevec[0].prefix_lines = filevec[1].prefix_lines = lines; 738 } 739 740 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less 741 than 2**k. This table is derived from Chris K. Caldwell's list 742 <http://www.utm.edu/research/primes/lists/2small/>. */ 743 744 static unsigned char const prime_offset[] = 745 { 746 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3, 747 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21, 748 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27, 749 55, 93, 1, 57, 25 750 }; 751 752 /* Verify that this host's size_t is not too wide for the above table. */ 753 754 verify (sizeof (size_t) * CHAR_BIT <= sizeof prime_offset); 755 756 /* Given a vector of two file_data objects, read the file associated 757 with each one, and build the table of equivalence classes. 758 Return nonzero if either file appears to be a binary file. 759 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */ 760 761 bool 762 read_files (struct file_data filevec[], bool pretend_binary) 763 { 764 int i; 765 bool skip_test = text | pretend_binary; 766 bool appears_binary = pretend_binary | sip (&filevec[0], skip_test); 767 768 if (filevec[0].desc != filevec[1].desc) 769 appears_binary |= sip (&filevec[1], skip_test | appears_binary); 770 else 771 { 772 filevec[1].buffer = filevec[0].buffer; 773 filevec[1].bufsize = filevec[0].bufsize; 774 filevec[1].buffered = filevec[0].buffered; 775 } 776 if (appears_binary) 777 { 778 set_binary_mode (filevec[0].desc, O_BINARY); 779 set_binary_mode (filevec[1].desc, O_BINARY); 780 return true; 781 } 782 783 find_identical_ends (filevec); 784 785 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1; 786 if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc) 787 xalloc_die (); 788 equivs = xmalloc (equivs_alloc * sizeof *equivs); 789 /* Equivalence class 0 is permanently safe for lines that were not 790 hashed. Real equivalence classes start at 1. */ 791 equivs_index = 1; 792 793 /* Allocate (one plus) a prime number of hash buckets. Use a prime 794 number between 1/3 and 2/3 of the value of equiv_allocs, 795 approximately. */ 796 for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++) 797 continue; 798 nbuckets = ((size_t) 1 << i) - prime_offset[i]; 799 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets) 800 xalloc_die (); 801 buckets = zalloc ((nbuckets + 1) * sizeof *buckets); 802 buckets++; 803 804 for (i = 0; i < 2; i++) 805 find_and_hash_each_line (&filevec[i]); 806 807 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index; 808 809 free (equivs); 810 free (buckets - 1); 811 812 return false; 813 } 814