1 /* Analyze file differences for GNU DIFF. 2 Copyright (C) 1988, 1989, 1992, 1993, 1997 Free Software Foundation, Inc. 3 4 This file is part of GNU DIFF. 5 6 GNU DIFF is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU DIFF is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 */ 17 18 /* The basic algorithm is described in: 19 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 20 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; 21 see especially section 4.2, which describes the variation used below. 22 Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE 23 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) 24 at the price of producing suboptimal output for large inputs with 25 many differences. 26 27 The basic algorithm was independently discovered as described in: 28 "Algorithms for Approximate String Matching", E. Ukkonen, 29 Information and Control Vol. 64, 1985, pp. 100-118. */ 30 31 #include "diff.h" 32 #include "cmpbuf.h" 33 34 extern int no_discards; 35 36 static int *xvec, *yvec; /* Vectors being compared. */ 37 static int *fdiag; /* Vector, indexed by diagonal, containing 38 1 + the X coordinate of the point furthest 39 along the given diagonal in the forward 40 search of the edit matrix. */ 41 static int *bdiag; /* Vector, indexed by diagonal, containing 42 the X coordinate of the point furthest 43 along the given diagonal in the backward 44 search of the edit matrix. */ 45 static int too_expensive; /* Edit scripts longer than this are too 46 expensive to compute. */ 47 48 #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */ 49 50 struct partition 51 { 52 int xmid, ymid; /* Midpoints of this partition. */ 53 int lo_minimal; /* Nonzero if low half will be analyzed minimally. */ 54 int hi_minimal; /* Likewise for high half. */ 55 }; 56 57 static int diag PARAMS((int, int, int, int, int, struct partition *)); 58 static struct change *add_change PARAMS((int, int, int, int, struct change *)); 59 static struct change *build_reverse_script PARAMS((struct file_data const[])); 60 static struct change *build_script PARAMS((struct file_data const[])); 61 static void briefly_report PARAMS((int, struct file_data const[])); 62 static void compareseq PARAMS((int, int, int, int, int)); 63 static void discard_confusing_lines PARAMS((struct file_data[])); 64 static void shift_boundaries PARAMS((struct file_data[])); 65 66 /* Find the midpoint of the shortest edit script for a specified 67 portion of the two files. 68 69 Scan from the beginnings of the files, and simultaneously from the ends, 70 doing a breadth-first search through the space of edit-sequence. 71 When the two searches meet, we have found the midpoint of the shortest 72 edit sequence. 73 74 If MINIMAL is nonzero, find the minimal edit script regardless 75 of expense. Otherwise, if the search is too expensive, use 76 heuristics to stop the search and report a suboptimal answer. 77 78 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal number 79 XMID - YMID equals the number of inserted lines minus the number 80 of deleted lines (counting only lines before the midpoint). 81 Return the approximate edit cost; this is the total number of 82 lines inserted or deleted (counting only lines before the midpoint), 83 unless a heuristic is used to terminate the search prematurely. 84 85 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script for the 86 left half of the partition is known; similarly for PART->RIGHT_MINIMAL. 87 88 This function assumes that the first lines of the specified portions 89 of the two files do not match, and likewise that the last lines do not 90 match. The caller must trim matching lines from the beginning and end 91 of the portions it is going to specify. 92 93 If we return the "wrong" partitions, 94 the worst this can do is cause suboptimal diff output. 95 It cannot cause incorrect diff output. */ 96 97 static int 98 diag (xoff, xlim, yoff, ylim, minimal, part) 99 int xoff, xlim, yoff, ylim, minimal; 100 struct partition *part; 101 { 102 int *const fd = fdiag; /* Give the compiler a chance. */ 103 int *const bd = bdiag; /* Additional help for the compiler. */ 104 int const *const xv = xvec; /* Still more help for the compiler. */ 105 int const *const yv = yvec; /* And more and more . . . */ 106 int const dmin = xoff - ylim; /* Minimum valid diagonal. */ 107 int const dmax = xlim - yoff; /* Maximum valid diagonal. */ 108 int const fmid = xoff - yoff; /* Center diagonal of top-down search. */ 109 int const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ 110 int fmin = fmid, fmax = fmid; /* Limits of top-down search. */ 111 int bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */ 112 int c; /* Cost. */ 113 int odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd 114 diagonal with respect to the northwest. */ 115 116 fd[fmid] = xoff; 117 bd[bmid] = xlim; 118 119 for (c = 1;; ++c) 120 { 121 int d; /* Active diagonal. */ 122 int big_snake = 0; 123 124 /* Extend the top-down search by an edit step in each diagonal. */ 125 fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin; 126 fmax < dmax ? fd[++fmax + 1] = -1 : --fmax; 127 for (d = fmax; d >= fmin; d -= 2) 128 { 129 int x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1]; 130 131 if (tlo >= thi) 132 x = tlo + 1; 133 else 134 x = thi; 135 oldx = x; 136 y = x - d; 137 while (x < xlim && y < ylim && xv[x] == yv[y]) 138 ++x, ++y; 139 if (x - oldx > SNAKE_LIMIT) 140 big_snake = 1; 141 fd[d] = x; 142 if (odd && bmin <= d && d <= bmax && bd[d] <= x) 143 { 144 part->xmid = x; 145 part->ymid = y; 146 part->lo_minimal = part->hi_minimal = 1; 147 return 2 * c - 1; 148 } 149 } 150 151 /* Similarly extend the bottom-up search. */ 152 bmin > dmin ? bd[--bmin - 1] = INT_MAX : ++bmin; 153 bmax < dmax ? bd[++bmax + 1] = INT_MAX : --bmax; 154 for (d = bmax; d >= bmin; d -= 2) 155 { 156 int x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1]; 157 158 if (tlo < thi) 159 x = tlo; 160 else 161 x = thi - 1; 162 oldx = x; 163 y = x - d; 164 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) 165 --x, --y; 166 if (oldx - x > SNAKE_LIMIT) 167 big_snake = 1; 168 bd[d] = x; 169 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) 170 { 171 part->xmid = x; 172 part->ymid = y; 173 part->lo_minimal = part->hi_minimal = 1; 174 return 2 * c; 175 } 176 } 177 178 if (minimal) 179 continue; 180 181 /* Heuristic: check occasionally for a diagonal that has made 182 lots of progress compared with the edit distance. 183 If we have any such, find the one that has made the most 184 progress and return it as if it had succeeded. 185 186 With this heuristic, for files with a constant small density 187 of changes, the algorithm is linear in the file size. */ 188 189 if (c > 200 && big_snake && heuristic) 190 { 191 int best; 192 193 best = 0; 194 for (d = fmax; d >= fmin; d -= 2) 195 { 196 int dd = d - fmid; 197 int x = fd[d]; 198 int y = x - d; 199 int v = (x - xoff) * 2 - dd; 200 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 201 { 202 if (v > best 203 && xoff + SNAKE_LIMIT <= x && x < xlim 204 && yoff + SNAKE_LIMIT <= y && y < ylim) 205 { 206 /* We have a good enough best diagonal; 207 now insist that it end with a significant snake. */ 208 int k; 209 210 for (k = 1; xv[x - k] == yv[y - k]; k++) 211 if (k == SNAKE_LIMIT) 212 { 213 best = v; 214 part->xmid = x; 215 part->ymid = y; 216 break; 217 } 218 } 219 } 220 } 221 if (best > 0) 222 { 223 part->lo_minimal = 1; 224 part->hi_minimal = 0; 225 return 2 * c - 1; 226 } 227 228 best = 0; 229 for (d = bmax; d >= bmin; d -= 2) 230 { 231 int dd = d - bmid; 232 int x = bd[d]; 233 int y = x - d; 234 int v = (xlim - x) * 2 + dd; 235 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 236 { 237 if (v > best 238 && xoff < x && x <= xlim - SNAKE_LIMIT 239 && yoff < y && y <= ylim - SNAKE_LIMIT) 240 { 241 /* We have a good enough best diagonal; 242 now insist that it end with a significant snake. */ 243 int k; 244 245 for (k = 0; xv[x + k] == yv[y + k]; k++) 246 if (k == SNAKE_LIMIT - 1) 247 { 248 best = v; 249 part->xmid = x; 250 part->ymid = y; 251 break; 252 } 253 } 254 } 255 } 256 if (best > 0) 257 { 258 part->lo_minimal = 0; 259 part->hi_minimal = 1; 260 return 2 * c - 1; 261 } 262 } 263 264 /* Heuristic: if we've gone well beyond the call of duty, 265 give up and report halfway between our best results so far. */ 266 if (c >= too_expensive) 267 { 268 int fxybest, fxbest; 269 int bxybest, bxbest; 270 271 fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */ 272 273 /* Find forward diagonal that maximizes X + Y. */ 274 fxybest = -1; 275 for (d = fmax; d >= fmin; d -= 2) 276 { 277 int x = min (fd[d], xlim); 278 int y = x - d; 279 if (ylim < y) 280 x = ylim + d, y = ylim; 281 if (fxybest < x + y) 282 { 283 fxybest = x + y; 284 fxbest = x; 285 } 286 } 287 288 /* Find backward diagonal that minimizes X + Y. */ 289 bxybest = INT_MAX; 290 for (d = bmax; d >= bmin; d -= 2) 291 { 292 int x = max (xoff, bd[d]); 293 int y = x - d; 294 if (y < yoff) 295 x = yoff + d, y = yoff; 296 if (x + y < bxybest) 297 { 298 bxybest = x + y; 299 bxbest = x; 300 } 301 } 302 303 /* Use the better of the two diagonals. */ 304 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) 305 { 306 part->xmid = fxbest; 307 part->ymid = fxybest - fxbest; 308 part->lo_minimal = 1; 309 part->hi_minimal = 0; 310 } 311 else 312 { 313 part->xmid = bxbest; 314 part->ymid = bxybest - bxbest; 315 part->lo_minimal = 0; 316 part->hi_minimal = 1; 317 } 318 return 2 * c - 1; 319 } 320 } 321 } 322 323 /* Compare in detail contiguous subsequences of the two files 324 which are known, as a whole, to match each other. 325 326 The results are recorded in the vectors files[N].changed_flag, by 327 storing a 1 in the element for each line that is an insertion or deletion. 328 329 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. 330 331 Note that XLIM, YLIM are exclusive bounds. 332 All line numbers are origin-0 and discarded lines are not counted. 333 334 If MINIMAL is nonzero, find a minimal difference no matter how 335 expensive it is. */ 336 337 static void 338 compareseq (xoff, xlim, yoff, ylim, minimal) 339 int xoff, xlim, yoff, ylim, minimal; 340 { 341 int * const xv = xvec; /* Help the compiler. */ 342 int * const yv = yvec; 343 344 /* Slide down the bottom initial diagonal. */ 345 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) 346 ++xoff, ++yoff; 347 /* Slide up the top initial diagonal. */ 348 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) 349 --xlim, --ylim; 350 351 /* Handle simple cases. */ 352 if (xoff == xlim) 353 while (yoff < ylim) 354 files[1].changed_flag[files[1].realindexes[yoff++]] = 1; 355 else if (yoff == ylim) 356 while (xoff < xlim) 357 files[0].changed_flag[files[0].realindexes[xoff++]] = 1; 358 else 359 { 360 int c; 361 struct partition part; 362 363 /* Find a point of correspondence in the middle of the files. */ 364 365 c = diag (xoff, xlim, yoff, ylim, minimal, &part); 366 367 if (c == 1) 368 { 369 /* This should be impossible, because it implies that 370 one of the two subsequences is empty, 371 and that case was handled above without calling `diag'. 372 Let's verify that this is true. */ 373 abort (); 374 #if 0 375 /* The two subsequences differ by a single insert or delete; 376 record it and we are done. */ 377 if (part.xmid - part.ymid < xoff - yoff) 378 files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1; 379 else 380 files[0].changed_flag[files[0].realindexes[part.xmid]] = 1; 381 #endif 382 } 383 else 384 { 385 /* Use the partitions to split this problem into subproblems. */ 386 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal); 387 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal); 388 } 389 } 390 } 391 392 /* Discard lines from one file that have no matches in the other file. 393 394 A line which is discarded will not be considered by the actual 395 comparison algorithm; it will be as if that line were not in the file. 396 The file's `realindexes' table maps virtual line numbers 397 (which don't count the discarded lines) into real line numbers; 398 this is how the actual comparison algorithm produces results 399 that are comprehensible when the discarded lines are counted. 400 401 When we discard a line, we also mark it as a deletion or insertion 402 so that it will be printed in the output. */ 403 404 static void 405 discard_confusing_lines (filevec) 406 struct file_data filevec[]; 407 { 408 unsigned int f, i; 409 char *discarded[2]; 410 int *equiv_count[2]; 411 int *p; 412 413 /* Allocate our results. */ 414 p = (int *) xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines) 415 * (2 * sizeof (int))); 416 for (f = 0; f < 2; f++) 417 { 418 filevec[f].undiscarded = p; p += filevec[f].buffered_lines; 419 filevec[f].realindexes = p; p += filevec[f].buffered_lines; 420 } 421 422 /* Set up equiv_count[F][I] as the number of lines in file F 423 that fall in equivalence class I. */ 424 425 p = (int *) xmalloc (filevec[0].equiv_max * (2 * sizeof (int))); 426 equiv_count[0] = p; 427 equiv_count[1] = p + filevec[0].equiv_max; 428 bzero (p, filevec[0].equiv_max * (2 * sizeof (int))); 429 430 for (i = 0; i < filevec[0].buffered_lines; ++i) 431 ++equiv_count[0][filevec[0].equivs[i]]; 432 for (i = 0; i < filevec[1].buffered_lines; ++i) 433 ++equiv_count[1][filevec[1].equivs[i]]; 434 435 /* Set up tables of which lines are going to be discarded. */ 436 437 discarded[0] = xmalloc (sizeof (char) 438 * (filevec[0].buffered_lines 439 + filevec[1].buffered_lines)); 440 discarded[1] = discarded[0] + filevec[0].buffered_lines; 441 bzero (discarded[0], sizeof (char) * (filevec[0].buffered_lines 442 + filevec[1].buffered_lines)); 443 444 /* Mark to be discarded each line that matches no line of the other file. 445 If a line matches many lines, mark it as provisionally discardable. */ 446 447 for (f = 0; f < 2; f++) 448 { 449 unsigned int end = filevec[f].buffered_lines; 450 char *discards = discarded[f]; 451 int *counts = equiv_count[1 - f]; 452 int *equivs = filevec[f].equivs; 453 unsigned int many = 5; 454 unsigned int tem = end / 64; 455 456 /* Multiply MANY by approximate square root of number of lines. 457 That is the threshold for provisionally discardable lines. */ 458 while ((tem = tem >> 2) > 0) 459 many *= 2; 460 461 for (i = 0; i < end; i++) 462 { 463 int nmatch; 464 if (equivs[i] == 0) 465 continue; 466 nmatch = counts[equivs[i]]; 467 if (nmatch == 0) 468 discards[i] = 1; 469 else if (nmatch > many) 470 discards[i] = 2; 471 } 472 } 473 474 /* Don't really discard the provisional lines except when they occur 475 in a run of discardables, with nonprovisionals at the beginning 476 and end. */ 477 478 for (f = 0; f < 2; f++) 479 { 480 unsigned int end = filevec[f].buffered_lines; 481 register char *discards = discarded[f]; 482 483 for (i = 0; i < end; i++) 484 { 485 /* Cancel provisional discards not in middle of run of discards. */ 486 if (discards[i] == 2) 487 discards[i] = 0; 488 else if (discards[i] != 0) 489 { 490 /* We have found a nonprovisional discard. */ 491 register int j; 492 unsigned int length; 493 unsigned int provisional = 0; 494 495 /* Find end of this run of discardable lines. 496 Count how many are provisionally discardable. */ 497 for (j = i; j < end; j++) 498 { 499 if (discards[j] == 0) 500 break; 501 if (discards[j] == 2) 502 ++provisional; 503 } 504 505 /* Cancel provisional discards at end, and shrink the run. */ 506 while (j > i && discards[j - 1] == 2) 507 discards[--j] = 0, --provisional; 508 509 /* Now we have the length of a run of discardable lines 510 whose first and last are not provisional. */ 511 length = j - i; 512 513 /* If 1/4 of the lines in the run are provisional, 514 cancel discarding of all provisional lines in the run. */ 515 if (provisional * 4 > length) 516 { 517 while (j > i) 518 if (discards[--j] == 2) 519 discards[j] = 0; 520 } 521 else 522 { 523 register unsigned int consec; 524 unsigned int minimum = 1; 525 unsigned int tem = length / 4; 526 527 /* MINIMUM is approximate square root of LENGTH/4. 528 A subrun of two or more provisionals can stand 529 when LENGTH is at least 16. 530 A subrun of 4 or more can stand when LENGTH >= 64. */ 531 while ((tem = tem >> 2) > 0) 532 minimum *= 2; 533 minimum++; 534 535 /* Cancel any subrun of MINIMUM or more provisionals 536 within the larger run. */ 537 for (j = 0, consec = 0; j < length; j++) 538 if (discards[i + j] != 2) 539 consec = 0; 540 else if (minimum == ++consec) 541 /* Back up to start of subrun, to cancel it all. */ 542 j -= consec; 543 else if (minimum < consec) 544 discards[i + j] = 0; 545 546 /* Scan from beginning of run 547 until we find 3 or more nonprovisionals in a row 548 or until the first nonprovisional at least 8 lines in. 549 Until that point, cancel any provisionals. */ 550 for (j = 0, consec = 0; j < length; j++) 551 { 552 if (j >= 8 && discards[i + j] == 1) 553 break; 554 if (discards[i + j] == 2) 555 consec = 0, discards[i + j] = 0; 556 else if (discards[i + j] == 0) 557 consec = 0; 558 else 559 consec++; 560 if (consec == 3) 561 break; 562 } 563 564 /* I advances to the last line of the run. */ 565 i += length - 1; 566 567 /* Same thing, from end. */ 568 for (j = 0, consec = 0; j < length; j++) 569 { 570 if (j >= 8 && discards[i - j] == 1) 571 break; 572 if (discards[i - j] == 2) 573 consec = 0, discards[i - j] = 0; 574 else if (discards[i - j] == 0) 575 consec = 0; 576 else 577 consec++; 578 if (consec == 3) 579 break; 580 } 581 } 582 } 583 } 584 } 585 586 /* Actually discard the lines. */ 587 for (f = 0; f < 2; f++) 588 { 589 char *discards = discarded[f]; 590 unsigned int end = filevec[f].buffered_lines; 591 unsigned int j = 0; 592 for (i = 0; i < end; ++i) 593 if (no_discards || discards[i] == 0) 594 { 595 filevec[f].undiscarded[j] = filevec[f].equivs[i]; 596 filevec[f].realindexes[j++] = i; 597 } 598 else 599 filevec[f].changed_flag[i] = 1; 600 filevec[f].nondiscarded_lines = j; 601 } 602 603 free (discarded[0]); 604 free (equiv_count[0]); 605 } 606 607 /* Adjust inserts/deletes of identical lines to join changes 608 as much as possible. 609 610 We do something when a run of changed lines include a 611 line at one end and have an excluded, identical line at the other. 612 We are free to choose which identical line is included. 613 `compareseq' usually chooses the one at the beginning, 614 but usually it is cleaner to consider the following identical line 615 to be the "change". */ 616 617 int inhibit; 618 619 static void 620 shift_boundaries (filevec) 621 struct file_data filevec[]; 622 { 623 int f; 624 625 if (inhibit) 626 return; 627 628 for (f = 0; f < 2; f++) 629 { 630 char *changed = filevec[f].changed_flag; 631 char const *other_changed = filevec[1-f].changed_flag; 632 int const *equivs = filevec[f].equivs; 633 int i = 0; 634 int j = 0; 635 int i_end = filevec[f].buffered_lines; 636 637 while (1) 638 { 639 int runlength, start, corresponding; 640 641 /* Scan forwards to find beginning of another run of changes. 642 Also keep track of the corresponding point in the other file. */ 643 644 while (i < i_end && changed[i] == 0) 645 { 646 while (other_changed[j++]) 647 continue; 648 i++; 649 } 650 651 if (i == i_end) 652 break; 653 654 start = i; 655 656 /* Find the end of this run of changes. */ 657 658 while (changed[++i]) 659 continue; 660 while (other_changed[j]) 661 j++; 662 663 do 664 { 665 /* Record the length of this run of changes, so that 666 we can later determine whether the run has grown. */ 667 runlength = i - start; 668 669 /* Move the changed region back, so long as the 670 previous unchanged line matches the last changed one. 671 This merges with previous changed regions. */ 672 673 while (start && equivs[start - 1] == equivs[i - 1]) 674 { 675 changed[--start] = 1; 676 changed[--i] = 0; 677 while (changed[start - 1]) 678 start--; 679 while (other_changed[--j]) 680 continue; 681 } 682 683 /* Set CORRESPONDING to the end of the changed run, at the last 684 point where it corresponds to a changed run in the other file. 685 CORRESPONDING == I_END means no such point has been found. */ 686 corresponding = other_changed[j - 1] ? i : i_end; 687 688 /* Move the changed region forward, so long as the 689 first changed line matches the following unchanged one. 690 This merges with following changed regions. 691 Do this second, so that if there are no merges, 692 the changed region is moved forward as far as possible. */ 693 694 while (i != i_end && equivs[start] == equivs[i]) 695 { 696 changed[start++] = 0; 697 changed[i++] = 1; 698 while (changed[i]) 699 i++; 700 while (other_changed[++j]) 701 corresponding = i; 702 } 703 } 704 while (runlength != i - start); 705 706 /* If possible, move the fully-merged run of changes 707 back to a corresponding run in the other file. */ 708 709 while (corresponding < i) 710 { 711 changed[--start] = 1; 712 changed[--i] = 0; 713 while (other_changed[--j]) 714 continue; 715 } 716 } 717 } 718 } 719 720 /* Cons an additional entry onto the front of an edit script OLD. 721 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 722 DELETED is the number of lines deleted here from file 0. 723 INSERTED is the number of lines inserted here in file 1. 724 725 If DELETED is 0 then LINE0 is the number of the line before 726 which the insertion was done; vice versa for INSERTED and LINE1. */ 727 728 static struct change * 729 add_change (line0, line1, deleted, inserted, old) 730 int line0, line1, deleted, inserted; 731 struct change *old; 732 { 733 struct change *new = (struct change *) xmalloc (sizeof (struct change)); 734 735 new->line0 = line0; 736 new->line1 = line1; 737 new->inserted = inserted; 738 new->deleted = deleted; 739 new->link = old; 740 return new; 741 } 742 743 /* Scan the tables of which lines are inserted and deleted, 744 producing an edit script in reverse order. */ 745 746 static struct change * 747 build_reverse_script (filevec) 748 struct file_data const filevec[]; 749 { 750 struct change *script = 0; 751 char *changed0 = filevec[0].changed_flag; 752 char *changed1 = filevec[1].changed_flag; 753 int len0 = filevec[0].buffered_lines; 754 int len1 = filevec[1].buffered_lines; 755 756 /* Note that changedN[len0] does exist, and contains 0. */ 757 758 int i0 = 0, i1 = 0; 759 760 while (i0 < len0 || i1 < len1) 761 { 762 if (changed0[i0] || changed1[i1]) 763 { 764 int line0 = i0, line1 = i1; 765 766 /* Find # lines changed here in each file. */ 767 while (changed0[i0]) ++i0; 768 while (changed1[i1]) ++i1; 769 770 /* Record this change. */ 771 script = add_change (line0, line1, i0 - line0, i1 - line1, script); 772 } 773 774 /* We have reached lines in the two files that match each other. */ 775 i0++, i1++; 776 } 777 778 return script; 779 } 780 781 /* Scan the tables of which lines are inserted and deleted, 782 producing an edit script in forward order. */ 783 784 static struct change * 785 build_script (filevec) 786 struct file_data const filevec[]; 787 { 788 struct change *script = 0; 789 char *changed0 = filevec[0].changed_flag; 790 char *changed1 = filevec[1].changed_flag; 791 int i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines; 792 793 /* Note that changedN[-1] does exist, and contains 0. */ 794 795 while (i0 >= 0 || i1 >= 0) 796 { 797 if (changed0[i0 - 1] || changed1[i1 - 1]) 798 { 799 int line0 = i0, line1 = i1; 800 801 /* Find # lines changed here in each file. */ 802 while (changed0[i0 - 1]) --i0; 803 while (changed1[i1 - 1]) --i1; 804 805 /* Record this change. */ 806 script = add_change (i0, i1, line0 - i0, line1 - i1, script); 807 } 808 809 /* We have reached lines in the two files that match each other. */ 810 i0--, i1--; 811 } 812 813 return script; 814 } 815 816 /* If CHANGES, briefly report that two files differed. */ 817 static void 818 briefly_report (changes, filevec) 819 int changes; 820 struct file_data const filevec[]; 821 { 822 if (changes) 823 message (no_details_flag ? "Files %s and %s differ\n" 824 : "Binary files %s and %s differ\n", 825 filevec[0].name, filevec[1].name); 826 } 827 828 /* Report the differences of two files. DEPTH is the current directory 829 depth. */ 830 int 831 diff_2_files (filevec, depth) 832 struct file_data filevec[]; 833 int depth; 834 { 835 int diags; 836 int i; 837 struct change *e, *p; 838 struct change *script; 839 int changes; 840 841 842 /* If we have detected that either file is binary, 843 compare the two files as binary. This can happen 844 only when the first chunk is read. 845 Also, --brief without any --ignore-* options means 846 we can speed things up by treating the files as binary. */ 847 848 if (read_files (filevec, no_details_flag & ~ignore_some_changes)) 849 { 850 /* Files with different lengths must be different. */ 851 if (filevec[0].stat.st_size != filevec[1].stat.st_size 852 && (filevec[0].desc < 0 || S_ISREG (filevec[0].stat.st_mode)) 853 && (filevec[1].desc < 0 || S_ISREG (filevec[1].stat.st_mode))) 854 changes = 1; 855 856 /* Standard input equals itself. */ 857 else if (filevec[0].desc == filevec[1].desc) 858 changes = 0; 859 860 else 861 /* Scan both files, a buffer at a time, looking for a difference. */ 862 { 863 /* Allocate same-sized buffers for both files. */ 864 size_t buffer_size = buffer_lcm (STAT_BLOCKSIZE (filevec[0].stat), 865 STAT_BLOCKSIZE (filevec[1].stat)); 866 for (i = 0; i < 2; i++) 867 filevec[i].buffer = xrealloc (filevec[i].buffer, buffer_size); 868 869 for (;; filevec[0].buffered_chars = filevec[1].buffered_chars = 0) 870 { 871 /* Read a buffer's worth from both files. */ 872 for (i = 0; i < 2; i++) 873 if (0 <= filevec[i].desc) 874 while (filevec[i].buffered_chars != buffer_size) 875 { 876 int r = read (filevec[i].desc, 877 filevec[i].buffer 878 + filevec[i].buffered_chars, 879 buffer_size - filevec[i].buffered_chars); 880 if (r == 0) 881 break; 882 if (r < 0) 883 pfatal_with_name (filevec[i].name); 884 filevec[i].buffered_chars += r; 885 } 886 887 /* If the buffers differ, the files differ. */ 888 if (filevec[0].buffered_chars != filevec[1].buffered_chars 889 || (filevec[0].buffered_chars != 0 890 && memcmp (filevec[0].buffer, 891 filevec[1].buffer, 892 filevec[0].buffered_chars) != 0)) 893 { 894 changes = 1; 895 break; 896 } 897 898 /* If we reach end of file, the files are the same. */ 899 if (filevec[0].buffered_chars != buffer_size) 900 { 901 changes = 0; 902 break; 903 } 904 } 905 } 906 907 briefly_report (changes, filevec); 908 } 909 else 910 { 911 /* Allocate vectors for the results of comparison: 912 a flag for each line of each file, saying whether that line 913 is an insertion or deletion. 914 Allocate an extra element, always zero, at each end of each vector. */ 915 916 size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4; 917 filevec[0].changed_flag = xmalloc (s); 918 bzero (filevec[0].changed_flag, s); 919 filevec[0].changed_flag++; 920 filevec[1].changed_flag = filevec[0].changed_flag 921 + filevec[0].buffered_lines + 2; 922 923 /* Some lines are obviously insertions or deletions 924 because they don't match anything. Detect them now, and 925 avoid even thinking about them in the main comparison algorithm. */ 926 927 discard_confusing_lines (filevec); 928 929 /* Now do the main comparison algorithm, considering just the 930 undiscarded lines. */ 931 932 xvec = filevec[0].undiscarded; 933 yvec = filevec[1].undiscarded; 934 diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; 935 fdiag = (int *) xmalloc (diags * (2 * sizeof (int))); 936 bdiag = fdiag + diags; 937 fdiag += filevec[1].nondiscarded_lines + 1; 938 bdiag += filevec[1].nondiscarded_lines + 1; 939 940 /* Set TOO_EXPENSIVE to be approximate square root of input size, 941 bounded below by 256. */ 942 too_expensive = 1; 943 for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines; 944 i != 0; i >>= 2) 945 too_expensive <<= 1; 946 too_expensive = max (256, too_expensive); 947 948 files[0] = filevec[0]; 949 files[1] = filevec[1]; 950 951 compareseq (0, filevec[0].nondiscarded_lines, 952 0, filevec[1].nondiscarded_lines, no_discards); 953 954 free (fdiag - (filevec[1].nondiscarded_lines + 1)); 955 956 /* Modify the results slightly to make them prettier 957 in cases where that can validly be done. */ 958 959 shift_boundaries (filevec); 960 961 /* Get the results of comparison in the form of a chain 962 of `struct change's -- an edit script. */ 963 964 if (output_style == OUTPUT_ED) 965 script = build_reverse_script (filevec); 966 else 967 script = build_script (filevec); 968 969 /* Set CHANGES if we had any diffs. 970 If some changes are ignored, we must scan the script to decide. */ 971 if (ignore_blank_lines_flag || ignore_regexp_list) 972 { 973 struct change *next = script; 974 changes = 0; 975 976 while (next && changes == 0) 977 { 978 struct change *this, *end; 979 int first0, last0, first1, last1, deletes, inserts; 980 981 /* Find a set of changes that belong together. */ 982 this = next; 983 end = find_change (next); 984 985 /* Disconnect them from the rest of the changes, making them 986 a hunk, and remember the rest for next iteration. */ 987 next = end->link; 988 end->link = 0; 989 990 /* Determine whether this hunk is really a difference. */ 991 analyze_hunk (this, &first0, &last0, &first1, &last1, 992 &deletes, &inserts); 993 994 /* Reconnect the script so it will all be freed properly. */ 995 end->link = next; 996 997 if (deletes || inserts) 998 changes = 1; 999 } 1000 } 1001 else 1002 changes = (script != 0); 1003 1004 if (no_details_flag) 1005 briefly_report (changes, filevec); 1006 else 1007 { 1008 if (changes || ! no_diff_means_no_output) 1009 { 1010 /* Record info for starting up output, 1011 to be used if and when we have some output to print. */ 1012 setup_output (files[0].name, files[1].name, depth); 1013 1014 switch (output_style) 1015 { 1016 case OUTPUT_CONTEXT: 1017 print_context_script (script, 0); 1018 break; 1019 1020 case OUTPUT_UNIFIED: 1021 print_context_script (script, 1); 1022 break; 1023 1024 case OUTPUT_ED: 1025 print_ed_script (script); 1026 break; 1027 1028 case OUTPUT_FORWARD_ED: 1029 pr_forward_ed_script (script); 1030 break; 1031 1032 case OUTPUT_RCS: 1033 print_rcs_script (script); 1034 break; 1035 1036 case OUTPUT_NORMAL: 1037 print_normal_script (script); 1038 break; 1039 1040 case OUTPUT_IFDEF: 1041 print_ifdef_script (script); 1042 break; 1043 1044 case OUTPUT_SDIFF: 1045 print_sdiff_script (script); 1046 } 1047 1048 finish_output (); 1049 } 1050 } 1051 1052 free (filevec[0].undiscarded); 1053 1054 free (filevec[0].changed_flag - 1); 1055 1056 for (i = 1; i >= 0; --i) 1057 free (filevec[i].equivs); 1058 1059 for (i = 0; i < 2; ++i) 1060 free (filevec[i].linbuf + filevec[i].linbuf_base); 1061 1062 for (e = script; e; e = p) 1063 { 1064 p = e->link; 1065 free (e); 1066 } 1067 1068 if (! ROBUST_OUTPUT_STYLE (output_style)) 1069 for (i = 0; i < 2; ++i) 1070 if (filevec[i].missing_newline) 1071 { 1072 diff_error ("No newline at end of file %s", filevec[i].name, ""); 1073 changes = 2; 1074 } 1075 } 1076 1077 if (filevec[0].buffer != filevec[1].buffer) 1078 free (filevec[0].buffer); 1079 free (filevec[1].buffer); 1080 1081 return changes; 1082 } 1083