1 static char sccsid[] = "@(#)diffreg.c 4.6 11/08/82"; 2 3 #include "diff.h" 4 /* 5 * diff - compare two files. 6 */ 7 8 /* 9 * Uses an algorithm due to Harold Stone, which finds 10 * a pair of longest identical subsequences in the two 11 * files. 12 * 13 * The major goal is to generate the match vector J. 14 * J[i] is the index of the line in file1 corresponding 15 * to line i file0. J[i] = 0 if there is no 16 * such line in file1. 17 * 18 * Lines are hashed so as to work in core. All potential 19 * matches are located by sorting the lines of each file 20 * on the hash (called ``value''). In particular, this 21 * collects the equivalence classes in file1 together. 22 * Subroutine equiv replaces the value of each line in 23 * file0 by the index of the first element of its 24 * matching equivalence in (the reordered) file1. 25 * To save space equiv squeezes file1 into a single 26 * array member in which the equivalence classes 27 * are simply concatenated, except that their first 28 * members are flagged by changing sign. 29 * 30 * Next the indices that point into member are unsorted into 31 * array class according to the original order of file0. 32 * 33 * The cleverness lies in routine stone. This marches 34 * through the lines of file0, developing a vector klist 35 * of "k-candidates". At step i a k-candidate is a matched 36 * pair of lines x,y (x in file0 y in file1) such that 37 * there is a common subsequence of length k 38 * between the first i lines of file0 and the first y 39 * lines of file1, but there is no such subsequence for 40 * any smaller y. x is the earliest possible mate to y 41 * that occurs in such a subsequence. 42 * 43 * Whenever any of the members of the equivalence class of 44 * lines in file1 matable to a line in file0 has serial number 45 * less than the y of some k-candidate, that k-candidate 46 * with the smallest such y is replaced. The new 47 * k-candidate is chained (via pred) to the current 48 * k-1 candidate so that the actual subsequence can 49 * be recovered. When a member has serial number greater 50 * that the y of all k-candidates, the klist is extended. 51 * At the end, the longest subsequence is pulled out 52 * and placed in the array J by unravel 53 * 54 * With J in hand, the matches there recorded are 55 * check'ed against reality to assure that no spurious 56 * matches have crept in due to hashing. If they have, 57 * they are broken, and "jackpot" is recorded--a harmless 58 * matter except that a true match for a spuriously 59 * mated line may now be unnecessarily reported as a change. 60 * 61 * Much of the complexity of the program comes simply 62 * from trying to minimize core utilization and 63 * maximize the range of doable problems by dynamically 64 * allocating what is needed and reusing what is not. 65 * The core requirements for problems larger than somewhat 66 * are (in words) 2*length(file0) + length(file1) + 67 * 3*(number of k-candidates installed), typically about 68 * 6n words for files of length n. 69 */ 70 71 #define prints(s) fputs(s,stdout) 72 73 FILE *input[2]; 74 FILE *fopen(); 75 76 struct cand { 77 int x; 78 int y; 79 int pred; 80 } cand; 81 struct line { 82 int serial; 83 int value; 84 } *file[2], line; 85 int len[2]; 86 struct line *sfile[2]; /* shortened by pruning common prefix and suffix */ 87 int slen[2]; 88 int pref, suff; /* length of prefix and suffix */ 89 int *class; /* will be overlaid on file[0] */ 90 int *member; /* will be overlaid on file[1] */ 91 int *klist; /* will be overlaid on file[0] after class */ 92 struct cand *clist; /* merely a free storage pot for candidates */ 93 int clen = 0; 94 int *J; /* will be overlaid on class */ 95 long *ixold; /* will be overlaid on klist */ 96 long *ixnew; /* will be overlaid on file[1] */ 97 98 diffreg() 99 { 100 register int i, j; 101 FILE *f1, *f2; 102 char buf1[BUFSIZ], buf2[BUFSIZ]; 103 104 if (hflag) { 105 diffargv[0] = "diffh"; 106 execv(diffh, diffargv); 107 fprintf(stderr, "diff: "); 108 perror(diffh); 109 done(); 110 } 111 dummy = malloc(1); 112 if ((stb1.st_mode & S_IFMT) == S_IFDIR) 113 file1 = splice(file1, file2); 114 else if ((stb2.st_mode & S_IFMT) == S_IFDIR) 115 file2 = splice(file2, file1); 116 else if (!strcmp(file1, "-")) { 117 if (!strcmp(file2, "-")) { 118 fprintf(stderr, "diff: can't specify - -\n"); 119 done(); 120 } 121 file1 = copytemp(); 122 } else if (!strcmp(file2, "-")) 123 file2 = copytemp(); 124 if ((f1 = fopen(file1, "r")) == NULL) { 125 fprintf(stderr, "diff: "); 126 perror(file1); 127 fclose(f1); 128 done(); 129 } 130 if ((f2 = fopen(file2, "r")) == NULL) { 131 fprintf(stderr, "diff: "); 132 perror(file2); 133 fclose(f1); 134 fclose(f2); 135 done(); 136 } 137 if (stb1.st_size != stb2.st_size) 138 goto notsame; 139 for (;;) { 140 i = fread(buf1, 1, BUFSIZ, f1); 141 j = fread(buf2, 1, BUFSIZ, f2); 142 if (i < 0 || j < 0 || i != j) 143 goto notsame; 144 if (i == 0 && j == 0) { 145 fclose(f1); 146 fclose(f2); 147 status = 0; /* files don't differ */ 148 goto same; 149 } 150 for (j = 0; j < i; j++) 151 if (buf1[j] != buf2[j]) 152 goto notsame; 153 } 154 notsame: 155 /* 156 * Files certainly differ at this point; set status accordingly 157 */ 158 status = 1; 159 if (!asciifile(f1) || !asciifile(f2)) { 160 printf("Binary files %s and %s differ\n", file1, file2); 161 fclose(f1); 162 fclose(f2); 163 done(); 164 } 165 prepare(0, f1); 166 prepare(1, f2); 167 fclose(f1); 168 fclose(f2); 169 prune(); 170 sort(sfile[0],slen[0]); 171 sort(sfile[1],slen[1]); 172 173 member = (int *)file[1]; 174 equiv(sfile[0], slen[0], sfile[1], slen[1], member); 175 member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int)); 176 177 class = (int *)file[0]; 178 unsort(sfile[0], slen[0], class); 179 class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int)); 180 181 klist = (int *)talloc((slen[0]+2)*sizeof(int)); 182 clist = (struct cand *)talloc(sizeof(cand)); 183 i = stone(class, slen[0], member, klist); 184 free((char *)member); 185 free((char *)class); 186 187 J = (int *)talloc((len[0]+2)*sizeof(int)); 188 unravel(klist[i]); 189 free((char *)clist); 190 free((char *)klist); 191 192 ixold = (long *)talloc((len[0]+2)*sizeof(long)); 193 ixnew = (long *)talloc((len[1]+2)*sizeof(long)); 194 check(); 195 output(); 196 status = anychange; 197 same: 198 if (opt == D_CONTEXT && anychange == 0) 199 printf("No differences encountered\n"); 200 done(); 201 } 202 203 char * 204 copytemp() 205 { 206 char buf[BUFSIZ]; 207 register int i, f; 208 209 signal(SIGHUP,done); 210 signal(SIGINT,done); 211 signal(SIGPIPE,done); 212 signal(SIGTERM,done); 213 tempfile = mktemp("/tmp/dXXXXX"); 214 f = creat(tempfile,0600); 215 if (f < 0) { 216 fprintf(stderr, "diff: "); 217 perror(tempfile); 218 done(); 219 } 220 while ((i = read(0,buf,BUFSIZ)) > 0) 221 if (write(f,buf,i) != i) { 222 fprintf(stderr, "diff: "); 223 perror(tempfile); 224 done(); 225 } 226 close(f); 227 return (tempfile); 228 } 229 230 char * 231 splice(dir, file) 232 char *dir, *file; 233 { 234 char *tail; 235 char buf[BUFSIZ]; 236 237 if (!strcmp(file, "-")) { 238 fprintf(stderr, "diff: can't specify - with other arg directory\n"); 239 done(); 240 } 241 tail = rindex(file, '/'); 242 if (tail == 0) 243 tail = file; 244 else 245 tail++; 246 sprintf(buf, "%s/%s", dir, tail); 247 return (savestr(buf)); 248 } 249 250 prepare(i, fd) 251 int i; 252 FILE *fd; 253 { 254 register struct line *p; 255 register j,h; 256 257 fseek(fd, (long)0, 0); 258 p = (struct line *)talloc(3*sizeof(line)); 259 for(j=0; h=readhash(fd);) { 260 p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line)); 261 p[j].value = h; 262 } 263 len[i] = j; 264 file[i] = p; 265 } 266 267 prune() 268 { 269 register i,j; 270 for(pref=0;pref<len[0]&&pref<len[1]&& 271 file[0][pref+1].value==file[1][pref+1].value; 272 pref++ ) ; 273 for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& 274 file[0][len[0]-suff].value==file[1][len[1]-suff].value; 275 suff++) ; 276 for(j=0;j<2;j++) { 277 sfile[j] = file[j]+pref; 278 slen[j] = len[j]-pref-suff; 279 for(i=0;i<=slen[j];i++) 280 sfile[j][i].serial = i; 281 } 282 } 283 284 equiv(a,n,b,m,c) 285 struct line *a, *b; 286 int *c; 287 { 288 register int i, j; 289 i = j = 1; 290 while(i<=n && j<=m) { 291 if(a[i].value <b[j].value) 292 a[i++].value = 0; 293 else if(a[i].value == b[j].value) 294 a[i++].value = j; 295 else 296 j++; 297 } 298 while(i <= n) 299 a[i++].value = 0; 300 b[m+1].value = 0; 301 j = 0; 302 while(++j <= m) { 303 c[j] = -b[j].serial; 304 while(b[j+1].value == b[j].value) { 305 j++; 306 c[j] = b[j].serial; 307 } 308 } 309 c[j] = -1; 310 } 311 312 stone(a,n,b,c) 313 int *a; 314 int *b; 315 int *c; 316 { 317 register int i, k,y; 318 int j, l; 319 int oldc, tc; 320 int oldl; 321 k = 0; 322 c[0] = newcand(0,0,0); 323 for(i=1; i<=n; i++) { 324 j = a[i]; 325 if(j==0) 326 continue; 327 y = -b[j]; 328 oldl = 0; 329 oldc = c[0]; 330 do { 331 if(y <= clist[oldc].y) 332 continue; 333 l = search(c, k, y); 334 if(l!=oldl+1) 335 oldc = c[l-1]; 336 if(l<=k) { 337 if(clist[c[l]].y <= y) 338 continue; 339 tc = c[l]; 340 c[l] = newcand(i,y,oldc); 341 oldc = tc; 342 oldl = l; 343 } else { 344 c[l] = newcand(i,y,oldc); 345 k++; 346 break; 347 } 348 } while((y=b[++j]) > 0); 349 } 350 return(k); 351 } 352 353 newcand(x,y,pred) 354 { 355 register struct cand *q; 356 clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand)); 357 q = clist + clen -1; 358 q->x = x; 359 q->y = y; 360 q->pred = pred; 361 return(clen-1); 362 } 363 364 search(c, k, y) 365 int *c; 366 { 367 register int i, j, l; 368 int t; 369 if(clist[c[k]].y<y) /*quick look for typical case*/ 370 return(k+1); 371 i = 0; 372 j = k+1; 373 while((l=(i+j)/2) > i) { 374 t = clist[c[l]].y; 375 if(t > y) 376 j = l; 377 else if(t < y) 378 i = l; 379 else 380 return(l); 381 } 382 return(l+1); 383 } 384 385 unravel(p) 386 { 387 register int i; 388 register struct cand *q; 389 for(i=0; i<=len[0]; i++) 390 J[i] = i<=pref ? i: 391 i>len[0]-suff ? i+len[1]-len[0]: 392 0; 393 for(q=clist+p;q->y!=0;q=clist+q->pred) 394 J[q->x+pref] = q->y+pref; 395 } 396 397 /* check does double duty: 398 1. ferret out any fortuitous correspondences due 399 to confounding by hashing (which result in "jackpot") 400 2. collect random access indexes to the two files */ 401 402 check() 403 { 404 register int i, j; 405 int jackpot; 406 long ctold, ctnew; 407 char c,d; 408 409 if ((input[0] = fopen(file1,"r")) == NULL) { 410 perror(file1); 411 done(); 412 } 413 if ((input[1] = fopen(file2,"r")) == NULL) { 414 perror(file2); 415 done(); 416 } 417 j = 1; 418 ixold[0] = ixnew[0] = 0; 419 jackpot = 0; 420 ctold = ctnew = 0; 421 for(i=1;i<=len[0];i++) { 422 if(J[i]==0) { 423 ixold[i] = ctold += skipline(0); 424 continue; 425 } 426 while(j<J[i]) { 427 ixnew[j] = ctnew += skipline(1); 428 j++; 429 } 430 for(;;) { 431 c = getc(input[0]); 432 d = getc(input[1]); 433 ctold++; 434 ctnew++; 435 if(bflag && isspace(c) && isspace(d)) { 436 do { 437 if(c=='\n') break; 438 ctold++; 439 } while(isspace(c=getc(input[0]))); 440 do { 441 if(d=='\n') break; 442 ctnew++; 443 } while(isspace(d=getc(input[1]))); 444 } 445 if(c!=d) { 446 jackpot++; 447 J[i] = 0; 448 if(c!='\n') 449 ctold += skipline(0); 450 if(d!='\n') 451 ctnew += skipline(1); 452 break; 453 } 454 if(c=='\n') 455 break; 456 } 457 ixold[i] = ctold; 458 ixnew[j] = ctnew; 459 j++; 460 } 461 for(;j<=len[1];j++) { 462 ixnew[j] = ctnew += skipline(1); 463 } 464 fclose(input[0]); 465 fclose(input[1]); 466 /* 467 if(jackpot) 468 fprintf(stderr, "jackpot\n"); 469 */ 470 } 471 472 sort(a,n) /*shellsort CACM #201*/ 473 struct line *a; 474 { 475 struct line w; 476 register int j,m; 477 struct line *ai; 478 register struct line *aim; 479 int k; 480 for(j=1;j<=n;j*= 2) 481 m = 2*j - 1; 482 for(m/=2;m!=0;m/=2) { 483 k = n-m; 484 for(j=1;j<=k;j++) { 485 for(ai = &a[j]; ai > a; ai -= m) { 486 aim = &ai[m]; 487 if(aim < ai) 488 break; /*wraparound*/ 489 if(aim->value > ai[0].value || 490 aim->value == ai[0].value && 491 aim->serial > ai[0].serial) 492 break; 493 w.value = ai[0].value; 494 ai[0].value = aim->value; 495 aim->value = w.value; 496 w.serial = ai[0].serial; 497 ai[0].serial = aim->serial; 498 aim->serial = w.serial; 499 } 500 } 501 } 502 } 503 504 unsort(f, l, b) 505 struct line *f; 506 int *b; 507 { 508 register int *a; 509 register int i; 510 a = (int *)talloc((l+1)*sizeof(int)); 511 for(i=1;i<=l;i++) 512 a[f[i].serial] = f[i].value; 513 for(i=1;i<=l;i++) 514 b[i] = a[i]; 515 free((char *)a); 516 } 517 518 skipline(f) 519 { 520 register i; 521 char c; 522 523 for(i=1;(c=getc(input[f]))!='\n';i++) 524 if (c < 0) 525 return(i); 526 return(i); 527 } 528 529 output() 530 { 531 int m; 532 register int i0, i1, j1; 533 int j0; 534 input[0] = fopen(file1,"r"); 535 input[1] = fopen(file2,"r"); 536 m = len[0]; 537 J[0] = 0; 538 J[m+1] = len[1]+1; 539 if(opt!=D_EDIT) for(i0=1;i0<=m;i0=i1+1) { 540 while(i0<=m&&J[i0]==J[i0-1]+1) i0++; 541 j0 = J[i0-1]+1; 542 i1 = i0-1; 543 while(i1<m&&J[i1+1]==0) i1++; 544 j1 = J[i1+1]-1; 545 J[i1] = j1; 546 change(i0,i1,j0,j1); 547 } else for(i0=m;i0>=1;i0=i1-1) { 548 while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--; 549 j0 = J[i0+1]-1; 550 i1 = i0+1; 551 while(i1>1&&J[i1-1]==0) i1--; 552 j1 = J[i1-1]+1; 553 J[i1] = j1; 554 change(i1,i0,j1,j0); 555 } 556 if(m==0) 557 change(1,0,1,len[1]); 558 if (opt==D_IFDEF) { 559 for (;;) { 560 #define c i0 561 c = getc(input[0]); 562 if (c < 0) 563 return; 564 putchar(c); 565 } 566 #undef c 567 } 568 } 569 570 /* indicate that there is a difference between lines a and b of the from file 571 to get to lines c to d of the to file. 572 If a is greater then b then there are no lines in the from file involved 573 and this means that there were lines appended (beginning at b). 574 If c is greater than d then there are lines missing from the to file. 575 */ 576 change(a,b,c,d) 577 { 578 char ch; 579 int lowa,upb,lowc,upd; 580 struct stat stbuf; 581 582 if (opt != D_IFDEF && a>b && c>d) 583 return; 584 if (anychange == 0) { 585 anychange = 1; 586 if(opt == D_CONTEXT) { 587 printf("*** %s ", file1); 588 stat(file1, &stbuf); 589 printf("%s--- %s ", 590 ctime(&stbuf.st_mtime), file2); 591 stat(file2, &stbuf); 592 printf("%s", ctime(&stbuf.st_mtime)); 593 } 594 } 595 if (a <= b && c <= d) 596 ch = 'c'; 597 else 598 ch = (a <= b) ? 'd' : 'a'; 599 if(opt == D_CONTEXT) { 600 lowa = max(1, a-context); 601 upb = min(len[0], b+context); 602 lowc = max(1, c-context); 603 upd = min(len[1], d+context); 604 605 /* print out from file */ 606 printf("***************\n*** "); 607 range(lowa,upb,","); 608 printf("\n"); 609 if (ch == 'a') 610 fetch(ixold,lowa,upb,input[0]," "); 611 else { 612 fetch(ixold,lowa,a-1,input[0]," "); 613 fetch(ixold,a,b,input[0],ch == 'c' ? "! " : "- "); 614 fetch(ixold,b+1,upb,input[0]," "); 615 } 616 putchar('\n'); 617 printf("--- "); 618 range(lowc,upd,","); 619 printf(" -----\n"); 620 if (ch == 'd') 621 fetch(ixnew,lowc,upd,input[1]," "); 622 else { 623 fetch(ixnew,lowc,c-1,input[1]," "); 624 fetch(ixnew,c,d,input[1],ch == 'c' ? "! " : "+ "); 625 fetch(ixnew,d+1,upd,input[1]," "); 626 } 627 return; 628 } 629 switch (opt) { 630 631 case D_NORMAL: 632 case D_EDIT: 633 range(a,b,","); 634 putchar(a>b?'a':c>d?'d':'c'); 635 if(opt==D_NORMAL) 636 range(c,d,","); 637 putchar('\n'); 638 break; 639 case D_REVERSE: 640 putchar(a>b?'a':c>d?'d':'c'); 641 range(a,b," "); 642 putchar('\n'); 643 break; 644 } 645 if(opt == D_NORMAL || opt == D_IFDEF) { 646 fetch(ixold,a,b,input[0],"< ", 1); 647 if(a<=b&&c<=d && opt == D_NORMAL) 648 prints("---\n"); 649 } 650 fetch(ixnew,c,d,input[1],opt==D_NORMAL?"> ":"", 0); 651 if ((opt ==D_EDIT || opt == D_REVERSE) && c<=d) 652 prints(".\n"); 653 if (inifdef) { 654 fprintf(stdout, "#endif %s\n", endifname); 655 inifdef = 0; 656 } 657 } 658 659 range(a,b,separator) 660 char *separator; 661 { 662 printf("%d", a>b?b:a); 663 if(a<b) { 664 printf("%s%d", separator, b); 665 } 666 } 667 668 fetch(f,a,b,lb,s,oldfile) 669 long *f; 670 FILE *lb; 671 char *s; 672 { 673 register int i, j; 674 register int nc; 675 int oneflag = (*ifdef1!='\0') != (*ifdef2!='\0'); 676 677 /* 678 * When doing #ifdef's, copy down to current line 679 * if this is the first file, so that stuff makes it to output. 680 */ 681 if (opt == D_IFDEF && oldfile){ 682 long curpos = ftell(lb); 683 /* print through if append (a>b), else to (nb: 0 vs 1 orig) */ 684 nc = f[a>b? b : a-1 ] - curpos; 685 for (i = 0; i < nc; i++) 686 putchar(getc(lb)); 687 } 688 if (a > b) 689 return; 690 if (opt == D_IFDEF) { 691 if (inifdef) 692 fprintf(stdout, "#else %s%s\n", oneflag && oldfile==1 ? "!" : "", ifdef2); 693 else { 694 if (oneflag) { 695 /* There was only one ifdef given */ 696 endifname = ifdef2; 697 if (oldfile) 698 fprintf(stdout, "#ifndef %s\n", endifname); 699 else 700 fprintf(stdout, "#ifdef %s\n", endifname); 701 } 702 else { 703 endifname = oldfile ? ifdef1 : ifdef2; 704 fprintf(stdout, "#ifdef %s\n", endifname); 705 } 706 } 707 inifdef = 1+oldfile; 708 } 709 for(i=a;i<=b;i++) { 710 fseek(lb,f[i-1],0); 711 nc = f[i]-f[i-1]; 712 if (opt != D_IFDEF) 713 prints(s); 714 for(j=0;j<nc;j++) 715 putchar(getc(lb)); 716 } 717 if (inifdef && !wantelses) { 718 fprintf(stdout, "#endif %s\n", endifname); 719 inifdef = 0; 720 } 721 } 722 723 #define HALFLONG 16 724 #define low(x) (x&((1L<<HALFLONG)-1)) 725 #define high(x) (x>>HALFLONG) 726 727 /* 728 * hashing has the effect of 729 * arranging line in 7-bit bytes and then 730 * summing 1-s complement in 16-bit hunks 731 */ 732 readhash(f) 733 FILE *f; 734 { 735 long sum; 736 register unsigned shift; 737 register space; 738 register t; 739 sum = 1; 740 space = 0; 741 if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) { 742 if(t==-1) 743 return(0); 744 sum += (long)t << (shift%=HALFLONG); 745 } 746 else for(shift=0;;) { 747 switch(t=getc(f)) { 748 case -1: 749 return(0); 750 case '\t': 751 case ' ': 752 space++; 753 continue; 754 default: 755 if(space) { 756 shift += 7; 757 space = 0; 758 } 759 sum += (long)t << (shift%=HALFLONG); 760 shift += 7; 761 continue; 762 case '\n': 763 break; 764 } 765 break; 766 } 767 sum = low(sum) + high(sum); 768 return((short)low(sum) + (short)high(sum)); 769 } 770 771 #include <a.out.h> 772 773 asciifile(f) 774 FILE *f; 775 { 776 char buf[BUFSIZ]; 777 register int cnt; 778 register char *cp; 779 780 fseek(f, (long)0, 0); 781 cnt = fread(buf, 1, BUFSIZ, f); 782 if (cnt >= sizeof (struct exec)) { 783 struct exec hdr; 784 hdr = *(struct exec *)buf; 785 if (!N_BADMAG(hdr)) 786 return (0); 787 } 788 cp = buf; 789 while (--cnt >= 0) 790 if (*cp++ & 0200) 791 return (0); 792 return (1); 793 } 794