|
|
1.1 ! root 1: # ! 2: /* diff - differential file comparison ! 3: * ! 4: * Uses an algorithm due to Harold Stone, which finds ! 5: * a pair of longest identical subsequences in the two ! 6: * files. ! 7: * ! 8: * The major goal is to generate the match vector J. ! 9: * J[i] is the index of the line in file1 corresponding ! 10: * to line i file0. J[i] = 0 if there is no ! 11: * such line in file1. ! 12: * ! 13: * Lines are hashed so as to work in core. All potential ! 14: * matches are located by sorting the lines of each file ! 15: * on the hash (called value_____). In particular, this ! 16: * collects the equivalence classes in file1 together. ! 17: * Subroutine equiv____ replaces the value of each line in ! 18: * file0 by the index of the first element of its ! 19: * matching equivalence in (the reordered) file1. ! 20: * To save space equiv_____ squeezes file1 into a single ! 21: * array member______ in which the equivalence classes ! 22: * are simply concatenated, except that their first ! 23: * members are flagged by changing sign. ! 24: * ! 25: * Next the indices that point into member______ are unsorted_______ into ! 26: * array class_____ according to the original order of file0. ! 27: * ! 28: * The cleverness lies in routine stone______. This marches ! 29: * through the lines of file0, developing a vector klist_____ ! 30: * of "k-candidates". At step i a k-candidate is a matched ! 31: * pair of lines x,y (x in file0 y in file1) such that ! 32: * there is a common subsequence of lenght k ! 33: * between the first i lines of file0 and the first y ! 34: * lines of file1, but there is no such subsequence for ! 35: * any smaller y. x is the earliest possible mate to y ! 36: * that occurs in such a subsequence. ! 37: * ! 38: * Whenever any of the members of the equivalence class of ! 39: * lines in file1 matable to a line in file0 has serial number ! 40: * less than the y of some k-candidate, that k-candidate ! 41: * with the smallest such y is replaced. The new ! 42: * k-candidate is chained (via pred____) to the current ! 43: * k-1 candidate so that the actual subsequence can ! 44: * be recovered. When a member has serial number greater ! 45: * that the y of all k-candidates, the klist is extended. ! 46: * At the end, the longest subsequence is pulled out ! 47: * and placed in the array J by unravel_______. ! 48: * ! 49: * With J in hand, the matches there recorded are ! 50: * check_____ed against reality to assure that no spurious ! 51: * matches have crept in due to hashing. If they have, ! 52: * they are broken, and "jackpot " is recorded--a harmless ! 53: * matter except that a true match for a spuriously ! 54: * mated line may now be unnecessarily reported as a change. ! 55: * ! 56: * Much of the complexity of the program comes simply ! 57: * from trying to minimize core utilization and ! 58: * maximize the range of doable problems by dynamically ! 59: * allocating what is needed and reusing what is not. ! 60: * The core requirements for problems larger than somewhat ! 61: * are (in words) 2*length(file0) + length(file1) + ! 62: * 3*(number of k-candidates installed), typically about ! 63: * 6n words for files of length n. ! 64: */ ! 65: #include <stdio.h> ! 66: #include <ctype.h> ! 67: #include <sys/types.h> ! 68: #include <sys/stat.h> ! 69: #include <signal.h> ! 70: #define prints(s) fputs(s,stdout) ! 71: ! 72: #define HALFLONG 16 ! 73: #define low(x) (x&((1L<<HALFLONG)-1)) ! 74: #define high(x) (x>>HALFLONG) ! 75: FILE *input[2]; ! 76: FILE *fopen(); ! 77: ! 78: struct cand { ! 79: int x; ! 80: int y; ! 81: int pred; ! 82: } cand; ! 83: struct line { ! 84: int serial; ! 85: int value; ! 86: } *file[2], line; ! 87: int len[2]; ! 88: struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/ ! 89: int slen[2]; ! 90: int pref, suff; /*length of prefix and suffix*/ ! 91: int *class; /*will be overlaid on file[0]*/ ! 92: int *member; /*will be overlaid on file[1]*/ ! 93: int *klist; /*will be overlaid on file[0] after class*/ ! 94: struct cand *clist; /* merely a free storage pot for candidates */ ! 95: int clen = 0; ! 96: int *J; /*will be overlaid on class*/ ! 97: long *ixold; /*will be overlaid on klist*/ ! 98: long *ixnew; /*will be overlaid on file[1]*/ ! 99: int opt; /* -1,0,1 = -e,normal,-f */ ! 100: int status = 2; ! 101: int anychange = 0; ! 102: char *empty = ""; ! 103: int bflag; ! 104: ! 105: char *tempfile; /*used when comparing against std input*/ ! 106: char *mktemp(); ! 107: char *dummy; /*used in resetting storage search ptr*/ ! 108: ! 109: done() ! 110: { ! 111: unlink(tempfile); ! 112: exit(status); ! 113: } ! 114: ! 115: char *talloc(n) ! 116: { ! 117: extern char *malloc(); ! 118: register char *p; ! 119: p = malloc((unsigned)n); ! 120: if(p!=NULL) ! 121: return(p); ! 122: noroom(); ! 123: } ! 124: ! 125: char *ralloc(p,n) /*compacting reallocation */ ! 126: char *p; ! 127: { ! 128: register char *q; ! 129: char *realloc(); ! 130: free(p); ! 131: free(dummy); ! 132: dummy = malloc(1); ! 133: q = realloc(p, (unsigned)n); ! 134: if(q==NULL) ! 135: noroom(); ! 136: return(q); ! 137: } ! 138: ! 139: noroom() ! 140: { ! 141: mesg("files too big, try -h\n",empty); ! 142: done(); ! 143: } ! 144: ! 145: sort(a,n) /*shellsort CACM #201*/ ! 146: struct line *a; ! 147: { ! 148: struct line w; ! 149: register int j,m; ! 150: struct line *ai; ! 151: register struct line *aim; ! 152: int k; ! 153: for(j=1;j<=n;j*= 2) ! 154: m = 2*j - 1; ! 155: for(m/=2;m!=0;m/=2) { ! 156: k = n-m; ! 157: for(j=1;j<=k;j++) { ! 158: for(ai = &a[j]; ai > a; ai -= m) { ! 159: aim = &ai[m]; ! 160: if(aim < ai) ! 161: break; /*wraparound*/ ! 162: if(aim->value > ai[0].value || ! 163: aim->value == ai[0].value && ! 164: aim->serial > ai[0].serial) ! 165: break; ! 166: w.value = ai[0].value; ! 167: ai[0].value = aim->value; ! 168: aim->value = w.value; ! 169: w.serial = ai[0].serial; ! 170: ai[0].serial = aim->serial; ! 171: aim->serial = w.serial; ! 172: } ! 173: } ! 174: } ! 175: } ! 176: ! 177: unsort(f, l, b) ! 178: struct line *f; ! 179: int *b; ! 180: { ! 181: register int *a; ! 182: register int i; ! 183: a = (int *)talloc((l+1)*sizeof(int)); ! 184: for(i=1;i<=l;i++) ! 185: a[f[i].serial] = f[i].value; ! 186: for(i=1;i<=l;i++) ! 187: b[i] = a[i]; ! 188: free((char *)a); ! 189: } ! 190: ! 191: filename(pa1, pa2) ! 192: char **pa1, **pa2; ! 193: { ! 194: register char *a1, *b1, *a2; ! 195: char buf[BUFSIZ]; ! 196: struct stat stbuf; ! 197: int i, f; ! 198: a1 = *pa1; ! 199: a2 = *pa2; ! 200: if(stat(a1,&stbuf)!=-1 && ((stbuf.st_mode&S_IFMT)==S_IFDIR)) { ! 201: b1 = *pa1 = malloc(100); ! 202: while(*b1++ = *a1++) ; ! 203: b1[-1] = '/'; ! 204: a1 = b1; ! 205: while(*a1++ = *a2++) ! 206: if(*a2 && *a2!='/' && a2[-1]=='/') ! 207: a1 = b1; ! 208: } ! 209: else if(a1[0]=='-'&&a1[1]==0&&tempfile==0) { ! 210: signal(SIGHUP,done); ! 211: signal(SIGINT,done); ! 212: signal(SIGPIPE,done); ! 213: signal(SIGTERM,done); ! 214: *pa1 = tempfile = mktemp("/tmp/dXXXXX"); ! 215: if((f=creat(tempfile,0600)) < 0) { ! 216: mesg("cannot create ",tempfile); ! 217: done(); ! 218: } ! 219: while((i=read(0,buf,BUFSIZ))>0) ! 220: write(f,buf,i); ! 221: close(f); ! 222: } ! 223: } ! 224: ! 225: prepare(i, arg) ! 226: char *arg; ! 227: { ! 228: register struct line *p; ! 229: register j,h; ! 230: if((input[i] = fopen(arg,"r")) == NULL){ ! 231: mesg("cannot open ", arg); ! 232: done(); ! 233: } ! 234: p = (struct line *)talloc(3*sizeof(line)); ! 235: for(j=0; h=readhash(input[i]);) { ! 236: p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line)); ! 237: p[j].value = h; ! 238: } ! 239: len[i] = j; ! 240: file[i] = p; ! 241: fclose(input[i]); ! 242: } ! 243: ! 244: prune() ! 245: { ! 246: register i,j; ! 247: for(pref=0;pref<len[0]&&pref<len[1]&& ! 248: file[0][pref+1].value==file[1][pref+1].value; ! 249: pref++ ) ; ! 250: for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&& ! 251: file[0][len[0]-suff].value==file[1][len[1]-suff].value; ! 252: suff++) ; ! 253: for(j=0;j<2;j++) { ! 254: sfile[j] = file[j]+pref; ! 255: slen[j] = len[j]-pref-suff; ! 256: for(i=0;i<=slen[j];i++) ! 257: sfile[j][i].serial = i; ! 258: } ! 259: } ! 260: ! 261: equiv(a,n,b,m,c) ! 262: struct line *a, *b; ! 263: int *c; ! 264: { ! 265: register int i, j; ! 266: i = j = 1; ! 267: while(i<=n && j<=m) { ! 268: if(a[i].value <b[j].value) ! 269: a[i++].value = 0; ! 270: else if(a[i].value == b[j].value) ! 271: a[i++].value = j; ! 272: else ! 273: j++; ! 274: } ! 275: while(i <= n) ! 276: a[i++].value = 0; ! 277: b[m+1].value = 0; ! 278: j = 0; ! 279: while(++j <= m) { ! 280: c[j] = -b[j].serial; ! 281: while(b[j+1].value == b[j].value) { ! 282: j++; ! 283: c[j] = b[j].serial; ! 284: } ! 285: } ! 286: c[j] = -1; ! 287: } ! 288: ! 289: main(argc, argv) ! 290: char **argv; ! 291: { ! 292: register int k; ! 293: char **args; ! 294: ! 295: args = argv; ! 296: if(argc>3 && *argv[1]=='-') { ! 297: argc--; ! 298: argv++; ! 299: for(k=1;argv[0][k];k++) { ! 300: switch(argv[0][k]) { ! 301: case 'e': ! 302: opt = -1; ! 303: break; ! 304: case 'f': ! 305: opt = 1; ! 306: break; ! 307: case 'b': ! 308: bflag = 1; ! 309: break; ! 310: case 'h': ! 311: execv("/usr/lib/diffh",args); ! 312: mesg("cannot find diffh",empty); ! 313: done(); ! 314: } ! 315: } ! 316: } ! 317: if(argc!=3) { ! 318: mesg("arg count",empty); ! 319: done(); ! 320: } ! 321: dummy = malloc(1); ! 322: ! 323: filename(&argv[1], &argv[2]); ! 324: filename(&argv[2], &argv[1]); ! 325: prepare(0, argv[1]); ! 326: prepare(1, argv[2]); ! 327: prune(); ! 328: sort(sfile[0],slen[0]); ! 329: sort(sfile[1],slen[1]); ! 330: ! 331: member = (int *)file[1]; ! 332: equiv(sfile[0], slen[0], sfile[1], slen[1], member); ! 333: member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int)); ! 334: ! 335: class = (int *)file[0]; ! 336: unsort(sfile[0], slen[0], class); ! 337: class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int)); ! 338: ! 339: klist = (int *)talloc((slen[0]+2)*sizeof(int)); ! 340: clist = (struct cand *)talloc(sizeof(cand)); ! 341: k = stone(class, slen[0], member, klist); ! 342: free((char *)member); ! 343: free((char *)class); ! 344: ! 345: J = (int *)talloc((len[0]+2)*sizeof(int)); ! 346: unravel(klist[k]); ! 347: free((char *)clist); ! 348: free((char *)klist); ! 349: ! 350: ixold = (long *)talloc((len[0]+2)*sizeof(long)); ! 351: ixnew = (long *)talloc((len[1]+2)*sizeof(long)); ! 352: check(argv); ! 353: output(argv); ! 354: status = anychange; ! 355: done(); ! 356: } ! 357: ! 358: stone(a,n,b,c) ! 359: int *a; ! 360: int *b; ! 361: int *c; ! 362: { ! 363: register int i, k,y; ! 364: int j, l; ! 365: int oldc, tc; ! 366: int oldl; ! 367: k = 0; ! 368: c[0] = newcand(0,0,0); ! 369: for(i=1; i<=n; i++) { ! 370: j = a[i]; ! 371: if(j==0) ! 372: continue; ! 373: y = -b[j]; ! 374: oldl = 0; ! 375: oldc = c[0]; ! 376: do { ! 377: if(y <= clist[oldc].y) ! 378: continue; ! 379: l = search(c, k, y); ! 380: if(l!=oldl+1) ! 381: oldc = c[l-1]; ! 382: if(l<=k) { ! 383: if(clist[c[l]].y <= y) ! 384: continue; ! 385: tc = c[l]; ! 386: c[l] = newcand(i,y,oldc); ! 387: oldc = tc; ! 388: oldl = l; ! 389: } else { ! 390: c[l] = newcand(i,y,oldc); ! 391: k++; ! 392: break; ! 393: } ! 394: } while((y=b[++j]) > 0); ! 395: } ! 396: return(k); ! 397: } ! 398: ! 399: newcand(x,y,pred) ! 400: { ! 401: register struct cand *q; ! 402: clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand)); ! 403: q = clist + clen -1; ! 404: q->x = x; ! 405: q->y = y; ! 406: q->pred = pred; ! 407: return(clen-1); ! 408: } ! 409: ! 410: search(c, k, y) ! 411: int *c; ! 412: { ! 413: register int i, j, l; ! 414: int t; ! 415: if(clist[c[k]].y<y) /*quick look for typical case*/ ! 416: return(k+1); ! 417: i = 0; ! 418: j = k+1; ! 419: while((l=(i+j)/2) > i) { ! 420: t = clist[c[l]].y; ! 421: if(t > y) ! 422: j = l; ! 423: else if(t < y) ! 424: i = l; ! 425: else ! 426: return(l); ! 427: } ! 428: return(l+1); ! 429: } ! 430: ! 431: unravel(p) ! 432: { ! 433: register int i; ! 434: register struct cand *q; ! 435: for(i=0; i<=len[0]; i++) ! 436: J[i] = i<=pref ? i: ! 437: i>len[0]-suff ? i+len[1]-len[0]: ! 438: 0; ! 439: for(q=clist+p;q->y!=0;q=clist+q->pred) ! 440: J[q->x+pref] = q->y+pref; ! 441: } ! 442: ! 443: /* check does double duty: ! 444: 1. ferret out any fortuitous correspondences due ! 445: to confounding by hashing (which result in "jackpot") ! 446: 2. collect random access indexes to the two files */ ! 447: ! 448: check(argv) ! 449: char **argv; ! 450: { ! 451: register int i, j; ! 452: int jackpot; ! 453: long ctold, ctnew; ! 454: char c,d; ! 455: input[0] = fopen(argv[1],"r"); ! 456: input[1] = fopen(argv[2],"r"); ! 457: j = 1; ! 458: ixold[0] = ixnew[0] = 0; ! 459: jackpot = 0; ! 460: ctold = ctnew = 0; ! 461: for(i=1;i<=len[0];i++) { ! 462: if(J[i]==0) { ! 463: ixold[i] = ctold += skipline(0); ! 464: continue; ! 465: } ! 466: while(j<J[i]) { ! 467: ixnew[j] = ctnew += skipline(1); ! 468: j++; ! 469: } ! 470: for(;;) { ! 471: c = getc(input[0]); ! 472: d = getc(input[1]); ! 473: ctold++; ! 474: ctnew++; ! 475: if(bflag && isspace(c) && isspace(d)) { ! 476: do { ! 477: if(c=='\n') break; ! 478: ctold++; ! 479: } while(isspace(c=getc(input[0]))); ! 480: do { ! 481: if(d=='\n') break; ! 482: ctnew++; ! 483: } while(isspace(d=getc(input[1]))); ! 484: } ! 485: if(c!=d) { ! 486: jackpot++; ! 487: J[i] = 0; ! 488: if(c!='\n') ! 489: ctold += skipline(0); ! 490: if(d!='\n') ! 491: ctnew += skipline(1); ! 492: break; ! 493: } ! 494: if(c=='\n') ! 495: break; ! 496: } ! 497: ixold[i] = ctold; ! 498: ixnew[j] = ctnew; ! 499: j++; ! 500: } ! 501: for(;j<=len[1];j++) { ! 502: ixnew[j] = ctnew += skipline(1); ! 503: } ! 504: fclose(input[0]); ! 505: fclose(input[1]); ! 506: /* ! 507: if(jackpot) ! 508: mesg("jackpot",empty); ! 509: */ ! 510: } ! 511: ! 512: skipline(f) ! 513: { ! 514: register i; ! 515: for(i=1;getc(input[f])!='\n';i++) ; ! 516: return(i); ! 517: } ! 518: ! 519: output(argv) ! 520: char **argv; ! 521: { ! 522: int m; ! 523: register int i0, i1, j1; ! 524: int j0; ! 525: input[0] = fopen(argv[1],"r"); ! 526: input[1] = fopen(argv[2],"r"); ! 527: m = len[0]; ! 528: J[0] = 0; ! 529: J[m+1] = len[1]+1; ! 530: if(opt!=-1) for(i0=1;i0<=m;i0=i1+1) { ! 531: while(i0<=m&&J[i0]==J[i0-1]+1) i0++; ! 532: j0 = J[i0-1]+1; ! 533: i1 = i0-1; ! 534: while(i1<m&&J[i1+1]==0) i1++; ! 535: j1 = J[i1+1]-1; ! 536: J[i1] = j1; ! 537: change(i0,i1,j0,j1); ! 538: } else for(i0=m;i0>=1;i0=i1-1) { ! 539: while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--; ! 540: j0 = J[i0+1]-1; ! 541: i1 = i0+1; ! 542: while(i1>1&&J[i1-1]==0) i1--; ! 543: j1 = J[i1-1]+1; ! 544: J[i1] = j1; ! 545: change(i1,i0,j1,j0); ! 546: } ! 547: if(m==0) ! 548: change(1,0,1,len[1]); ! 549: } ! 550: ! 551: change(a,b,c,d) ! 552: { ! 553: if(a>b&&c>d) return; ! 554: anychange = 1; ! 555: if(opt!=1) { ! 556: range(a,b,","); ! 557: putchar(a>b?'a':c>d?'d':'c'); ! 558: if(opt!=-1) range(c,d,","); ! 559: } else { ! 560: putchar(a>b?'a':c>d?'d':'c'); ! 561: range(a,b," "); ! 562: } ! 563: putchar('\n'); ! 564: if(opt==0) { ! 565: fetch(ixold,a,b,input[0],"< "); ! 566: if(a<=b&&c<=d) prints("---\n"); ! 567: } ! 568: fetch(ixnew,c,d,input[1],opt==0?"> ":empty); ! 569: if(opt!=0&&c<=d) prints(".\n"); ! 570: } ! 571: ! 572: range(a,b,separator) ! 573: char *separator; ! 574: { ! 575: printf("%d", a>b?b:a); ! 576: if(a<b) { ! 577: printf("%s%d", separator, b); ! 578: } ! 579: } ! 580: ! 581: fetch(f,a,b,lb,s) ! 582: long *f; ! 583: FILE *lb; ! 584: char *s; ! 585: { ! 586: register int i, j; ! 587: register int nc; ! 588: for(i=a;i<=b;i++) { ! 589: fseek(lb,f[i-1],0); ! 590: nc = f[i]-f[i-1]; ! 591: prints(s); ! 592: for(j=0;j<nc;j++) ! 593: putchar(getc(lb)); ! 594: } ! 595: } ! 596: ! 597: /* hashing has the effect of ! 598: * arranging line in 7-bit bytes and then ! 599: * summing 1-s complement in 16-bit hunks ! 600: */ ! 601: ! 602: readhash(f) ! 603: FILE *f; ! 604: { ! 605: long sum; ! 606: register unsigned shift; ! 607: register space; ! 608: register t; ! 609: sum = 1; ! 610: space = 0; ! 611: if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) { ! 612: if(t==-1) ! 613: return(0); ! 614: sum += (long)t << (shift%=HALFLONG); ! 615: } ! 616: else for(shift=0;;) { ! 617: switch(t=getc(f)) { ! 618: case -1: ! 619: return(0); ! 620: case '\t': ! 621: case ' ': ! 622: space++; ! 623: continue; ! 624: default: ! 625: if(space) { ! 626: shift += 7; ! 627: space = 0; ! 628: } ! 629: sum += (long)t << (shift%=HALFLONG); ! 630: shift += 7; ! 631: continue; ! 632: case '\n': ! 633: break; ! 634: } ! 635: break; ! 636: } ! 637: sum = low(sum) + high(sum); ! 638: return((short)low(sum) + (short)high(sum)); ! 639: } ! 640: ! 641: mesg(s,t) ! 642: char *s, *t; ! 643: { ! 644: fprintf(stderr,"diff: %s%s\n",s,t); ! 645: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.