|
|
1.1 root 1: #
2: /* diff - differential file comparison
3: *
4: * Uses an algorithm due to Harold Stone, which finds
5: * a pair of longest identical subsequences in the two
6: * files.
7: *
8: * The major goal is to generate the match vector J.
9: * J[i] is the index of the line in file1 corresponding
10: * to line i file0. J[i] = 0 if there is no
11: * such line in file1.
12: *
13: * Lines are hashed so as to work in core. All potential
14: * matches are located by sorting the lines of each file
15: * on the hash (called value_____). In particular, this
16: * collects the equivalence classes in file1 together.
17: * Subroutine equiv____ replaces the value of each line in
18: * file0 by the index of the first element of its
19: * matching equivalence in (the reordered) file1.
20: * To save space equiv_____ squeezes file1 into a single
21: * array member______ in which the equivalence classes
22: * are simply concatenated, except that their first
23: * members are flagged by changing sign.
24: *
25: * Next the indices that point into member______ are unsorted_______ into
26: * array class_____ according to the original order of file0.
27: *
28: * The cleverness lies in routine stone______. This marches
29: * through the lines of file0, developing a vector klist_____
30: * of "k-candidates". At step i a k-candidate is a matched
31: * pair of lines x,y (x in file0 y in file1) such that
32: * there is a common subsequence of lenght k
33: * between the first i lines of file0 and the first y
34: * lines of file1, but there is no such subsequence for
35: * any smaller y. x is the earliest possible mate to y
36: * that occurs in such a subsequence.
37: *
38: * Whenever any of the members of the equivalence class of
39: * lines in file1 matable to a line in file0 has serial number
40: * less than the y of some k-candidate, that k-candidate
41: * with the smallest such y is replaced. The new
42: * k-candidate is chained (via pred____) to the current
43: * k-1 candidate so that the actual subsequence can
44: * be recovered. When a member has serial number greater
45: * that the y of all k-candidates, the klist is extended.
46: * At the end, the longest subsequence is pulled out
47: * and placed in the array J by unravel_______.
48: *
49: * With J in hand, the matches there recorded are
50: * check_____ed against reality to assure that no spurious
51: * matches have crept in due to hashing. If they have,
52: * they are broken, and "jackpot " is recorded--a harmless
53: * matter except that a true match for a spuriously
54: * mated line may now be unnecessarily reported as a change.
55: *
56: * Much of the complexity of the program comes simply
57: * from trying to minimize core utilization and
58: * maximize the range of doable problems by dynamically
59: * allocating what is needed and reusing what is not.
60: * The core requirements for problems larger than somewhat
61: * are (in words) 2*length(file0) + length(file1) +
62: * 3*(number of k-candidates installed), typically about
63: * 6n words for files of length n.
64: */
65: #include <stdio.h>
66: #include <ctype.h>
67: #include <sys/types.h>
68: #include <sys/stat.h>
69: #include <signal.h>
70: #define prints(s) fputs(s,stdout)
71:
72: #define HALFLONG 16
73: #define low(x) (x&((1L<<HALFLONG)-1))
74: #define high(x) (x>>HALFLONG)
75: FILE *input[2];
76: FILE *fopen();
77:
78: struct cand {
79: int x;
80: int y;
81: int pred;
82: } cand;
83: struct line {
84: int serial;
85: int value;
86: } *file[2], line;
87: int len[2];
88: struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/
89: int slen[2];
90: int pref, suff; /*length of prefix and suffix*/
91: int *class; /*will be overlaid on file[0]*/
92: int *member; /*will be overlaid on file[1]*/
93: int *klist; /*will be overlaid on file[0] after class*/
94: struct cand *clist; /* merely a free storage pot for candidates */
95: int clen = 0;
96: int *J; /*will be overlaid on class*/
97: long *ixold; /*will be overlaid on klist*/
98: long *ixnew; /*will be overlaid on file[1]*/
99: int opt; /* -1,0,1 = -e,normal,-f */
100: int status = 2;
101: int anychange = 0;
102: char *empty = "";
103: int bflag;
104:
105: char *tempfile; /*used when comparing against std input*/
106: char *mktemp();
107: char *dummy; /*used in resetting storage search ptr*/
108:
109: done()
110: {
111: unlink(tempfile);
112: exit(status);
113: }
114:
115: char *talloc(n)
116: {
117: extern char *malloc();
118: register char *p;
119: p = malloc((unsigned)n);
120: if(p!=NULL)
121: return(p);
122: noroom();
123: }
124:
125: char *ralloc(p,n) /*compacting reallocation */
126: char *p;
127: {
128: register char *q;
129: char *realloc();
130: free(p);
131: free(dummy);
132: dummy = malloc(1);
133: q = realloc(p, (unsigned)n);
134: if(q==NULL)
135: noroom();
136: return(q);
137: }
138:
139: noroom()
140: {
141: mesg("files too big, try -h\n",empty);
142: done();
143: }
144:
145: sort(a,n) /*shellsort CACM #201*/
146: struct line *a;
147: {
148: struct line w;
149: register int j,m;
150: struct line *ai;
151: register struct line *aim;
152: int k;
153: for(j=1;j<=n;j*= 2)
154: m = 2*j - 1;
155: for(m/=2;m!=0;m/=2) {
156: k = n-m;
157: for(j=1;j<=k;j++) {
158: for(ai = &a[j]; ai > a; ai -= m) {
159: aim = &ai[m];
160: if(aim < ai)
161: break; /*wraparound*/
162: if(aim->value > ai[0].value ||
163: aim->value == ai[0].value &&
164: aim->serial > ai[0].serial)
165: break;
166: w.value = ai[0].value;
167: ai[0].value = aim->value;
168: aim->value = w.value;
169: w.serial = ai[0].serial;
170: ai[0].serial = aim->serial;
171: aim->serial = w.serial;
172: }
173: }
174: }
175: }
176:
177: unsort(f, l, b)
178: struct line *f;
179: int *b;
180: {
181: register int *a;
182: register int i;
183: a = (int *)talloc((l+1)*sizeof(int));
184: for(i=1;i<=l;i++)
185: a[f[i].serial] = f[i].value;
186: for(i=1;i<=l;i++)
187: b[i] = a[i];
188: free((char *)a);
189: }
190:
191: filename(pa1, pa2)
192: char **pa1, **pa2;
193: {
194: register char *a1, *b1, *a2;
195: char buf[BUFSIZ];
196: struct stat stbuf;
197: int i, f;
198: a1 = *pa1;
199: a2 = *pa2;
200: if(stat(a1,&stbuf)!=-1 && ((stbuf.st_mode&S_IFMT)==S_IFDIR)) {
201: b1 = *pa1 = malloc(100);
202: while(*b1++ = *a1++) ;
203: b1[-1] = '/';
204: a1 = b1;
205: while(*a1++ = *a2++)
206: if(*a2 && *a2!='/' && a2[-1]=='/')
207: a1 = b1;
208: }
209: else if(a1[0]=='-'&&a1[1]==0&&tempfile==0) {
210: signal(SIGHUP,done);
211: signal(SIGINT,done);
212: signal(SIGPIPE,done);
213: signal(SIGTERM,done);
214: *pa1 = tempfile = mktemp("/tmp/dXXXXX");
215: if((f=creat(tempfile,0600)) < 0) {
216: mesg("cannot create ",tempfile);
217: done();
218: }
219: while((i=read(0,buf,BUFSIZ))>0)
220: write(f,buf,i);
221: close(f);
222: }
223: }
224:
225: prepare(i, arg)
226: char *arg;
227: {
228: register struct line *p;
229: register j,h;
230: if((input[i] = fopen(arg,"r")) == NULL){
231: mesg("cannot open ", arg);
232: done();
233: }
234: p = (struct line *)talloc(3*sizeof(line));
235: for(j=0; h=readhash(input[i]);) {
236: p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line));
237: p[j].value = h;
238: }
239: len[i] = j;
240: file[i] = p;
241: fclose(input[i]);
242: }
243:
244: prune()
245: {
246: register i,j;
247: for(pref=0;pref<len[0]&&pref<len[1]&&
248: file[0][pref+1].value==file[1][pref+1].value;
249: pref++ ) ;
250: for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
251: file[0][len[0]-suff].value==file[1][len[1]-suff].value;
252: suff++) ;
253: for(j=0;j<2;j++) {
254: sfile[j] = file[j]+pref;
255: slen[j] = len[j]-pref-suff;
256: for(i=0;i<=slen[j];i++)
257: sfile[j][i].serial = i;
258: }
259: }
260:
261: equiv(a,n,b,m,c)
262: struct line *a, *b;
263: int *c;
264: {
265: register int i, j;
266: i = j = 1;
267: while(i<=n && j<=m) {
268: if(a[i].value <b[j].value)
269: a[i++].value = 0;
270: else if(a[i].value == b[j].value)
271: a[i++].value = j;
272: else
273: j++;
274: }
275: while(i <= n)
276: a[i++].value = 0;
277: b[m+1].value = 0;
278: j = 0;
279: while(++j <= m) {
280: c[j] = -b[j].serial;
281: while(b[j+1].value == b[j].value) {
282: j++;
283: c[j] = b[j].serial;
284: }
285: }
286: c[j] = -1;
287: }
288:
289: main(argc, argv)
290: char **argv;
291: {
292: register int k;
293: char **args;
294:
295: args = argv;
296: if(argc>3 && *argv[1]=='-') {
297: argc--;
298: argv++;
299: for(k=1;argv[0][k];k++) {
300: switch(argv[0][k]) {
301: case 'e':
302: opt = -1;
303: break;
304: case 'f':
305: opt = 1;
306: break;
307: case 'b':
308: bflag = 1;
309: break;
310: case 'h':
311: execv("/usr/lib/diffh",args);
312: mesg("cannot find diffh",empty);
313: done();
314: }
315: }
316: }
317: if(argc!=3) {
318: mesg("arg count",empty);
319: done();
320: }
321: dummy = malloc(1);
322:
323: filename(&argv[1], &argv[2]);
324: filename(&argv[2], &argv[1]);
325: prepare(0, argv[1]);
326: prepare(1, argv[2]);
327: prune();
328: sort(sfile[0],slen[0]);
329: sort(sfile[1],slen[1]);
330:
331: member = (int *)file[1];
332: equiv(sfile[0], slen[0], sfile[1], slen[1], member);
333: member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int));
334:
335: class = (int *)file[0];
336: unsort(sfile[0], slen[0], class);
337: class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int));
338:
339: klist = (int *)talloc((slen[0]+2)*sizeof(int));
340: clist = (struct cand *)talloc(sizeof(cand));
341: k = stone(class, slen[0], member, klist);
342: free((char *)member);
343: free((char *)class);
344:
345: J = (int *)talloc((len[0]+2)*sizeof(int));
346: unravel(klist[k]);
347: free((char *)clist);
348: free((char *)klist);
349:
350: ixold = (long *)talloc((len[0]+2)*sizeof(long));
351: ixnew = (long *)talloc((len[1]+2)*sizeof(long));
352: check(argv);
353: output(argv);
354: status = anychange;
355: done();
356: }
357:
358: stone(a,n,b,c)
359: int *a;
360: int *b;
361: int *c;
362: {
363: register int i, k,y;
364: int j, l;
365: int oldc, tc;
366: int oldl;
367: k = 0;
368: c[0] = newcand(0,0,0);
369: for(i=1; i<=n; i++) {
370: j = a[i];
371: if(j==0)
372: continue;
373: y = -b[j];
374: oldl = 0;
375: oldc = c[0];
376: do {
377: if(y <= clist[oldc].y)
378: continue;
379: l = search(c, k, y);
380: if(l!=oldl+1)
381: oldc = c[l-1];
382: if(l<=k) {
383: if(clist[c[l]].y <= y)
384: continue;
385: tc = c[l];
386: c[l] = newcand(i,y,oldc);
387: oldc = tc;
388: oldl = l;
389: } else {
390: c[l] = newcand(i,y,oldc);
391: k++;
392: break;
393: }
394: } while((y=b[++j]) > 0);
395: }
396: return(k);
397: }
398:
399: newcand(x,y,pred)
400: {
401: register struct cand *q;
402: clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand));
403: q = clist + clen -1;
404: q->x = x;
405: q->y = y;
406: q->pred = pred;
407: return(clen-1);
408: }
409:
410: search(c, k, y)
411: int *c;
412: {
413: register int i, j, l;
414: int t;
415: if(clist[c[k]].y<y) /*quick look for typical case*/
416: return(k+1);
417: i = 0;
418: j = k+1;
419: while((l=(i+j)/2) > i) {
420: t = clist[c[l]].y;
421: if(t > y)
422: j = l;
423: else if(t < y)
424: i = l;
425: else
426: return(l);
427: }
428: return(l+1);
429: }
430:
431: unravel(p)
432: {
433: register int i;
434: register struct cand *q;
435: for(i=0; i<=len[0]; i++)
436: J[i] = i<=pref ? i:
437: i>len[0]-suff ? i+len[1]-len[0]:
438: 0;
439: for(q=clist+p;q->y!=0;q=clist+q->pred)
440: J[q->x+pref] = q->y+pref;
441: }
442:
443: /* check does double duty:
444: 1. ferret out any fortuitous correspondences due
445: to confounding by hashing (which result in "jackpot")
446: 2. collect random access indexes to the two files */
447:
448: check(argv)
449: char **argv;
450: {
451: register int i, j;
452: int jackpot;
453: long ctold, ctnew;
454: char c,d;
455: input[0] = fopen(argv[1],"r");
456: input[1] = fopen(argv[2],"r");
457: j = 1;
458: ixold[0] = ixnew[0] = 0;
459: jackpot = 0;
460: ctold = ctnew = 0;
461: for(i=1;i<=len[0];i++) {
462: if(J[i]==0) {
463: ixold[i] = ctold += skipline(0);
464: continue;
465: }
466: while(j<J[i]) {
467: ixnew[j] = ctnew += skipline(1);
468: j++;
469: }
470: for(;;) {
471: c = getc(input[0]);
472: d = getc(input[1]);
473: ctold++;
474: ctnew++;
475: if(bflag && isspace(c) && isspace(d)) {
476: do {
477: if(c=='\n') break;
478: ctold++;
479: } while(isspace(c=getc(input[0])));
480: do {
481: if(d=='\n') break;
482: ctnew++;
483: } while(isspace(d=getc(input[1])));
484: }
485: if(c!=d) {
486: jackpot++;
487: J[i] = 0;
488: if(c!='\n')
489: ctold += skipline(0);
490: if(d!='\n')
491: ctnew += skipline(1);
492: break;
493: }
494: if(c=='\n')
495: break;
496: }
497: ixold[i] = ctold;
498: ixnew[j] = ctnew;
499: j++;
500: }
501: for(;j<=len[1];j++) {
502: ixnew[j] = ctnew += skipline(1);
503: }
504: fclose(input[0]);
505: fclose(input[1]);
506: /*
507: if(jackpot)
508: mesg("jackpot",empty);
509: */
510: }
511:
512: skipline(f)
513: {
514: register i;
515: for(i=1;getc(input[f])!='\n';i++) ;
516: return(i);
517: }
518:
519: output(argv)
520: char **argv;
521: {
522: int m;
523: register int i0, i1, j1;
524: int j0;
525: input[0] = fopen(argv[1],"r");
526: input[1] = fopen(argv[2],"r");
527: m = len[0];
528: J[0] = 0;
529: J[m+1] = len[1]+1;
530: if(opt!=-1) for(i0=1;i0<=m;i0=i1+1) {
531: while(i0<=m&&J[i0]==J[i0-1]+1) i0++;
532: j0 = J[i0-1]+1;
533: i1 = i0-1;
534: while(i1<m&&J[i1+1]==0) i1++;
535: j1 = J[i1+1]-1;
536: J[i1] = j1;
537: change(i0,i1,j0,j1);
538: } else for(i0=m;i0>=1;i0=i1-1) {
539: while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--;
540: j0 = J[i0+1]-1;
541: i1 = i0+1;
542: while(i1>1&&J[i1-1]==0) i1--;
543: j1 = J[i1-1]+1;
544: J[i1] = j1;
545: change(i1,i0,j1,j0);
546: }
547: if(m==0)
548: change(1,0,1,len[1]);
549: }
550:
551: change(a,b,c,d)
552: {
553: if(a>b&&c>d) return;
554: anychange = 1;
555: if(opt!=1) {
556: range(a,b,",");
557: putchar(a>b?'a':c>d?'d':'c');
558: if(opt!=-1) range(c,d,",");
559: } else {
560: putchar(a>b?'a':c>d?'d':'c');
561: range(a,b," ");
562: }
563: putchar('\n');
564: if(opt==0) {
565: fetch(ixold,a,b,input[0],"< ");
566: if(a<=b&&c<=d) prints("---\n");
567: }
568: fetch(ixnew,c,d,input[1],opt==0?"> ":empty);
569: if(opt!=0&&c<=d) prints(".\n");
570: }
571:
572: range(a,b,separator)
573: char *separator;
574: {
575: printf("%d", a>b?b:a);
576: if(a<b) {
577: printf("%s%d", separator, b);
578: }
579: }
580:
581: fetch(f,a,b,lb,s)
582: long *f;
583: FILE *lb;
584: char *s;
585: {
586: register int i, j;
587: register int nc;
588: for(i=a;i<=b;i++) {
589: fseek(lb,f[i-1],0);
590: nc = f[i]-f[i-1];
591: prints(s);
592: for(j=0;j<nc;j++)
593: putchar(getc(lb));
594: }
595: }
596:
597: /* hashing has the effect of
598: * arranging line in 7-bit bytes and then
599: * summing 1-s complement in 16-bit hunks
600: */
601:
602: readhash(f)
603: FILE *f;
604: {
605: long sum;
606: register unsigned shift;
607: register space;
608: register t;
609: sum = 1;
610: space = 0;
611: if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) {
612: if(t==-1)
613: return(0);
614: sum += (long)t << (shift%=HALFLONG);
615: }
616: else for(shift=0;;) {
617: switch(t=getc(f)) {
618: case -1:
619: return(0);
620: case '\t':
621: case ' ':
622: space++;
623: continue;
624: default:
625: if(space) {
626: shift += 7;
627: space = 0;
628: }
629: sum += (long)t << (shift%=HALFLONG);
630: shift += 7;
631: continue;
632: case '\n':
633: break;
634: }
635: break;
636: }
637: sum = low(sum) + high(sum);
638: return((short)low(sum) + (short)high(sum));
639: }
640:
641: mesg(s,t)
642: char *s, *t;
643: {
644: fprintf(stderr,"diff: %s%s\n",s,t);
645: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.