|
|
1.1 root 1: static char sccsid[] = "@(#)diffreg.c 4.1 10/9/80";
2:
3: #include "diff.h"
4: /*
5: * diff - compare two files.
6: */
7:
8: /*
9: * Uses an algorithm due to Harold Stone, which finds
10: * a pair of longest identical subsequences in the two
11: * files.
12: *
13: * The major goal is to generate the match vector J.
14: * J[i] is the index of the line in file1 corresponding
15: * to line i file0. J[i] = 0 if there is no
16: * such line in file1.
17: *
18: * Lines are hashed so as to work in core. All potential
19: * matches are located by sorting the lines of each file
20: * on the hash (called ``value''). In particular, this
21: * collects the equivalence classes in file1 together.
22: * Subroutine equiv replaces the value of each line in
23: * file0 by the index of the first element of its
24: * matching equivalence in (the reordered) file1.
25: * To save space equiv squeezes file1 into a single
26: * array called "member" in which the equivalence classes
27: * are simply concatenated, except that their first
28: * members are flagged by changing sign.
29: *
30: * Next the indices that point into "member" are unsorted into
31: * array "class"(?) according to the original order of file0.
32: *
33: * The cleverness lies in routine stone. This marches
34: * through the lines of file0, developing a vector klist
35: * of "k-candidates". At step i a k-candidate is a matched
36: * pair of lines x,y (x in file0 y in file1) such that
37: * there is a common subsequence of length k
38: * between the first i lines of file0 and the first y
39: * lines of file1, but there is no such subsequence for
40: * any smaller y. x is the earliest possible mate to y
41: * that occurs in such a subsequence.
42: *
43: * Whenever any of the members of the equivalence class of
44: * lines in file1 matable to a line in file0 has serial number
45: * less than the y of some k-candidate, that k-candidate
46: * with the smallest such y is replaced. The new
47: * k-candidate is chained (via pred) to the current
48: * k-1 candidate so that the actual subsequence can
49: * be recovered. When a member has serial number greater
50: * that the y of all k-candidates, the klist is extended.
51: * At the end, the longest subsequence is pulled out
52: * and placed in the array J by unravel
53: *
54: * With J in hand, the matches there recorded are
55: * check'ed against reality to assure that no spurious
56: * matches have crept in due to hashing. If they have,
57: * they are broken, and "jackpot" is recorded--a harmless
58: * matter except that a true match for a spuriously
59: * mated line may now be unnecessarily reported as a change.
60: *
61: * Much of the complexity of the program comes simply
62: * from trying to minimize core utilization and
63: * maximize the range of doable problems by dynamically
64: * allocating what is needed and reusing what is not.
65: * The core requirements for problems larger than somewhat
66: * are (in words) 2*length(file0) + length(file1) +
67: * 3*(number of k-candidates installed), typically about
68: * 6n words for files of length n.
69: */
70:
71: #define prints(s) fputs(s,stdout)
72:
73: FILE *input[2];
74: FILE *fopen();
75:
76: struct cand {
77: int x;
78: int y;
79: int pred;
80: } cand;
81: struct line {
82: int serial;
83: int value;
84: } *file[2], line;
85: int len[2];
86: struct line *sfile[2]; /* shortened by pruning common prefix and suffix */
87: int slen[2];
88: int pref, suff; /* length of prefix and suffix */
89: int *class; /* will be overlaid on file[0] */
90: int *member; /* will be overlaid on file[1] */
91: int *klist; /* will be overlaid on file[0] after class */
92: struct cand *clist; /* merely a free storage pot for candidates */
93: int clen = 0;
94: int *J; /* will be overlaid on class */
95: long *ixold; /* will be overlaid on klist */
96: long *ixnew; /* will be overlaid on file[1] */
97:
98: diffreg()
99: {
100: register int k;
101:
102: if (hflag) {
103: diffargv[0] = "diffh";
104: execv(diffh, diffargv);
105: fprintf(stderr, "diff: ");
106: perror(diffh);
107: done();
108: }
109: dummy = malloc(1);
110: if ((stb1.st_mode & S_IFMT) == S_IFDIR)
111: file1 = splice(file1, file2);
112: else if ((stb2.st_mode & S_IFMT) == S_IFDIR)
113: file2 = splice(file2, file1);
114: else if (!strcmp(file1, "-")) {
115: if (!strcmp(file2, "-")) {
116: fprintf(stderr, "diff: can't specify - -\n");
117: done();
118: }
119: file1 = copytemp("/dev/stdin");
120: } else if (!strcmp(file2, "-"))
121: file2 = copytemp("/dev/stdin");
122: else {
123: if ((stb1.st_mode & S_IFMT) == S_IFCHR) /* buffer it */
124: file1 = copytemp(file1);
125: if ((stb2.st_mode & S_IFMT) == S_IFCHR)
126: file2 = copytemp(file2);
127: }
128: prepare(0, file1);
129: prepare(1, file2);
130: prune();
131: sort(sfile[0],slen[0]);
132: sort(sfile[1],slen[1]);
133:
134: member = (int *)file[1];
135: equiv(sfile[0], slen[0], sfile[1], slen[1], member);
136: member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int));
137:
138: class = (int *)file[0];
139: unsort(sfile[0], slen[0], class);
140: class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int));
141:
142: klist = (int *)talloc((slen[0]+2)*sizeof(int));
143: clist = (struct cand *)talloc(sizeof(cand));
144: k = stone(class, slen[0], member, klist);
145: free((char *)member);
146: free((char *)class);
147:
148: J = (int *)talloc((len[0]+2)*sizeof(int));
149: unravel(klist[k]);
150: free((char *)clist);
151: free((char *)klist);
152:
153: ixold = (long *)talloc((len[0]+2)*sizeof(long));
154: ixnew = (long *)talloc((len[1]+2)*sizeof(long));
155: check();
156: output();
157: status = anychange;
158: if (opt == D_CONTEXT && anychange == 0)
159: printf("No differences encountered\n");
160: done();
161: }
162:
163: char *
164: copytemp(fn) /* special dev output (inc. /dev/fd) gets saved here */
165: char *fn;
166: {
167: char buf[BUFSIZ];
168: char *temp;
169: char template[13]; /* temp file */
170: register int i, f, ifd;
171:
172: if ((ifd = open(fn, 0)) == -1) {
173: fprintf(stderr, "diff: cannot read %s\n", fn);
174: done();
175: }
176: signal(SIGHUP,done);
177: signal(SIGINT,done);
178: signal(SIGPIPE,done);
179: signal(SIGTERM,done);
180: strcpy(tempfile[whichtemp++],
181: temp=mktemp(strcpy(template, "/tmp/dXXXXXX")));
182: f = creat(temp,0600);
183: if (f < 0) {
184: fprintf(stderr, "diff: ");
185: perror(temp);
186: done();
187: }
188: while ((i = read(ifd,buf,BUFSIZ)) > 0)
189: if (write(f,buf,i) != i) {
190: fprintf(stderr, "diff: ");
191: perror(temp);
192: done();
193: }
194: close(f);
195: close(ifd);
196: return (tempfile[whichtemp-1]);
197: }
198:
199: char *
200: splice(dir, file)
201: char *dir, *file;
202: {
203: char *tail;
204: char buf[BUFSIZ];
205:
206: if (!strcmp(file, "-")) {
207: fprintf(stderr, "diff: can't specify - with other arg directory\n");
208: done();
209: }
210: tail = strrchr(file, '/');
211: if (tail == 0)
212: tail = file;
213: else
214: tail++;
215: sprintf(buf, "%s/%s", dir, tail);
216: return (savestr(buf));
217: }
218:
219: prepare(i, arg)
220: char *arg;
221: {
222: register struct line *p;
223: register j,h;
224: if((input[i] = fopen(arg,"r")) == NULL){
225: fprintf(stderr, "diff: ");
226: perror(arg);
227: done();
228: }
229: p = (struct line *)talloc(3*sizeof(line));
230: for(j=0; h=readhash(input[i]);) {
231: p = (struct line *)realloc((char *)p,(++j+3)*sizeof(line));
232: p[j].value = h;
233: }
234: len[i] = j;
235: file[i] = p;
236: fclose(input[i]);
237: }
238:
239: prune()
240: {
241: register i,j;
242: for(pref=0;pref<len[0]&&pref<len[1]&&
243: file[0][pref+1].value==file[1][pref+1].value;
244: pref++ ) ;
245: for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
246: file[0][len[0]-suff].value==file[1][len[1]-suff].value;
247: suff++) ;
248: for(j=0;j<2;j++) {
249: sfile[j] = file[j]+pref;
250: slen[j] = len[j]-pref-suff;
251: for(i=0;i<=slen[j];i++)
252: sfile[j][i].serial = i;
253: }
254: }
255:
256: equiv(a,n,b,m,c)
257: struct line *a, *b;
258: int *c;
259: {
260: register int i, j;
261: i = j = 1;
262: while(i<=n && j<=m) {
263: if(a[i].value <b[j].value)
264: a[i++].value = 0;
265: else if(a[i].value == b[j].value)
266: a[i++].value = j;
267: else
268: j++;
269: }
270: while(i <= n)
271: a[i++].value = 0;
272: b[m+1].value = 0;
273: j = 0;
274: while(++j <= m) {
275: c[j] = -b[j].serial;
276: while(b[j+1].value == b[j].value) {
277: j++;
278: c[j] = b[j].serial;
279: }
280: }
281: c[j] = -1;
282: }
283:
284: stone(a,n,b,c)
285: int *a;
286: int *b;
287: int *c;
288: {
289: register int i, k,y;
290: int j, l;
291: int oldc, tc;
292: int oldl;
293: k = 0;
294: c[0] = newcand(0,0,0);
295: for(i=1; i<=n; i++) {
296: j = a[i];
297: if(j==0)
298: continue;
299: y = -b[j];
300: oldl = 0;
301: oldc = c[0];
302: do {
303: if(y <= clist[oldc].y)
304: continue;
305: l = search(c, k, y);
306: if(l!=oldl+1)
307: oldc = c[l-1];
308: if(l<=k) {
309: if(clist[c[l]].y <= y)
310: continue;
311: tc = c[l];
312: c[l] = newcand(i,y,oldc);
313: oldc = tc;
314: oldl = l;
315: } else {
316: c[l] = newcand(i,y,oldc);
317: k++;
318: break;
319: }
320: } while((y=b[++j]) > 0);
321: }
322: return(k);
323: }
324:
325: newcand(x,y,pred)
326: {
327: register struct cand *q;
328: clist = (struct cand *)realloc((char *)clist,++clen*sizeof(cand));
329: q = clist + clen -1;
330: q->x = x;
331: q->y = y;
332: q->pred = pred;
333: return(clen-1);
334: }
335:
336: search(c, k, y)
337: int *c;
338: {
339: register int i, j, l;
340: int t;
341: if(clist[c[k]].y<y) /*quick look for typical case*/
342: return(k+1);
343: i = 0;
344: j = k+1;
345: while((l=(i+j)/2) > i) {
346: t = clist[c[l]].y;
347: if(t > y)
348: j = l;
349: else if(t < y)
350: i = l;
351: else
352: return(l);
353: }
354: return(l+1);
355: }
356:
357: unravel(p)
358: {
359: register int i;
360: register struct cand *q;
361: for(i=0; i<=len[0]; i++)
362: J[i] = i<=pref ? i:
363: i>len[0]-suff ? i+len[1]-len[0]:
364: 0;
365: for(q=clist+p;q->y!=0;q=clist+q->pred)
366: J[q->x+pref] = q->y+pref;
367: }
368:
369: /* check does double duty:
370: 1. ferret out any fortuitous correspondences due
371: to confounding by hashing (which result in "jackpot")
372: 2. collect random access indexes to the two files */
373:
374: check()
375: {
376: register int i, j;
377: int jackpot;
378: long ctold, ctnew;
379: char c,d;
380: input[0] = fopen(file1,"r");
381: input[1] = fopen(file2,"r");
382: j = 1;
383: ixold[0] = ixnew[0] = 0;
384: jackpot = 0;
385: ctold = ctnew = 0;
386: for(i=1;i<=len[0];i++) {
387: if(J[i]==0) {
388: ixold[i] = ctold += skipline(0);
389: continue;
390: }
391: while(j<J[i]) {
392: ixnew[j] = ctnew += skipline(1);
393: j++;
394: }
395: for(;;) {
396: c = getc(input[0]);
397: d = getc(input[1]);
398: ctold++;
399: ctnew++;
400: if(bflag==1 && isspace(c) && isspace(d)) {
401: do {
402: if(c=='\n') break;
403: ctold++;
404: } while(isspace(c=getc(input[0])));
405: do {
406: if(d=='\n') break;
407: ctnew++;
408: } while(isspace(d=getc(input[1])));
409: }
410: if(bflag==2 && isspace(c)) {
411: do {
412: if(c=='\n') break;
413: ctold++;
414: } while(isspace(c=getc(input[0])));
415: }
416: if(bflag==2 && isspace(d)) {
417: do {
418: if(d=='\n') break;
419: ctnew++;
420: } while(isspace(d=getc(input[1])));
421: }
422: if(c!=d) {
423: jackpot++;
424: J[i] = 0;
425: if(c!='\n')
426: ctold += skipline(0);
427: if(d!='\n')
428: ctnew += skipline(1);
429: break;
430: }
431: if(c=='\n')
432: break;
433: }
434: ixold[i] = ctold;
435: ixnew[j] = ctnew;
436: j++;
437: }
438: for(;j<=len[1];j++) {
439: ixnew[j] = ctnew += skipline(1);
440: }
441: fclose(input[0]);
442: fclose(input[1]);
443: /*
444: if(jackpot)
445: fprintf(stderr, "jackpot\n");
446: */
447: }
448:
449: sort(a,n) /*shellsort CACM #201*/
450: struct line *a;
451: {
452: struct line w;
453: register int j,m;
454: struct line *ai;
455: register struct line *aim;
456: int k;
457:
458: m = 0;
459: for(j=1;j<=n;j*= 2)
460: m = 2*j - 1;
461: for(m/=2;m!=0;m/=2) {
462: k = n-m;
463: for(j=1;j<=k;j++) {
464: for(ai = &a[j]; ai > a; ai -= m) {
465: aim = &ai[m];
466: if(aim < ai)
467: break; /*wraparound*/
468: if(aim->value > ai[0].value ||
469: aim->value == ai[0].value &&
470: aim->serial > ai[0].serial)
471: break;
472: w.value = ai[0].value;
473: ai[0].value = aim->value;
474: aim->value = w.value;
475: w.serial = ai[0].serial;
476: ai[0].serial = aim->serial;
477: aim->serial = w.serial;
478: }
479: }
480: }
481: }
482:
483: unsort(f, l, b)
484: struct line *f;
485: int *b;
486: {
487: register int *a;
488: register int i;
489: a = (int *)talloc((l+1)*sizeof(int));
490: for(i=1;i<=l;i++)
491: a[f[i].serial] = f[i].value;
492: for(i=1;i<=l;i++)
493: b[i] = a[i];
494: free((char *)a);
495: }
496:
497: skipline(f)
498: {
499: register i;
500: for(i=1;getc(input[f])!='\n';i++) ;
501: return(i);
502: }
503:
504: output()
505: {
506: int m;
507: register int i0, i1, j1;
508: int j0;
509: input[0] = fopen(file1,"r");
510: input[1] = fopen(file2,"r");
511: m = len[0];
512: J[0] = 0;
513: J[m+1] = len[1]+1;
514: if(opt!=D_EDIT) for(i0=1;i0<=m;i0=i1+1) {
515: while(i0<=m&&J[i0]==J[i0-1]+1) i0++;
516: j0 = J[i0-1]+1;
517: i1 = i0-1;
518: while(i1<m&&J[i1+1]==0) i1++;
519: j1 = J[i1+1]-1;
520: J[i1] = j1;
521: change(i0,i1,j0,j1);
522: } else for(i0=m;i0>=1;i0=i1-1) {
523: while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--;
524: j0 = J[i0+1]-1;
525: i1 = i0+1;
526: while(i1>1&&J[i1-1]==0) i1--;
527: j1 = J[i1-1]+1;
528: J[i1] = j1;
529: change(i1,i0,j1,j0);
530: }
531: if(m==0)
532: change(1,0,1,len[1]);
533: }
534:
535: /* indicate that there is a difference between lines a and b of the from file
536: to get to lines c to d of the to file.
537: If a is greater then b then there are no lines in the from file involved
538: and this means that there were lines appended (beginning at b).
539: If c is greater than d then there are lines missing from the to file.
540: */
541: change(a,b,c,d)
542: {
543: char ch;
544: int lowa,upb,lowc,upd;
545: struct stat stbuf;
546:
547: if (a>b && c>d)
548: return;
549: if (anychange == 0) {
550: anychange = 1;
551: if(opt == D_CONTEXT) {
552: printf("*** %s ", file1);
553: stat(file1, &stbuf);
554: printf("%s--- %s ",
555: ctime(&stbuf.st_mtime), file2);
556: stat(file2, &stbuf);
557: printf("%s", ctime(&stbuf.st_mtime));
558: }
559: }
560: if (a <= b && c <= d)
561: ch = 'c';
562: else
563: ch = (a <= b) ? 'd' : 'a';
564: if(opt == D_CONTEXT) {
565: lowa = max(1, a-context);
566: upb = min(len[0], b+context);
567: lowc = max(1, c-context);
568: upd = min(len[1], d+context);
569:
570: /* print out from file */
571: printf("***************\n*** ");
572: range(lowa,upb,",");
573: printf("\n");
574: if (ch == 'a')
575: fetch(ixold,lowa,upb,input[0]," ");
576: else {
577: fetch(ixold,lowa,a-1,input[0]," ");
578: fetch(ixold,a,b,input[0],ch == 'c' ? "! " : "- ");
579: fetch(ixold,b+1,upb,input[0]," ");
580: }
581: putchar('\n');
582: printf("--- ");
583: range(lowc,upd,",");
584: printf(" -----\n");
585: if (ch == 'd')
586: fetch(ixnew,lowc,upd,input[1]," ");
587: else {
588: fetch(ixnew,lowc,c-1,input[1]," ");
589: fetch(ixnew,c,d,input[1],ch == 'c' ? "! " : "+ ");
590: fetch(ixnew,d+1,upd,input[1]," ");
591: }
592: return;
593: }
594: switch (opt) {
595:
596: case D_NORMAL:
597: case D_EDIT:
598: range(a,b,",");
599: putchar(a>b?'a':c>d?'d':'c');
600: if(opt==D_NORMAL)
601: range(c,d,",");
602: putchar('\n');
603: break;
604: case D_REVERSE:
605: putchar(a>b?'a':c>d?'d':'c');
606: range(a,b," ");
607: putchar('\n');
608: break;
609: }
610: if(opt == D_NORMAL) {
611: fetch(ixold,a,b,input[0],"< ", 1);
612: if(a<=b&&c<=d)
613: prints("---\n");
614: }
615: fetch(ixnew,c,d,input[1],opt==D_NORMAL?"> ":"", 0);
616: if ((opt ==D_EDIT || opt == D_REVERSE) && c<=d)
617: prints(".\n");
618: }
619:
620: range(a,b,separator)
621: char *separator;
622: {
623: printf("%d", a>b?b:a);
624: if(a<b) {
625: printf("%s%d", separator, b);
626: }
627: }
628:
629: fetch(f,a,b,lb,s,oldfile)
630: long *f;
631: FILE *lb;
632: char *s;
633: {
634: register int i, j;
635: register int nc;
636:
637: if (a > b)
638: return;
639: for(i=a;i<=b;i++) {
640: fseek(lb,f[i-1],0);
641: nc = f[i]-f[i-1];
642: prints(s);
643: for(j=0;j<nc;j++)
644: putchar(getc(lb));
645: }
646: }
647:
648: #define HALFLONG 16
649: #define low(x) (x&((1L<<HALFLONG)-1))
650: #define high(x) (x>>HALFLONG)
651:
652: /*
653: * hashing has the effect of
654: * arranging line in 7-bit bytes and then
655: * summing 1-s complement in 16-bit hunks
656: */
657: readhash(f)
658: FILE *f;
659: {
660: long sum;
661: register unsigned shift;
662: register space;
663: register t;
664: sum = 1;
665: space = 0;
666: if(bflag==0) for(shift=0;(t=getc(f))!='\n';shift+=7) {
667: if(t==-1)
668: return(0);
669: sum += (long)t << (shift &= (HALFLONG-1));
670: }
671: else if(bflag==1) for(shift=0;;) {
672: switch(t=getc(f)) {
673: case -1:
674: return(0);
675: case '\t':
676: case ' ':
677: space++;
678: continue;
679: default:
680: if(space) {
681: shift += 7;
682: space = 0;
683: }
684: sum += (long)t << (shift &= (HALFLONG-1));
685: shift += 7;
686: continue;
687: case '\n':
688: break;
689: }
690: break;
691: }
692: else for(shift=0;;) { /* bflag==2 */
693: switch(t=getc(f)) {
694: case -1:
695: return(0);
696: case '\t':
697: case ' ':
698: continue;
699: default:
700: sum += (long)t << (shift &= (HALFLONG-1));
701: shift += 7;
702: continue;
703: case '\n':
704: break;
705: }
706: break;
707: }
708: sum = low(sum) + high(sum);
709: return((short)low(sum) + (short)high(sum));
710: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.