|
|
1.1 root 1: static char sccsid[] = "@(#)diffreg.c 4.1 10/9/80";
2:
3: #include "diff.h"
4: /*
5: * diff - compare two files.
6: */
7:
8: /*
9: * Uses an algorithm due to Harold Stone, which finds
10: * a pair of longest identical subsequences in the two
11: * files.
12: *
13: * The major goal is to generate the match vector J.
14: * J[i] is the index of the line in file1 corresponding
15: * to line i file0. J[i] = 0 if there is no
16: * such line in file1.
17: *
18: * Lines are hashed so as to work in core. All potential
19: * matches are located by sorting the lines of each file
20: * on the hash (called ``value''). In particular, this
21: * collects the equivalence classes in file1 together.
22: * Subroutine equiv replaces the value of each line in
23: * file0 by the index of the first element of its
24: * matching equivalence in (the reordered) file1.
25: * To save space equiv squeezes file1 into a single
26: * array member in which the equivalence classes
27: * are simply concatenated, except that their first
28: * members are flagged by changing sign.
29: *
30: * Next the indices that point into member are unsorted into
31: * array class according to the original order of file0.
32: *
33: * The cleverness lies in routine stone. This marches
34: * through the lines of file0, developing a vector klist
35: * of "k-candidates". At step i a k-candidate is a matched
36: * pair of lines x,y (x in file0 y in file1) such that
37: * there is a common subsequence of length k
38: * between the first i lines of file0 and the first y
39: * lines of file1, but there is no such subsequence for
40: * any smaller y. x is the earliest possible mate to y
41: * that occurs in such a subsequence.
42: *
43: * Whenever any of the members of the equivalence class of
44: * lines in file1 matable to a line in file0 has serial number
45: * less than the y of some k-candidate, that k-candidate
46: * with the smallest such y is replaced. The new
47: * k-candidate is chained (via pred) to the current
48: * k-1 candidate so that the actual subsequence can
49: * be recovered. When a member has serial number greater
50: * that the y of all k-candidates, the klist is extended.
51: * At the end, the longest subsequence is pulled out
52: * and placed in the array J by unravel
53: *
54: * With J in hand, the matches there recorded are
55: * check'ed against reality to assure that no spurious
56: * matches have crept in due to hashing. If they have,
57: * they are broken, and "jackpot" is recorded--a harmless
58: * matter except that a true match for a spuriously
59: * mated line may now be unnecessarily reported as a change.
60: *
61: * Much of the complexity of the program comes simply
62: * from trying to minimize core utilization and
63: * maximize the range of doable problems by dynamically
64: * allocating what is needed and reusing what is not.
65: * The core requirements for problems larger than somewhat
66: * are (in words) 2*length(file0) + length(file1) +
67: * 3*(number of k-candidates installed), typically about
68: * 6n words for files of length n.
69: */
70:
71: #define prints(s) fputs(s,stdout)
72:
73: FILE *input[2];
74: FILE *fopen();
75:
76: struct cand {
77: int x;
78: int y;
79: int pred;
80: } cand;
81: struct line {
82: int serial;
83: int value;
84: } *file[2], line;
85: int len[2];
86: struct line *sfile[2]; /* shortened by pruning common prefix and suffix */
87: int slen[2];
88: int pref, suff; /* length of prefix and suffix */
89: int *class; /* will be overlaid on file[0] */
90: int *member; /* will be overlaid on file[1] */
91: int *klist; /* will be overlaid on file[0] after class */
92: struct cand *clist; /* merely a free storage pot for candidates */
93: int clen = 0;
94: int *J; /* will be overlaid on class */
95: long *ixold; /* will be overlaid on klist */
96: long *ixnew; /* will be overlaid on file[1] */
97:
98: diffreg()
99: {
100: register int k;
101:
102: if (hflag) {
103: diffargv[0] = "diffh";
104: execv(diffh, diffargv);
105: fprintf(stderr, "diff: ");
106: perror(diffh);
107: done();
108: }
109: dummy = malloc(1);
110: if ((stb1.st_mode & S_IFMT) == S_IFDIR)
111: file1 = splice(file1, file2);
112: else if ((stb2.st_mode & S_IFMT) == S_IFDIR)
113: file2 = splice(file2, file1);
114: else if (!strcmp(file1, "-")) {
115: if (!strcmp(file2, "-")) {
116: fprintf(stderr, "diff: can't specify - -\n");
117: done();
118: }
119: file1 = copytemp();
120: } else if (!strcmp(file2, "-"))
121: file2 = copytemp();
122: prepare(0, file1);
123: prepare(1, file2);
124: prune();
125: sort(sfile[0],slen[0]);
126: sort(sfile[1],slen[1]);
127:
128: member = (int *)file[1];
129: equiv(sfile[0], slen[0], sfile[1], slen[1], member);
130: member = (int *)ralloc((char *)member,(slen[1]+2)*sizeof(int));
131:
132: class = (int *)file[0];
133: unsort(sfile[0], slen[0], class);
134: class = (int *)ralloc((char *)class,(slen[0]+2)*sizeof(int));
135:
136: klist = (int *)talloc((slen[0]+2)*sizeof(int));
137: clist = (struct cand *)talloc(sizeof(cand));
138: k = stone(class, slen[0], member, klist);
139: free((char *)member);
140: free((char *)class);
141:
142: J = (int *)talloc((len[0]+2)*sizeof(int));
143: unravel(klist[k]);
144: free((char *)clist);
145: free((char *)klist);
146:
147: ixold = (long *)talloc((len[0]+2)*sizeof(long));
148: ixnew = (long *)talloc((len[1]+2)*sizeof(long));
149: check();
150: output();
151: status = anychange;
152: if (opt == D_CONTEXT && anychange == 0)
153: printf("No differences encountered\n");
154: done();
155: }
156:
157: char *
158: copytemp()
159: {
160: char buf[BUFSIZ];
161: register int i, f;
162:
163: signal(SIGHUP,done);
164: signal(SIGINT,done);
165: signal(SIGPIPE,done);
166: signal(SIGTERM,done);
167: tempfile = mktemp("/tmp/dXXXXX");
168: f = creat(tempfile,0600);
169: if (f < 0) {
170: fprintf("diff: ");
171: perror(tempfile);
172: done();
173: }
174: while ((i = read(0,buf,BUFSIZ)) > 0)
175: if (write(f,buf,i) != i) {
176: fprintf(stderr, "diff: ");
177: perror(tempfile);
178: done();
179: }
180: close(f);
181: return (tempfile);
182: }
183:
184: char *
185: splice(dir, file)
186: char *dir, *file;
187: {
188: char *tail;
189: char buf[BUFSIZ];
190:
191: if (!strcmp(file, "-")) {
192: fprintf(stderr, "diff: can't specify - with other arg directory\n");
193: done();
194: }
195: tail = rindex(file, '/');
196: if (tail == 0)
197: tail = file;
198: else
199: tail++;
200: sprintf(buf, "%s/%s", dir, tail);
201: return (savestr(buf));
202: }
203:
204: prepare(i, arg)
205: char *arg;
206: {
207: register struct line *p;
208: register j,h;
209: if((input[i] = fopen(arg,"r")) == NULL){
210: fprintf(stderr, "diff: ");
211: perror(arg);
212: done();
213: }
214: p = (struct line *)talloc(3*sizeof(line));
215: for(j=0; h=readhash(input[i]);) {
216: p = (struct line *)ralloc((char *)p,(++j+3)*sizeof(line));
217: p[j].value = h;
218: }
219: len[i] = j;
220: file[i] = p;
221: fclose(input[i]);
222: }
223:
224: prune()
225: {
226: register i,j;
227: for(pref=0;pref<len[0]&&pref<len[1]&&
228: file[0][pref+1].value==file[1][pref+1].value;
229: pref++ ) ;
230: for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
231: file[0][len[0]-suff].value==file[1][len[1]-suff].value;
232: suff++) ;
233: for(j=0;j<2;j++) {
234: sfile[j] = file[j]+pref;
235: slen[j] = len[j]-pref-suff;
236: for(i=0;i<=slen[j];i++)
237: sfile[j][i].serial = i;
238: }
239: }
240:
241: equiv(a,n,b,m,c)
242: struct line *a, *b;
243: int *c;
244: {
245: register int i, j;
246: i = j = 1;
247: while(i<=n && j<=m) {
248: if(a[i].value <b[j].value)
249: a[i++].value = 0;
250: else if(a[i].value == b[j].value)
251: a[i++].value = j;
252: else
253: j++;
254: }
255: while(i <= n)
256: a[i++].value = 0;
257: b[m+1].value = 0;
258: j = 0;
259: while(++j <= m) {
260: c[j] = -b[j].serial;
261: while(b[j+1].value == b[j].value) {
262: j++;
263: c[j] = b[j].serial;
264: }
265: }
266: c[j] = -1;
267: }
268:
269: stone(a,n,b,c)
270: int *a;
271: int *b;
272: int *c;
273: {
274: register int i, k,y;
275: int j, l;
276: int oldc, tc;
277: int oldl;
278: k = 0;
279: c[0] = newcand(0,0,0);
280: for(i=1; i<=n; i++) {
281: j = a[i];
282: if(j==0)
283: continue;
284: y = -b[j];
285: oldl = 0;
286: oldc = c[0];
287: do {
288: if(y <= clist[oldc].y)
289: continue;
290: l = search(c, k, y);
291: if(l!=oldl+1)
292: oldc = c[l-1];
293: if(l<=k) {
294: if(clist[c[l]].y <= y)
295: continue;
296: tc = c[l];
297: c[l] = newcand(i,y,oldc);
298: oldc = tc;
299: oldl = l;
300: } else {
301: c[l] = newcand(i,y,oldc);
302: k++;
303: break;
304: }
305: } while((y=b[++j]) > 0);
306: }
307: return(k);
308: }
309:
310: newcand(x,y,pred)
311: {
312: register struct cand *q;
313: clist = (struct cand *)ralloc((char *)clist,++clen*sizeof(cand));
314: q = clist + clen -1;
315: q->x = x;
316: q->y = y;
317: q->pred = pred;
318: return(clen-1);
319: }
320:
321: search(c, k, y)
322: int *c;
323: {
324: register int i, j, l;
325: int t;
326: if(clist[c[k]].y<y) /*quick look for typical case*/
327: return(k+1);
328: i = 0;
329: j = k+1;
330: while((l=(i+j)/2) > i) {
331: t = clist[c[l]].y;
332: if(t > y)
333: j = l;
334: else if(t < y)
335: i = l;
336: else
337: return(l);
338: }
339: return(l+1);
340: }
341:
342: unravel(p)
343: {
344: register int i;
345: register struct cand *q;
346: for(i=0; i<=len[0]; i++)
347: J[i] = i<=pref ? i:
348: i>len[0]-suff ? i+len[1]-len[0]:
349: 0;
350: for(q=clist+p;q->y!=0;q=clist+q->pred)
351: J[q->x+pref] = q->y+pref;
352: }
353:
354: /* check does double duty:
355: 1. ferret out any fortuitous correspondences due
356: to confounding by hashing (which result in "jackpot")
357: 2. collect random access indexes to the two files */
358:
359: check()
360: {
361: register int i, j;
362: int jackpot;
363: long ctold, ctnew;
364: char c,d;
365: input[0] = fopen(file1,"r");
366: input[1] = fopen(file2,"r");
367: j = 1;
368: ixold[0] = ixnew[0] = 0;
369: jackpot = 0;
370: ctold = ctnew = 0;
371: for(i=1;i<=len[0];i++) {
372: if(J[i]==0) {
373: ixold[i] = ctold += skipline(0);
374: continue;
375: }
376: while(j<J[i]) {
377: ixnew[j] = ctnew += skipline(1);
378: j++;
379: }
380: for(;;) {
381: c = getc(input[0]);
382: d = getc(input[1]);
383: ctold++;
384: ctnew++;
385: if(bflag && isspace(c) && isspace(d)) {
386: do {
387: if(c=='\n') break;
388: ctold++;
389: } while(isspace(c=getc(input[0])));
390: do {
391: if(d=='\n') break;
392: ctnew++;
393: } while(isspace(d=getc(input[1])));
394: }
395: if(c!=d) {
396: jackpot++;
397: J[i] = 0;
398: if(c!='\n')
399: ctold += skipline(0);
400: if(d!='\n')
401: ctnew += skipline(1);
402: break;
403: }
404: if(c=='\n')
405: break;
406: }
407: ixold[i] = ctold;
408: ixnew[j] = ctnew;
409: j++;
410: }
411: for(;j<=len[1];j++) {
412: ixnew[j] = ctnew += skipline(1);
413: }
414: fclose(input[0]);
415: fclose(input[1]);
416: /*
417: if(jackpot)
418: fprintf(stderr, "jackpot\n");
419: */
420: }
421:
422: sort(a,n) /*shellsort CACM #201*/
423: struct line *a;
424: {
425: struct line w;
426: register int j,m;
427: struct line *ai;
428: register struct line *aim;
429: int k;
430: for(j=1;j<=n;j*= 2)
431: m = 2*j - 1;
432: for(m/=2;m!=0;m/=2) {
433: k = n-m;
434: for(j=1;j<=k;j++) {
435: for(ai = &a[j]; ai > a; ai -= m) {
436: aim = &ai[m];
437: if(aim < ai)
438: break; /*wraparound*/
439: if(aim->value > ai[0].value ||
440: aim->value == ai[0].value &&
441: aim->serial > ai[0].serial)
442: break;
443: w.value = ai[0].value;
444: ai[0].value = aim->value;
445: aim->value = w.value;
446: w.serial = ai[0].serial;
447: ai[0].serial = aim->serial;
448: aim->serial = w.serial;
449: }
450: }
451: }
452: }
453:
454: unsort(f, l, b)
455: struct line *f;
456: int *b;
457: {
458: register int *a;
459: register int i;
460: a = (int *)talloc((l+1)*sizeof(int));
461: for(i=1;i<=l;i++)
462: a[f[i].serial] = f[i].value;
463: for(i=1;i<=l;i++)
464: b[i] = a[i];
465: free((char *)a);
466: }
467:
468: skipline(f)
469: {
470: register i;
471: for(i=1;getc(input[f])!='\n';i++) ;
472: return(i);
473: }
474:
475: output()
476: {
477: int m;
478: register int i0, i1, j1;
479: int j0;
480: input[0] = fopen(file1,"r");
481: input[1] = fopen(file2,"r");
482: m = len[0];
483: J[0] = 0;
484: J[m+1] = len[1]+1;
485: if(opt!=D_EDIT) for(i0=1;i0<=m;i0=i1+1) {
486: while(i0<=m&&J[i0]==J[i0-1]+1) i0++;
487: j0 = J[i0-1]+1;
488: i1 = i0-1;
489: while(i1<m&&J[i1+1]==0) i1++;
490: j1 = J[i1+1]-1;
491: J[i1] = j1;
492: change(i0,i1,j0,j1);
493: } else for(i0=m;i0>=1;i0=i1-1) {
494: while(i0>=1&&J[i0]==J[i0+1]-1&&J[i0]!=0) i0--;
495: j0 = J[i0+1]-1;
496: i1 = i0+1;
497: while(i1>1&&J[i1-1]==0) i1--;
498: j1 = J[i1-1]+1;
499: J[i1] = j1;
500: change(i1,i0,j1,j0);
501: }
502: if(m==0)
503: change(1,0,1,len[1]);
504: if (opt==D_IFDEF) {
505: for (;;) {
506: #define c i0
507: c = getc(input[0]);
508: if (c < 0)
509: return;
510: putchar(c);
511: }
512: #undef c
513: }
514: }
515:
516: /* indicate that there is a difference between lines a and b of the from file
517: to get to lines c to d of the to file.
518: If a is greater then b then there are no lines in the from file involved
519: and this means that there were lines appended (beginning at b).
520: If c is greater than d then there are lines missing from the to file.
521: */
522: change(a,b,c,d)
523: {
524: char ch;
525: int lowa,upb,lowc,upd;
526: struct stat stbuf;
527:
528: if (opt != D_IFDEF && a>b && c>d)
529: return;
530: if (anychange == 0) {
531: anychange = 1;
532: if(opt == D_CONTEXT) {
533: printf("*** %s ", file1);
534: stat(file1, &stbuf);
535: printf("%s--- %s ",
536: ctime(&stbuf.st_mtime), file2);
537: stat(file2, &stbuf);
538: printf("%s", ctime(&stbuf.st_mtime));
539: }
540: }
541: if (a <= b && c <= d)
542: ch = 'c';
543: else
544: ch = (a <= b) ? 'd' : 'a';
545: if(opt == D_CONTEXT) {
546: lowa = max(1, a-context);
547: upb = min(len[0], b+context);
548: lowc = max(1, c-context);
549: upd = min(len[1], d+context);
550:
551: /* print out from file */
552: printf("***************\n*** ");
553: range(lowa,upb,",");
554: printf("\n");
555: if (ch == 'a')
556: fetch(ixold,lowa,upb,input[0]," ");
557: else {
558: fetch(ixold,lowa,a-1,input[0]," ");
559: fetch(ixold,a,b,input[0],ch == 'c' ? "! " : "- ");
560: fetch(ixold,b+1,upb,input[0]," ");
561: }
562: putchar('\n');
563: printf("--- ");
564: range(lowc,upd,",");
565: printf(" -----\n");
566: if (ch == 'd')
567: fetch(ixnew,lowc,upd,input[1]," ");
568: else {
569: fetch(ixnew,lowc,c-1,input[1]," ");
570: fetch(ixnew,c,d,input[1],ch == 'c' ? "! " : "+ ");
571: fetch(ixnew,d+1,upd,input[1]," ");
572: }
573: return;
574: }
575: switch (opt) {
576:
577: case D_NORMAL:
578: case D_EDIT:
579: range(a,b,",");
580: putchar(a>b?'a':c>d?'d':'c');
581: if(opt==D_NORMAL)
582: range(c,d,",");
583: putchar('\n');
584: break;
585: case D_REVERSE:
586: putchar(a>b?'a':c>d?'d':'c');
587: range(a,b," ");
588: putchar('\n');
589: break;
590: }
591: if(opt == D_NORMAL || opt == D_IFDEF) {
592: fetch(ixold,a,b,input[0],"< ", 1);
593: if(a<=b&&c<=d && opt == D_NORMAL)
594: prints("---\n");
595: }
596: fetch(ixnew,c,d,input[1],opt==D_NORMAL?"> ":"", 0);
597: if ((opt ==D_EDIT || opt == D_REVERSE) && c<=d)
598: prints(".\n");
599: if (inifdef) {
600: fprintf(stdout, "#endif %s\n", endifname);
601: inifdef = 0;
602: }
603: }
604:
605: range(a,b,separator)
606: char *separator;
607: {
608: printf("%d", a>b?b:a);
609: if(a<b) {
610: printf("%s%d", separator, b);
611: }
612: }
613:
614: fetch(f,a,b,lb,s,oldfile)
615: long *f;
616: FILE *lb;
617: char *s;
618: {
619: register int i, j;
620: register int nc;
621: int oneflag = (*ifdef1!='\0') != (*ifdef2!='\0');
622:
623: /*
624: * When doing #ifdef's, copy down to current line
625: * if this is the first file, so that stuff makes it to output.
626: */
627: if (opt == D_IFDEF && oldfile){
628: long curpos = ftell(lb);
629: /* print through if append (a>b), else to (nb: 0 vs 1 orig) */
630: nc = f[a>b? b : a-1 ] - curpos;
631: for (i = 0; i < nc; i++)
632: putchar(getc(lb));
633: }
634: if (a > b)
635: return;
636: if (opt == D_IFDEF) {
637: if (inifdef)
638: fprintf(stdout, "#else %s%s\n", oneflag && oldfile==1 ? "!" : "", ifdef2);
639: else {
640: if (oneflag) {
641: /* There was only one ifdef given */
642: endifname = ifdef2;
643: if (oldfile)
644: fprintf(stdout, "#ifndef %s\n", endifname);
645: else
646: fprintf(stdout, "#ifdef %s\n", endifname);
647: }
648: else {
649: endifname = oldfile ? ifdef1 : ifdef2;
650: fprintf(stdout, "#ifdef %s\n", endifname);
651: }
652: }
653: inifdef = 1+oldfile;
654: }
655: for(i=a;i<=b;i++) {
656: fseek(lb,f[i-1],0);
657: nc = f[i]-f[i-1];
658: if (opt != D_IFDEF)
659: prints(s);
660: for(j=0;j<nc;j++)
661: putchar(getc(lb));
662: }
663: if (inifdef && !wantelses) {
664: fprintf(stdout, "#endif %s\n", endifname);
665: inifdef = 0;
666: }
667: }
668:
669: #define HALFLONG 16
670: #define low(x) (x&((1L<<HALFLONG)-1))
671: #define high(x) (x>>HALFLONG)
672:
673: /*
674: * hashing has the effect of
675: * arranging line in 7-bit bytes and then
676: * summing 1-s complement in 16-bit hunks
677: */
678: readhash(f)
679: FILE *f;
680: {
681: long sum;
682: register unsigned shift;
683: register space;
684: register t;
685: sum = 1;
686: space = 0;
687: if(!bflag) for(shift=0;(t=getc(f))!='\n';shift+=7) {
688: if(t==-1)
689: return(0);
690: sum += (long)t << (shift%=HALFLONG);
691: }
692: else for(shift=0;;) {
693: switch(t=getc(f)) {
694: case -1:
695: return(0);
696: case '\t':
697: case ' ':
698: space++;
699: continue;
700: default:
701: if(space) {
702: shift += 7;
703: space = 0;
704: }
705: sum += (long)t << (shift%=HALFLONG);
706: shift += 7;
707: continue;
708: case '\n':
709: break;
710: }
711: break;
712: }
713: sum = low(sum) + high(sum);
714: return((short)low(sum) + (short)high(sum));
715: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.