|
|
1.1 root 1: #define DEBUG
2:
3: #include "awk.h"
4: #include <ctype.h>
5: #include <stdio.h>
6: #include "y.tab.h"
7:
8: #define HAT (NCHARS-1) /* matches ^ in regular expr */
9: /* NCHARS is 2**n */
10: #define MAXLIN 256
11:
12: #define type(v) (v)->nobj
13: #define left(v) (v)->narg[0]
14: #define right(v) (v)->narg[1]
15: #define parent(v) (v)->nnext
16:
17: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
18: #define UNARY case STAR: case PLUS: case QUEST:
19:
20: /* encoding in tree Nodes:
21: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
22: left is index, right contains value or pointer to value
23: unary (STAR, PLUS, QUEST): left is child, right is null
24: binary (CAT, OR): left and right are children
25: parent contains pointer to parent
26: */
27:
28:
29: uchar chars[MAXLIN];
30: int setvec[MAXLIN];
31: int tmpset[MAXLIN];
32: Node *point[MAXLIN];
33:
34: int rtok; /* next token in current re */
35: int rlxval;
36: uchar *rlxstr;
37: uchar *prestr; /* current position in current re */
38: uchar *lastre; /* origin of last re */
39:
40: static int setcnt;
41: static int poscnt;
42:
43: uchar *patbeg;
44: int patlen;
45:
46: #define NFA 20 /* cache this many dynamic fa's */
47: fa *fatab[NFA];
48: int nfatab = 0; /* entries in fatab */
49: fa *mkdfa();
50:
51: fa *makedfa(s, anchor) /* returns dfa for reg expr s */
52: uchar *s;
53: int anchor;
54: {
55: int i, use, nuse;
56: fa *pfa;
57:
58: if (compile_time) /* a constant for sure */
59: return mkdfa(s, anchor);
60: for (i = 0; i < nfatab; i++) /* is it there already? */
61: if (fatab[i]->anchor == anchor && strcmp(fatab[i]->restr,s) == 0) {
62: fatab[i]->use++;
63: return fatab[i];
64: }
65: pfa = mkdfa(s, anchor);
66: if (nfatab < NFA) { /* room for another */
67: fatab[nfatab] = pfa;
68: fatab[nfatab]->use = 1;
69: nfatab++;
70: return pfa;
71: }
72: use = fatab[0]->use; /* replace least-recently used */
73: nuse = 0;
74: for (i = 1; i < nfatab; i++)
75: if (fatab[i]->use < use) {
76: use = fatab[i]->use;
77: nuse = i;
78: }
79: freefa(fatab[nuse]);
80: fatab[nuse] = pfa;
81: pfa->use = 1;
82: return pfa;
83: }
84:
85: fa *mkdfa(s, anchor) /* does the real work of making a dfa */
86: uchar *s;
87: int anchor; /* anchor = 1 for anchored matches, else 0 */
88: {
89: Node *p, *p1, *reparse();
90: fa *f;
91:
92: p = reparse(s);
93: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
94: /* put ALL STAR in front of reg. exp. */
95: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
96: /* put FINAL after reg. exp. */
97:
98: poscnt = 0;
99: penter(p1); /* enter parent pointers and leaf indices */
100: if ((f = (fa *) Calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
101: overflo("no room for fa");
102: f->accept = poscnt-1; /* penter has computed number of positions in re */
103: cfoll(f, p1); /* set up follow sets */
104: freetr(p1);
105: if ((f->posns[0] = (int *) Calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
106: overflo("out of space in makedfa");
107: if ((f->posns[1] = (int *) Calloc(1, sizeof(int))) == NULL)
108: overflo("out of space in makedfa");
109: *f->posns[1] = 0;
110: f->initstat = makeinit(f, anchor);
111: f->reset = 0;
112: f->anchor = anchor;
113: f->restr = tostring(s);
114: return f;
115: }
116:
117: int makeinit(f, anchor)
118: fa *f;
119: int anchor;
120: {
121: register i, k;
122:
123: f->curstat = 2;
124: f->out[2] = 0;
125: k = *(f->re[0].lfollow);
126: xfree(f->posns[2]);
127: if ((f->posns[2] = (int *) Calloc(1, (k+1)*sizeof(int))) == NULL)
128: overflo("out of space in makeinit");
129: for (i=0; i<=k; i++) {
130: (f->posns[2])[i] = (f->re[0].lfollow)[i];
131: }
132: if ((f->posns[2])[1] == f->accept)
133: f->out[2] = 1;
134: for (i=0; i<NCHARS; i++)
135: f->gototab[2][i] = 0;
136: f->curstat = cgoto(f, 2, HAT);
137: if (anchor) {
138: *f->posns[2] = k-1; /* leave out position 0 */
139: for (i=0; i<k; i++) {
140: (f->posns[0])[i] = (f->posns[2])[i];
141: }
142:
143: f->out[0] = f->out[2];
144: if (f->curstat != 2)
145: --(*f->posns[f->curstat]);
146: }
147: return f->curstat;
148: }
149:
150: penter(p) /* set up parent pointers and leaf indices */
151: Node *p;
152: {
153: switch(type(p)) {
154: LEAF
155: left(p) = (Node *) poscnt;
156: point[poscnt++] = p;
157: break;
158: UNARY
159: penter(left(p));
160: parent(left(p)) = p;
161: break;
162: case CAT:
163: case OR:
164: penter(left(p));
165: penter(right(p));
166: parent(left(p)) = p;
167: parent(right(p)) = p;
168: break;
169: default:
170: error(FATAL, "unknown type %d in penter\n", type(p));
171: break;
172: }
173: }
174:
175: freetr(p) /* free parse tree */
176: Node *p;
177: {
178: switch (type(p)) {
179: LEAF
180: xfree(p);
181: break;
182: UNARY
183: freetr(left(p));
184: xfree(p);
185: break;
186: case CAT:
187: case OR:
188: freetr(left(p));
189: freetr(right(p));
190: xfree(p);
191: break;
192: default:
193: error(FATAL, "unknown type %d in freetr", type(p));
194: break;
195: }
196: }
197:
198: uchar *cclenter(p)
199: register uchar *p;
200: {
201: register int i, c;
202: uchar *op;
203:
204: op = p;
205: i = 0;
206: while ((c = *p++) != 0) {
207: if (c == '\\') {
208: if ((c = *p++) == 't')
209: c = '\t';
210: else if (c == 'n')
211: c = '\n';
212: else if (c == 'f')
213: c = '\f';
214: else if (c == 'r')
215: c = '\r';
216: else if (c == 'b')
217: c = '\b';
218: else if (c == '\\')
219: c = '\\';
220: else if (isdigit(c)) {
221: int n = c - '0';
222: if (isdigit(*p)) {
223: n = 8 * n + *p++ - '0';
224: if (isdigit(*p))
225: n = 8 * n + *p++ - '0';
226: }
227: c = n;
228: } /* else */
229: /* c = c; */
230: } else if (c == '-' && i > 0 && chars[i-1] != 0) {
231: if (*p != 0) {
232: c = chars[i-1];
233: while (c < *p) { /* fails if *p is \\ */
234: if (i >= MAXLIN-1)
235: overflo("character class too big");
236: chars[i++] = ++c;
237: }
238: p++;
239: continue;
240: }
241: }
242: if (i >= MAXLIN-1)
243: overflo("character class too big");
244: chars[i++] = c;
245: }
246: chars[i++] = '\0';
247: dprintf("cclenter: in = |%s|, out = |%s|\n", op, chars, NULL);
248: xfree(op);
249: return(tostring(chars));
250: }
251:
252: overflo(s)
253: uchar *s;
254: {
255: error(FATAL, "regular expression too big: %s", s);
256: }
257:
258: cfoll(f, v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
259: fa *f;
260: register Node *v;
261: {
262: register int i;
263: register int *p;
264:
265: switch(type(v)) {
266: LEAF
267: f->re[(int) left(v)].ltype = type(v);
268: f->re[(int) left(v)].lval = (int) right(v);
269: for (i=0; i<=f->accept; i++)
270: setvec[i] = 0;
271: setcnt = 0;
272: follow(v); /* computes setvec and setcnt */
273: if ((p = (int *) Calloc(1, (setcnt+1)*sizeof(int))) == NULL)
274: overflo("follow set overflow");
275: f->re[(int) left(v)].lfollow = p;
276: *p = setcnt;
277: for (i = f->accept; i >= 0; i--)
278: if (setvec[i] == 1) *++p = i;
279: break;
280: UNARY
281: cfoll(f,left(v));
282: break;
283: case CAT:
284: case OR:
285: cfoll(f,left(v));
286: cfoll(f,right(v));
287: break;
288: default:
289: error(FATAL, "unknown type %d in cfoll", type(v));
290: }
291: }
292:
293: first(p) /* collects initially active leaves of p into setvec */
294: register Node *p; /* returns 0 or 1 depending on whether p matches empty string */
295: {
296: register int b;
297:
298: switch(type(p)) {
299: LEAF
300: if (setvec[(int) left(p)] != 1) {
301: setvec[(int) left(p)] = 1;
302: setcnt++;
303: }
304: if (type(p) == CCL && (*(uchar *) right(p)) == '\0')
305: return(0); /* empty CCL */
306: else return(1);
307: case PLUS:
308: if (first(left(p)) == 0) return(0);
309: return(1);
310: case STAR:
311: case QUEST:
312: first(left(p));
313: return(0);
314: case CAT:
315: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
316: return(1);
317: case OR:
318: b = first(right(p));
319: if (first(left(p)) == 0 || b == 0) return(0);
320: return(1);
321: }
322: error(FATAL, "unknown type %d in first\n", type(p));
323: return(-1);
324: }
325:
326: follow(v)
327: Node *v; /* collects leaves that can follow v into setvec */
328: {
329: Node *p;
330:
331: if (type(v) == FINAL)
332: return;
333: p = parent(v);
334: switch (type(p)) {
335: case STAR:
336: case PLUS:
337: first(v);
338: follow(p);
339: return;
340:
341: case OR:
342: case QUEST:
343: follow(p);
344: return;
345:
346: case CAT:
347: if (v == left(p)) { /* v is left child of p */
348: if (first(right(p)) == 0) {
349: follow(p);
350: return;
351: }
352: }
353: else /* v is right child */
354: follow(p);
355: return;
356: }
357: }
358:
359: member(c, s) /* is c in s? */
360: register uchar c, *s;
361: {
362: while (*s)
363: if (c == *s++)
364: return(1);
365: return(0);
366: }
367:
368:
369: match(f, p)
370: register fa *f;
371: register uchar *p;
372: {
373: register int s, ns;
374:
375: s = f->reset?makeinit(f,0):f->initstat;
376: if (f->out[s])
377: return(1);
378: do {
379: if (ns=f->gototab[s][*p])
380: s=ns;
381: else
382: s=cgoto(f,s,*p);
383: if (f->out[s])
384: return(1);
385: } while (*p++ != 0);
386: return(0);
387: }
388:
389: pmatch(f, p)
390: register fa *f;
391: register uchar *p;
392: {
393: register s, ns;
394: register uchar *q;
395: int i, k;
396:
397: s = f->reset?makeinit(f,1):f->initstat;
398: patbeg = p;
399: patlen = -1;
400: do {
401: q = p;
402: do {
403: if (f->out[s]) /* final state */
404: patlen = q-p;
405: if (ns=f->gototab[s][*q])
406: s=ns;
407: else
408: s=cgoto(f,s,*q);
409: if (s==1) /* no transition */
410: if (patlen >= 0) {
411: patbeg = p;
412: return(1);
413: }
414: else
415: goto nextin; /* no match */
416: } while (*q++ != 0);
417: if (f->out[s])
418: patlen = q-p-1; /* don't count $ */
419: if (patlen >= 0) {
420: patbeg = p;
421: return(1);
422: }
423: nextin:
424: s = 2;
425: if (f->reset) {
426: for (i=2; i<=f->curstat; i++)
427: Free(f->posns[i]);
428: k = *f->posns[0];
429: if ((f->posns[2] = (int *) Calloc(1, (k+1)*sizeof(int))) == NULL)
430: overflo("out of space in pmatch");
431: for (i=0; i<=k; i++)
432: (f->posns[2])[i] = (f->posns[0])[i];
433: f->initstat = f->curstat = 2;
434: f->out[2] = f->out[0];
435: for (i=0; i<NCHARS; i++)
436: f->gototab[2][i] = 0;
437: }
438: } while (*p++ != 0);
439: return (0);
440: }
441:
442: nematch(f, p)
443: register fa *f;
444: register uchar *p;
445: {
446: register int s, ns;
447: register uchar *q;
448: int i, k;
449:
450: s = f->reset?makeinit(f,1):f->initstat;
451: patlen = -1;
452: while (*p) {
453: q = p;
454: do {
455: if (f->out[s]) /* final state */
456: patlen = q-p;
457: if (ns=f->gototab[s][*q])
458: s=ns;
459: else
460: s=cgoto(f,s,*q);
461: if (s==1) /* no transition */
462: if (patlen > 0) {
463: patbeg = p;
464: return(1);
465: }
466: else
467: goto nnextin; /* no nonempty match */
468: } while (*q++ != 0);
469: if (f->out[s])
470: patlen = q-p-1; /* don't count $ */
471: if (patlen > 0 ) {
472: patbeg = p;
473: return(1);
474: }
475: nnextin:
476: s = 2;
477: if (f->reset) {
478: for (i=2; i<=f->curstat; i++)
479: Free(f->posns[i]);
480: k = *f->posns[0];
481: if ((f->posns[2] = (int *) Calloc(1, (k+1)*sizeof(int))) == NULL)
482: overflo("out of state space");
483: for (i=0; i<=k; i++)
484: (f->posns[2])[i] = (f->posns[0])[i];
485: f->initstat = f->curstat = 2;
486: f->out[2] = f->out[0];
487: for (i=0; i<NCHARS; i++)
488: f->gototab[2][i] = 0;
489: }
490: p++;
491: }
492: return (0);
493: }
494:
495: Node *regexp(), *primary(), *concat(), *alt(), *unary();
496:
497: Node *reparse(p)
498: uchar *p;
499: {
500: /* parses regular expression pointed to by p */
501: /* uses relex() to scan regular expression */
502: Node *np;
503:
504: dprintf("reparse <%s>\n", p);
505: lastre = prestr = p; /* prestr points to string to be parsed */
506: rtok = relex();
507: if (rtok == '\0')
508: error(FATAL, "empty regular expression");
509: np = regexp();
510: if (rtok == '\0')
511: return(np);
512: else
513: error(FATAL, "syntax error in regular expression %s at %s", lastre, prestr);
514: }
515:
516: Node *regexp()
517: {
518: return (alt(concat(primary())));
519: }
520:
521: Node *primary()
522: {
523: Node *np;
524:
525: switch (rtok) {
526: case CHAR:
527: np = op2(CHAR, NIL, (Node *) rlxval);
528: rtok = relex();
529: return (unary(np));
530: case ALL:
531: rtok = relex();
532: return (unary(op2(ALL, NIL, NIL)));
533: case DOT:
534: rtok = relex();
535: return (unary(op2(DOT, NIL, NIL)));
536: case CCL:
537: np = op2(CCL, NIL, cclenter(rlxstr));
538: rtok = relex();
539: return (unary(np));
540: case NCCL:
541: np = op2(NCCL, NIL, cclenter(rlxstr));
542: rtok = relex();
543: return (unary(np));
544: case '^':
545: rtok = relex();
546: return (unary(op2(CHAR, NIL, (Node *) HAT)));
547: case '$':
548: rtok = relex();
549: return (unary(op2(CHAR, NIL, NIL)));
550: case '(':
551: rtok = relex();
552: if (rtok == ')') { /* special pleading for () */
553: rtok = relex();
554: return unary(op2(CCL, NIL, tostring("")));
555: }
556: np = regexp();
557: if (rtok == ')') {
558: rtok = relex();
559: return (unary(np));
560: }
561: else
562: error(FATAL, "syntax error in regular expression %s at %s", lastre, prestr);
563: default:
564: error(FATAL, "illegal primary in regular expression %s at %s", lastre, prestr);
565: }
566: }
567:
568: Node *concat(np)
569: Node *np;
570: {
571: switch (rtok) {
572: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
573: return (concat(op2(CAT, np, primary())));
574: default:
575: return (np);
576: }
577: }
578:
579: Node *alt(np)
580: Node *np;
581: {
582: if (rtok == OR) {
583: rtok = relex();
584: return (alt(op2(OR, np, concat(primary()))));
585: }
586: return (np);
587: }
588:
589: Node *unary(np)
590: Node *np;
591: {
592: switch (rtok) {
593: case STAR:
594: rtok = relex();
595: return (unary(op2(STAR, np, NIL)));
596: case PLUS:
597: rtok = relex();
598: return (unary(op2(PLUS, np, NIL)));
599: case QUEST:
600: rtok = relex();
601: return (unary(op2(QUEST, np, NIL)));
602: default:
603: return (np);
604: }
605: }
606:
607: relex() /* lexical analyzer for reparse */
608: {
609: register int c;
610: uchar cbuf[150];
611: int clen, cflag;
612:
613: switch (c = *prestr++) {
614: case '|': return OR;
615: case '*': return STAR;
616: case '+': return PLUS;
617: case '?': return QUEST;
618: case '.': return DOT;
619: case '\0': prestr--; return '\0';
620: case '^':
621: case '$':
622: case '(':
623: case ')':
624: return c;
625: case '\\':
626: if ((c = *prestr++) == 't')
627: c = '\t';
628: else if (c == 'n')
629: c = '\n';
630: else if (c == 'f')
631: c = '\f';
632: else if (c == 'r')
633: c = '\r';
634: else if (c == 'b')
635: c = '\b';
636: else if (c == '\\')
637: c = '\\';
638: else if (isdigit(c)) {
639: int n = c - '0';
640: if (isdigit(*prestr)) {
641: n = 8 * n + *prestr++ - '0';
642: if (isdigit(*prestr))
643: n = 8 * n + *prestr++ - '0';
644: }
645: c = n;
646: } /* else it's now in c */
647: rlxval = c;
648: return CHAR;
649: default:
650: rlxval = c;
651: return CHAR;
652: case '[':
653: clen = 0;
654: if (*prestr == '^') {
655: cflag = 1;
656: prestr++;
657: }
658: else
659: cflag = 0;
660: for (;;) {
661: if ((c = *prestr++) == '\\') {
662: cbuf[clen++] = '\\';
663: if ((c = *prestr++) == '\0')
664: error(FATAL, "nonterminated character class %s", lastre);
665: cbuf[clen++] = c;
666: } else if (c == ']') {
667: cbuf[clen] = 0;
668: rlxstr = tostring(cbuf);
669: if (cflag == 0)
670: return CCL;
671: else
672: return NCCL;
673: } else if (c == '\n') {
674: error(FATAL, "newline in character class %s...", lastre);
675: } else if (c == '\0') {
676: error(FATAL, "nonterminated character class %s", lastre);
677: } else
678: cbuf[clen++] = c;
679: }
680: }
681: }
682:
683: int cgoto(f, s, c)
684: fa *f;
685: int s, c;
686: {
687: register int i, j, k;
688: register int *p, *q;
689:
690: for (i=0; i<=f->accept; i++)
691: setvec[i] = 0;
692: setcnt = 0;
693: /* compute positions of gototab[s,c] into setvec */
694: p = f->posns[s];
695: for (i=1; i<=*p; i++) {
696: if ((k = f->re[p[i]].ltype) != FINAL) {
697: if (k == CHAR && c == f->re[p[i]].lval
698: || k == DOT && c != 0 && c != HAT
699: || k == ALL && c != 0
700: || k == CCL && member(c, (uchar *) f->re[p[i]].lval)
701: || k == NCCL && !member(c, (uchar *) f->re[p[i]].lval) && c != 0 && c != HAT) {
702: q = f->re[p[i]].lfollow;
703: for (j=1; j<=*q; j++) {
704: if (setvec[q[j]] == 0) {
705: setcnt++;
706: setvec[q[j]] = 1;
707: }
708: }
709: }
710: }
711: }
712: /* determine if setvec is a previous state */
713: tmpset[0] = setcnt;
714: j = 1;
715: for (i = f->accept; i >= 0; i--)
716: if (setvec[i]) {
717: tmpset[j++] = i;
718: }
719: /* tmpset == previous state? */
720: for (i=1; i<= f->curstat; i++) {
721: p = f->posns[i];
722: if ((k = tmpset[0]) != p[0])
723: goto different;
724: for (j = 1; j <= k; j++)
725: if (tmpset[j] != p[j])
726: goto different;
727: /* setvec is state i */
728: f->gototab[s][c] = i;
729: return i;
730: different:;
731: }
732:
733: /* add tmpset to current set of states */
734: if (f->curstat >= NSTATES-1) {
735: f->curstat = 2;
736: f->reset = 1;
737: for (i=2; i<NSTATES; i++)
738: Free(f->posns[i]);
739: }
740: else
741: ++(f->curstat);
742: for (i=0; i<NCHARS; i++)
743: f->gototab[f->curstat][i] = 0;
744: if ((p = (int *) Calloc(1, (setcnt+1)*sizeof(int))) == NULL)
745: overflo("out of space in cgoto");
746:
747: f->posns[f->curstat] = p;
748: f->gototab[s][c] = f->curstat;
749: for (i = 0; i <= setcnt; i++)
750: p[i] = tmpset[i];
751: if (setvec[f->accept])
752: f->out[f->curstat] = 1;
753: else
754: f->out[f->curstat] = 0;
755: return f->curstat;
756: }
757:
758:
759: freefa(f)
760: struct fa *f;
761: {
762:
763: register int i;
764:
765: if (f == NULL)
766: return;
767: for (i=0; i<=f->curstat; i++)
768: Free(f->posns[i]);
769: for (i=0; i<=f->accept; i++)
770: Free(f->re[i].lfollow);
771: Free(f->restr);
772: Free(f);
773: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.