|
|
1.1 root 1: /*
2: Copyright (c) 1989 AT&T
3: All Rights Reserved
4:
5: THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T.
6:
7: The copyright notice above does not evidence any
8: actual or intended publication of such source code.
9: */
10:
11: #define DEBUG
12:
13: #include <ctype.h>
14: #include <stdio.h>
15: #include <string.h>
16: #include <stdlib.h>
17: #include "awk.h"
18: #include "y.tab.h"
19:
20: #define HAT (NCHARS-1) /* matches ^ in regular expr */
21: /* NCHARS is 2**n */
22: #define MAXLIN 512
23:
24: #define type(v) (v)->nobj
25: #define left(v) (v)->narg[0]
26: #define right(v) (v)->narg[1]
27: #define parent(v) (v)->nnext
28:
29: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
30: #define UNARY case STAR: case PLUS: case QUEST:
31:
32: /* encoding in tree Nodes:
33: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
34: left is index, right contains value or pointer to value
35: unary (STAR, PLUS, QUEST): left is child, right is null
36: binary (CAT, OR): left and right are children
37: parent contains pointer to parent
38: */
39:
40:
41: uchar chars[MAXLIN];
42: int setvec[MAXLIN];
43: int tmpset[MAXLIN];
44: Node *point[MAXLIN];
45:
46: int rtok; /* next token in current re */
47: int rlxval;
48: uchar *rlxstr;
49: uchar *prestr; /* current position in current re */
50: uchar *lastre; /* origin of last re */
51:
52: static int setcnt;
53: static int poscnt;
54:
55: uchar *patbeg;
56: int patlen;
57:
58: #define NFA 20 /* cache this many dynamic fa's */
59: fa *fatab[NFA];
60: int nfatab = 0; /* entries in fatab */
61:
62: fa *makedfa(uchar *s, int anchor) /* returns dfa for reg expr s */
63: {
64: int i, use, nuse;
65: fa *pfa;
66:
67: if (compile_time) /* a constant for sure */
68: return mkdfa(s, anchor);
69: for (i = 0; i < nfatab; i++) /* is it there already? */
70: if (fatab[i]->anchor == anchor && strcmp(fatab[i]->restr,s) == 0) {
71: fatab[i]->use++;
72: return fatab[i];
73: }
74: pfa = mkdfa(s, anchor);
75: if (nfatab < NFA) { /* room for another */
76: fatab[nfatab] = pfa;
77: fatab[nfatab]->use = 1;
78: nfatab++;
79: return pfa;
80: }
81: use = fatab[0]->use; /* replace least-recently used */
82: nuse = 0;
83: for (i = 1; i < nfatab; i++)
84: if (fatab[i]->use < use) {
85: use = fatab[i]->use;
86: nuse = i;
87: }
88: freefa(fatab[nuse]);
89: fatab[nuse] = pfa;
90: pfa->use = 1;
91: return pfa;
92: }
93:
94: fa *mkdfa(uchar *s, int anchor) /* does the real work of making a dfa */
95: /* anchor = 1 for anchored matches, else 0 */
96: {
97: Node *p, *p1;
98: fa *f;
99:
100: p = reparse(s);
101: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
102: /* put ALL STAR in front of reg. exp. */
103: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
104: /* put FINAL after reg. exp. */
105:
106: poscnt = 0;
107: penter(p1); /* enter parent pointers and leaf indices */
108: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
109: overflo("out of space for fa");
110: f->accept = poscnt-1; /* penter has computed number of positions in re */
111: cfoll(f, p1); /* set up follow sets */
112: freetr(p1);
113: if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
114: overflo("out of space in makedfa");
115: if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
116: overflo("out of space in makedfa");
117: *f->posns[1] = 0;
118: f->initstat = makeinit(f, anchor);
119: f->anchor = anchor;
120: f->restr = tostring(s);
121: return f;
122: }
123:
124: int makeinit(fa *f, int anchor)
125: {
126: register int i, k;
127:
128: f->curstat = 2;
129: f->out[2] = 0;
130: f->reset = 0;
131: k = *(f->re[0].lfollow);
132: xfree(f->posns[2]);
133: if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
134: overflo("out of space in makeinit");
135: for (i=0; i<=k; i++) {
136: (f->posns[2])[i] = (f->re[0].lfollow)[i];
137: }
138: if ((f->posns[2])[1] == f->accept)
139: f->out[2] = 1;
140: for (i=0; i<NCHARS; i++)
141: f->gototab[2][i] = 0;
142: f->curstat = cgoto(f, 2, HAT);
143: if (anchor) {
144: *f->posns[2] = k-1; /* leave out position 0 */
145: for (i=0; i<k; i++) {
146: (f->posns[0])[i] = (f->posns[2])[i];
147: }
148:
149: f->out[0] = f->out[2];
150: if (f->curstat != 2)
151: --(*f->posns[f->curstat]);
152: }
153: return f->curstat;
154: }
155:
156: void penter(Node *p) /* set up parent pointers and leaf indices */
157: {
158: switch(type(p)) {
159: LEAF
160: left(p) = (Node *) poscnt;
161: if (poscnt >= MAXLIN)
162: overflo("leaf index overflow in penter");
163: point[poscnt++] = p;
164: break;
165: UNARY
166: penter(left(p));
167: parent(left(p)) = p;
168: break;
169: case CAT:
170: case OR:
171: penter(left(p));
172: penter(right(p));
173: parent(left(p)) = p;
174: parent(right(p)) = p;
175: break;
176: default: /* can't happen */
177: ERROR "unknown type %d in penter", type(p) FATAL;
178: break;
179: }
180: }
181:
182: void freetr(Node *p) /* free parse tree */
183: {
184: switch (type(p)) {
185: LEAF
186: xfree(p);
187: break;
188: UNARY
189: freetr(left(p));
190: xfree(p);
191: break;
192: case CAT:
193: case OR:
194: freetr(left(p));
195: freetr(right(p));
196: xfree(p);
197: break;
198: default: /* can't happen */
199: ERROR "unknown type %d in freetr", type(p) FATAL;
200: break;
201: }
202: }
203:
204: /* in the parsing of regular expressions, metacharacters like . have */
205: /* to be seen literally; \056 is not a metacharacter. */
206:
207: hexstr(char **pp) /* find and eval hex string at pp, return new p */
208: {
209: char *p;
210: int n = 0;
211:
212: for (p = *pp; isxdigit(*p); p++) {
213: if (isdigit(*p))
214: n = 16 * n + *p - '0';
215: else if (*p >= 'a' && *p <= 'f')
216: n = 16 * n + *p - 'a' + 10;
217: else if (*p >= 'A' && *p <= 'F')
218: n = 16 * n + *p - 'A' + 10;
219: }
220: *pp = p;
221: return n;
222: }
223:
224: #define isoctdigit(c) ((c) >= '0' && (c) <= '8') /* multiple use of arg */
225:
226: quoted(char **pp) /* pick up next thing after a \\ */
227: /* and increment *pp */
228: {
229: char *p = *pp;
230: int c;
231:
232: if ((c = *p++) == 't')
233: c = '\t';
234: else if (c == 'n')
235: c = '\n';
236: else if (c == 'f')
237: c = '\f';
238: else if (c == 'r')
239: c = '\r';
240: else if (c == 'b')
241: c = '\b';
242: else if (c == '\\')
243: c = '\\';
244: else if (c == 'x') { /* hexadecimal goo follows */
245: c = hexstr(&p);
246: } else if (isoctdigit(c)) { /* \d \dd \ddd */
247: int n = c - '0';
248: if (isoctdigit(*p)) {
249: n = 8 * n + *p++ - '0';
250: if (isoctdigit(*p))
251: n = 8 * n + *p++ - '0';
252: }
253: c = n;
254: } /* else */
255: /* c = c; */
256: *pp = p;
257: return c;
258: }
259:
260: uchar *cclenter(uchar *p)
261: {
262: register int i, c, c2;
263: uchar *op;
264:
265: op = p;
266: i = 0;
267: while ((c = *p++) != 0) {
268: if (c == '\\') {
269: c = quoted(&p);
270: } else if (c == '-' && i > 0 && chars[i-1] != 0) {
271: if (*p != 0) {
272: c = chars[i-1];
273: c2 = *p++;
274: if (c2 == '\\')
275: c2 = quoted(&p);
276: while (c < c2) {
277: if (i >= MAXLIN-1)
278: overflo("character class too big");
279: chars[i++] = ++c;
280: }
281: continue;
282: }
283: }
284: if (i >= MAXLIN-1)
285: overflo("character class too big");
286: chars[i++] = c;
287: }
288: chars[i++] = '\0';
289: dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, chars) );
290: xfree(op);
291: return(tostring(chars));
292: }
293:
294: void overflo(uchar *s)
295: {
296: ERROR "regular expression too big: %s", s FATAL;
297: }
298:
299: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
300: {
301: register int i;
302: register int *p;
303:
304: switch(type(v)) {
305: LEAF
306: f->re[(int) left(v)].ltype = type(v);
307: f->re[(int) left(v)].lval = (long) right(v); /* assumes ptr & long fit */
308: for (i=0; i<=f->accept; i++)
309: setvec[i] = 0;
310: setcnt = 0;
311: follow(v); /* computes setvec and setcnt */
312: if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
313: overflo("out of space building follow set");
314: f->re[(int) left(v)].lfollow = p;
315: *p = setcnt;
316: for (i = f->accept; i >= 0; i--)
317: if (setvec[i] == 1) *++p = i;
318: break;
319: UNARY
320: cfoll(f,left(v));
321: break;
322: case CAT:
323: case OR:
324: cfoll(f,left(v));
325: cfoll(f,right(v));
326: break;
327: default: /* can't happen */
328: ERROR "unknown type %d in cfoll", type(v) FATAL;
329: }
330: }
331:
332: first(Node *p) /* collects initially active leaves of p into setvec */
333: /* returns 0 or 1 depending on whether p matches empty string */
334: {
335: register int b;
336:
337: switch(type(p)) {
338: LEAF
339: if (setvec[(int) left(p)] != 1) {
340: setvec[(int) left(p)] = 1;
341: setcnt++;
342: }
343: if (type(p) == CCL && (*(uchar *) right(p)) == '\0')
344: return(0); /* empty CCL */
345: else return(1);
346: case PLUS:
347: if (first(left(p)) == 0) return(0);
348: return(1);
349: case STAR:
350: case QUEST:
351: first(left(p));
352: return(0);
353: case CAT:
354: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
355: return(1);
356: case OR:
357: b = first(right(p));
358: if (first(left(p)) == 0 || b == 0) return(0);
359: return(1);
360: }
361: ERROR "unknown type %d in first", type(p) FATAL; /* can't happen */
362: return(-1);
363: }
364:
365: void follow(Node *v) /* collects leaves that can follow v into setvec */
366: {
367: Node *p;
368:
369: if (type(v) == FINAL)
370: return;
371: p = parent(v);
372: switch (type(p)) {
373: case STAR:
374: case PLUS:
375: first(v);
376: follow(p);
377: return;
378:
379: case OR:
380: case QUEST:
381: follow(p);
382: return;
383:
384: case CAT:
385: if (v == left(p)) { /* v is left child of p */
386: if (first(right(p)) == 0) {
387: follow(p);
388: return;
389: }
390: }
391: else /* v is right child */
392: follow(p);
393: return;
394: }
395: }
396:
397: member(int c, uchar *s) /* is c in s? */
398: {
399: while (*s)
400: if (c == *s++)
401: return(1);
402: return(0);
403: }
404:
405:
406: match(fa *f, uchar *p)
407: {
408: register int s, ns;
409:
410: s = f->reset?makeinit(f,0):f->initstat;
411: if (f->out[s])
412: return(1);
413: do {
414: if (ns=f->gototab[s][*p])
415: s=ns;
416: else
417: s=cgoto(f,s,*p);
418: if (f->out[s])
419: return(1);
420: } while (*p++ != 0);
421: return(0);
422: }
423:
424: pmatch(fa *f, uchar *p)
425: {
426: register int s, ns;
427: register uchar *q;
428: int i, k;
429:
430: s = f->reset?makeinit(f,1):f->initstat;
431: patbeg = p;
432: patlen = -1;
433: do {
434: q = p;
435: do {
436: if (f->out[s]) /* final state */
437: patlen = q-p;
438: if (ns=f->gototab[s][*q])
439: s=ns;
440: else
441: s=cgoto(f,s,*q);
442: if (s==1) /* no transition */
443: if (patlen >= 0) {
444: patbeg = p;
445: return(1);
446: }
447: else
448: goto nextin; /* no match */
449: } while (*q++ != 0);
450: if (f->out[s])
451: patlen = q-p-1; /* don't count $ */
452: if (patlen >= 0) {
453: patbeg = p;
454: return(1);
455: }
456: nextin:
457: s = 2;
458: if (f->reset) {
459: for (i=2; i<=f->curstat; i++)
460: xfree(f->posns[i]);
461: k = *f->posns[0];
462: if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
463: overflo("out of space in pmatch");
464: for (i=0; i<=k; i++)
465: (f->posns[2])[i] = (f->posns[0])[i];
466: f->initstat = f->curstat = 2;
467: f->out[2] = f->out[0];
468: for (i=0; i<NCHARS; i++)
469: f->gototab[2][i] = 0;
470: }
471: } while (*p++ != 0);
472: return (0);
473: }
474:
475: nematch(fa *f, uchar *p)
476: {
477: register int s, ns;
478: register uchar *q;
479: int i, k;
480:
481: s = f->reset?makeinit(f,1):f->initstat;
482: patlen = -1;
483: while (*p) {
484: q = p;
485: do {
486: if (f->out[s]) /* final state */
487: patlen = q-p;
488: if (ns=f->gototab[s][*q])
489: s=ns;
490: else
491: s=cgoto(f,s,*q);
492: if (s==1) /* no transition */
493: if (patlen > 0) {
494: patbeg = p;
495: return(1);
496: }
497: else
498: goto nnextin; /* no nonempty match */
499: } while (*q++ != 0);
500: if (f->out[s])
501: patlen = q-p-1; /* don't count $ */
502: if (patlen > 0 ) {
503: patbeg = p;
504: return(1);
505: }
506: nnextin:
507: s = 2;
508: if (f->reset) {
509: for (i=2; i<=f->curstat; i++)
510: xfree(f->posns[i]);
511: k = *f->posns[0];
512: if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
513: overflo("out of state space");
514: for (i=0; i<=k; i++)
515: (f->posns[2])[i] = (f->posns[0])[i];
516: f->initstat = f->curstat = 2;
517: f->out[2] = f->out[0];
518: for (i=0; i<NCHARS; i++)
519: f->gototab[2][i] = 0;
520: }
521: p++;
522: }
523: return (0);
524: }
525:
526: Node *reparse(uchar *p)
527: {
528: /* parses regular expression pointed to by p */
529: /* uses relex() to scan regular expression */
530: Node *np;
531:
532: dprintf( ("reparse <%s>\n", p) );
533: lastre = prestr = p; /* prestr points to string to be parsed */
534: rtok = relex();
535: if (rtok == '\0')
536: ERROR "empty regular expression" FATAL;
537: np = regexp();
538: if (rtok != '\0')
539: ERROR "syntax error in regular expression %s at %s", lastre, prestr FATAL;
540: return(np);
541: }
542:
543: Node *regexp(void)
544: {
545: return (alt(concat(primary())));
546: }
547:
548: Node *primary(void)
549: {
550: Node *np;
551:
552: switch (rtok) {
553: case CHAR:
554: np = op2(CHAR, NIL, (Node *) rlxval);
555: rtok = relex();
556: return (unary(np));
557: case ALL:
558: rtok = relex();
559: return (unary(op2(ALL, NIL, NIL)));
560: case DOT:
561: rtok = relex();
562: return (unary(op2(DOT, NIL, NIL)));
563: case CCL:
564: np = op2(CCL, NIL, (Node*) cclenter(rlxstr));
565: rtok = relex();
566: return (unary(np));
567: case NCCL:
568: np = op2(NCCL, NIL, (Node *) cclenter(rlxstr));
569: rtok = relex();
570: return (unary(np));
571: case '^':
572: rtok = relex();
573: return (unary(op2(CHAR, NIL, (Node *) HAT)));
574: case '$':
575: rtok = relex();
576: return (unary(op2(CHAR, NIL, NIL)));
577: case '(':
578: rtok = relex();
579: if (rtok == ')') { /* special pleading for () */
580: rtok = relex();
581: return unary(op2(CCL, NIL, (Node *) tostring("")));
582: }
583: np = regexp();
584: if (rtok == ')') {
585: rtok = relex();
586: return (unary(np));
587: }
588: else
589: ERROR "syntax error in regular expression %s at %s", lastre, prestr FATAL;
590: default:
591: ERROR "illegal primary in regular expression %s at %s", lastre, prestr FATAL;
592: }
593: return 0; /*NOTREACHED*/
594: }
595:
596: Node *concat(Node *np)
597: {
598: switch (rtok) {
599: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
600: return (concat(op2(CAT, np, primary())));
601: }
602: return (np);
603: }
604:
605: Node *alt(Node *np)
606: {
607: if (rtok == OR) {
608: rtok = relex();
609: return (alt(op2(OR, np, concat(primary()))));
610: }
611: return (np);
612: }
613:
614: Node *unary(Node *np)
615: {
616: switch (rtok) {
617: case STAR:
618: rtok = relex();
619: return (unary(op2(STAR, np, NIL)));
620: case PLUS:
621: rtok = relex();
622: return (unary(op2(PLUS, np, NIL)));
623: case QUEST:
624: rtok = relex();
625: return (unary(op2(QUEST, np, NIL)));
626: default:
627: return (np);
628: }
629: }
630:
631: relex(void) /* lexical analyzer for reparse */
632: {
633: register int c;
634: uchar cbuf[MAXLIN];
635: int clen, cflag;
636:
637: switch (c = *prestr++) {
638: case '|': return OR;
639: case '*': return STAR;
640: case '+': return PLUS;
641: case '?': return QUEST;
642: case '.': return DOT;
643: case '\0': prestr--; return '\0';
644: case '^':
645: case '$':
646: case '(':
647: case ')':
648: return c;
649: case '\\':
650: rlxval = quoted(&prestr);
651: return CHAR;
652: default:
653: rlxval = c;
654: return CHAR;
655: case '[':
656: clen = 0;
657: if (*prestr == '^') {
658: cflag = 1;
659: prestr++;
660: }
661: else
662: cflag = 0;
663: for ( ; clen < MAXLIN-1; ) {
664: if ((c = *prestr++) == '\\') {
665: cbuf[clen++] = '\\';
666: if ((c = *prestr++) == '\0')
667: ERROR "nonterminated character class %.20s...", lastre FATAL;
668: cbuf[clen++] = c;
669: } else if (c == ']') {
670: cbuf[clen] = 0;
671: rlxstr = tostring(cbuf);
672: if (cflag == 0)
673: return CCL;
674: else
675: return NCCL;
676: } else if (c == '\n') {
677: ERROR "newline in character class %.20s...", lastre FATAL;
678: } else if (c == '\0') {
679: ERROR "nonterminated character class %.20s", lastre FATAL;
680: } else
681: cbuf[clen++] = c;
682: }
683: if (clen >= MAXLIN-1)
684: ERROR "character class %.20s... too long", cbuf FATAL;
685: }
686: /* can't happen */
687: return 0;
688: }
689:
690: int cgoto(fa *f, int s, int c)
691: {
692: register int i, j, k;
693: register int *p, *q;
694:
695: for (i=0; i<=f->accept; i++)
696: setvec[i] = 0;
697: setcnt = 0;
698: /* compute positions of gototab[s,c] into setvec */
699: p = f->posns[s];
700: for (i=1; i<=*p; i++) {
701: if ((k = f->re[p[i]].ltype) != FINAL) {
702: if (k == CHAR && c == f->re[p[i]].lval
703: || k == DOT && c != 0 && c != HAT
704: || k == ALL && c != 0
705: || k == CCL && member(c, (uchar *) f->re[p[i]].lval)
706: || k == NCCL && !member(c, (uchar *) f->re[p[i]].lval) && c != 0 && c != HAT) {
707: q = f->re[p[i]].lfollow;
708: for (j=1; j<=*q; j++) {
709: if (setvec[q[j]] == 0) {
710: setcnt++;
711: setvec[q[j]] = 1;
712: }
713: }
714: }
715: }
716: }
717: /* determine if setvec is a previous state */
718: tmpset[0] = setcnt;
719: j = 1;
720: for (i = f->accept; i >= 0; i--)
721: if (setvec[i]) {
722: tmpset[j++] = i;
723: }
724: /* tmpset == previous state? */
725: for (i=1; i<= f->curstat; i++) {
726: p = f->posns[i];
727: if ((k = tmpset[0]) != p[0])
728: goto different;
729: for (j = 1; j <= k; j++)
730: if (tmpset[j] != p[j])
731: goto different;
732: /* setvec is state i */
733: f->gototab[s][c] = i;
734: return i;
735: different:;
736: }
737:
738: /* add tmpset to current set of states */
739: if (f->curstat >= NSTATES-1) {
740: f->curstat = 2;
741: f->reset = 1;
742: for (i=2; i<NSTATES; i++)
743: xfree(f->posns[i]);
744: }
745: else
746: ++(f->curstat);
747: for (i=0; i<NCHARS; i++)
748: f->gototab[f->curstat][i] = 0;
749: xfree(f->posns[f->curstat]);
750: if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
751: overflo("out of space in cgoto");
752:
753: f->posns[f->curstat] = p;
754: f->gototab[s][c] = f->curstat;
755: for (i = 0; i <= setcnt; i++)
756: p[i] = tmpset[i];
757: if (setvec[f->accept])
758: f->out[f->curstat] = 1;
759: else
760: f->out[f->curstat] = 0;
761: return f->curstat;
762: }
763:
764:
765: void freefa(fa *f)
766: {
767:
768: register int i;
769:
770: if (f == NULL)
771: return;
772: for (i=0; i<=f->curstat; i++)
773: xfree(f->posns[i]);
774: for (i=0; i<=f->accept; i++) {
775: xfree(f->re[i].lfollow);
776: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
777: xfree(f->re[i].lval);
778: }
779: xfree(f->restr);
780: xfree(f);
781: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.