|
|
1.1 root 1: #include "re.h"
2: #include "lre.h"
3: #include "hdr.h"
4:
5: static Exprtype toktype;
6: static int toklit;
7: static char *beg, *end;
8: static int maxid;
9: static Expr *e0(void);
10: static Expr *d0(void);
11: static Expr *r18(void);
12: static void err(char*);
13: static int parno;
14: static jmp_buf gohome;
15: static unsigned char *mymap;
16:
17: void
18: egpost(re_re *r)
19: {
20: r->maxid = maxid;
21: r->backref = r->root->backref;
22: r->parens = r->root->parens;
23: }
24:
25: Expr *
26: eg_newexpr(Exprtype t, int l, Expr *left, Expr *right)
27: {
28: register Expr *e = (Expr *)egmalloc(sizeof(Expr), "eg_newexpr");
29:
30: if (!e)
31: return 0;
32: e->type = t;
33: e->parent = 0;
34: e->lit = l;
35: if(e->lit)
36: e->id = maxid++;
37: else
38: e->id = 0;
39: e->backref = 0;
40: e->parens = 0;
41: if(e->l = left){
42: left->parent = e;
43: if(left->backref) e->backref = 1;
44: if(left->parens) e->parens = 1;
45: }
46: if(e->r = right){
47: right->parent = e;
48: if(right->backref) e->backref = 1;
49: if(right->parens) e->parens = 1;
50: }
51: e->follow = 0;
52: e->flen = 0;
53: e->reallit = 0;
54: return(e);
55: }
56:
57: void
58: eg_lexinit(char *b, char *e)
59: {
60: beg = b;
61: end = e;
62: maxid = 1;
63: parno = 1;
64: }
65:
66: void
67: eg_lex(void)
68: {
69: if(beg == end){
70: toktype = EOP;
71: toklit = -1;
72: } else switch(toklit = *beg++)
73: {
74: case '.': toktype = Dot; break;
75: case '*': toktype = Star; break;
76: case '+': toktype = Plus; break;
77: case '?': toktype = Quest; break;
78: case '^': toktype = Carat; break;
79: case '$': toktype = Dollar; break;
80: case '[': toktype = Charclass; break;
81: case '\n':
82: case '|': toktype = Alternate; break;
83: case '(': toktype = Lpar; break;
84: case ')': toktype = Rpar; break;
85: case '\\': toktype = Backslash;
86: if(beg == end)
87: err("bad \\");
88: else
89: toklit = *beg++;
90: break;
91: default: toktype = Literal; break;
92: }
93: }
94:
95: void
96: eg_epr(register Expr *e, char *res, int doset)
97: {
98: char r1[EPRINTSIZE], r2[EPRINTSIZE], rid[EPRINTSIZE];
99: int ids = 0; /* sort of a debugging flag */
100:
101: if(e == 0){
102: SPR res, "!0!");
103: return;
104: }
105: r1[0] = 0;
106: if(ids)
107: SPR rid, "%d:", e->id);
108: else
109: rid[0] = 0;
110: switch(e->type)
111: {
112: case Literal:
113: if(doset)
114: eg_spr(e->flen, e->follow, r1);
115: SPR res, "%s'%c'%s", rid, e->lit, r1);
116: break;
117: case Dot:
118: case Carat:
119: case Dollar:
120: if(doset)
121: eg_spr(e->flen, e->follow, r1);
122: SPR res, "%s%c%s", rid, e->lit, r1);
123: break;
124: case Compcharclass:
125: case Charclass:
126: *res++ = '[';
127: if(e->type == Compcharclass)
128: *res++ = '^';
129: memmove(res, (char *)e->r, (int)e->l);
130: res += (int)e->l;
131: *res++ = ']';
132: *res = 0;
133: break;
134: case Cat:
135: eg_epr(e->l, r1, doset);
136: eg_epr(e->r, r2, doset);
137: SPR res, "%s%s%s", rid, r1, r2);
138: break;
139: case Alternate:
140: eg_epr(e->l, r1, doset);
141: eg_epr(e->r, r2, doset);
142: SPR res, "%s(%s|%s)", rid, r1, r2);
143: break;
144: case Star:
145: eg_epr(e->l, r1, doset);
146: SPR res, "%s(%s)*", rid, r1);
147: break;
148: case Plus:
149: eg_epr(e->l, r1, doset);
150: SPR res, "%s(%s)+", rid, r1);
151: break;
152: case Quest:
153: eg_epr(e->l, r1, doset);
154: SPR res, "%s(%s)?", rid, r1);
155: break;
156: case Group:
157: eg_epr(e->l, r1, doset);
158: SPR res, "%sG<%s>", rid, r1);
159: break;
160: case EOP:
161: eg_epr(e->l, r1, doset);
162: SPR res, "%s%s<EOP>", rid, r1);
163: break;
164: case Backref:
165: SPR res, "%s\\%d", rid, e->lit);
166: break;
167: default:
168: SPR res, "<undef type %d>", e->type);
169: err(res);
170: break;
171: }
172: }
173:
174: static void
175: ccl(int *count, char **str, int oldrange)
176: {
177: register n;
178: int cnt;
179: char tab[256], *s;
180: int range, lastc, i;
181:
182: #define TSET(b) {if(tab[n = mymap[(b)]] == 0){ tab[n] = 1; cnt++; } }
183:
184: cnt = 0;
185: memset(tab, 0, sizeof tab);
186: lastc = -1;
187: range = 0;
188: /* scan for chars */
189: for(; (beg < end); beg++){
190: toklit = *beg;
191: if(*beg == ']'){
192: if(!oldrange)
193: break;
194: if(lastc >= 0)
195: break;
196: }
197: if(*beg == '-'){
198: if(lastc < 0){
199: TSET('-')
200: lastc = *(unsigned char *)beg;
201: } else
202: range = 1;
203: continue;
204: }
205: if(*beg == '\\'){
206: if(++beg >= end){
207: err("unexpected eop after \\ in char class");
208: beg--;
209: }
210: }
211: if(range){
212: for(i = *(unsigned char *)beg; i >= lastc; i--)
213: TSET(i)
214: range = 0;
215: } else
216: TSET(*(unsigned char *)beg)
217: lastc = *(unsigned char *)beg;
218: }
219: if(range){
220: if(oldrange)
221: TSET('-')
222: else
223: err("unterminated range in []");
224: }
225: if(beg < end)
226: beg++;
227: else
228: err("eop during ccl");
229: if(cnt == 0)
230: err("empty charclass");
231: eg_lex();
232: *count = cnt;
233: *str = s = (char *)egmalloc(cnt, "charclass defn");
234: if (!s)
235: return;
236: for(n = 0; n < 256; n++)
237: if(tab[n])
238: *s++ = n;
239: }
240:
241: /*
242: gre patterns:
243:
244: e0: e1 { '|' e1 }*
245: e1: e2 { e2 }*
246: e2: e3 { '*' | '?' | '+' | \{ m,n \} }*
247: e3: lit | '.' | '^' | '$' | '(' e0 ')'
248: */
249:
250: static Expr *
251: e3(void)
252: {
253: Expr *e;
254: Exprtype t;
255: int cnt;
256: char *s;
257:
258: switch(toktype)
259: {
260: case Backslash:
261: if((toklit >= '1') && (toklit <= '9')){
262: e = eg_newexpr(Backref, toklit-'0', (Expr *)0, (Expr *)0);
263: e->backref = 1;
264: } else
265: e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0);
266: eg_lex();
267: break;
268: case Literal:
269: e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0);
270: eg_lex();
271: break;
272: case Dot:
273: e = eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0);
274: eg_lex();
275: break;
276: case Carat:
277: e = eg_newexpr(Carat, '^', (Expr *)0, (Expr *)0);
278: eg_lex();
279: break;
280: case Dollar:
281: e = eg_newexpr(Dollar, '$', (Expr *)0, (Expr *)0);
282: eg_lex();
283: break;
284: case Charclass:
285: t = toktype;
286: if(*beg == '^'){
287: t = Compcharclass;
288: beg++;
289: }
290: ccl(&cnt, &s, 0);
291: e = eg_newexpr(t, '[', (Expr *)0, (Expr *)0);
292: e->l = (Expr *)cnt; /* num of chars */
293: e->r = (Expr *)s; /* chars */
294: break;
295: case Lpar:
296: eg_lex();
297: cnt = parno++;
298: e = e0();
299: if(toktype == Rpar)
300: eg_lex();
301: else
302: err("expected a ')'");
303: e = eg_newexpr(Group, cnt, e, (Expr *)0);
304: e->parens = 1;
305: return(e);
306: case EOP:
307: default:
308: err("expected a lit or '('");
309: e = 0;
310: }
311: return(e);
312: }
313:
314: static int
315: integer(void)
316: {
317: int n;
318:
319: n = 0;
320: while((toktype == Literal) && (toklit >= '0') && (toklit <= '9')){
321: n = 10*n + toklit-'0';
322: eg_lex();
323: }
324: return(n);
325: }
326:
327: static Expr *
328: ecopy(Expr *e)
329: {
330: Expr *ee;
331: char res[256];
332:
333: if(e == 0)
334: return(e);
335: switch(e->type)
336: {
337: case Literal:
338: case Dot:
339: case Carat:
340: case Dollar:
341: case Backref:
342: return(eg_newexpr(e->type, e->lit, (Expr *)0, (Expr *)0));
343: case Compcharclass:
344: case Charclass:
345: ee = eg_newexpr(e->type, e->lit, (Expr *)0, (Expr *)0);
346: ee->r = (Expr *)egmalloc((int)e->l, "expr copy");
347: if (!ee->r)
348: return 0;
349: ee->l = e->l;
350: memmove((char *)ee->r, (char *)e->r, (int)e->l);
351: return(ee);
352: case Cat:
353: case Alternate:
354: return(eg_newexpr(e->type, e->lit, ecopy(e->l), ecopy(e->r)));
355: case Star:
356: case Plus:
357: case Quest:
358: case Group:
359: case EOP:
360: return(eg_newexpr(e->type, e->lit, ecopy(e->l), (Expr *)0));
361: default:
362: SPR res, "<undef type %d>", e->type);
363: err(res);
364: return((Expr *)0);
365: }
366: }
367:
368: static Expr *
369: edup(Expr *expr, int n, int opt)
370: {
371: if(n == 1){
372: expr = ecopy(expr);
373: if(opt)
374: expr = eg_newexpr(Quest, 0, expr, (Expr *)0);
375: return(expr);
376: }
377: return(eg_newexpr(Cat, 0, edup(expr, n-n/2, opt), edup(expr, n/2, opt)));
378: }
379:
380: static Expr *
381: range(Expr *expr)
382: {
383: int beg, end;
384: Expr *e, *e1;
385:
386: if((toktype == Literal) && (toklit >= '0') && (toklit <= '9'))
387: beg = integer();
388: else
389: err("expected a number in range");
390: if((toktype == Literal) && (toklit == ',')){
391: end = -1;
392: eg_lex();
393: } else
394: end = -2;
395: if((toktype == Literal) && (toklit >= '0') && (toklit <= '9'))
396: end = integer();
397: if((toktype == Backslash) && (toklit == '}'))
398: eg_lex();
399: else
400: err("expected \\} in range");
401: e1 = edup(expr, beg, 0);
402: if(end == -2)
403: e = e1;
404: else if(end == -1)
405: e = eg_newexpr(Cat, 0, e1, eg_newexpr(Star, 0, expr, (Expr *)0));
406: else {
407: if(end < beg)
408: err("bad range specification");
409: e = (end > beg)? eg_newexpr(Cat, 0, e1, edup(expr, end-beg, 1)) : e1;
410: }
411: return(e);
412: }
413:
414: static Expr *
415: e2(void)
416: {
417: Expr *e;
418: Exprtype t;
419:
420: e = e3();
421: while((toktype == Star) || (toktype == Plus) || (toktype == Quest)
422: || ((toktype == Backslash) && (toklit == '{'))){
423: if((toktype == Backslash) && (toklit == '{')){
424: eg_lex();
425: e = range(e);
426: } else {
427: t = toktype;
428: eg_lex();
429: e = eg_newexpr(t, 0, e, (Expr *)0);
430: }
431: }
432: return(e);
433: }
434:
435: static Expr *
436: e1(void)
437: {
438: Expr *e, *f;
439:
440: e = e2();
441: while((toktype == Literal) || (toktype == Dot) || (toktype == Lpar)
442: || (toktype == Backslash) || (toktype == Dollar)
443: || (toktype == Carat) || (toktype == Charclass)){
444: f = e2();
445: e = eg_newexpr(Cat, 0, e, f);
446: }
447: return(e);
448: }
449:
450: static Expr *
451: e0(void)
452: {
453: Expr *e, *f;
454:
455: e = e1();
456: while(toktype == Alternate){
457: eg_lex();
458: if(toktype == EOP)
459: continue;
460: f = e1();
461: e = eg_newexpr(Alternate, 0, e, f);
462: }
463: return(e);
464: }
465:
466: /*
467: egrep patterns:
468:
469: d0: d1 { '|' d1 }*
470: d1: d2 { d2 }*
471: d2: d3 { '*' | '?' | '+' }
472: d3: lit | '.' | '^' | '$' | '(' d0 ')'
473: */
474:
475: static Expr *
476: d3(void)
477: {
478: Expr *e;
479: Exprtype t;
480: int cnt;
481: char *s;
482:
483: switch(toktype)
484: {
485: case Backslash:
486: case Literal:
487: e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0);
488: eg_lex();
489: break;
490: case Dot:
491: e = eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0);
492: eg_lex();
493: break;
494: case Carat:
495: e = eg_newexpr(Carat, '^', (Expr *)0, (Expr *)0);
496: eg_lex();
497: break;
498: case Dollar:
499: e = eg_newexpr(Dollar, '$', (Expr *)0, (Expr *)0);
500: eg_lex();
501: break;
502: case Charclass:
503: t = toktype;
504: if(*beg == '^'){
505: t = Compcharclass;
506: beg++;
507: }
508: ccl(&cnt, &s, 1);
509: e = eg_newexpr(t, '[', (Expr *)0, (Expr *)0);
510: e->l = (Expr *)cnt; /* num of chars */
511: e->r = (Expr *)s; /* chars */
512: break;
513: case Lpar:
514: eg_lex();
515: e = d0();
516: if(toktype == Rpar)
517: eg_lex();
518: else
519: err("expected a ')'");
520: return(e);
521: default:
522: err("expected a lit or '('");
523: e = 0;
524: }
525: return(e);
526: }
527:
528: static Expr *
529: d2(void)
530: {
531: Expr *e;
532: Exprtype t;
533:
534: e = d3();
535: while((toktype == Star) || (toktype == Plus) || (toktype == Quest)){
536: t = toktype;
537: eg_lex();
538: e = eg_newexpr(t, 0, e, (Expr *)0);
539: }
540: return(e);
541: }
542:
543: static Expr *
544: d1(void)
545: {
546: Expr *e, *f;
547:
548: e = d2();
549: while((toktype == Literal) || (toktype == Dot) || (toktype == Lpar)
550: || (toktype == Dollar) || (toktype == Backslash)
551: || (toktype == Carat) || (toktype == Charclass)){
552: f = d2();
553: e = eg_newexpr(Cat, 0, e, f);
554: }
555: return(e);
556: }
557:
558: static Expr *
559: d0(void)
560: {
561: Expr *e, *f;
562:
563: e = d1();
564: while(toktype == Alternate){
565: eg_lex();
566: if(toktype == EOP)
567: continue;
568: f = d1();
569: e = eg_newexpr(Alternate, 0, e, f);
570: }
571: return(e);
572: }
573:
574: /*
575: grep patterns:
576:
577: r0: r18 | '^' r18 | '^' r18 '$' | r18 '$'
578: r18: r17 { r17 }*
579: r17: r14 | r14 '*' | '\(' r18 '\)'
580: r14: lit | '.' | '*' | '\' d
581: */
582:
583: static Expr *
584: r14(void)
585: {
586: Expr *e;
587: Exprtype t;
588: int cnt;
589: char *s;
590:
591: switch(toktype)
592: {
593: case Alternate:
594: case Plus:
595: case Quest:
596: case Star:
597: case Lpar:
598: case Rpar:
599: case Dollar:
600: case Carat:
601: case Literal:
602: e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0);
603: eg_lex();
604: break;
605: case Backslash:
606: if((toklit >= '1') && (toklit <= '9')){
607: e = eg_newexpr(Backref, toklit-'0', (Expr *)0, (Expr *)0);
608: e->backref = 1;
609: } else {
610: e = eg_newexpr(Literal, toklit, (Expr *)0, (Expr *)0);
611: e->reallit = 1;
612: }
613: eg_lex();
614: break;
615: case Dot:
616: e = eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0);
617: eg_lex();
618: break;
619: case Charclass:
620: t = toktype;
621: if(*beg == '^'){
622: t = Compcharclass;
623: beg++;
624: }
625: ccl(&cnt, &s, 1);
626: e = eg_newexpr(t, '[', (Expr *)0, (Expr *)0);
627: e->l = (Expr *)cnt; /* num of chars */
628: e->r = (Expr *)s; /* chars */
629: break;
630: default:
631: err("expected a one-char RE");
632: eg_lex(); /* make sure we don't loop */
633: e = 0;
634: }
635: return(e);
636: }
637:
638: static Expr *
639: r17(void)
640: {
641: Expr *e;
642: int cnt;
643:
644: if((toktype == Backslash) && (toklit == '(')){
645: eg_lex();
646: cnt = parno++;
647: e = r18();
648: if((toktype == Backslash) && (toklit == ')'))
649: eg_lex();
650: else
651: err("expected a closing \\)");
652: e = eg_newexpr(Group, cnt, e, (Expr *)0);
653: e->parens = 1;
654: } else {
655: e = r14();
656: if(toktype == Star){
657: e = eg_newexpr(Star, 0, e, (Expr *)0);
658: eg_lex();
659: }
660: }
661: return(e);
662: }
663:
664: static Expr *
665: r18(void)
666: {
667: Expr *e, *f;
668:
669: e = r17();
670: while(toktype != EOP){
671: if((toktype == Backslash) && (toklit == ')'))
672: break;
673: f = r17();
674: e = eg_newexpr(Cat, 0, e, f);
675: }
676: return(e);
677: }
678:
679: static Expr *
680: r0(void)
681: {
682: Expr *e, *e1;
683:
684: if(toktype == Carat){
685: e1 = eg_newexpr(Carat, '^', (Expr *)0, (Expr *)0);
686: eg_lex();
687: } else
688: e1 = 0;
689: if(toktype == EOP)
690: e = e1;
691: else {
692: e = r18();
693: /* did we see a dollar that is not a literal? */
694: if(e && (toktype == EOP)){
695: /* singleton dollar */
696: if((e->type == Literal) && (e->lit == '$'))
697: e->type = Dollar;
698: /* any other dollar */
699: if((e->type == Cat) && !e->r->reallit && (e->r->type == Literal)
700: && (e->r->lit == '$'))
701: e->r->type = Dollar;
702: }
703: if(e1){
704: if(e)
705: e = eg_newexpr(Cat, 0, e1, e);
706: else
707: e = e1;
708: }
709: }
710: if(toktype == Dollar){
711: if(e)
712: e = eg_newexpr(Cat, 0, e, eg_newexpr(Dollar, '$', (Expr *)0, (Expr *)0));
713: else
714: e = eg_newexpr(Dollar, '$', (Expr *)0, (Expr *)0);
715: eg_lex();
716: }
717: return(e);
718: }
719:
720: Expr *
721: eg_eall(enum Parsetype type, unsigned char *map)
722: {
723: Expr *e;
724:
725: mymap = map;
726: if(setjmp(gohome) == 0){
727: if(type == egrepparse)
728: while(toktype == Alternate) /* bogus but user-friendly */
729: eg_lex();
730: switch(type)
731: {
732: case greparse: e = e0(); break;
733: case grepparse: e = r0(); break;
734: case egrepparse: e = d0(); break;
735: }
736: if(type == egrepparse)
737: while(toktype == Alternate) /* bogus but user-friendly */
738: eg_lex();
739: if(toktype != EOP)
740: err("expected end of pattern");
741: } else
742: e = 0;
743: /*{char buf1[4096]; e, buf1, 0); print("e='%s'\n", buf1);}/**/
744: return(e);
745: }
746:
747: void
748: eg_spr(long c, int *p, register char *buf)
749: {
750: if(c > 0){
751: *buf++ = '{';
752: *buf = 0;
753: while(--c > 0){
754: SPR buf, "%d,", *p++);
755: buf = strchr(buf, 0);
756: }
757: SPR buf, "%d}", *p);
758: } else
759: SPR buf, "{}");
760: }
761:
762: static void
763: err(char *s)
764: {
765: char buf[4096];
766:
767: if(toklit < 0)
768: SPR buf, "expression error: %s but got end of expression", s);
769: else
770: SPR buf, "expression error: %s near '%c'", s, toklit);
771: re_error(buf);
772: longjmp(gohome, 1);
773: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.