|
|
1.1 root 1: #ifdef DEBUG
2: #include <stdio.h>
3: #endif
4: #include <libc.h>
5: #include "regprog.h"
6:
7: /*
8: * Parser Information
9: */
10: typedef struct Node{
11: Inst *first;
12: Inst *last;
13: }Node;
14: #define NSTACK 20
15: static Node andstack[NSTACK];
16: static Node *andp;
17: static int atorstack[NSTACK];
18: static int *atorp;
19: static int cursubid; /* id of current subexpression */
20: static int subidstack[NSTACK]; /* parallel to atorstack */
21: static int *subidp;
22: static int lastwasand; /* Last token was operand */
23: static int nbra;
24: static char *exprp; /* pointer to next character in source expression */
25: static int nclass;
26: static Class *classp;
27: static Inst *freep;
28: static int errors;
29:
30: /* predeclared crap */
31: static void operator();
32: static void pushand();
33: static void pushator();
34: static void evaluntil();
35: static void bldcclass();
36:
37: static void
38: rcerror(s)
39: char *s;
40: {
41: errors++;
42: regerror(s);
43: }
44:
45: static Inst *
46: newinst(t)
47: int t;
48: {
49: freep->type=t;
50: freep->left=0;
51: freep->right=0;
52: return freep++;
53: }
54:
55: static void
56: operand(t)
57: int t;
58: {
59: register Inst *i;
60: if(lastwasand)
61: operator(CAT); /* catenate is implicit */
62: i=newinst(t);
63: if(t==CCLASS) /* ugh */
64: i->right=(Inst *)&(classp[nclass-1]); /* UGH! */
65: pushand(i, i);
66: lastwasand=TRUE;
67: }
68:
69: static void
70: operator(t)
71: int t;
72: {
73: if(t==RBRA && --nbra<0)
74: rcerror("unmatched right paren");
75: if(t==LBRA) {
76: if (++cursubid >= NSUBEXP)
77: rcerror ("too many subexpressions");
78: nbra++;
79: if (lastwasand)
80: operator(CAT);
81: } else
82: evaluntil(t);
83: if(t!=RBRA)
84: pushator(t);
85: lastwasand=FALSE;
86: if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
87: lastwasand=TRUE; /* these look like operands */
88: }
89:
90: static void
91: regerr2(s, c)
92: char *s;
93: {
94: char buf[100];
95: char *cp = buf;
96: while(*s)
97: *cp++ = *s++;
98: *cp++ = c;
99: *cp = '\0';
100: rcerror(buf);
101: }
102:
103: static void
104: cant(s)
105: char *s;
106: {
107: char buf[100];
108: strcpy(buf, "can't happen: ");
109: strcat(buf, s);
110: rcerror(buf);
111: }
112:
113: static void
114: pushand(f, l)
115: Inst *f, *l;
116: {
117: if(andp >= &andstack[NSTACK])
118: cant("operand stack overflow");
119: andp->first=f;
120: andp->last=l;
121: andp++;
122: }
123:
124: static void
125: pushator(t)
126: int t;
127: {
128: if(atorp >= &atorstack[NSTACK])
129: cant("operator stack overflow");
130: *atorp++=t;
131: *subidp++=cursubid;
132: }
133:
134: static Node *
135: popand(op)
136: {
137: register Inst *inst;
138:
139: if(andp <= &andstack[0]) {
140: regerr2("missing operand for ", op);
141: inst=newinst(NOP);
142: pushand(inst,inst);
143: }
144: return --andp;
145: }
146:
147: static int
148: popator()
149: {
150: if(atorp <= &atorstack[0])
151: cant("operator stack underflow");
152: --subidp;
153: return *--atorp;
154: }
155:
156: static void
157: evaluntil(pri)
158: register pri;
159: {
160: register Node *op1, *op2;
161: register Inst *inst1, *inst2;
162:
163: while(pri==RBRA || atorp[-1]>=pri){
164: switch(popator()){
165: default:
166: rcerror("unknown operator in evaluntil");
167: break;
168: case LBRA: /* must have been RBRA */
169: op1=popand('(');
170: inst2=newinst(RBRA);
171: inst2->subid = *subidp;
172: op1->last->next = inst2;
173: inst1=newinst(LBRA);
174: inst1->subid = *subidp;
175: inst1->next=op1->first;
176: pushand(inst1, inst2);
177: return;
178: case OR:
179: op2=popand('|');
180: op1=popand('|');
181: inst2=newinst(NOP);
182: op2->last->next=inst2;
183: op1->last->next=inst2;
184: inst1=newinst(OR);
185: inst1->right=op1->first;
186: inst1->left=op2->first;
187: pushand(inst1, inst2);
188: break;
189: case CAT:
190: op2=popand(0);
191: op1=popand(0);
192: op1->last->next=op2->first;
193: pushand(op1->first, op2->last);
194: break;
195: case STAR:
196: op2=popand('*');
197: inst1=newinst(OR);
198: op2->last->next=inst1;
199: inst1->right=op2->first;
200: pushand(inst1, inst1);
201: break;
202: case PLUS:
203: op2=popand('+');
204: inst1=newinst(OR);
205: op2->last->next=inst1;
206: inst1->right=op2->first;
207: pushand(op2->first, inst1);
208: break;
209: case QUEST:
210: op2=popand('?');
211: inst1=newinst(OR);
212: inst2=newinst(NOP);
213: inst1->left=inst2;
214: inst1->right=op2->first;
215: op2->last->next=inst2;
216: pushand(inst1, inst2);
217: break;
218: }
219: }
220: }
221:
222: static Prog *
223: optimize(pp)
224: Prog *pp;
225: {
226: register Inst *inst, *target;
227: int size;
228: Prog *npp;
229:
230: /*
231: * get rid of NOOP chains
232: */
233: for(inst=pp->firstinst; inst->type!=END; inst++){
234: target=inst->next;
235: while(target->type == NOP)
236: target=target->next;
237: inst->next=target;
238: }
239:
240: /*
241: * The original allocation is for an area larger than
242: * necessary. Reallocate to the actual space used
243: * and then relocate the code.
244: */
245: size = sizeof(Prog) + (freep - pp->firstinst)*sizeof(Inst);
246: npp = (Prog *)realloc((char *)pp, size);
247: if(npp==NULL || npp==pp)
248: return(pp);
249: freep = &npp->firstinst[freep - pp->firstinst];
250: npp->startinst = &npp->firstinst[pp->startinst - pp->firstinst];
251: for(inst=npp->firstinst; inst<freep; inst++){
252: switch(inst->type){
253: case OR:
254: case STAR:
255: case PLUS:
256: case QUEST:
257: inst->right = &npp->firstinst[inst->right - pp->firstinst];
258: break;
259: case CCLASS:
260: inst->right = (Inst *) &npp->class[(Class*)inst->right - pp->class];
261: break;
262: }
263: inst->left = &npp->firstinst[inst->left - pp->firstinst];
264: }
265: return(npp);
266: }
267:
268: #ifdef DEBUG
269: static char *
270: dumptype(t){
271: static char ordinary[4] = "'.'";
272:
273: switch(t){
274: case START: return "START";
275: case RBRA: return "RBRA";
276: case LBRA: return "LBRA";
277: case OR: return "OR";
278: case CAT: return "CAT";
279: case STAR: return "STAR";
280: case PLUS: return "PLUS";
281: case QUEST: return "QUEST";
282: case ANY: return "ANY";
283: case NOP: return "NOP";
284: case BOL: return "BOL";
285: case EOL: return "EOL";
286: case CCLASS: return "CCLASS";
287: case END: return "END";
288: default:
289: ordinary[1] = t;
290: return ordinary;
291: }
292: }
293:
294: static void
295: dumpstack(){
296: Node *stk;
297: int *ip;
298:
299: printf("operators\n");
300: for(ip=atorstack; ip<atorp; ip++)
301: printf("0%o\n", *ip);
302: printf("operands\n");
303: for(stk=andstack; stk<andp; stk++){
304: printf("%s\t", dumptype(stk->first->type));
305: printf("%s\n", dumptype(stk->last->type));
306: }
307: }
308:
309: static void
310: putC(c)
311: {
312: if(c < ' ' || '~' < c){
313: switch(c){
314: default:
315: putchar('^');
316: putchar((c + '@') & 0x7f);
317: return;
318: case '\b':
319: c = 'b';
320: break;
321: case '\t':
322: c = 't';
323: break;
324: case '\n':
325: c = 'n';
326: break;
327: case '\v':
328: c = 'v';
329: break;
330: case '\f':
331: c = 'f';
332: break;
333: case '\r':
334: c = 'r';
335: break;
336: }
337: putchar('\\');
338: }
339: putchar(c);
340: }
341:
342: static void
343: putCHAR(from, to)
344: {
345: if(from != 0)
346: if(from == to)
347: putC(from);
348: else{
349: putC(from);
350: putchar('-');
351: putC(to);
352: }
353: }
354:
355: static void
356: dump(pp)
357: Prog *pp;
358: {
359: int c, from;
360: Inst *l;
361: Class *classp;
362:
363: l=pp->firstinst;
364: do{
365: printf("%d:\t%s\t%d\t", l-pp->firstinst, dumptype(l->type),
366: l->left-pp->firstinst);
367: if(l->type == CCLASS){
368: classp = (Class*) l->right;
369: putchar('[');
370: if(classp->map[0] & 02){ /* ^A? */
371: putchar('^'); /* assume negation */
372: for(from=0, c=1; c < 128; ++c){
373: if(classp->map[c/8] & (1<<(c&07))){
374: putCHAR(from, c-1);
375: from = 0;
376: } else {
377: if(from == 0)
378: from = c;
379: }
380: }
381: putCHAR(from, c-1);
382: } else {
383: for(from=0, c=1; c < 128; ++c){
384: if(classp->map[c/8] & (1<<(c&07))){
385: if(from == 0)
386: from = c;
387: } else {
388: putCHAR(from, c-1);
389: from = 0;
390: }
391: }
392: putCHAR(from, c-1);
393: }
394: putchar(']');
395: } else
396: printf("%d", l->right-pp->firstinst);
397: putchar('\n');
398: }while(l++->type);
399: }
400: #endif
401:
402: static void
403: startlex(s)
404: char *s;
405: {
406: exprp=s;
407: nclass=0;
408: nbra=0;
409: }
410:
411: static Class *
412: newclass(){
413: register Class *p;
414: register n;
415:
416: if(nclass >= NCLASS)
417: regerr2("too many character classes; limit", NCLASS+'0');
418: p = &(classp[nclass++]);
419: for(n=0; n<16; n++)
420: p->map[n]=0;
421: return p;
422: }
423:
424: static int
425: lex(){
426: register c= *exprp++;
427:
428: switch(c){
429: case '\\':
430: if(*exprp)
431: c= *exprp++;
432: break;
433: case 0:
434: c=END;
435: --exprp; /* In case we come here again */
436: break;
437: case '*':
438: c=STAR;
439: break;
440: case '?':
441: c=QUEST;
442: break;
443: case '+':
444: c=PLUS;
445: break;
446: case '|':
447: c=OR;
448: break;
449: case '.':
450: c=ANY;
451: break;
452: case '(':
453: c=LBRA;
454: break;
455: case ')':
456: c=RBRA;
457: break;
458: case '^':
459: c=BOL;
460: break;
461: case '$':
462: c=EOL;
463: break;
464: case '[':
465: c=CCLASS;
466: bldcclass();
467: break;
468: }
469: return c;
470: }
471:
472: static int
473: nextc(){
474: if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
475: rcerror("malformed '[]'");
476: if(exprp[0]=='\\'){
477: exprp++;
478: return *exprp++|0200;
479: }
480: return *exprp++;
481: }
482:
483: static void
484: bldcclass(){
485: register c1, c2;
486: register Class *classp;
487: register negate=FALSE;
488:
489: classp=newclass();
490: /* we have already seen the '[' */
491: if(*exprp=='^'){
492: negate=TRUE;
493: exprp++;
494: }
495: while((c1=c2=nextc()) != ']'){
496: if(*exprp=='-'){
497: exprp++; /* eat '-' */
498: if((c2=nextc()) == ']')
499: rcerror("malformed '[]'");
500: }
501: for((c1&=0177), (c2&=0177); c1<=c2; c1++)
502: classp->map[c1/8] |= 1<<(c1&07);
503: }
504: if(negate)
505: for(c1=0; c1<16; c1++)
506: classp->map[c1]^=0377;
507: classp->map[0] &= 0376; /* exclude NUL */
508: }
509:
510: extern regexp *
511: regcomp(s)
512: char *s;
513: {
514: register token;
515: Prog *pp;
516:
517: /* get memory for the program */
518: pp = (Prog *)malloc(sizeof(Prog) + 3*sizeof(Inst)*strlen(s));
519: if (pp == NULL) {
520: rcerror("out of memory");
521: return NULL;
522: }
523: freep = pp->firstinst;
524: classp = pp->class;
525: errors = 0;
526:
527: /* go compile the sucker */
528: startlex(s);
529: atorp=atorstack;
530: andp=andstack;
531: subidp=subidstack;
532: lastwasand=FALSE;
533: cursubid=0;
534:
535: /* Start with a low priority operator to prime parser */
536: pushator(START-1);
537: while((token=lex()) != END){
538: if((token&0300) == OPERATOR)
539: operator(token);
540: else
541: operand(token);
542: }
543:
544: /* Close with a low priority operator */
545: evaluntil(START);
546:
547: /* Force END */
548: operand(END);
549: evaluntil(START);
550: #ifdef DEBUG
551: dumpstack();
552: #endif
553: if(nbra)
554: rcerror("unmatched left paren");
555: --andp; /* points to first and only operand */
556: pp->startinst=andp->first;
557: #ifdef DEBUG
558: dump(pp);
559: #endif
560: pp = optimize(pp);
561: #ifdef DEBUG
562: printf("start: %d\n", pp->startinst-pp->firstinst);
563: dump(pp);
564: fflush(stdout);
565: #endif
566: if (errors) {
567: free((char *)pp);
568: pp = NULL;
569: }
570: return (regexp *)pp;
571: }
572:
573: #ifdef DEBUG
574: #include <setjmp.h>
575:
576: jmp_buf jmpbuf;
577:
578: main( argc, argv )
579: char **argv;
580: {
581: regexp *prog = NULL;
582: regsubexp match[NSUBEXP];
583: char line[256];
584: int i;
585:
586: while( TRUE ) {
587: setjmp( jmpbuf );
588:
589: if(prog)
590: free( prog );
591:
592: do {
593: fputs( "Enter re: ", stdout );
594: if ( gets(line) == NULL )
595: exit(0);
596: } while( (prog = regcomp(line)) == NULL );
597:
598: fputs( "Enter string: ", stdout );
599: while( gets(line) != NULL ) {
600: if ( !regexec(prog,line,match,NSUBEXP) )
601: puts( "*** NO MATCH ***" );
602: else
603: for( i = 0; i < NSUBEXP; ++i )
604: if ( match[i].sp )
605: printf( "match[%d] = \"%.*s\"\n", i, match[i].ep - match[i].sp, match[i].sp );
606: }
607: }
608: }
609:
610: regerror( s )
611: char *s;
612: {
613: puts( s );
614: longjmp( jmpbuf );
615: }
616:
617: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.