|
|
1.1 root 1: #include <libc.h>
2: #include "regprog.h"
3:
4: /*
5: * Parser Information
6: */
7: typedef struct Node{
8: Inst *first;
9: Inst *last;
10: }Node;
11: #define NSTACK 20
12: static Node andstack[NSTACK];
13: static Node *andp;
14: static int atorstack[NSTACK];
15: static int *atorp;
16: static int cursubid; /* id of current subexpression */
17: static int subidstack[NSTACK]; /* parallel to atorstack */
18: static int *subidp;
19: static int lastwasand; /* Last token was operand */
20: static int nbra;
21: static char *exprp; /* pointer to next character in source expression */
22: static int nclass;
23: static Class *classp;
24: static Inst *freep;
25: static int errors;
26:
27: /* predeclared crap */
28: static void operator();
29: static void pushand();
30: static void pushator();
31: static void evaluntil();
32: static void bldcclass();
33:
34: static void
35: rcerror(s)
36: char *s;
37: {
38: errors++;
39: regerror(s);
40: }
41:
42: static Inst *
43: newinst(t)
44: int t;
45: {
46: freep->type=t;
47: freep->left=0;
48: freep->right=0;
49: return freep++;
50: }
51:
52: static void
53: operand(t)
54: int t;
55: {
56: register Inst *i;
57: if(lastwasand)
58: operator(CAT); /* catenate is implicit */
59: i=newinst(t);
60: if(t==CCLASS) /* ugh */
61: i->right=(Inst *)&(classp[nclass-1]); /* UGH! */
62: pushand(i, i);
63: lastwasand=TRUE;
64: }
65:
66: static void
67: operator(t)
68: int t;
69: {
70: if(t==RBRA && --nbra<0)
71: rcerror("unmatched right paren");
72: if(t==LBRA) {
73: if (++cursubid >= NSUBEXP)
74: rcerror ("too many subexpressions");
75: nbra++;
76: if (lastwasand)
77: operator(CAT);
78: } else
79: evaluntil(t);
80: if(t!=RBRA)
81: pushator(t);
82: lastwasand=FALSE;
83: if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
84: lastwasand=TRUE; /* these look like operands */
85: }
86:
87: static void
88: regerr2(s, c)
89: char *s;
90: {
91: char buf[100];
92: char *cp = buf;
93: while(*s)
94: *cp++ = *s++;
95: *cp++ = c;
96: *cp = '\0';
97: rcerror(buf);
98: }
99:
100: static void
101: cant(s)
102: char *s;
103: {
104: char buf[100];
105: strcpy(buf, "can't happen: ");
106: strcat(buf, s);
107: rcerror(buf);
108: }
109:
110: static void
111: pushand(f, l)
112: Inst *f, *l;
113: {
114: if(andp >= &andstack[NSTACK])
115: cant("operand stack overflow");
116: andp->first=f;
117: andp->last=l;
118: andp++;
119: }
120:
121: static void
122: pushator(t)
123: int t;
124: {
125: if(atorp >= &atorstack[NSTACK])
126: cant("operator stack overflow");
127: *atorp++=t;
128: *subidp++=cursubid;
129: }
130:
131: static Node *
132: popand(op)
133: {
134: register Inst *inst;
135:
136: if(andp <= &andstack[0]) {
137: regerr2("missing operand for ", op);
138: inst=newinst(NOP);
139: pushand(inst,inst);
140: }
141: return --andp;
142: }
143:
144: static int
145: popator()
146: {
147: if(atorp <= &atorstack[0])
148: cant("operator stack underflow");
149: --subidp;
150: return *--atorp;
151: }
152:
153: static void
154: evaluntil(pri)
155: register pri;
156: {
157: register Node *op1, *op2;
158: register Inst *inst1, *inst2;
159:
160: while(pri==RBRA || atorp[-1]>=pri){
161: switch(popator()){
162: default:
163: rcerror("unknown operator in evaluntil");
164: break;
165: case LBRA: /* must have been RBRA */
166: op1=popand('(');
167: inst2=newinst(RBRA);
168: inst2->subid = *subidp;
169: op1->last->next = inst2;
170: inst1=newinst(LBRA);
171: inst1->subid = *subidp;
172: inst1->next=op1->first;
173: pushand(inst1, inst2);
174: return;
175: case OR:
176: op2=popand('|');
177: op1=popand('|');
178: inst2=newinst(NOP);
179: op2->last->next=inst2;
180: op1->last->next=inst2;
181: inst1=newinst(OR);
182: inst1->right=op1->first;
183: inst1->left=op2->first;
184: pushand(inst1, inst2);
185: break;
186: case CAT:
187: op2=popand(0);
188: op1=popand(0);
189: op1->last->next=op2->first;
190: pushand(op1->first, op2->last);
191: break;
192: case STAR:
193: op2=popand('*');
194: inst1=newinst(OR);
195: op2->last->next=inst1;
196: inst1->right=op2->first;
197: pushand(inst1, inst1);
198: break;
199: case PLUS:
200: op2=popand('+');
201: inst1=newinst(OR);
202: op2->last->next=inst1;
203: inst1->right=op2->first;
204: pushand(op2->first, inst1);
205: break;
206: case QUEST:
207: op2=popand('?');
208: inst1=newinst(OR);
209: inst2=newinst(NOP);
210: inst1->left=inst2;
211: inst1->right=op2->first;
212: op2->last->next=inst2;
213: pushand(inst1, inst2);
214: break;
215: }
216: }
217: }
218:
219: static Prog *
220: optimize(pp)
221: Prog *pp;
222: {
223: register Inst *inst, *target;
224: int size;
225: Prog *npp;
226: int diff;
227:
228: /*
229: * get rid of NOOP chains
230: */
231: for(inst=pp->firstinst; inst->type!=END; inst++){
232: target=inst->next;
233: while(target->type == NOP)
234: target=target->next;
235: inst->next=target;
236: }
237:
238: /*
239: * The original allocation is for an area larger than
240: * necessary. Reallocate to the actual space used
241: * and then relocate the code.
242: */
243: size = sizeof(Prog) + (freep - pp->firstinst)*sizeof(Inst);
244: npp = (Prog *)realloc((char *)pp, size);
245: if(npp==NULL || npp==pp)
246: return(pp);
247: diff = (char *)npp - (char *)pp;
248: freep = (Inst *)((char *)freep + diff);
249: for(inst=npp->firstinst; inst<freep; inst++){
250: switch(inst->type){
251: case OR:
252: case STAR:
253: case PLUS:
254: case QUEST:
255: case CCLASS:
256: *(char **)&inst->right += diff;
257: break;
258: }
259: *(char **)&inst->left += diff;
260: }
261: *(char **)&npp->startinst += diff;
262: return(npp);
263: }
264:
265: #ifdef DEBUG
266: static void
267: dumpstack(){
268: Node *stk;
269: int *ip;
270:
271: printf("operators\n");
272: for(ip=atorstack; ip<atorp; ip++)
273: printf("0%o\n", *ip);
274: printf("operands\n");
275: for(stk=andstack; stk<andp; stk++)
276: printf("0%o\t0%o\n", stk->first->type, stk->last->type);
277: }
278:
279: static void
280: dump(pp)
281: Prog *pp;
282: {
283: Inst *l;
284:
285: l=pp->firstinst;
286: do{
287: printf("%d:\t0%o\t%d\t%d\n", l-pp->firstinst, l->type,
288: l->left-pp->firstinst, l->right-pp->firstinst);
289: }while(l++->type);
290: }
291: #endif
292:
293: static void
294: startlex(s)
295: char *s;
296: {
297: exprp=s;
298: nclass=0;
299: nbra=0;
300: }
301:
302: static Class *
303: newclass(){
304: register Class *p;
305: register n;
306:
307: if(nclass >= NCLASS)
308: regerr2("too many character classes; limit", NCLASS+'0');
309: p = &(classp[nclass++]);
310: for(n=0; n<16; n++)
311: p->map[n]=0;
312: return p;
313: }
314:
315: static int
316: lex(){
317: register c= *exprp++;
318:
319: switch(c){
320: case '\\':
321: if(*exprp)
322: c= *exprp++;
323: break;
324: case 0:
325: c=END;
326: --exprp; /* In case we come here again */
327: break;
328: case '*':
329: c=STAR;
330: break;
331: case '?':
332: c=QUEST;
333: break;
334: case '+':
335: c=PLUS;
336: break;
337: case '|':
338: c=OR;
339: break;
340: case '.':
341: c=ANY;
342: break;
343: case '(':
344: c=LBRA;
345: break;
346: case ')':
347: c=RBRA;
348: break;
349: case '^':
350: c=BOL;
351: break;
352: case '$':
353: c=EOL;
354: break;
355: case '[':
356: c=CCLASS;
357: bldcclass();
358: break;
359: }
360: return c;
361: }
362:
363: static int
364: nextc(){
365: if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
366: rcerror("malformed '[]'");
367: if(exprp[0]=='\\'){
368: exprp++;
369: return *exprp++|0200;
370: }
371: return *exprp++;
372: }
373:
374: static void
375: bldcclass(){
376: register c1, c2;
377: register Class *classp;
378: register negate=FALSE;
379:
380: classp=newclass();
381: /* we have already seen the '[' */
382: if(*exprp=='^'){
383: negate=TRUE;
384: exprp++;
385: }
386: while((c1=c2=nextc()) != ']'){
387: if(*exprp=='-'){
388: exprp++; /* eat '-' */
389: if((c2=nextc()) == ']')
390: rcerror("malformed '[]'");
391: }
392: for((c1&=0177), (c2&=0177); c1<=c2; c1++)
393: classp->map[c1/8] |= 1<<(c1&07);
394: }
395: if(negate)
396: for(c1=0; c1<16; c1++)
397: classp->map[c1]^=0377;
398: classp->map[0] &= 0376; /* exclude NUL */
399: }
400:
401: extern regexp *
402: regcomp(s)
403: char *s;
404: {
405: register token;
406: Prog *pp;
407:
408: /* get memory for the program */
409: pp = (Prog *)malloc(sizeof(Prog) + 3*sizeof(Inst)*strlen(s));
410: if (pp == NULL) {
411: rcerror("out of memory");
412: return NULL;
413: }
414: freep = pp->firstinst;
415: classp = pp->class;
416: errors = 0;
417:
418: /* go compile the sucker */
419: startlex(s);
420: atorp=atorstack;
421: andp=andstack;
422: subidp=subidstack;
423: lastwasand=FALSE;
424: cursubid=0;
425:
426: /* Start with a low priority operator to prime parser */
427: pushator(START-1);
428: while((token=lex()) != END){
429: if((token&0300) == OPERATOR)
430: operator(token);
431: else
432: operand(token);
433: }
434:
435: /* Close with a low priority operator */
436: evaluntil(START);
437:
438: /* Force END */
439: operand(END);
440: evaluntil(START);
441: #ifdef DEBUG
442: dumpstack();
443: #endif
444: if(nbra)
445: rcerror("unmatched left paren");
446: --andp; /* points to first and only operand */
447: pp->startinst=andp->first;
448: #ifdef DEBUG
449: dump(pp);
450: #endif
451: pp = optimize(pp);
452: #ifdef DEBUG
453: printf("start: %d\n", andp->first-pp->firstinst);
454: dump(pp);
455: #endif
456: if (errors) {
457: free((char *)pp);
458: pp = NULL;
459: }
460: return (regexp *)pp;
461: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.