|
|
1.1 root 1: #include "sam.h"
2:
3: Range sel;
4: String lastregexp;
5: /*
6: * Machine Information
7: */
8: typedef struct Inst{
9: int type; /* < 0x200 ==> literal, otherwise action */
10: struct Inst *right;
11: struct Inst *left;
12: }Inst;
13: #define next left /* Left branch is usually just next ptr */
14: #define NPROG 1024
15: Inst program[NPROG];
16: Inst *progp;
17: Inst *startinst; /* First inst. of program; might not be program[0] */
18: Inst *bstartinst; /* same for backwards machine */
19:
20: typedef struct Ilist{
21: Inst *inst; /* Instruction of the thread */
22: Posn startp; /* first char of match */
23: }Ilist;
24: #define NLIST 128
25: Ilist *tl, *nl; /* This list, next list */
26: Ilist list[2][NLIST];
27:
28: /*
29: * Actions and Tokens
30: *
31: * 0x2xx are operators, value == precedence
32: * 0x3xx are tokens, i.e. operands for operators
33: */
34: #define OPERATOR 0x200 /* Bitmask of all operators */
35: #define START 0x200 /* Start, used for marker on stack */
36: #define RBRA 0x201 /* Right bracket, ) */
37: #define LBRA 0x202 /* Left bracket, ( */
38: #define OR 0x203 /* Alternation, | */
39: #define CAT 0x204 /* Concatentation, implicit operator */
40: #define STAR 0x205 /* Closure, * */
41: #define PLUS 0x206 /* a+ == aa* */
42: #define QUEST 0x207 /* a? == a|nothing, i.e. 0 or 1 a's */
43: #define ANY 0x300 /* Any character but newline, . */
44: #define ANYNL 0x301 /* Any character, including newline, @ */
45: #define NOP 0x302 /* No operation, internal use only */
46: #define BOL 0x303 /* Beginning of line, ^ */
47: #define EOL 0x304 /* End of line, $ */
48: #define CCLASS 0x305 /* Character class, [] */
49: #define END 0x377 /* Terminate: match found */
50:
51: /*
52: * Parser Information
53: */
54: typedef struct Node{
55: Inst *first;
56: Inst *last;
57: }Node;
58: #define NSTACK 20
59: Node andstack[NSTACK];
60: Node *andp;
61: int atorstack[NSTACK];
62: int *atorp;
63: int lastwasand; /* Last token was operand */
64: int backwards;
65: int nbra;
66: uchar *exprp; /* pointer to next character in source expression */
67: #define NCLASS 8
68: int nclass;
69: char class[NCLASS][32]; /* 32 bytes == 256 bits, one bit per char */
70:
71: Inst *
72: newinst(t)
73: int t;
74: {
75: if(progp>= &program[NPROG])
76: error(Etoolong);
77: progp->type=t;
78: progp->left=0;
79: progp->right=0;
80: return progp++;
81: }
82: Inst *
83: realcompile(s)
84: uchar *s;
85: {
86: register token;
87:
88: startlex(s);
89: atorp=atorstack;
90: andp=andstack;
91: lastwasand=FALSE;
92: /* Start with a low priority operator to prime parser */
93: pushator(START-1);
94: while((token=lex()) != END){
95: if((token&0x300) == OPERATOR)
96: operator(token);
97: else
98: operand(token);
99: }
100: /* Close with a low priority operator */
101: evaluntil(START);
102: /* Force END */
103: operand(END);
104: evaluntil(START);
105: if(nbra)
106: error(Eleftpar);
107: --andp; /* points to first and only operand */
108: return andp->first;
109: }
110: compile(r)
111: Regexp *r;
112: {
113: register String *s= &r->text;
114: Inst *oprogp;
115: if(s->n==lastregexp.n &&
116: strncmp(s->s, lastregexp.s, lastregexp.n)==0)
117: return;
118: progp=program;
119: backwards=FALSE;
120: startinst=realcompile(s->s);
121: optimize(program);
122: oprogp=progp;
123: backwards=TRUE;
124: bstartinst=realcompile(s->s);
125: optimize(oprogp);
126: strdupstr(&lastregexp, s);
127: }
128: operand(t)
129: int t;
130: {
131: register Inst *i;
132: if(lastwasand)
133: operator(CAT); /* catenate is implicit */
134: i=newinst(t);
135: if(t==CCLASS) /* ugh */
136: i->right=(Inst *)class[nclass-1]; /* UGH! */
137: pushand(i, i);
138: lastwasand=TRUE;
139: }
140: operator(t)
141: int t;
142: {
143: if(t==RBRA && --nbra<0)
144: error(Erightpar);
145: if(t==LBRA){
146: nbra++;
147: if(lastwasand)
148: operator(CAT);
149: }else
150: evaluntil(t);
151: if(t!=RBRA)
152: pushator(t);
153: lastwasand=FALSE;
154: if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
155: lastwasand=TRUE; /* these look like operands */
156: }
157: cant(s)
158: char *s;
159: {
160: char buf[100];
161: sprint(buf, "can't happen: %s", s);
162: panic(buf);
163: }
164: pushand(f, l)
165: Inst *f, *l;
166: {
167: if(andp >= &andstack[NSTACK])
168: cant("operand stack overflow");
169: andp->first=f;
170: andp->last=l;
171: andp++;
172: }
173: pushator(t)
174: int t;
175: {
176: if(atorp >= &atorstack[NSTACK])
177: cant("operator stack overflow");
178: *atorp++=t;
179: }
180: Node *
181: popand(op)
182: {
183: if(andp <= &andstack[0])
184: error_c(Emissop, op);
185: return --andp;
186: }
187: popator()
188: {
189: if(atorp <= &atorstack[0])
190: cant("operator stack underflow");
191: return *--atorp;
192: }
193: evaluntil(pri)
194: register pri;
195: {
196: register Node *op1, *op2, *t;
197: register Inst *inst1, *inst2;
198: while(pri==RBRA || atorp[-1]>=pri){
199: switch(popator()){
200: case LBRA:
201: return; /* must have been RBRA */
202: default:
203: panic("unknown regexp operator");
204: break;
205: case OR:
206: op2=popand('|');
207: op1=popand('|');
208: inst2=newinst(NOP);
209: op2->last->next=inst2;
210: op1->last->next=inst2;
211: inst1=newinst(OR);
212: inst1->right=op1->first;
213: inst1->left=op2->first;
214: pushand(inst1, inst2);
215: break;
216: case CAT:
217: op2=popand(0);
218: op1=popand(0);
219: if(backwards && op2->first->type!=END)
220: t=op1, op1=op2, op2=t;
221: op1->last->next=op2->first;
222: pushand(op1->first, op2->last);
223: break;
224: case STAR:
225: op2=popand('*');
226: inst1=newinst(OR);
227: op2->last->next=inst1;
228: inst1->right=op2->first;
229: pushand(inst1, inst1);
230: break;
231: case PLUS:
232: op2=popand('+');
233: inst1=newinst(OR);
234: op2->last->next=inst1;
235: inst1->right=op2->first;
236: pushand(op2->first, inst1);
237: break;
238: case QUEST:
239: op2=popand('?');
240: inst1=newinst(OR);
241: inst2=newinst(NOP);
242: inst1->left=inst2;
243: inst1->right=op2->first;
244: op2->last->next=inst2;
245: pushand(inst1, inst2);
246: break;
247: }
248: }
249: }
250: optimize(start)
251: register Inst *start;
252: {
253: register Inst *inst, *target;
254:
255: for(inst=start; inst->type!=END; inst++){
256: target=inst->next;
257: while(target->type == NOP)
258: target=target->next;
259: inst->next=target;
260: }
261: }
262: #ifdef DEBUG
263: dumpstack(){
264: Node *stk;
265: int *ip;
266:
267: dprint("operators\n");
268: for(ip=atorstack; ip<atorp; ip++)
269: dprint("0%o\n", *ip);
270: dprint("operands\n");
271: for(stk=andstack; stk<andp; stk++)
272: dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
273: }
274: dump(){
275: Inst *l;
276:
277: l=program;
278: do{
279: dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
280: l->left-program, l->right-program);
281: }while(l++->type);
282: }
283: #endif
284: startlex(s)
285: uchar *s;
286: {
287: exprp=s;
288: nclass=0;
289: nbra=0;
290: }
291: char *
292: newclass(){
293: register char *p;
294: register n;
295:
296: if(nclass >= NCLASS)
297: error(Eclass);
298: p=class[nclass++];
299: for(n=0; n<32; n++)
300: p[n]=0;
301: return p;
302: }
303: lex(){
304: register c= *exprp++;
305:
306: switch(c){
307: case '\\':
308: if(*exprp)
309: if((c= *exprp++)=='n')
310: c='\n';
311: break;
312: case 0:
313: c=END;
314: --exprp; /* In case we come here again */
315: break;
316: case '*':
317: c=STAR;
318: break;
319: case '?':
320: c=QUEST;
321: break;
322: case '+':
323: c=PLUS;
324: break;
325: case '|':
326: c=OR;
327: break;
328: case '.':
329: c=ANY;
330: break;
331: case '@':
332: c=ANYNL;
333: break;
334: case '(':
335: c=LBRA;
336: break;
337: case ')':
338: c=RBRA;
339: break;
340: case '^':
341: c=BOL;
342: break;
343: case '$':
344: c=EOL;
345: break;
346: case '[':
347: c=CCLASS;
348: bldcclass();
349: break;
350: }
351: return c;
352: }
353: nextrec(){
354: if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
355: error(Ebadclass);
356: if(exprp[0]=='\\'){
357: exprp++;
358: if(*exprp=='n'){
359: exprp++;
360: return '\n';
361: }
362: return *exprp++|0x100;
363: }
364: return *exprp++;
365: }
366: bldcclass(){
367: register c1, c2;
368: register char *classp;
369: register negate=FALSE;
370:
371: classp=newclass();
372: /* we have already seen the '[' */
373: if(*exprp=='^'){
374: negate=TRUE;
375: exprp++;
376: }
377: while((c1=c2=nextrec()) != ']'){
378: if(*exprp=='-'){
379: exprp++; /* eat '-' */
380: if((c2=nextrec()) == ']')
381: error(Ebadclass);
382: }
383: for((c1&=0xFF), (c2&=0xFF); c1<=c2; c1++)
384: classp[c1/8] |= 1<<(c1&07);
385: }
386: if(negate){
387: for(c1=0; c1<32; c1++)
388: classp[c1]^=0xFF;
389: classp[1]&=~(1<<('\n'-8)); /* exclude '\n' */
390: }
391: classp[0]&=0xFE; /* exclude NUL */
392: }
393: /*
394: * Note optimization in addinst:
395: * *l must be pending when addinst called; if *l has been looked
396: * at already, the optimization is a bug.
397: */
398: addinst(l, inst, startp)
399: register Ilist *l;
400: register Inst *inst;
401: Posn startp;
402: {
403: register Ilist *p;
404: for(p=l; p->inst; p++){
405: if(p->inst==inst){
406: if(startp < p->startp)
407: p->startp=startp; /* this would be bug */
408: return; /* It's already there */
409: }
410: }
411: p->inst=inst;
412: p->startp=startp;
413: (++p)->inst=0;
414: }
415: execute(f, startp, eof)
416: File *f;
417: Posn startp, eof;
418: {
419: register flag=0;
420: register Inst *inst;
421: register Ilist *tlp;
422: register Posn p=startp;
423: int nnl=0, ntl;
424: register c;
425: int wrapped=0;
426: register startchar=startinst->type<OPERATOR? startinst->type : 0;
427:
428: list[0][0].inst=list[1][0].inst=0;
429: sel.p1= -1;
430: Fgetcset(f, startp);
431: /* Execute machine once for each character */
432: for(;;p++){
433: doloop:
434: c=Fgetc(f);
435: if(p>=eof || c<0){
436: switch(wrapped++){
437: case 0: /* let loop run one more click */
438: case 2:
439: break;
440: case 1: /* expired; wrap to beginning */
441: if(sel.p1>=0 || eof!=INFINITY)
442: goto Return;
443: list[0][0].inst=list[1][0].inst=0;
444: Fgetcset(f, (Posn)0);
445: p=0;
446: goto doloop;
447: default:
448: goto Return;
449: }
450: }else if(((wrapped && p>=startp) || sel.p1>0) && nnl==0)
451: break;
452: /* fast check for first char */
453: if(startchar && nnl==0 && c!=startchar)
454: continue;
455: tl=list[flag];
456: nl=list[flag^=1];
457: nl->inst=0;
458: ntl=nnl;
459: nnl=0;
460: if(sel.p1<0 && (!wrapped || p<startp || startp==eof)){
461: /* Add first instruction to this list */
462: if(++ntl >= NLIST)
463: Overflow:
464: error(Eoverflow);
465: addinst(tl, startinst, p);
466: }
467: /* Execute machine until this list is empty */
468: for(tlp=tl; inst=tlp->inst; tlp++){ /* assignment = */
469: Switchstmt:
470: switch(inst->type){
471: default: /* regular character */
472: if(inst->type==c){
473: Addinst:
474: if(++nnl >= NLIST)
475: goto Overflow;
476: addinst(nl, inst->next, tlp->startp);
477: }
478: break;
479: case ANYNL:
480: goto Addinst;
481: case ANY:
482: if(c!='\n')
483: goto Addinst;
484: break;
485: case BOL:
486: if(p==0){
487: Step:
488: inst=inst->next;
489: goto Switchstmt;
490: }
491: if(f->getci>1){
492: if(f->getcbuf[f->getci-2]=='\n')
493: goto Step;
494: }else{
495: uchar c;
496: if(Fchars(f, &c, p-1, p)==1 && c=='\n')
497: goto Step;
498: }
499: break;
500: case EOL:
501: if(c=='\n')
502: goto Step;
503: break;
504: case CCLASS:
505: if(c>=0 && ((char *)inst->right)[c/8]&(1<<(c&07)))
506: goto Addinst;
507: break;
508: case OR:
509: /* evaluate right choice later */
510: if(++ntl >= NLIST)
511: goto Overflow;
512: addinst(tlp, inst->right, tlp->startp);
513: /* efficiency: advance and re-evaluate */
514: inst=inst->left;
515: goto Switchstmt;
516: case END: /* Match! */
517: newmatch(tlp->startp, p);
518: break;
519: }
520: }
521: }
522: Return:
523: return sel.p1>=0;
524: }
525: newmatch(p1, p2)
526: Posn p1, p2;
527: {
528: if(sel.p1<0 || p1<sel.p1 || (p1==sel.p1 && p2>sel.p2)){
529: sel.p1=p1;
530: sel.p2=p2;
531: }
532: }
533: bexecute(f, startp)
534: register File *f;
535: Posn startp;
536: {
537: register flag=0;
538: register Inst *inst;
539: register Ilist *tlp;
540: register Posn p=startp;
541: int nnl=0, ntl;
542: register c;
543: int wrapped=0;
544: register startchar=bstartinst->type<OPERATOR? bstartinst->type : 0;
545:
546: list[0][0].inst=list[1][0].inst=0;
547: sel.p1= -1;
548: Fgetcset(f, startp);
549: /* Execute machine once for each character, including terminal NUL */
550: for(;;--p){
551: doloop:
552: if((c=Fbgetc(f))==-1){
553: switch(wrapped++){
554: case 0: /* let loop run one more click */
555: case 2:
556: break;
557: case 1: /* expired; wrap to end */
558: if(sel.p1>=0)
559: case 3:
560: goto Return;
561: list[0][0].inst=list[1][0].inst=0;
562: Fgetcset(f, f->nbytes);
563: p=f->nbytes;
564: goto doloop;
565: default:
566: goto Return;
567: }
568: }else if(((wrapped && p<=startp) || sel.p1>0) && nnl==0)
569: break;
570: /* fast check for first char */
571: if(startchar && nnl==0 && c!=startchar)
572: continue;
573: tl=list[flag];
574: nl=list[flag^=1];
575: nl->inst=0;
576: ntl=nnl;
577: nnl=0;
578: if(sel.p1<0 && (!wrapped || p>startp)){
579: /* Add first instruction to this list */
580: if(++ntl >= NLIST)
581: Overflow:
582: error(Eoverflow);
583: /* the minus is so the optimizations in addinst work */
584: addinst(tl, bstartinst, -p);
585: }
586: /* Execute machine until this list is empty */
587: for(tlp=tl; inst=tlp->inst; tlp++){ /* assignment = */
588: Switchstmt:
589: switch(inst->type){
590: default: /* regular character */
591: if(inst->type==c){
592: Addinst:
593: if(++nnl >= NLIST)
594: goto Overflow;
595: addinst(nl, inst->next, tlp->startp);
596: }
597: break;
598: case ANYNL:
599: goto Addinst;
600: case ANY:
601: if(c!='\n')
602: goto Addinst;
603: break;
604: case BOL:
605: if(c=='\n'){
606: Step:
607: inst=inst->next;
608: goto Switchstmt;
609: }
610: break;
611: case EOL:
612: if(f->getci<f->ngetc-1){
613: if(f->getcbuf[f->getci+1]=='\n')
614: goto Step;
615: }else if(p<f->nbytes-1){
616: uchar c;
617: if(Fchars(f, &c, p+1, p+2)==1 && c=='\n')
618: goto Step;
619: }
620: break;
621: case CCLASS:
622: if(((char *)inst->right)[c/8]&(1<<(c&07)))
623: goto Addinst;
624: break;
625: case OR:
626: /* evaluate right choice later */
627: if(++ntl >= NLIST)
628: goto Overflow;
629: addinst(tlp, inst->right, tlp->startp);
630: /* efficiency: advance and re-evaluate */
631: inst=inst->left;
632: goto Switchstmt;
633: case END: /* Match! */
634: bnewmatch(-tlp->startp, p); /* minus sign */
635: break;
636: }
637: }
638: }
639: Return:
640: return sel.p1>=0;
641: }
642: bnewmatch(p2, p1) /* note the reversal; p1<=p2 */
643: Posn p1, p2;
644: {
645: if(sel.p1<0 || p2>sel.p2 || (p2==sel.p2 && p1<sel.p1)){
646: sel.p1=p1;
647: sel.p2=p2;
648: }
649: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.