|
|
1.1 ! root 1: #include "sam.h" ! 2: ! 3: Range sel; ! 4: String lastregexp; ! 5: /* ! 6: * Machine Information ! 7: */ ! 8: typedef struct Inst{ ! 9: int type; /* < 0x200 ==> literal, otherwise action */ ! 10: struct Inst *right; ! 11: struct Inst *left; ! 12: }Inst; ! 13: #define next left /* Left branch is usually just next ptr */ ! 14: #define NPROG 1024 ! 15: Inst program[NPROG]; ! 16: Inst *progp; ! 17: Inst *startinst; /* First inst. of program; might not be program[0] */ ! 18: Inst *bstartinst; /* same for backwards machine */ ! 19: ! 20: typedef struct Ilist{ ! 21: Inst *inst; /* Instruction of the thread */ ! 22: Posn startp; /* first char of match */ ! 23: }Ilist; ! 24: #define NLIST 128 ! 25: Ilist *tl, *nl; /* This list, next list */ ! 26: Ilist list[2][NLIST]; ! 27: ! 28: /* ! 29: * Actions and Tokens ! 30: * ! 31: * 0x2xx are operators, value == precedence ! 32: * 0x3xx are tokens, i.e. operands for operators ! 33: */ ! 34: #define OPERATOR 0x200 /* Bitmask of all operators */ ! 35: #define START 0x200 /* Start, used for marker on stack */ ! 36: #define RBRA 0x201 /* Right bracket, ) */ ! 37: #define LBRA 0x202 /* Left bracket, ( */ ! 38: #define OR 0x203 /* Alternation, | */ ! 39: #define CAT 0x204 /* Concatentation, implicit operator */ ! 40: #define STAR 0x205 /* Closure, * */ ! 41: #define PLUS 0x206 /* a+ == aa* */ ! 42: #define QUEST 0x207 /* a? == a|nothing, i.e. 0 or 1 a's */ ! 43: #define ANY 0x300 /* Any character but newline, . */ ! 44: #define ANYNL 0x301 /* Any character, including newline, @ */ ! 45: #define NOP 0x302 /* No operation, internal use only */ ! 46: #define BOL 0x303 /* Beginning of line, ^ */ ! 47: #define EOL 0x304 /* End of line, $ */ ! 48: #define CCLASS 0x305 /* Character class, [] */ ! 49: #define END 0x377 /* Terminate: match found */ ! 50: ! 51: /* ! 52: * Parser Information ! 53: */ ! 54: typedef struct Node{ ! 55: Inst *first; ! 56: Inst *last; ! 57: }Node; ! 58: #define NSTACK 20 ! 59: Node andstack[NSTACK]; ! 60: Node *andp; ! 61: int atorstack[NSTACK]; ! 62: int *atorp; ! 63: int lastwasand; /* Last token was operand */ ! 64: int backwards; ! 65: int nbra; ! 66: uchar *exprp; /* pointer to next character in source expression */ ! 67: #define NCLASS 8 ! 68: int nclass; ! 69: char class[NCLASS][32]; /* 32 bytes == 256 bits, one bit per char */ ! 70: ! 71: Inst * ! 72: newinst(t) ! 73: int t; ! 74: { ! 75: if(progp>= &program[NPROG]) ! 76: error(Etoolong); ! 77: progp->type=t; ! 78: progp->left=0; ! 79: progp->right=0; ! 80: return progp++; ! 81: } ! 82: Inst * ! 83: realcompile(s) ! 84: uchar *s; ! 85: { ! 86: register token; ! 87: ! 88: startlex(s); ! 89: atorp=atorstack; ! 90: andp=andstack; ! 91: lastwasand=FALSE; ! 92: /* Start with a low priority operator to prime parser */ ! 93: pushator(START-1); ! 94: while((token=lex()) != END){ ! 95: if((token&0x300) == OPERATOR) ! 96: operator(token); ! 97: else ! 98: operand(token); ! 99: } ! 100: /* Close with a low priority operator */ ! 101: evaluntil(START); ! 102: /* Force END */ ! 103: operand(END); ! 104: evaluntil(START); ! 105: if(nbra) ! 106: error(Eleftpar); ! 107: --andp; /* points to first and only operand */ ! 108: return andp->first; ! 109: } ! 110: compile(r) ! 111: Regexp *r; ! 112: { ! 113: register String *s= &r->text; ! 114: Inst *oprogp; ! 115: if(s->n==lastregexp.n && ! 116: strncmp(s->s, lastregexp.s, lastregexp.n)==0) ! 117: return; ! 118: progp=program; ! 119: backwards=FALSE; ! 120: startinst=realcompile(s->s); ! 121: optimize(program); ! 122: oprogp=progp; ! 123: backwards=TRUE; ! 124: bstartinst=realcompile(s->s); ! 125: optimize(oprogp); ! 126: strdupstr(&lastregexp, s); ! 127: } ! 128: operand(t) ! 129: int t; ! 130: { ! 131: register Inst *i; ! 132: if(lastwasand) ! 133: operator(CAT); /* catenate is implicit */ ! 134: i=newinst(t); ! 135: if(t==CCLASS) /* ugh */ ! 136: i->right=(Inst *)class[nclass-1]; /* UGH! */ ! 137: pushand(i, i); ! 138: lastwasand=TRUE; ! 139: } ! 140: operator(t) ! 141: int t; ! 142: { ! 143: if(t==RBRA && --nbra<0) ! 144: error(Erightpar); ! 145: if(t==LBRA){ ! 146: nbra++; ! 147: if(lastwasand) ! 148: operator(CAT); ! 149: }else ! 150: evaluntil(t); ! 151: if(t!=RBRA) ! 152: pushator(t); ! 153: lastwasand=FALSE; ! 154: if(t==STAR || t==QUEST || t==PLUS || t==RBRA) ! 155: lastwasand=TRUE; /* these look like operands */ ! 156: } ! 157: cant(s) ! 158: char *s; ! 159: { ! 160: char buf[100]; ! 161: sprint(buf, "can't happen: %s", s); ! 162: panic(buf); ! 163: } ! 164: pushand(f, l) ! 165: Inst *f, *l; ! 166: { ! 167: if(andp >= &andstack[NSTACK]) ! 168: cant("operand stack overflow"); ! 169: andp->first=f; ! 170: andp->last=l; ! 171: andp++; ! 172: } ! 173: pushator(t) ! 174: int t; ! 175: { ! 176: if(atorp >= &atorstack[NSTACK]) ! 177: cant("operator stack overflow"); ! 178: *atorp++=t; ! 179: } ! 180: Node * ! 181: popand(op) ! 182: { ! 183: if(andp <= &andstack[0]) ! 184: error_c(Emissop, op); ! 185: return --andp; ! 186: } ! 187: popator() ! 188: { ! 189: if(atorp <= &atorstack[0]) ! 190: cant("operator stack underflow"); ! 191: return *--atorp; ! 192: } ! 193: evaluntil(pri) ! 194: register pri; ! 195: { ! 196: register Node *op1, *op2, *t; ! 197: register Inst *inst1, *inst2; ! 198: while(pri==RBRA || atorp[-1]>=pri){ ! 199: switch(popator()){ ! 200: case LBRA: ! 201: return; /* must have been RBRA */ ! 202: default: ! 203: panic("unknown regexp operator"); ! 204: break; ! 205: case OR: ! 206: op2=popand('|'); ! 207: op1=popand('|'); ! 208: inst2=newinst(NOP); ! 209: op2->last->next=inst2; ! 210: op1->last->next=inst2; ! 211: inst1=newinst(OR); ! 212: inst1->right=op1->first; ! 213: inst1->left=op2->first; ! 214: pushand(inst1, inst2); ! 215: break; ! 216: case CAT: ! 217: op2=popand(0); ! 218: op1=popand(0); ! 219: if(backwards && op2->first->type!=END) ! 220: t=op1, op1=op2, op2=t; ! 221: op1->last->next=op2->first; ! 222: pushand(op1->first, op2->last); ! 223: break; ! 224: case STAR: ! 225: op2=popand('*'); ! 226: inst1=newinst(OR); ! 227: op2->last->next=inst1; ! 228: inst1->right=op2->first; ! 229: pushand(inst1, inst1); ! 230: break; ! 231: case PLUS: ! 232: op2=popand('+'); ! 233: inst1=newinst(OR); ! 234: op2->last->next=inst1; ! 235: inst1->right=op2->first; ! 236: pushand(op2->first, inst1); ! 237: break; ! 238: case QUEST: ! 239: op2=popand('?'); ! 240: inst1=newinst(OR); ! 241: inst2=newinst(NOP); ! 242: inst1->left=inst2; ! 243: inst1->right=op2->first; ! 244: op2->last->next=inst2; ! 245: pushand(inst1, inst2); ! 246: break; ! 247: } ! 248: } ! 249: } ! 250: optimize(start) ! 251: register Inst *start; ! 252: { ! 253: register Inst *inst, *target; ! 254: ! 255: for(inst=start; inst->type!=END; inst++){ ! 256: target=inst->next; ! 257: while(target->type == NOP) ! 258: target=target->next; ! 259: inst->next=target; ! 260: } ! 261: } ! 262: #ifdef DEBUG ! 263: dumpstack(){ ! 264: Node *stk; ! 265: int *ip; ! 266: ! 267: dprint("operators\n"); ! 268: for(ip=atorstack; ip<atorp; ip++) ! 269: dprint("0%o\n", *ip); ! 270: dprint("operands\n"); ! 271: for(stk=andstack; stk<andp; stk++) ! 272: dprint("0%o\t0%o\n", stk->first->type, stk->last->type); ! 273: } ! 274: dump(){ ! 275: Inst *l; ! 276: ! 277: l=program; ! 278: do{ ! 279: dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type, ! 280: l->left-program, l->right-program); ! 281: }while(l++->type); ! 282: } ! 283: #endif ! 284: startlex(s) ! 285: uchar *s; ! 286: { ! 287: exprp=s; ! 288: nclass=0; ! 289: nbra=0; ! 290: } ! 291: char * ! 292: newclass(){ ! 293: register char *p; ! 294: register n; ! 295: ! 296: if(nclass >= NCLASS) ! 297: error(Eclass); ! 298: p=class[nclass++]; ! 299: for(n=0; n<32; n++) ! 300: p[n]=0; ! 301: return p; ! 302: } ! 303: lex(){ ! 304: register c= *exprp++; ! 305: ! 306: switch(c){ ! 307: case '\\': ! 308: if(*exprp) ! 309: if((c= *exprp++)=='n') ! 310: c='\n'; ! 311: break; ! 312: case 0: ! 313: c=END; ! 314: --exprp; /* In case we come here again */ ! 315: break; ! 316: case '*': ! 317: c=STAR; ! 318: break; ! 319: case '?': ! 320: c=QUEST; ! 321: break; ! 322: case '+': ! 323: c=PLUS; ! 324: break; ! 325: case '|': ! 326: c=OR; ! 327: break; ! 328: case '.': ! 329: c=ANY; ! 330: break; ! 331: case '@': ! 332: c=ANYNL; ! 333: break; ! 334: case '(': ! 335: c=LBRA; ! 336: break; ! 337: case ')': ! 338: c=RBRA; ! 339: break; ! 340: case '^': ! 341: c=BOL; ! 342: break; ! 343: case '$': ! 344: c=EOL; ! 345: break; ! 346: case '[': ! 347: c=CCLASS; ! 348: bldcclass(); ! 349: break; ! 350: } ! 351: return c; ! 352: } ! 353: nextrec(){ ! 354: if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) ! 355: error(Ebadclass); ! 356: if(exprp[0]=='\\'){ ! 357: exprp++; ! 358: if(*exprp=='n'){ ! 359: exprp++; ! 360: return '\n'; ! 361: } ! 362: return *exprp++|0x100; ! 363: } ! 364: return *exprp++; ! 365: } ! 366: bldcclass(){ ! 367: register c1, c2; ! 368: register char *classp; ! 369: register negate=FALSE; ! 370: ! 371: classp=newclass(); ! 372: /* we have already seen the '[' */ ! 373: if(*exprp=='^'){ ! 374: negate=TRUE; ! 375: exprp++; ! 376: } ! 377: while((c1=c2=nextrec()) != ']'){ ! 378: if(*exprp=='-'){ ! 379: exprp++; /* eat '-' */ ! 380: if((c2=nextrec()) == ']') ! 381: error(Ebadclass); ! 382: } ! 383: for((c1&=0xFF), (c2&=0xFF); c1<=c2; c1++) ! 384: classp[c1/8] |= 1<<(c1&07); ! 385: } ! 386: if(negate){ ! 387: for(c1=0; c1<32; c1++) ! 388: classp[c1]^=0xFF; ! 389: classp[1]&=~(1<<('\n'-8)); /* exclude '\n' */ ! 390: } ! 391: classp[0]&=0xFE; /* exclude NUL */ ! 392: } ! 393: /* ! 394: * Note optimization in addinst: ! 395: * *l must be pending when addinst called; if *l has been looked ! 396: * at already, the optimization is a bug. ! 397: */ ! 398: addinst(l, inst, startp) ! 399: register Ilist *l; ! 400: register Inst *inst; ! 401: Posn startp; ! 402: { ! 403: register Ilist *p; ! 404: for(p=l; p->inst; p++){ ! 405: if(p->inst==inst){ ! 406: if(startp < p->startp) ! 407: p->startp=startp; /* this would be bug */ ! 408: return; /* It's already there */ ! 409: } ! 410: } ! 411: p->inst=inst; ! 412: p->startp=startp; ! 413: (++p)->inst=0; ! 414: } ! 415: execute(f, startp, eof) ! 416: File *f; ! 417: Posn startp, eof; ! 418: { ! 419: register flag=0; ! 420: register Inst *inst; ! 421: register Ilist *tlp; ! 422: register Posn p=startp; ! 423: int nnl=0, ntl; ! 424: register c; ! 425: int wrapped=0; ! 426: register startchar=startinst->type<OPERATOR? startinst->type : 0; ! 427: ! 428: list[0][0].inst=list[1][0].inst=0; ! 429: sel.p1= -1; ! 430: Fgetcset(f, startp); ! 431: /* Execute machine once for each character */ ! 432: for(;;p++){ ! 433: doloop: ! 434: c=Fgetc(f); ! 435: if(p>=eof || c<0){ ! 436: switch(wrapped++){ ! 437: case 0: /* let loop run one more click */ ! 438: case 2: ! 439: break; ! 440: case 1: /* expired; wrap to beginning */ ! 441: if(sel.p1>=0 || eof!=INFINITY) ! 442: goto Return; ! 443: list[0][0].inst=list[1][0].inst=0; ! 444: Fgetcset(f, (Posn)0); ! 445: p=0; ! 446: goto doloop; ! 447: default: ! 448: goto Return; ! 449: } ! 450: }else if(((wrapped && p>=startp) || sel.p1>0) && nnl==0) ! 451: break; ! 452: /* fast check for first char */ ! 453: if(startchar && nnl==0 && c!=startchar) ! 454: continue; ! 455: tl=list[flag]; ! 456: nl=list[flag^=1]; ! 457: nl->inst=0; ! 458: ntl=nnl; ! 459: nnl=0; ! 460: if(sel.p1<0 && (!wrapped || p<startp || startp==eof)){ ! 461: /* Add first instruction to this list */ ! 462: if(++ntl >= NLIST) ! 463: Overflow: ! 464: error(Eoverflow); ! 465: addinst(tl, startinst, p); ! 466: } ! 467: /* Execute machine until this list is empty */ ! 468: for(tlp=tl; inst=tlp->inst; tlp++){ /* assignment = */ ! 469: Switchstmt: ! 470: switch(inst->type){ ! 471: default: /* regular character */ ! 472: if(inst->type==c){ ! 473: Addinst: ! 474: if(++nnl >= NLIST) ! 475: goto Overflow; ! 476: addinst(nl, inst->next, tlp->startp); ! 477: } ! 478: break; ! 479: case ANYNL: ! 480: goto Addinst; ! 481: case ANY: ! 482: if(c!='\n') ! 483: goto Addinst; ! 484: break; ! 485: case BOL: ! 486: if(p==0){ ! 487: Step: ! 488: inst=inst->next; ! 489: goto Switchstmt; ! 490: } ! 491: if(f->getci>1){ ! 492: if(f->getcbuf[f->getci-2]=='\n') ! 493: goto Step; ! 494: }else{ ! 495: uchar c; ! 496: if(Fchars(f, &c, p-1, p)==1 && c=='\n') ! 497: goto Step; ! 498: } ! 499: break; ! 500: case EOL: ! 501: if(c=='\n') ! 502: goto Step; ! 503: break; ! 504: case CCLASS: ! 505: if(c>=0 && ((char *)inst->right)[c/8]&(1<<(c&07))) ! 506: goto Addinst; ! 507: break; ! 508: case OR: ! 509: /* evaluate right choice later */ ! 510: if(++ntl >= NLIST) ! 511: goto Overflow; ! 512: addinst(tlp, inst->right, tlp->startp); ! 513: /* efficiency: advance and re-evaluate */ ! 514: inst=inst->left; ! 515: goto Switchstmt; ! 516: case END: /* Match! */ ! 517: newmatch(tlp->startp, p); ! 518: break; ! 519: } ! 520: } ! 521: } ! 522: Return: ! 523: return sel.p1>=0; ! 524: } ! 525: newmatch(p1, p2) ! 526: Posn p1, p2; ! 527: { ! 528: if(sel.p1<0 || p1<sel.p1 || (p1==sel.p1 && p2>sel.p2)){ ! 529: sel.p1=p1; ! 530: sel.p2=p2; ! 531: } ! 532: } ! 533: bexecute(f, startp) ! 534: register File *f; ! 535: Posn startp; ! 536: { ! 537: register flag=0; ! 538: register Inst *inst; ! 539: register Ilist *tlp; ! 540: register Posn p=startp; ! 541: int nnl=0, ntl; ! 542: register c; ! 543: int wrapped=0; ! 544: register startchar=bstartinst->type<OPERATOR? bstartinst->type : 0; ! 545: ! 546: list[0][0].inst=list[1][0].inst=0; ! 547: sel.p1= -1; ! 548: Fgetcset(f, startp); ! 549: /* Execute machine once for each character, including terminal NUL */ ! 550: for(;;--p){ ! 551: doloop: ! 552: if((c=Fbgetc(f))==-1){ ! 553: switch(wrapped++){ ! 554: case 0: /* let loop run one more click */ ! 555: case 2: ! 556: break; ! 557: case 1: /* expired; wrap to end */ ! 558: if(sel.p1>=0) ! 559: case 3: ! 560: goto Return; ! 561: list[0][0].inst=list[1][0].inst=0; ! 562: Fgetcset(f, f->nbytes); ! 563: p=f->nbytes; ! 564: goto doloop; ! 565: default: ! 566: goto Return; ! 567: } ! 568: }else if(((wrapped && p<=startp) || sel.p1>0) && nnl==0) ! 569: break; ! 570: /* fast check for first char */ ! 571: if(startchar && nnl==0 && c!=startchar) ! 572: continue; ! 573: tl=list[flag]; ! 574: nl=list[flag^=1]; ! 575: nl->inst=0; ! 576: ntl=nnl; ! 577: nnl=0; ! 578: if(sel.p1<0 && (!wrapped || p>startp)){ ! 579: /* Add first instruction to this list */ ! 580: if(++ntl >= NLIST) ! 581: Overflow: ! 582: error(Eoverflow); ! 583: /* the minus is so the optimizations in addinst work */ ! 584: addinst(tl, bstartinst, -p); ! 585: } ! 586: /* Execute machine until this list is empty */ ! 587: for(tlp=tl; inst=tlp->inst; tlp++){ /* assignment = */ ! 588: Switchstmt: ! 589: switch(inst->type){ ! 590: default: /* regular character */ ! 591: if(inst->type==c){ ! 592: Addinst: ! 593: if(++nnl >= NLIST) ! 594: goto Overflow; ! 595: addinst(nl, inst->next, tlp->startp); ! 596: } ! 597: break; ! 598: case ANYNL: ! 599: goto Addinst; ! 600: case ANY: ! 601: if(c!='\n') ! 602: goto Addinst; ! 603: break; ! 604: case BOL: ! 605: if(c=='\n'){ ! 606: Step: ! 607: inst=inst->next; ! 608: goto Switchstmt; ! 609: } ! 610: break; ! 611: case EOL: ! 612: if(f->getci<f->ngetc-1){ ! 613: if(f->getcbuf[f->getci+1]=='\n') ! 614: goto Step; ! 615: }else if(p<f->nbytes-1){ ! 616: uchar c; ! 617: if(Fchars(f, &c, p+1, p+2)==1 && c=='\n') ! 618: goto Step; ! 619: } ! 620: break; ! 621: case CCLASS: ! 622: if(((char *)inst->right)[c/8]&(1<<(c&07))) ! 623: goto Addinst; ! 624: break; ! 625: case OR: ! 626: /* evaluate right choice later */ ! 627: if(++ntl >= NLIST) ! 628: goto Overflow; ! 629: addinst(tlp, inst->right, tlp->startp); ! 630: /* efficiency: advance and re-evaluate */ ! 631: inst=inst->left; ! 632: goto Switchstmt; ! 633: case END: /* Match! */ ! 634: bnewmatch(-tlp->startp, p); /* minus sign */ ! 635: break; ! 636: } ! 637: } ! 638: } ! 639: Return: ! 640: return sel.p1>=0; ! 641: } ! 642: bnewmatch(p2, p1) /* note the reversal; p1<=p2 */ ! 643: Posn p1, p2; ! 644: { ! 645: if(sel.p1<0 || p2>sel.p2 || (p2==sel.p2 && p1<sel.p1)){ ! 646: sel.p1=p1; ! 647: sel.p2=p2; ! 648: } ! 649: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.