|
|
1.1 ! root 1: #ifdef DEBUG ! 2: #include <stdio.h> ! 3: #endif ! 4: #include <libc.h> ! 5: #include "regprog.h" ! 6: ! 7: /* ! 8: * Parser Information ! 9: */ ! 10: typedef struct Node{ ! 11: Inst *first; ! 12: Inst *last; ! 13: }Node; ! 14: #define NSTACK 20 ! 15: static Node andstack[NSTACK]; ! 16: static Node *andp; ! 17: static int atorstack[NSTACK]; ! 18: static int *atorp; ! 19: static int cursubid; /* id of current subexpression */ ! 20: static int subidstack[NSTACK]; /* parallel to atorstack */ ! 21: static int *subidp; ! 22: static int lastwasand; /* Last token was operand */ ! 23: static int nbra; ! 24: static char *exprp; /* pointer to next character in source expression */ ! 25: static int nclass; ! 26: static Class *classp; ! 27: static Inst *freep; ! 28: static int errors; ! 29: ! 30: /* predeclared crap */ ! 31: static void operator(); ! 32: static void pushand(); ! 33: static void pushator(); ! 34: static void evaluntil(); ! 35: static void bldcclass(); ! 36: ! 37: static void ! 38: rcerror(s) ! 39: char *s; ! 40: { ! 41: errors++; ! 42: regerror(s); ! 43: } ! 44: ! 45: static Inst * ! 46: newinst(t) ! 47: int t; ! 48: { ! 49: freep->type=t; ! 50: freep->left=0; ! 51: freep->right=0; ! 52: return freep++; ! 53: } ! 54: ! 55: static void ! 56: operand(t) ! 57: int t; ! 58: { ! 59: register Inst *i; ! 60: if(lastwasand) ! 61: operator(CAT); /* catenate is implicit */ ! 62: i=newinst(t); ! 63: if(t==CCLASS) /* ugh */ ! 64: i->right=(Inst *)&(classp[nclass-1]); /* UGH! */ ! 65: pushand(i, i); ! 66: lastwasand=TRUE; ! 67: } ! 68: ! 69: static void ! 70: operator(t) ! 71: int t; ! 72: { ! 73: if(t==RBRA && --nbra<0) ! 74: rcerror("unmatched right paren"); ! 75: if(t==LBRA) { ! 76: if (++cursubid >= NSUBEXP) ! 77: rcerror ("too many subexpressions"); ! 78: nbra++; ! 79: if (lastwasand) ! 80: operator(CAT); ! 81: } else ! 82: evaluntil(t); ! 83: if(t!=RBRA) ! 84: pushator(t); ! 85: lastwasand=FALSE; ! 86: if(t==STAR || t==QUEST || t==PLUS || t==RBRA) ! 87: lastwasand=TRUE; /* these look like operands */ ! 88: } ! 89: ! 90: static void ! 91: regerr2(s, c) ! 92: char *s; ! 93: { ! 94: char buf[100]; ! 95: char *cp = buf; ! 96: while(*s) ! 97: *cp++ = *s++; ! 98: *cp++ = c; ! 99: *cp = '\0'; ! 100: rcerror(buf); ! 101: } ! 102: ! 103: static void ! 104: cant(s) ! 105: char *s; ! 106: { ! 107: char buf[100]; ! 108: strcpy(buf, "can't happen: "); ! 109: strcat(buf, s); ! 110: rcerror(buf); ! 111: } ! 112: ! 113: static void ! 114: pushand(f, l) ! 115: Inst *f, *l; ! 116: { ! 117: if(andp >= &andstack[NSTACK]) ! 118: cant("operand stack overflow"); ! 119: andp->first=f; ! 120: andp->last=l; ! 121: andp++; ! 122: } ! 123: ! 124: static void ! 125: pushator(t) ! 126: int t; ! 127: { ! 128: if(atorp >= &atorstack[NSTACK]) ! 129: cant("operator stack overflow"); ! 130: *atorp++=t; ! 131: *subidp++=cursubid; ! 132: } ! 133: ! 134: static Node * ! 135: popand(op) ! 136: { ! 137: register Inst *inst; ! 138: ! 139: if(andp <= &andstack[0]) { ! 140: regerr2("missing operand for ", op); ! 141: inst=newinst(NOP); ! 142: pushand(inst,inst); ! 143: } ! 144: return --andp; ! 145: } ! 146: ! 147: static int ! 148: popator() ! 149: { ! 150: if(atorp <= &atorstack[0]) ! 151: cant("operator stack underflow"); ! 152: --subidp; ! 153: return *--atorp; ! 154: } ! 155: ! 156: static void ! 157: evaluntil(pri) ! 158: register pri; ! 159: { ! 160: register Node *op1, *op2; ! 161: register Inst *inst1, *inst2; ! 162: ! 163: while(pri==RBRA || atorp[-1]>=pri){ ! 164: switch(popator()){ ! 165: default: ! 166: rcerror("unknown operator in evaluntil"); ! 167: break; ! 168: case LBRA: /* must have been RBRA */ ! 169: op1=popand('('); ! 170: inst2=newinst(RBRA); ! 171: inst2->subid = *subidp; ! 172: op1->last->next = inst2; ! 173: inst1=newinst(LBRA); ! 174: inst1->subid = *subidp; ! 175: inst1->next=op1->first; ! 176: pushand(inst1, inst2); ! 177: return; ! 178: case OR: ! 179: op2=popand('|'); ! 180: op1=popand('|'); ! 181: inst2=newinst(NOP); ! 182: op2->last->next=inst2; ! 183: op1->last->next=inst2; ! 184: inst1=newinst(OR); ! 185: inst1->right=op1->first; ! 186: inst1->left=op2->first; ! 187: pushand(inst1, inst2); ! 188: break; ! 189: case CAT: ! 190: op2=popand(0); ! 191: op1=popand(0); ! 192: op1->last->next=op2->first; ! 193: pushand(op1->first, op2->last); ! 194: break; ! 195: case STAR: ! 196: op2=popand('*'); ! 197: inst1=newinst(OR); ! 198: op2->last->next=inst1; ! 199: inst1->right=op2->first; ! 200: pushand(inst1, inst1); ! 201: break; ! 202: case PLUS: ! 203: op2=popand('+'); ! 204: inst1=newinst(OR); ! 205: op2->last->next=inst1; ! 206: inst1->right=op2->first; ! 207: pushand(op2->first, inst1); ! 208: break; ! 209: case QUEST: ! 210: op2=popand('?'); ! 211: inst1=newinst(OR); ! 212: inst2=newinst(NOP); ! 213: inst1->left=inst2; ! 214: inst1->right=op2->first; ! 215: op2->last->next=inst2; ! 216: pushand(inst1, inst2); ! 217: break; ! 218: } ! 219: } ! 220: } ! 221: ! 222: static Prog * ! 223: optimize(pp) ! 224: Prog *pp; ! 225: { ! 226: register Inst *inst, *target; ! 227: int size; ! 228: Prog *npp; ! 229: ! 230: /* ! 231: * get rid of NOOP chains ! 232: */ ! 233: for(inst=pp->firstinst; inst->type!=END; inst++){ ! 234: target=inst->next; ! 235: while(target->type == NOP) ! 236: target=target->next; ! 237: inst->next=target; ! 238: } ! 239: ! 240: /* ! 241: * The original allocation is for an area larger than ! 242: * necessary. Reallocate to the actual space used ! 243: * and then relocate the code. ! 244: */ ! 245: size = sizeof(Prog) + (freep - pp->firstinst)*sizeof(Inst); ! 246: npp = (Prog *)realloc((char *)pp, size); ! 247: if(npp==NULL || npp==pp) ! 248: return(pp); ! 249: freep = &npp->firstinst[freep - pp->firstinst]; ! 250: npp->startinst = &npp->firstinst[pp->startinst - pp->firstinst]; ! 251: for(inst=npp->firstinst; inst<freep; inst++){ ! 252: switch(inst->type){ ! 253: case OR: ! 254: case STAR: ! 255: case PLUS: ! 256: case QUEST: ! 257: inst->right = &npp->firstinst[inst->right - pp->firstinst]; ! 258: break; ! 259: case CCLASS: ! 260: inst->right = (Inst *) &npp->class[(Class*)inst->right - pp->class]; ! 261: break; ! 262: } ! 263: inst->left = &npp->firstinst[inst->left - pp->firstinst]; ! 264: } ! 265: return(npp); ! 266: } ! 267: ! 268: #ifdef DEBUG ! 269: static char * ! 270: dumptype(t){ ! 271: static char ordinary[4] = "'.'"; ! 272: ! 273: switch(t){ ! 274: case START: return "START"; ! 275: case RBRA: return "RBRA"; ! 276: case LBRA: return "LBRA"; ! 277: case OR: return "OR"; ! 278: case CAT: return "CAT"; ! 279: case STAR: return "STAR"; ! 280: case PLUS: return "PLUS"; ! 281: case QUEST: return "QUEST"; ! 282: case ANY: return "ANY"; ! 283: case NOP: return "NOP"; ! 284: case BOL: return "BOL"; ! 285: case EOL: return "EOL"; ! 286: case CCLASS: return "CCLASS"; ! 287: case END: return "END"; ! 288: default: ! 289: ordinary[1] = t; ! 290: return ordinary; ! 291: } ! 292: } ! 293: ! 294: static void ! 295: dumpstack(){ ! 296: Node *stk; ! 297: int *ip; ! 298: ! 299: printf("operators\n"); ! 300: for(ip=atorstack; ip<atorp; ip++) ! 301: printf("0%o\n", *ip); ! 302: printf("operands\n"); ! 303: for(stk=andstack; stk<andp; stk++){ ! 304: printf("%s\t", dumptype(stk->first->type)); ! 305: printf("%s\n", dumptype(stk->last->type)); ! 306: } ! 307: } ! 308: ! 309: static void ! 310: putC(c) ! 311: { ! 312: if(c < ' ' || '~' < c){ ! 313: switch(c){ ! 314: default: ! 315: putchar('^'); ! 316: putchar((c + '@') & 0x7f); ! 317: return; ! 318: case '\b': ! 319: c = 'b'; ! 320: break; ! 321: case '\t': ! 322: c = 't'; ! 323: break; ! 324: case '\n': ! 325: c = 'n'; ! 326: break; ! 327: case '\v': ! 328: c = 'v'; ! 329: break; ! 330: case '\f': ! 331: c = 'f'; ! 332: break; ! 333: case '\r': ! 334: c = 'r'; ! 335: break; ! 336: } ! 337: putchar('\\'); ! 338: } ! 339: putchar(c); ! 340: } ! 341: ! 342: static void ! 343: putCHAR(from, to) ! 344: { ! 345: if(from != 0) ! 346: if(from == to) ! 347: putC(from); ! 348: else{ ! 349: putC(from); ! 350: putchar('-'); ! 351: putC(to); ! 352: } ! 353: } ! 354: ! 355: static void ! 356: dump(pp) ! 357: Prog *pp; ! 358: { ! 359: int c, from; ! 360: Inst *l; ! 361: Class *classp; ! 362: ! 363: l=pp->firstinst; ! 364: do{ ! 365: printf("%d:\t%s\t%d\t", l-pp->firstinst, dumptype(l->type), ! 366: l->left-pp->firstinst); ! 367: if(l->type == CCLASS){ ! 368: classp = (Class*) l->right; ! 369: putchar('['); ! 370: if(classp->map[0] & 02){ /* ^A? */ ! 371: putchar('^'); /* assume negation */ ! 372: for(from=0, c=1; c < 128; ++c){ ! 373: if(classp->map[c/8] & (1<<(c&07))){ ! 374: putCHAR(from, c-1); ! 375: from = 0; ! 376: } else { ! 377: if(from == 0) ! 378: from = c; ! 379: } ! 380: } ! 381: putCHAR(from, c-1); ! 382: } else { ! 383: for(from=0, c=1; c < 128; ++c){ ! 384: if(classp->map[c/8] & (1<<(c&07))){ ! 385: if(from == 0) ! 386: from = c; ! 387: } else { ! 388: putCHAR(from, c-1); ! 389: from = 0; ! 390: } ! 391: } ! 392: putCHAR(from, c-1); ! 393: } ! 394: putchar(']'); ! 395: } else ! 396: printf("%d", l->right-pp->firstinst); ! 397: putchar('\n'); ! 398: }while(l++->type); ! 399: } ! 400: #endif ! 401: ! 402: static void ! 403: startlex(s) ! 404: char *s; ! 405: { ! 406: exprp=s; ! 407: nclass=0; ! 408: nbra=0; ! 409: } ! 410: ! 411: static Class * ! 412: newclass(){ ! 413: register Class *p; ! 414: register n; ! 415: ! 416: if(nclass >= NCLASS) ! 417: regerr2("too many character classes; limit", NCLASS+'0'); ! 418: p = &(classp[nclass++]); ! 419: for(n=0; n<16; n++) ! 420: p->map[n]=0; ! 421: return p; ! 422: } ! 423: ! 424: static int ! 425: lex(){ ! 426: register c= *exprp++; ! 427: ! 428: switch(c){ ! 429: case '\\': ! 430: if(*exprp) ! 431: c= *exprp++; ! 432: break; ! 433: case 0: ! 434: c=END; ! 435: --exprp; /* In case we come here again */ ! 436: break; ! 437: case '*': ! 438: c=STAR; ! 439: break; ! 440: case '?': ! 441: c=QUEST; ! 442: break; ! 443: case '+': ! 444: c=PLUS; ! 445: break; ! 446: case '|': ! 447: c=OR; ! 448: break; ! 449: case '.': ! 450: c=ANY; ! 451: break; ! 452: case '(': ! 453: c=LBRA; ! 454: break; ! 455: case ')': ! 456: c=RBRA; ! 457: break; ! 458: case '^': ! 459: c=BOL; ! 460: break; ! 461: case '$': ! 462: c=EOL; ! 463: break; ! 464: case '[': ! 465: c=CCLASS; ! 466: bldcclass(); ! 467: break; ! 468: } ! 469: return c; ! 470: } ! 471: ! 472: static int ! 473: nextc(){ ! 474: if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) ! 475: rcerror("malformed '[]'"); ! 476: if(exprp[0]=='\\'){ ! 477: exprp++; ! 478: return *exprp++|0200; ! 479: } ! 480: return *exprp++; ! 481: } ! 482: ! 483: static void ! 484: bldcclass(){ ! 485: register c1, c2; ! 486: register Class *classp; ! 487: register negate=FALSE; ! 488: ! 489: classp=newclass(); ! 490: /* we have already seen the '[' */ ! 491: if(*exprp=='^'){ ! 492: negate=TRUE; ! 493: exprp++; ! 494: } ! 495: while((c1=c2=nextc()) != ']'){ ! 496: if(*exprp=='-'){ ! 497: exprp++; /* eat '-' */ ! 498: if((c2=nextc()) == ']') ! 499: rcerror("malformed '[]'"); ! 500: } ! 501: for((c1&=0177), (c2&=0177); c1<=c2; c1++) ! 502: classp->map[c1/8] |= 1<<(c1&07); ! 503: } ! 504: if(negate) ! 505: for(c1=0; c1<16; c1++) ! 506: classp->map[c1]^=0377; ! 507: classp->map[0] &= 0376; /* exclude NUL */ ! 508: } ! 509: ! 510: extern regexp * ! 511: regcomp(s) ! 512: char *s; ! 513: { ! 514: register token; ! 515: Prog *pp; ! 516: ! 517: /* get memory for the program */ ! 518: pp = (Prog *)malloc(sizeof(Prog) + 3*sizeof(Inst)*strlen(s)); ! 519: if (pp == NULL) { ! 520: rcerror("out of memory"); ! 521: return NULL; ! 522: } ! 523: freep = pp->firstinst; ! 524: classp = pp->class; ! 525: errors = 0; ! 526: ! 527: /* go compile the sucker */ ! 528: startlex(s); ! 529: atorp=atorstack; ! 530: andp=andstack; ! 531: subidp=subidstack; ! 532: lastwasand=FALSE; ! 533: cursubid=0; ! 534: ! 535: /* Start with a low priority operator to prime parser */ ! 536: pushator(START-1); ! 537: while((token=lex()) != END){ ! 538: if((token&0300) == OPERATOR) ! 539: operator(token); ! 540: else ! 541: operand(token); ! 542: } ! 543: ! 544: /* Close with a low priority operator */ ! 545: evaluntil(START); ! 546: ! 547: /* Force END */ ! 548: operand(END); ! 549: evaluntil(START); ! 550: #ifdef DEBUG ! 551: dumpstack(); ! 552: #endif ! 553: if(nbra) ! 554: rcerror("unmatched left paren"); ! 555: --andp; /* points to first and only operand */ ! 556: pp->startinst=andp->first; ! 557: #ifdef DEBUG ! 558: dump(pp); ! 559: #endif ! 560: pp = optimize(pp); ! 561: #ifdef DEBUG ! 562: printf("start: %d\n", pp->startinst-pp->firstinst); ! 563: dump(pp); ! 564: fflush(stdout); ! 565: #endif ! 566: if (errors) { ! 567: free((char *)pp); ! 568: pp = NULL; ! 569: } ! 570: return (regexp *)pp; ! 571: } ! 572: ! 573: #ifdef DEBUG ! 574: #include <setjmp.h> ! 575: ! 576: jmp_buf jmpbuf; ! 577: ! 578: main( argc, argv ) ! 579: char **argv; ! 580: { ! 581: regexp *prog = NULL; ! 582: regsubexp match[NSUBEXP]; ! 583: char line[256]; ! 584: int i; ! 585: ! 586: while( TRUE ) { ! 587: setjmp( jmpbuf ); ! 588: ! 589: if(prog) ! 590: free( prog ); ! 591: ! 592: do { ! 593: fputs( "Enter re: ", stdout ); ! 594: if ( gets(line) == NULL ) ! 595: exit(0); ! 596: } while( (prog = regcomp(line)) == NULL ); ! 597: ! 598: fputs( "Enter string: ", stdout ); ! 599: while( gets(line) != NULL ) { ! 600: if ( !regexec(prog,line,match,NSUBEXP) ) ! 601: puts( "*** NO MATCH ***" ); ! 602: else ! 603: for( i = 0; i < NSUBEXP; ++i ) ! 604: if ( match[i].sp ) ! 605: printf( "match[%d] = \"%.*s\"\n", i, match[i].ep - match[i].sp, match[i].sp ); ! 606: } ! 607: } ! 608: } ! 609: ! 610: regerror( s ) ! 611: char *s; ! 612: { ! 613: puts( s ); ! 614: longjmp( jmpbuf ); ! 615: } ! 616: ! 617: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.