|
|
1.1 ! root 1: #include "re.h" ! 2: #include "lre.h" ! 3: #include "hdr.h" ! 4: ! 5: int re_debug = 0; ! 6: ! 7: #define RESET(r, ps) COPY(r, ps, begin) ! 8: #define SET(r, ps, n) {if(r->ps.base[n] == 0) r->ps.count++, r->ps.base[n] = r->ps.last, r->ps.last = n; } ! 9: #define GET(ps, n) for(n = ps.last; n > 0; n = ps.base[n]) ! 10: #define COPY(r, to, from) memmove((char *)r->to.base, (char *)r->from.base, r->maxid*sizeof(int)), r->to.count = r->from.count, r->to.last = r->from.last ! 11: ! 12: static State *addstate(re_re *r, Positionset *ps); ! 13: static int first(re_re *, Expr *); ! 14: static int match(re_re *, Expr *, int); ! 15: static State *nextstate(re_re *, State *, int); ! 16: #ifdef DEBUG ! 17: static void ppr(Positionset *, char *); ! 18: #endif ! 19: ! 20: typedef enum { NORMAL, ACCEPT, ACCEPT_EOP, FAIL } Out; ! 21: ! 22: static void ! 23: eptr(register re_re *r, register Expr *e) ! 24: { ! 25: if((e->id < 0) || (e->id >= r->maxid)){ ! 26: EPR "eptr abort: r=%ld[maxid=%d] e=%ld[id=%d]\n", r, r->maxid, e, e->id); ! 27: abort(); ! 28: } ! 29: r->ptr[e->id] = e; ! 30: if((e->type != Charclass) && (e->type != Compcharclass)){ ! 31: if(e->l) ! 32: eptr(r, e->l); ! 33: if(e->r) ! 34: eptr(r, e->r); ! 35: } ! 36: } ! 37: ! 38: re_re * ! 39: egprep(enum Parsetype parser, unsigned char *b, unsigned char *e, unsigned char *map, int dotstar) ! 40: { ! 41: register re_re *r; ! 42: Expr *ecomp; ! 43: int i; ! 44: ! 45: r = (re_re *)egmalloc(sizeof (re_re), "allocating re_re"); ! 46: if(r == 0) ! 47: return 0; ! 48: memset((char *)r, 0, sizeof (re_re)); ! 49: eg_lexinit((char *)b, (char *)e); ! 50: if(map) ! 51: memmove(r->mymap, (char *)map, 256); ! 52: else ! 53: for(i = 0; i < 256; i++) ! 54: r->mymap[i] = i; ! 55: eg_lex(); ! 56: if(dotstar){ ! 57: ecomp = eg_newexpr(Star, 0, eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0), (Expr *)0); ! 58: ecomp = eg_newexpr(Cat, 0, ecomp, eg_eall(parser, r->mymap)); ! 59: } else ! 60: ecomp = eg_eall(parser, r->mymap); ! 61: r->root = eg_newexpr(EOP, '#', ecomp, (Expr *)0); ! 62: egpost(r); ! 63: r->carat = 0; ! 64: if(r->backref || r->parens) ! 65: egbr(r); ! 66: else ! 67: eginit(r, dotstar); ! 68: return(r); ! 69: } ! 70: ! 71: void ! 72: eginit(register re_re *r, int dotstar) ! 73: { ! 74: int n; ! 75: ! 76: #ifdef DEBUG ! 77: if(TRACE(6)) ! 78: PR "eginit(r=%d, dotstar=%d)\n", r, dotstar); ! 79: #endif ! 80: #ifdef DEBUG ! 81: if(TRACE(10)){ ! 82: char buf[EPRINTSIZE]; ! 83: eg_epr(r->root, buf, 0); ! 84: PR "eginit: r=%d expr='%s'\n", r, buf); ! 85: } ! 86: #endif ! 87: r->ptr = (Expr **)egmalloc(r->maxid*sizeof(Expr *), "ptr"); ! 88: if (!r->ptr) ! 89: return; ! 90: eptr(r, r->root); ! 91: r->firstpos.base = (int *)egmalloc(n = r->maxid*sizeof(int), "first base"); ! 92: if (!r->firstpos.base){ ! 93: free((char*)r->ptr); ! 94: return; ! 95: } ! 96: r->begin.base = (int *)egmalloc(n, "begin base"); ! 97: if (!r->begin.base){ ! 98: free((char*)r->firstpos.base); ! 99: free((char*)r->ptr); ! 100: return; ! 101: } ! 102: r->tmp.base = (int *)egmalloc(n, "tmp base"); ! 103: if (!r->tmp.base){ ! 104: free((char*)r->begin.base); ! 105: free((char*)r->firstpos.base); ! 106: free((char*)r->ptr); ! 107: return; ! 108: } ! 109: memset((char *)r->begin.base, 0, n); ! 110: r->begin.count = 0; ! 111: r->begin.last = -1; ! 112: r->carat = dotstar; ! 113: RESET(r, firstpos); ! 114: if(first(r, r->root->l) == 0){ ! 115: /* ! 116: nullable, so include me ! 117: */ ! 118: SET(r, firstpos, r->root->id) ! 119: } ! 120: if(dotstar) ! 121: COPY(r, begin, firstpos); ! 122: eg_stateinit(r); ! 123: eg_clrstates(r); ! 124: eg_posinit(r); ! 125: (void)addstate(r, &r->firstpos); ! 126: eg_savestate(r, dotstar? nextstate(r, r->states, RE_CARAT):r->states); ! 127: eg_posset(r); ! 128: } ! 129: ! 130: unsigned char * ! 131: eg_quickmatch(register re_re *r, register unsigned char *b, register unsigned char *e, int endpts) ! 132: { ! 133: register State *s, *t; ! 134: ! 135: #ifdef DEBUG ! 136: if(TRACE(10)){ ! 137: char buf[EPRINTSIZE]; ! 138: eg_epr(r->root, buf, 0); ! 139: PR "qm: r=%d expr='%s' endpts=%d\n", r, buf, endpts); ! 140: } ! 141: #endif ! 142: s = eg_startstate(r); ! 143: if(endpts&RE_BEG){ ! 144: if(!r->carat){ ! 145: if(t = s->tab[RE_CARAT]) ! 146: s = t; ! 147: else ! 148: s = nextstate(r, s, RE_CARAT); ! 149: if(s->out == FAIL) ! 150: s = eg_startstate(r); ! 151: } ! 152: if(s->out){ ! 153: #ifdef DEBUG ! 154: if(TRACE(6)) ! 155: PR "match at ^: '%s' out=%d\n", b, s->out); ! 156: #endif ! 157: return((s->out == FAIL)? 0:b); ! 158: } ! 159: } ! 160: while(b < e){ ! 161: #ifdef DEBUG ! 162: if(TRACE(4)) ! 163: PR "state %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b); ! 164: #endif ! 165: if(t = s->tab[*(unsigned char *)b]) ! 166: s = t; ! 167: else ! 168: s = nextstate(r, s, *(unsigned char *)b); ! 169: if(s->out){ ! 170: #ifdef DEBUG ! 171: if(TRACE(6)) ! 172: PR "match at input '%s' out=%d\n", b, s->out); ! 173: #endif ! 174: return((s->out == FAIL)? 0:b); ! 175: } ! 176: b++; ! 177: } ! 178: if(endpts&RE_END){ ! 179: if(t = s->tab[RE_DOLLAR]) ! 180: s = t; ! 181: else ! 182: s = nextstate(r, s, RE_DOLLAR); ! 183: } ! 184: if(s->out){ ! 185: #ifdef DEBUG ! 186: if(TRACE(6)) ! 187: PR "match at $ '%s' out=%d\n", b, s->out); ! 188: #endif ! 189: return((s->out == FAIL)? 0:b); ! 190: } ! 191: #ifdef DEBUG ! 192: if(TRACE(3)){ ! 193: char buf[EPRINTSIZE]; ! 194: ! 195: eg_epr(r->root, buf, 1); ! 196: PR "pat = %s\n", buf); ! 197: } ! 198: #endif ! 199: return(0); ! 200: } ! 201: ! 202: unsigned char * ! 203: eg_lquickmatch(register re_re *r, register unsigned char *b, register unsigned char *e, int endpts) ! 204: { ! 205: register State *s, *t; ! 206: int outedness = 0; ! 207: ! 208: #ifdef DEBUG ! 209: if(TRACE(10)){ ! 210: char buf[EPRINTSIZE]; ! 211: eg_epr(r->root, buf, 0); ! 212: PR "lqm: r=%d carat=%d expr='%s' endpts=%d\n", r, r->carat, buf, endpts); ! 213: } ! 214: #endif ! 215: s = eg_startstate(r); ! 216: if(endpts&RE_BEG){ ! 217: if(!r->carat){ ! 218: if(t = s->tab[RE_CARAT]) ! 219: s = t; ! 220: else ! 221: s = nextstate(r, s, RE_CARAT); ! 222: if(s->out == FAIL) ! 223: s = eg_startstate(r); ! 224: } ! 225: if(s->out){ ! 226: #ifdef DEBUG ! 227: if(TRACE(6)) ! 228: PR "match at ^: '%s' out=%d\n", b, s->out); ! 229: #endif ! 230: if(s->out == ACCEPT_EOP) ! 231: return(b); ! 232: else if(s->out == FAIL) ! 233: return(0); ! 234: outedness = 1; ! 235: } ! 236: } ! 237: /* ! 238: look for first match ! 239: */ ! 240: if(outedness == 0) ! 241: for(; b < e; b++){ ! 242: #ifdef DEBUG ! 243: if(TRACE(4)) ! 244: PR "state %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b); ! 245: #endif ! 246: if(t = s->tab[*(unsigned char *)b]) ! 247: s = t; ! 248: else ! 249: s = nextstate(r, s, *(unsigned char *)b); ! 250: if(s->out){ ! 251: #ifdef DEBUG ! 252: if(TRACE(4)) ! 253: PR " out=%d, state %d@%d\n", s->out, ! 254: s-r->states, (int)s); ! 255: #endif ! 256: b++; ! 257: if((s->out == ACCEPT_EOP) || (s->out == FAIL)) ! 258: goto finish; ! 259: outedness = 1; ! 260: break; ! 261: } ! 262: } ! 263: /* ! 264: in the following loop, we have seen a match already ! 265: because only way outedness is zero is if b>=e ! 266: */ ! 267: for(; b < e; b++){ ! 268: #ifdef DEBUG ! 269: if(TRACE(4)) ! 270: PR "statex %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b); ! 271: #endif ! 272: if(t = s->tab[*(unsigned char *)b]) ! 273: s = t; ! 274: else ! 275: s = nextstate(r, s, *(unsigned char *)b); ! 276: if(s->out == ACCEPT) ! 277: continue; ! 278: if((s->out == NORMAL) || (s->out == FAIL)){ ! 279: #ifdef DEBUG ! 280: if(TRACE(6)) ! 281: PR "unmatch at input '%s' out=%d\n", b, s->out); ! 282: #endif ! 283: return(b); ! 284: } ! 285: } ! 286: if(endpts&RE_END){ ! 287: if(t = s->tab[RE_DOLLAR]) ! 288: s = t; ! 289: else ! 290: s = nextstate(r, s, RE_DOLLAR); ! 291: } ! 292: finish: ! 293: if((s->out && (s->out != FAIL)) || outedness){ ! 294: #ifdef DEBUG ! 295: if(TRACE(6)) ! 296: PR "match at $ '%s' out=%d\n", b, s->out); ! 297: #endif ! 298: return(b); ! 299: } ! 300: #ifdef DEBUG ! 301: if(TRACE(3)){ ! 302: char buf[EPRINTSIZE]; ! 303: ! 304: eg_epr(r->root, buf, 1); ! 305: PR "lqm('%s') failed; out=%d, s->out=%d \n", buf, outedness, s->out); ! 306: } ! 307: #endif ! 308: return(0); ! 309: } ! 310: ! 311: static ! 312: match(register re_re *r, register Expr *e, int a) ! 313: { ! 314: switch(e->type) ! 315: { ! 316: case Dollar: return(a == RE_DOLLAR); ! 317: case Carat: return(a == RE_CARAT); ! 318: case Dot: return(a < 256); ! 319: case Literal: return(r->mymap[a] == r->mymap[e->lit]); ! 320: case Charclass: if(a >= 256) return(0); ! 321: return(memchr((char *)e->r, r->mymap[a], (int)e->l) != 0); ! 322: case Compcharclass: ! 323: if(a >= 256) return(0); ! 324: return(memchr((char *)e->r, r->mymap[a], (int)e->l) == 0); ! 325: } ! 326: return(0); ! 327: } ! 328: ! 329: /* ! 330: generates the followset for a node in firstpos ! 331: */ ! 332: ! 333: static void ! 334: follow(register re_re *r, register Expr *e) ! 335: { ! 336: register Expr *p; ! 337: ! 338: if(e->type == EOP) ! 339: return; ! 340: else ! 341: p = e->parent; ! 342: switch(p->type) ! 343: { ! 344: case EOP: ! 345: SET(r, firstpos, p->id) ! 346: break; ! 347: case Plus: ! 348: case Star: ! 349: (void)first(r, e); ! 350: follow(r, p); ! 351: break; ! 352: case Quest: ! 353: case Alternate: ! 354: follow(r, p); ! 355: break; ! 356: case Cat: ! 357: if(e == p->l){ ! 358: if(first(r, p->r) == 0){ ! 359: follow(r, p); ! 360: break; ! 361: } ! 362: } else ! 363: follow(r, p); ! 364: break; ! 365: case Group: ! 366: follow(r, p); ! 367: break; ! 368: } ! 369: } ! 370: ! 371: /* ! 372: first returns 0 if e is nullable and in the process, ! 373: sets up firstpos. ! 374: */ ! 375: ! 376: static ! 377: first(register re_re *r, register Expr *e) ! 378: { ! 379: int k; ! 380: ! 381: switch(e->type) ! 382: { ! 383: case Carat: ! 384: case Dollar: ! 385: case Literal: ! 386: case Dot: ! 387: case Charclass: ! 388: case Compcharclass: ! 389: SET(r, firstpos, e->id) ! 390: return(1); ! 391: case EOP: ! 392: case Star: ! 393: case Quest: ! 394: (void)first(r, e->l); ! 395: return(0); ! 396: case Group: ! 397: case Plus: ! 398: return(first(r, e->l)); ! 399: case Cat: ! 400: return(first(r, e->l) || first(r, e->r)); ! 401: case Alternate: ! 402: k = first(r, e->r); ! 403: return(first(r, e->l) && k); ! 404: default: ! 405: EPR "bad type %d\n", e->type); ! 406: abort(); ! 407: return(0); ! 408: } ! 409: } ! 410: ! 411: static void ! 412: efollow(register re_re *r, register Expr *e) ! 413: { ! 414: register i, *p; ! 415: ! 416: RESET(r, firstpos); ! 417: follow(r, e); ! 418: e->flen = r->firstpos.count; ! 419: e->follow = (int *)malloc((unsigned)(e->flen*sizeof(int))); ! 420: if(e->follow == 0){ ! 421: char buf[100]; ! 422: SPR buf, "efollow malloc fail(%ld)\n", e->flen); ! 423: re_error(buf); ! 424: return; ! 425: } ! 426: p = e->follow; ! 427: GET(r->firstpos, i) ! 428: *p++ = i; ! 429: if(p != e->follow+e->flen){ ! 430: char buf[100]; ! 431: SPR buf, "efollow length error: %d %ld\n", p-e->follow, e->flen); ! 432: re_error(buf); ! 433: return; ! 434: } ! 435: } ! 436: ! 437: static State * ! 438: addstate(register re_re *r, register Positionset *ps) ! 439: { ! 440: register *p, j; ! 441: register State *s; ! 442: ! 443: s = r->states + eg_getstate(r); ! 444: memset((char *)s->tab, 0, sizeof(s->tab)); ! 445: s->pos = eg_posalloc(r, (int)ps->count); ! 446: s->npos = ps->count; ! 447: p = r->posbase+s->pos; ! 448: GET((*ps), j) ! 449: *p++ = j; ! 450: if(s->out = (ps->base[r->root->id] != 0)){ ! 451: if((ps->base[r->root->id] <= 0) && (ps->last == r->root->id)) ! 452: s->out = ACCEPT_EOP; /* marker for only the EOP marker */ ! 453: else ! 454: s->out = ACCEPT; ! 455: } ! 456: if(s->npos == 0) ! 457: s->out = FAIL; ! 458: #ifdef DEBUG ! 459: if(TRACE(3)){ ! 460: char buf[2000]; ! 461: eg_spr(s->npos, s->pos+r->posbase, buf); ! 462: PR "new state[%d]@%d %s,pos=%d out=%d\n", s-r->states, s, buf, s->pos, s->out); ! 463: } ! 464: #endif ! 465: return(s); ! 466: } ! 467: ! 468: static State * ! 469: nextstate(register re_re *r, register State *s, int a) ! 470: { ! 471: register int p, *q, *eq; ! 472: register long i; ! 473: State *news; ! 474: Expr *e; ! 475: ! 476: RESET(r, tmp); ! 477: #ifdef DEBUG ! 478: if(TRACE(5)){ ! 479: char buf[2000]; ! 480: ppr(&r->tmp, buf); ! 481: PR "nextstate: reset: %s\n", buf); ! 482: } ! 483: #endif ! 484: for(i = s->npos, p = s->pos; i > 0; i--){ ! 485: e = r->ptr[r->posbase[p++]]; ! 486: #ifdef DEBUG ! 487: if(TRACE(11)){ ! 488: PR "i=%d e->type=%d\n", i, e->type); ! 489: } ! 490: #endif ! 491: if(e->type == EOP) continue; ! 492: if(match(r, e, a)){ ! 493: #ifdef DEBUG ! 494: if(TRACE(11)){PR "matched %c\n", a);} ! 495: #endif ! 496: if(e->follow == 0) ! 497: efollow(r, e); ! 498: for(q = e->follow, eq = q+e->flen; q < eq; q++){ ! 499: SET(r, tmp, *q) ! 500: } ! 501: } ! 502: } ! 503: #ifdef DEBUG ! 504: if(TRACE(5)){ ! 505: char buf[2000]; ! 506: ppr(&r->tmp, buf); ! 507: PR "nextstate(%d, '%c'): found %s\n", s-r->states, a, buf); ! 508: } ! 509: #endif ! 510: if(news = eg_stateof(r, &r->tmp)){ ! 511: if(a <= RE_DOLLAR) ! 512: s->tab[a] = news; ! 513: } else ! 514: news = addstate(r, &r->tmp); ! 515: #ifdef DEBUG ! 516: if(TRACE(5)){ ! 517: PR "nextstate(%d, '%c'): returning %ld %d out=%d\n", s-r->states, a, ! 518: (long)news, news-r->states, news->out); ! 519: } ! 520: #endif ! 521: return(news); ! 522: } ! 523: ! 524: void * ! 525: egmalloc(int n, char *s) ! 526: { ! 527: char *x; ! 528: char buf[256]; ! 529: ! 530: if((x = malloc(n)) == 0){ ! 531: SPR buf, "malloc fail(%d): %s", n, s); ! 532: re_error(buf); ! 533: return 0; ! 534: } ! 535: return((void *)x); ! 536: } ! 537: ! 538: void * ! 539: egrealloc(char *p, int n, char *s) ! 540: { ! 541: char *x; ! 542: char buf[256]; ! 543: ! 544: if((x = realloc(p, n)) == 0){ ! 545: SPR buf, "realloc fail(%d): %s", n, s); ! 546: re_error(buf); ! 547: return 0; ! 548: } ! 549: return((void *)x); ! 550: } ! 551: ! 552: #ifdef DEBUG ! 553: static void ! 554: ppr(register Positionset *ps, register char *p) ! 555: { ! 556: register n; ! 557: ! 558: if(ps->count < 1){ ! 559: SPR p, "{}"); ! 560: return; ! 561: } ! 562: *p++ = '{'; ! 563: GET((*ps), n){ ! 564: SPR p, "%d[=%d],", n, ps->base[n]); ! 565: p = strchr(p, 0); ! 566: } ! 567: p[-1] = '}'; ! 568: } ! 569: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.