|
|
1.1 ! root 1: /* ! 2: * regcomp and regexec -- regsub and regerror are elsewhere ! 3: * ! 4: * Copyright (c) 1986 by University of Toronto. ! 5: * Written by Henry Spencer. Not derived from licensed software. ! 6: * ! 7: * Permission is granted to anyone to use this software for any ! 8: * purpose on any computer system, and to redistribute it freely, ! 9: * subject to the following restrictions: ! 10: * ! 11: * 1. The author is not responsible for the consequences of use of ! 12: * this software, no matter how awful, even if they arise ! 13: * from defects in it. ! 14: * ! 15: * 2. The origin of this software must not be misrepresented, either ! 16: * by explicit claim or by omission. ! 17: * ! 18: * 3. Altered versions must be plainly marked as such, and must not ! 19: * be misrepresented as being the original software. ! 20: ********* This version is slightly altered to fit cleanly with Coherent ! 21: ********* libmisc.a. The original is freely available on mwcbbs. ! 22: * ! 23: * Beware that some of this code is subtly aware of the way operator ! 24: * precedence is structured in regular expressions. Serious changes in ! 25: * regular-expression syntax might require a total rethink. ! 26: */ ! 27: #include <stdio.h> ! 28: #include "local_misc.h" ! 29: ! 30: /* ! 31: * The "internal use only" fields in regexp.h are present to pass info from ! 32: * compile to execute that permits the execute phase to run lots faster on ! 33: * simple cases. They are: ! 34: * ! 35: * regstart char that must begin a match; '\0' if none obvious ! 36: * reganch is the match anchored (at beginning-of-line only)? ! 37: * regmust string (pointer into program) that match must include, or NULL ! 38: * regmlen length of regmust string ! 39: * ! 40: * Regstart and reganch permit very fast decisions on suitable starting points ! 41: * for a match, cutting down the work a lot. Regmust permits fast rejection ! 42: * of lines that cannot possibly match. The regmust tests are costly enough ! 43: * that regcomp() supplies a regmust only if the r.e. contains something ! 44: * potentially expensive (at present, the only such thing detected is * or + ! 45: * at the start of the r.e., which can involve a lot of backup). Regmlen is ! 46: * supplied because the test in regexec() needs it and regcomp() is computing ! 47: * it anyway. ! 48: */ ! 49: ! 50: /* ! 51: * Structure for regexp "program". This is essentially a linear encoding ! 52: * of a nondeterministic finite-state machine (aka syntax charts or ! 53: * "railroad normal form" in parsing technology). Each node is an opcode ! 54: * plus a "next" pointer, possibly plus an operand. "Next" pointers of ! 55: * all nodes except BRANCH implement concatenation; a "next" pointer with ! 56: * a BRANCH on both ends of it is connecting two alternatives. (Here we ! 57: * have one of the subtle syntax dependencies: an individual BRANCH (as ! 58: * opposed to a collection of them) is never concatenated with anything ! 59: * because of operator precedence.) The operand of some types of node is ! 60: * a literal string; for others, it is a node leading into a sub-FSM. In ! 61: * particular, the operand of a BRANCH node is the first node of the branch. ! 62: * (NB this is *not* a tree structure: the tail of the branch connects ! 63: * to the thing following the set of BRANCHes.) The opcodes are: ! 64: */ ! 65: ! 66: /* definition number opnd? meaning */ ! 67: #define END 0 /* no End of program. */ ! 68: #define BOL 1 /* no Match "" at beginning of line. */ ! 69: #define EOL 2 /* no Match "" at end of line. */ ! 70: #define ANY 3 /* no Match any one character. */ ! 71: #define ANYOF 4 /* str Match any character in this string. */ ! 72: #define ANYBUT 5 /* str Match any character not in this string. */ ! 73: #define BRANCH 6 /* node Match this alternative, or the next... */ ! 74: #define BACK 7 /* no Match "", "next" ptr points backward. */ ! 75: #define EXACTLY 8 /* str Match this string. */ ! 76: #define NOTHING 9 /* no Match empty string. */ ! 77: #define STAR 10 /* node Match this (simple) thing 0 or more times. */ ! 78: #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ ! 79: #define OPEN 20 /* no Mark this point in input as start of #n. */ ! 80: /* OPEN+1 is number 1, etc. */ ! 81: #define CLOSE 30 /* no Analogous to OPEN. */ ! 82: ! 83: /* ! 84: * Opcode notes: ! 85: * ! 86: * BRANCH The set of branches constituting a single choice are hooked ! 87: * together with their "next" pointers, since precedence prevents ! 88: * anything being concatenated to any individual branch. The ! 89: * "next" pointer of the last BRANCH in a choice points to the ! 90: * thing following the whole choice. This is also where the ! 91: * final "next" pointer of each individual branch points; each ! 92: * branch starts with the operand node of a BRANCH node. ! 93: * ! 94: * BACK Normal "next" pointers all implicitly point forward; BACK ! 95: * exists to make loop structures possible. ! 96: * ! 97: * STAR,PLUS '?', and complex '*' and '+', are implemented as circular ! 98: * BRANCH structures using BACK. Simple cases (one character ! 99: * per match) are implemented with STAR and PLUS for speed ! 100: * and to minimize recursive plunges. ! 101: * ! 102: * OPEN,CLOSE ...are numbered at compile time. ! 103: */ ! 104: ! 105: /* ! 106: * A node is one char of opcode followed by two chars of "next" pointer. ! 107: * "Next" pointers are stored as two 8-bit pieces, high order first. The ! 108: * value is a positive offset from the opcode of the node containing it. ! 109: * An operand, if any, simply follows the node. (Note that much of the ! 110: * code generation knows about this implicit relationship.) ! 111: * ! 112: * Using two bytes for the "next" pointer is vast overkill for most things, ! 113: * but allows patterns to get big without disasters. ! 114: */ ! 115: #define OP(p) (*(p)) ! 116: #define NEXT(p) (((*((p)+1)&0377)<<8) + *((p)+2)&0377) ! 117: #define OPERAND(p) ((p) + 3) ! 118: ! 119: /* ! 120: * See regmagic.h for one further detail of program structure. ! 121: */ ! 122: ! 123: ! 124: /* ! 125: * Utility definitions. ! 126: */ ! 127: #ifndef CHARBITS ! 128: #define UCHARAT(p) ((int)*(unsigned char *)(p)) ! 129: #else ! 130: #define UCHARAT(p) ((int)*(p)&CHARBITS) ! 131: #endif ! 132: ! 133: #define FAIL(m) { regerror(m); return(NULL); } ! 134: #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') ! 135: #define META "^$.[()|?+*\\" ! 136: ! 137: /* ! 138: * Flags to be passed up and down. ! 139: */ ! 140: #define HASWIDTH 01 /* Known never to match null string. */ ! 141: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ ! 142: #define SPSTART 04 /* Starts with * or +. */ ! 143: #define WORST 0 /* Worst case. */ ! 144: ! 145: /* ! 146: * Global work variables for regcomp(). ! 147: */ ! 148: static char *regparse; /* Input-scan pointer. */ ! 149: static int regnpar; /* () count. */ ! 150: static char regdummy; ! 151: static char *regcode; /* Code-emit pointer; ®dummy = don't. */ ! 152: static long regsize; /* Code size. */ ! 153: ! 154: /* ! 155: * Forward declarations for regcomp()'s friends. ! 156: */ ! 157: #ifndef STATIC ! 158: #define STATIC static ! 159: #endif ! 160: STATIC char *reg(); ! 161: STATIC char *regbranch(); ! 162: STATIC char *regpiece(); ! 163: STATIC char *regatom(); ! 164: STATIC char *regnode(); ! 165: STATIC char *regnext(); ! 166: STATIC void regc(); ! 167: STATIC void reginsert(); ! 168: STATIC void regtail(); ! 169: STATIC void regoptail(); ! 170: #ifdef STRCSPN ! 171: STATIC int strcspn(); ! 172: #endif ! 173: ! 174: /* ! 175: - regcomp - compile a regular expression into internal code ! 176: * ! 177: * We can't allocate space until we know how big the compiled form will be, ! 178: * but we can't compile it (and thus know how big it is) until we've got a ! 179: * place to put the code. So we cheat: we compile it twice, once with code ! 180: * generation turned off and size counting turned on, and once "for real". ! 181: * This also means that we don't allocate space until we are sure that the ! 182: * thing really will compile successfully, and we never have to move the ! 183: * code and thus invalidate pointers into it. (Note that it has to be in ! 184: * one piece because free() must be able to free it all.) ! 185: * ! 186: * Beware that the optimization-preparation code in here knows about some ! 187: * of the structure of the compiled regexp. ! 188: */ ! 189: regexp * ! 190: regcomp(exp) ! 191: char *exp; ! 192: { ! 193: register regexp *r; ! 194: register char *scan; ! 195: register char *longest; ! 196: register int len; ! 197: int flags; ! 198: extern char *alloc(); ! 199: ! 200: if (exp == NULL) ! 201: FAIL("NULL argument"); ! 202: ! 203: /* First pass: determine size, legality. */ ! 204: regparse = exp; ! 205: regnpar = 1; ! 206: regsize = 0L; ! 207: regcode = ®dummy; ! 208: regc(REG_MAGIC); ! 209: if (reg(0, &flags) == NULL) ! 210: return(NULL); ! 211: ! 212: /* Small enough for pointer-storage convention? */ ! 213: if (regsize >= 32767L) /* Probably could be 65535L. */ ! 214: FAIL("regexp too big"); ! 215: ! 216: /* Allocate space. */ ! 217: r = (regexp *)alloc(sizeof(regexp) + (unsigned)regsize); ! 218: ! 219: /* Second pass: emit code. */ ! 220: regparse = exp; ! 221: regnpar = 1; ! 222: regcode = r->program; ! 223: regc(REG_MAGIC); ! 224: if (reg(0, &flags) == NULL) ! 225: return(NULL); ! 226: ! 227: /* Dig out information for optimizations. */ ! 228: r->regstart = '\0'; /* Worst-case defaults. */ ! 229: r->reganch = 0; ! 230: r->regmust = NULL; ! 231: r->regmlen = 0; ! 232: scan = r->program+1; /* First BRANCH. */ ! 233: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ ! 234: scan = OPERAND(scan); ! 235: ! 236: /* Starting-point info. */ ! 237: if (OP(scan) == EXACTLY) ! 238: r->regstart = *OPERAND(scan); ! 239: else if (OP(scan) == BOL) ! 240: r->reganch++; ! 241: ! 242: /* ! 243: * If there's something expensive in the r.e., find the ! 244: * longest literal string that must appear and make it the ! 245: * regmust. Resolve ties in favor of later strings, since ! 246: * the regstart check works with the beginning of the r.e. ! 247: * and avoiding duplication strengthens checking. Not a ! 248: * strong reason, but sufficient in the absence of others. ! 249: */ ! 250: if (flags&SPSTART) { ! 251: longest = NULL; ! 252: len = 0; ! 253: for (; scan != NULL; scan = regnext(scan)) ! 254: if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { ! 255: longest = OPERAND(scan); ! 256: len = strlen(OPERAND(scan)); ! 257: } ! 258: r->regmust = longest; ! 259: r->regmlen = len; ! 260: } ! 261: } ! 262: ! 263: return(r); ! 264: } ! 265: ! 266: /* ! 267: - reg - regular expression, i.e. main body or parenthesized thing ! 268: * ! 269: * Caller must absorb opening parenthesis. ! 270: * ! 271: * Combining parenthesis handling with the base level of regular expression ! 272: * is a trifle forced, but the need to tie the tails of the branches to what ! 273: * follows makes it hard to avoid. ! 274: */ ! 275: static char * ! 276: reg(paren, flagp) ! 277: int paren; /* Parenthesized? */ ! 278: int *flagp; ! 279: { ! 280: register char *ret; ! 281: register char *br; ! 282: register char *ender; ! 283: register int parno; ! 284: int flags; ! 285: ! 286: *flagp = HASWIDTH; /* Tentatively. */ ! 287: ! 288: /* Make an OPEN node, if parenthesized. */ ! 289: if (paren) { ! 290: if (regnpar >= NSUBEXP) ! 291: FAIL("too many ()"); ! 292: parno = regnpar; ! 293: regnpar++; ! 294: ret = regnode(OPEN+parno); ! 295: } else ! 296: ret = NULL; ! 297: ! 298: /* Pick up the branches, linking them together. */ ! 299: br = regbranch(&flags); ! 300: if (br == NULL) ! 301: return(NULL); ! 302: if (ret != NULL) ! 303: regtail(ret, br); /* OPEN -> first. */ ! 304: else ! 305: ret = br; ! 306: if (!(flags&HASWIDTH)) ! 307: *flagp &= ~HASWIDTH; ! 308: *flagp |= flags&SPSTART; ! 309: while (*regparse == '|') { ! 310: regparse++; ! 311: br = regbranch(&flags); ! 312: if (br == NULL) ! 313: return(NULL); ! 314: regtail(ret, br); /* BRANCH -> BRANCH. */ ! 315: if (!(flags&HASWIDTH)) ! 316: *flagp &= ~HASWIDTH; ! 317: *flagp |= flags&SPSTART; ! 318: } ! 319: ! 320: /* Make a closing node, and hook it on the end. */ ! 321: ender = regnode((paren) ? CLOSE+parno : END); ! 322: regtail(ret, ender); ! 323: ! 324: /* Hook the tails of the branches to the closing node. */ ! 325: for (br = ret; br != NULL; br = regnext(br)) ! 326: regoptail(br, ender); ! 327: ! 328: /* Check for proper termination. */ ! 329: if (paren && *regparse++ != ')') { ! 330: FAIL("unmatched ()"); ! 331: } else if (!paren && *regparse != '\0') { ! 332: if (*regparse == ')') { ! 333: FAIL("unmatched ()"); ! 334: } else ! 335: FAIL("junk on end"); /* "Can't happen". */ ! 336: /* NOTREACHED */ ! 337: } ! 338: ! 339: return(ret); ! 340: } ! 341: ! 342: /* ! 343: - regbranch - one alternative of an | operator ! 344: * ! 345: * Implements the concatenation operator. ! 346: */ ! 347: static char * ! 348: regbranch(flagp) ! 349: int *flagp; ! 350: { ! 351: register char *ret; ! 352: register char *chain; ! 353: register char *latest; ! 354: int flags; ! 355: ! 356: *flagp = WORST; /* Tentatively. */ ! 357: ! 358: ret = regnode(BRANCH); ! 359: chain = NULL; ! 360: while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { ! 361: latest = regpiece(&flags); ! 362: if (latest == NULL) ! 363: return(NULL); ! 364: *flagp |= flags&HASWIDTH; ! 365: if (chain == NULL) /* First piece. */ ! 366: *flagp |= flags&SPSTART; ! 367: else ! 368: regtail(chain, latest); ! 369: chain = latest; ! 370: } ! 371: if (chain == NULL) /* Loop ran zero times. */ ! 372: (void) regnode(NOTHING); ! 373: ! 374: return(ret); ! 375: } ! 376: ! 377: /* ! 378: - regpiece - something followed by possible [*+?] ! 379: * ! 380: * Note that the branching code sequences used for ? and the general cases ! 381: * of * and + are somewhat optimized: they use the same NOTHING node as ! 382: * both the endmarker for their branch list and the body of the last branch. ! 383: * It might seem that this node could be dispensed with entirely, but the ! 384: * endmarker role is not redundant. ! 385: */ ! 386: static char * ! 387: regpiece(flagp) ! 388: int *flagp; ! 389: { ! 390: register char *ret; ! 391: register char op; ! 392: register char *next; ! 393: int flags; ! 394: ! 395: ret = regatom(&flags); ! 396: if (ret == NULL) ! 397: return(NULL); ! 398: ! 399: op = *regparse; ! 400: if (!ISMULT(op)) { ! 401: *flagp = flags; ! 402: return(ret); ! 403: } ! 404: ! 405: if (!(flags&HASWIDTH) && op != '?') ! 406: FAIL("*+ operand could be empty"); ! 407: *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); ! 408: ! 409: if (op == '*' && (flags&SIMPLE)) ! 410: reginsert(STAR, ret); ! 411: else if (op == '*') { ! 412: /* Emit x* as (x&|), where & means "self". */ ! 413: reginsert(BRANCH, ret); /* Either x */ ! 414: regoptail(ret, regnode(BACK)); /* and loop */ ! 415: regoptail(ret, ret); /* back */ ! 416: regtail(ret, regnode(BRANCH)); /* or */ ! 417: regtail(ret, regnode(NOTHING)); /* null. */ ! 418: } else if (op == '+' && (flags&SIMPLE)) ! 419: reginsert(PLUS, ret); ! 420: else if (op == '+') { ! 421: /* Emit x+ as x(&|), where & means "self". */ ! 422: next = regnode(BRANCH); /* Either */ ! 423: regtail(ret, next); ! 424: regtail(regnode(BACK), ret); /* loop back */ ! 425: regtail(next, regnode(BRANCH)); /* or */ ! 426: regtail(ret, regnode(NOTHING)); /* null. */ ! 427: } else if (op == '?') { ! 428: /* Emit x? as (x|) */ ! 429: reginsert(BRANCH, ret); /* Either x */ ! 430: regtail(ret, regnode(BRANCH)); /* or */ ! 431: next = regnode(NOTHING); /* null. */ ! 432: regtail(ret, next); ! 433: regoptail(ret, next); ! 434: } ! 435: regparse++; ! 436: if (ISMULT(*regparse)) ! 437: FAIL("nested *?+"); ! 438: ! 439: return(ret); ! 440: } ! 441: ! 442: /* ! 443: - regatom - the lowest level ! 444: * ! 445: * Optimization: gobbles an entire sequence of ordinary characters so that ! 446: * it can turn them into a single node, which is smaller to store and ! 447: * faster to run. Backslashed characters are exceptions, each becoming a ! 448: * separate node; the code is simpler that way and it's not worth fixing. ! 449: */ ! 450: static char * ! 451: regatom(flagp) ! 452: int *flagp; ! 453: { ! 454: register char *ret; ! 455: int flags; ! 456: ! 457: *flagp = WORST; /* Tentatively. */ ! 458: ! 459: switch (*regparse++) { ! 460: case '^': ! 461: ret = regnode(BOL); ! 462: break; ! 463: case '$': ! 464: ret = regnode(EOL); ! 465: break; ! 466: case '.': ! 467: ret = regnode(ANY); ! 468: *flagp |= HASWIDTH|SIMPLE; ! 469: break; ! 470: case '[': { ! 471: register int class; ! 472: register int classend; ! 473: ! 474: if (*regparse == '^') { /* Complement of range. */ ! 475: ret = regnode(ANYBUT); ! 476: regparse++; ! 477: } else ! 478: ret = regnode(ANYOF); ! 479: if (*regparse == ']' || *regparse == '-') ! 480: regc(*regparse++); ! 481: while (*regparse != '\0' && *regparse != ']') { ! 482: if (*regparse == '-') { ! 483: regparse++; ! 484: if (*regparse == ']' || *regparse == '\0') ! 485: regc('-'); ! 486: else { ! 487: class = UCHARAT(regparse-2)+1; ! 488: classend = UCHARAT(regparse); ! 489: if (class > classend+1) ! 490: FAIL("invalid [] range"); ! 491: for (; class <= classend; class++) ! 492: regc(class); ! 493: regparse++; ! 494: } ! 495: } else ! 496: regc(*regparse++); ! 497: } ! 498: regc('\0'); ! 499: if (*regparse != ']') ! 500: FAIL("unmatched []"); ! 501: regparse++; ! 502: *flagp |= HASWIDTH|SIMPLE; ! 503: } ! 504: break; ! 505: case '(': ! 506: ret = reg(1, &flags); ! 507: if (ret == NULL) ! 508: return(NULL); ! 509: *flagp |= flags&(HASWIDTH|SPSTART); ! 510: break; ! 511: case '\0': ! 512: case '|': ! 513: case ')': ! 514: FAIL("internal urp"); /* Supposed to be caught earlier. */ ! 515: break; ! 516: case '?': ! 517: case '+': ! 518: case '*': ! 519: FAIL("?+* follows nothing"); ! 520: break; ! 521: case '\\': ! 522: if (*regparse == '\0') ! 523: FAIL("trailing \\"); ! 524: ret = regnode(EXACTLY); ! 525: regc(*regparse++); ! 526: regc('\0'); ! 527: *flagp |= HASWIDTH|SIMPLE; ! 528: break; ! 529: default: { ! 530: register int len; ! 531: register char ender; ! 532: ! 533: regparse--; ! 534: len = strcspn(regparse, META); ! 535: if (len <= 0) ! 536: FAIL("internal disaster"); ! 537: ender = *(regparse+len); ! 538: if (len > 1 && ISMULT(ender)) ! 539: len--; /* Back off clear of ?+* operand. */ ! 540: *flagp |= HASWIDTH; ! 541: if (len == 1) ! 542: *flagp |= SIMPLE; ! 543: ret = regnode(EXACTLY); ! 544: while (len > 0) { ! 545: regc(*regparse++); ! 546: len--; ! 547: } ! 548: regc('\0'); ! 549: } ! 550: break; ! 551: } ! 552: ! 553: return(ret); ! 554: } ! 555: ! 556: /* ! 557: - regnode - emit a node ! 558: */ ! 559: static char * /* Location. */ ! 560: regnode(op) ! 561: char op; ! 562: { ! 563: register char *ret; ! 564: register char *ptr; ! 565: ! 566: ret = regcode; ! 567: if (ret == ®dummy) { ! 568: regsize += 3; ! 569: return(ret); ! 570: } ! 571: ! 572: ptr = ret; ! 573: *ptr++ = op; ! 574: *ptr++ = '\0'; /* Null "next" pointer. */ ! 575: *ptr++ = '\0'; ! 576: regcode = ptr; ! 577: ! 578: return(ret); ! 579: } ! 580: ! 581: /* ! 582: - regc - emit (if appropriate) a byte of code ! 583: */ ! 584: static void ! 585: regc(b) ! 586: char b; ! 587: { ! 588: if (regcode != ®dummy) ! 589: *regcode++ = b; ! 590: else ! 591: regsize++; ! 592: } ! 593: ! 594: /* ! 595: - reginsert - insert an operator in front of already-emitted operand ! 596: * ! 597: * Means relocating the operand. ! 598: */ ! 599: static void ! 600: reginsert(op, opnd) ! 601: char op; ! 602: char *opnd; ! 603: { ! 604: register char *src; ! 605: register char *dst; ! 606: register char *place; ! 607: ! 608: if (regcode == ®dummy) { ! 609: regsize += 3; ! 610: return; ! 611: } ! 612: ! 613: src = regcode; ! 614: regcode += 3; ! 615: dst = regcode; ! 616: while (src > opnd) ! 617: *--dst = *--src; ! 618: ! 619: place = opnd; /* Op node, where operand used to be. */ ! 620: *place++ = op; ! 621: *place++ = '\0'; ! 622: *place++ = '\0'; ! 623: } ! 624: ! 625: /* ! 626: - regtail - set the next-pointer at the end of a node chain ! 627: */ ! 628: static void ! 629: regtail(p, val) ! 630: char *p; ! 631: char *val; ! 632: { ! 633: register char *scan; ! 634: register char *temp; ! 635: register int offset; ! 636: ! 637: if (p == ®dummy) ! 638: return; ! 639: ! 640: /* Find last node. */ ! 641: scan = p; ! 642: for (;;) { ! 643: temp = regnext(scan); ! 644: if (temp == NULL) ! 645: break; ! 646: scan = temp; ! 647: } ! 648: ! 649: if (OP(scan) == BACK) ! 650: offset = scan - val; ! 651: else ! 652: offset = val - scan; ! 653: *(scan+1) = (offset>>8)&0377; ! 654: *(scan+2) = offset&0377; ! 655: } ! 656: ! 657: /* ! 658: - regoptail - regtail on operand of first argument; nop if operandless ! 659: */ ! 660: static void ! 661: regoptail(p, val) ! 662: char *p; ! 663: char *val; ! 664: { ! 665: /* "Operandless" and "op != BRANCH" are synonymous in practice. */ ! 666: if (p == NULL || p == ®dummy || OP(p) != BRANCH) ! 667: return; ! 668: regtail(OPERAND(p), val); ! 669: } ! 670: ! 671: /* ! 672: * regexec and friends ! 673: */ ! 674: ! 675: /* ! 676: * Global work variables for regexec(). ! 677: */ ! 678: static char *reginput; /* String-input pointer. */ ! 679: static char *regbol; /* Beginning of input, for ^ check. */ ! 680: static char **regstartp; /* Pointer to startp array. */ ! 681: static char **regendp; /* Ditto for endp. */ ! 682: ! 683: /* ! 684: * Forwards. ! 685: */ ! 686: STATIC int regtry(); ! 687: STATIC int regmatch(); ! 688: STATIC int regrepeat(); ! 689: ! 690: #ifdef DEBUG ! 691: int regnarrate = 0; ! 692: void regdump(); ! 693: STATIC char *regprop(); ! 694: #endif ! 695: ! 696: /* ! 697: - regexec - match a regexp against a string ! 698: */ ! 699: int ! 700: regexec(prog, string, fromline) ! 701: register regexp *prog; ! 702: register char *string; ! 703: { ! 704: int result; ! 705: register char *s; ! 706: extern char *strchr(); ! 707: ! 708: /* Be paranoid... */ ! 709: if (prog == NULL || string == NULL) { ! 710: regerror("NULL parameter"); ! 711: return(0); ! 712: } ! 713: ! 714: /* Check validity of program. */ ! 715: if (UCHARAT(prog->program) != (unsigned char)REG_MAGIC) { ! 716: regerror("corrupted program"); ! 717: return(0); ! 718: } ! 719: ! 720: /* If there is a "must appear" string, look for it. */ ! 721: if (prog->regmust != NULL) { ! 722: s = string; ! 723: while ((s = strchr(s, prog->regmust[0])) != NULL) { ! 724: if (strncmp(s, prog->regmust, prog->regmlen) == 0) ! 725: break; /* Found it. */ ! 726: s++; ! 727: } ! 728: if (s == NULL) { /* Not present. */ ! 729: /* printf("return 0 from %d\n", fromline); */ ! 730: return(0); ! 731: } ! 732: } ! 733: ! 734: /* Mark beginning of line for ^ . */ ! 735: regbol = string; ! 736: ! 737: /* Simplest case: anchored match need be tried only once. */ ! 738: if (prog->reganch) { ! 739: result = regtry(prog, string); ! 740: /* printf("return %d from %d\n", result, fromline); */ ! 741: return result; ! 742: } ! 743: ! 744: /* Messy cases: unanchored match. */ ! 745: s = string; ! 746: if (prog->regstart != '\0') ! 747: /* We know what char it must start with. */ ! 748: while ((s = strchr(s, prog->regstart)) != NULL) { ! 749: if (regtry(prog, s)) { ! 750: /* printf("return 1 from %d\n", fromline); */ ! 751: return(1); ! 752: } ! 753: s++; ! 754: } ! 755: else ! 756: /* We don't -- general case. */ ! 757: do { ! 758: if (regtry(prog, s)) { ! 759: /* printf("return 1 from %d\n", fromline); */ ! 760: return(1); ! 761: } ! 762: } while (*s++ != '\0'); ! 763: ! 764: /* Failure. */ ! 765: /* printf("return 0 from %d\n", fromline); */ ! 766: return(0); ! 767: } ! 768: ! 769: /* ! 770: - regtry - try match at specific point ! 771: */ ! 772: static int /* 0 failure, 1 success */ ! 773: regtry(prog, string) ! 774: regexp *prog; ! 775: char *string; ! 776: { ! 777: register int i; ! 778: register char **sp; ! 779: register char **ep; ! 780: ! 781: reginput = string; ! 782: regstartp = prog->startp; ! 783: regendp = prog->endp; ! 784: ! 785: sp = prog->startp; ! 786: ep = prog->endp; ! 787: for (i = NSUBEXP; i > 0; i--) { ! 788: *sp++ = NULL; ! 789: *ep++ = NULL; ! 790: } ! 791: if (regmatch(prog->program + 1)) { ! 792: prog->startp[0] = string; ! 793: prog->endp[0] = reginput; ! 794: return(1); ! 795: } else ! 796: return(0); ! 797: } ! 798: ! 799: /* ! 800: - regmatch - main matching routine ! 801: * ! 802: * Conceptually the strategy is simple: check to see whether the current ! 803: * node matches, call self recursively to see whether the rest matches, ! 804: * and then act accordingly. In practice we make some effort to avoid ! 805: * recursion, in particular by going through "ordinary" nodes (that don't ! 806: * need to know whether the rest of the match failed) by a loop instead of ! 807: * by recursion. ! 808: */ ! 809: static int /* 0 failure, 1 success */ ! 810: regmatch(prog) ! 811: char *prog; ! 812: { ! 813: register char *scan; /* Current node. */ ! 814: char *next; /* Next node. */ ! 815: extern char *strchr(); ! 816: ! 817: scan = prog; ! 818: #ifdef DEBUG ! 819: if (scan != NULL && regnarrate) ! 820: fprintf(stderr, "%s(\n", regprop(scan)); ! 821: #endif ! 822: while (scan != NULL) { ! 823: #ifdef DEBUG ! 824: if (regnarrate) ! 825: fprintf(stderr, "%s...\n", regprop(scan)); ! 826: #endif ! 827: next = regnext(scan); ! 828: ! 829: switch (OP(scan)) { ! 830: case BOL: ! 831: if (reginput != regbol) ! 832: return(0); ! 833: break; ! 834: case EOL: ! 835: if (*reginput != '\0') ! 836: return(0); ! 837: break; ! 838: case ANY: ! 839: if (*reginput == '\0') ! 840: return(0); ! 841: reginput++; ! 842: break; ! 843: case EXACTLY: { ! 844: register int len; ! 845: register char *opnd; ! 846: ! 847: opnd = OPERAND(scan); ! 848: /* Inline the first character, for speed. */ ! 849: if (*opnd != *reginput) ! 850: return(0); ! 851: len = strlen(opnd); ! 852: if (len > 1 && strncmp(opnd, reginput, len) != 0) ! 853: return(0); ! 854: reginput += len; ! 855: } ! 856: break; ! 857: case ANYOF: ! 858: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) ! 859: return(0); ! 860: reginput++; ! 861: break; ! 862: case ANYBUT: ! 863: if (strchr(OPERAND(scan), *reginput) != NULL) ! 864: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) ! 865: return(0); ! 866: reginput++; ! 867: break; ! 868: case NOTHING: ! 869: break; ! 870: case BACK: ! 871: break; ! 872: case OPEN+1: ! 873: case OPEN+2: ! 874: case OPEN+3: ! 875: case OPEN+4: ! 876: case OPEN+5: ! 877: case OPEN+6: ! 878: case OPEN+7: ! 879: case OPEN+8: ! 880: case OPEN+9: { ! 881: register int no; ! 882: register char *save; ! 883: ! 884: no = OP(scan) - OPEN; ! 885: save = reginput; ! 886: ! 887: if (regmatch(next)) { ! 888: /* ! 889: * Don't set startp if some later ! 890: * invocation of the same parentheses ! 891: * already has. ! 892: */ ! 893: if (regstartp[no] == NULL) ! 894: regstartp[no] = save; ! 895: return(1); ! 896: } else ! 897: return(0); ! 898: } ! 899: break; ! 900: case CLOSE+1: ! 901: case CLOSE+2: ! 902: case CLOSE+3: ! 903: case CLOSE+4: ! 904: case CLOSE+5: ! 905: case CLOSE+6: ! 906: case CLOSE+7: ! 907: case CLOSE+8: ! 908: case CLOSE+9: { ! 909: register int no; ! 910: register char *save; ! 911: ! 912: no = OP(scan) - CLOSE; ! 913: save = reginput; ! 914: ! 915: if (regmatch(next)) { ! 916: /* ! 917: * Don't set endp if some later ! 918: * invocation of the same parentheses ! 919: * already has. ! 920: */ ! 921: if (regendp[no] == NULL) ! 922: regendp[no] = save; ! 923: return(1); ! 924: } else ! 925: return(0); ! 926: } ! 927: break; ! 928: case BRANCH: { ! 929: register char *save; ! 930: ! 931: if (OP(next) != BRANCH) /* No choice. */ ! 932: next = OPERAND(scan); /* Avoid recursion. */ ! 933: else { ! 934: do { ! 935: save = reginput; ! 936: if (regmatch(OPERAND(scan))) ! 937: return(1); ! 938: reginput = save; ! 939: scan = regnext(scan); ! 940: } while (scan != NULL && OP(scan) == BRANCH); ! 941: return(0); ! 942: /* NOTREACHED */ ! 943: } ! 944: } ! 945: break; ! 946: case STAR: ! 947: case PLUS: { ! 948: register char nextch; ! 949: register int no; ! 950: register char *save; ! 951: register int min; ! 952: ! 953: /* ! 954: * Lookahead to avoid useless match attempts ! 955: * when we know what character comes next. ! 956: */ ! 957: nextch = '\0'; ! 958: if (OP(next) == EXACTLY) ! 959: nextch = *OPERAND(next); ! 960: min = (OP(scan) == STAR) ? 0 : 1; ! 961: save = reginput; ! 962: no = regrepeat(OPERAND(scan)); ! 963: while (no >= min) { ! 964: /* If it could work, try it. */ ! 965: if (nextch == '\0' || *reginput == nextch) ! 966: if (regmatch(next)) ! 967: return(1); ! 968: /* Couldn't or didn't -- back up. */ ! 969: no--; ! 970: reginput = save + no; ! 971: } ! 972: return(0); ! 973: } ! 974: break; ! 975: case END: ! 976: return(1); /* Success! */ ! 977: break; ! 978: default: ! 979: regerror("memory corruption"); ! 980: return(0); ! 981: break; ! 982: } ! 983: ! 984: scan = next; ! 985: } ! 986: ! 987: /* ! 988: * We get here only if there's trouble -- normally "case END" is ! 989: * the terminating point. ! 990: */ ! 991: regerror("corrupted pointers"); ! 992: return(0); ! 993: } ! 994: ! 995: /* ! 996: - regrepeat - repeatedly match something simple, report how many ! 997: */ ! 998: static int ! 999: regrepeat(p) ! 1000: char *p; ! 1001: { ! 1002: register int count = 0; ! 1003: register char *scan; ! 1004: register char *opnd; ! 1005: ! 1006: scan = reginput; ! 1007: opnd = OPERAND(p); ! 1008: switch (OP(p)) { ! 1009: case ANY: ! 1010: count = strlen(scan); ! 1011: scan += count; ! 1012: break; ! 1013: case EXACTLY: ! 1014: while (*opnd == *scan) { ! 1015: count++; ! 1016: scan++; ! 1017: } ! 1018: break; ! 1019: case ANYOF: ! 1020: while (*scan != '\0' && strchr(opnd, *scan) != NULL) { ! 1021: count++; ! 1022: scan++; ! 1023: } ! 1024: break; ! 1025: case ANYBUT: ! 1026: while (*scan != '\0' && strchr(opnd, *scan) == NULL) { ! 1027: count++; ! 1028: scan++; ! 1029: } ! 1030: break; ! 1031: default: /* Oh dear. Called inappropriately. */ ! 1032: regerror("internal foulup"); ! 1033: count = 0; /* Best compromise. */ ! 1034: break; ! 1035: } ! 1036: reginput = scan; ! 1037: ! 1038: return(count); ! 1039: } ! 1040: ! 1041: /* ! 1042: - regnext - dig the "next" pointer out of a node ! 1043: */ ! 1044: static char * ! 1045: regnext(p) ! 1046: register char *p; ! 1047: { ! 1048: register int offset; ! 1049: ! 1050: if (p == ®dummy) ! 1051: return(NULL); ! 1052: ! 1053: offset = NEXT(p); ! 1054: if (offset == 0) ! 1055: return(NULL); ! 1056: ! 1057: if (OP(p) == BACK) ! 1058: return(p-offset); ! 1059: else ! 1060: return(p+offset); ! 1061: } ! 1062: ! 1063: #ifdef DEBUG ! 1064: ! 1065: STATIC char *regprop(); ! 1066: ! 1067: /* ! 1068: - regdump - dump a regexp onto stdout in vaguely comprehensible form ! 1069: */ ! 1070: void ! 1071: regdump(r) ! 1072: regexp *r; ! 1073: { ! 1074: register char *s; ! 1075: register char op = EXACTLY; /* Arbitrary non-END op. */ ! 1076: register char *next; ! 1077: extern char *strchr(); ! 1078: ! 1079: ! 1080: s = r->program + 1; ! 1081: while (op != END) { /* While that wasn't END last time... */ ! 1082: op = OP(s); ! 1083: printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ ! 1084: next = regnext(s); ! 1085: if (next == NULL) /* Next ptr. */ ! 1086: printf("(0)"); ! 1087: else ! 1088: printf("(%d)", (s-r->program)+(next-s)); ! 1089: s += 3; ! 1090: if (op == ANYOF || op == ANYBUT || op == EXACTLY) { ! 1091: /* Literal string, where present. */ ! 1092: while (*s != '\0') { ! 1093: putchar(*s); ! 1094: s++; ! 1095: } ! 1096: s++; ! 1097: } ! 1098: putchar('\n'); ! 1099: } ! 1100: ! 1101: /* Header fields of interest. */ ! 1102: if (r->regstart != '\0') ! 1103: printf("start `%c' ", r->regstart); ! 1104: if (r->reganch) ! 1105: printf("anchored "); ! 1106: if (r->regmust != NULL) ! 1107: printf("must have \"%s\"", r->regmust); ! 1108: printf("\n"); ! 1109: } ! 1110: ! 1111: /* ! 1112: - regprop - printable representation of opcode ! 1113: */ ! 1114: static char * ! 1115: regprop(op) ! 1116: char *op; ! 1117: { ! 1118: register char *p; ! 1119: static char buf[50]; ! 1120: ! 1121: (void) strcpy(buf, ":"); ! 1122: ! 1123: switch (OP(op)) { ! 1124: case BOL: ! 1125: p = "BOL"; ! 1126: break; ! 1127: case EOL: ! 1128: p = "EOL"; ! 1129: break; ! 1130: case ANY: ! 1131: p = "ANY"; ! 1132: break; ! 1133: case ANYOF: ! 1134: p = "ANYOF"; ! 1135: break; ! 1136: case ANYBUT: ! 1137: p = "ANYBUT"; ! 1138: break; ! 1139: case BRANCH: ! 1140: p = "BRANCH"; ! 1141: break; ! 1142: case EXACTLY: ! 1143: p = "EXACTLY"; ! 1144: break; ! 1145: case NOTHING: ! 1146: p = "NOTHING"; ! 1147: break; ! 1148: case BACK: ! 1149: p = "BACK"; ! 1150: break; ! 1151: case END: ! 1152: p = "END"; ! 1153: break; ! 1154: case OPEN+1: ! 1155: case OPEN+2: ! 1156: case OPEN+3: ! 1157: case OPEN+4: ! 1158: case OPEN+5: ! 1159: case OPEN+6: ! 1160: case OPEN+7: ! 1161: case OPEN+8: ! 1162: case OPEN+9: ! 1163: sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); ! 1164: p = NULL; ! 1165: break; ! 1166: case CLOSE+1: ! 1167: case CLOSE+2: ! 1168: case CLOSE+3: ! 1169: case CLOSE+4: ! 1170: case CLOSE+5: ! 1171: case CLOSE+6: ! 1172: case CLOSE+7: ! 1173: case CLOSE+8: ! 1174: case CLOSE+9: ! 1175: sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); ! 1176: p = NULL; ! 1177: break; ! 1178: case STAR: ! 1179: p = "STAR"; ! 1180: break; ! 1181: case PLUS: ! 1182: p = "PLUS"; ! 1183: break; ! 1184: default: ! 1185: regerror("corrupted opcode"); ! 1186: break; ! 1187: } ! 1188: if (p != NULL) ! 1189: (void) strcat(buf, p); ! 1190: return(buf); ! 1191: } ! 1192: #endif ! 1193: ! 1194: /* ! 1195: * The following is provided for those people who do not have strcspn() in ! 1196: * their C libraries. They should get off their butts and do something ! 1197: * about it; at least one public-domain implementation of those (highly ! 1198: * useful) string routines has been published on Usenet. ! 1199: */ ! 1200: #ifdef STRCSPN ! 1201: /* ! 1202: * strcspn - find length of initial segment of s1 consisting entirely ! 1203: * of characters not from s2 ! 1204: */ ! 1205: ! 1206: static int ! 1207: strcspn(s1, s2) ! 1208: char *s1; ! 1209: char *s2; ! 1210: { ! 1211: register char *scan1; ! 1212: register char *scan2; ! 1213: register int count; ! 1214: ! 1215: count = 0; ! 1216: for (scan1 = s1; *scan1 != '\0'; scan1++) { ! 1217: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ ! 1218: if (*scan1 == *scan2++) ! 1219: return(count); ! 1220: count++; ! 1221: } ! 1222: return(count); ! 1223: } ! 1224: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.