|
|
1.1 ! root 1: /* ! 2: * regcomp and regexec -- regsub and regerror are elsewhere ! 3: * ! 4: * Copyright (c) 1986 by University of Toronto. ! 5: * Written by Henry Spencer. Not derived from licensed software. ! 6: * ! 7: * Permission is granted to anyone to use this software for any ! 8: * purpose on any computer system, and to redistribute it freely, ! 9: * subject to the following restrictions: ! 10: * ! 11: * 1. The author is not responsible for the consequences of use of ! 12: * this software, no matter how awful, even if they arise ! 13: * from defects in it. ! 14: * ! 15: * 2. The origin of this software must not be misrepresented, either ! 16: * by explicit claim or by omission. ! 17: * ! 18: * 3. Altered versions must be plainly marked as such, and must not ! 19: * be misrepresented as being the original software. ! 20: *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, ! 21: *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | ! 22: *** to assist in implementing egrep. ! 23: *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, ! 24: *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching ! 25: *** as in BSD grep and ex. ! 26: *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, ! 27: *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. ! 28: *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, ! 29: *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. ! 30: * ! 31: * Beware that some of this code is subtly aware of the way operator ! 32: * precedence is structured in regular expressions. Serious changes in ! 33: * regular-expression syntax might require a total rethink. ! 34: */ ! 35: #include <stdio.h> ! 36: #include <ctype.h> ! 37: #include <regexp.h> ! 38: #include "regmagic.h" ! 39: ! 40: /* ! 41: * The "internal use only" fields in regexp.h are present to pass info from ! 42: * compile to execute that permits the execute phase to run lots faster on ! 43: * simple cases. They are: ! 44: * ! 45: * regstart char that must begin a match; '\0' if none obvious ! 46: * reganch is the match anchored (at beginning-of-line only)? ! 47: * regmust string (pointer into program) that match must include, or NULL ! 48: * regmlen length of regmust string ! 49: * ! 50: * Regstart and reganch permit very fast decisions on suitable starting points ! 51: * for a match, cutting down the work a lot. Regmust permits fast rejection ! 52: * of lines that cannot possibly match. The regmust tests are costly enough ! 53: * that regcomp() supplies a regmust only if the r.e. contains something ! 54: * potentially expensive (at present, the only such thing detected is * or + ! 55: * at the start of the r.e., which can involve a lot of backup). Regmlen is ! 56: * supplied because the test in regexec() needs it and regcomp() is computing ! 57: * it anyway. ! 58: */ ! 59: ! 60: /* ! 61: * Structure for regexp "program". This is essentially a linear encoding ! 62: * of a nondeterministic finite-state machine (aka syntax charts or ! 63: * "railroad normal form" in parsing technology). Each node is an opcode ! 64: * plus a "next" pointer, possibly plus an operand. "Next" pointers of ! 65: * all nodes except BRANCH implement concatenation; a "next" pointer with ! 66: * a BRANCH on both ends of it is connecting two alternatives. (Here we ! 67: * have one of the subtle syntax dependencies: an individual BRANCH (as ! 68: * opposed to a collection of them) is never concatenated with anything ! 69: * because of operator precedence.) The operand of some types of node is ! 70: * a literal string; for others, it is a node leading into a sub-FSM. In ! 71: * particular, the operand of a BRANCH node is the first node of the branch. ! 72: * (NB this is *not* a tree structure: the tail of the branch connects ! 73: * to the thing following the set of BRANCHes.) The opcodes are: ! 74: */ ! 75: ! 76: /* definition number opnd? meaning */ ! 77: #define END 0 /* no End of program. */ ! 78: #define BOL 1 /* no Match "" at beginning of line. */ ! 79: #define EOL 2 /* no Match "" at end of line. */ ! 80: #define ANY 3 /* no Match any one character. */ ! 81: #define ANYOF 4 /* str Match any character in this string. */ ! 82: #define ANYBUT 5 /* str Match any character not in this string. */ ! 83: #define BRANCH 6 /* node Match this alternative, or the next... */ ! 84: #define BACK 7 /* no Match "", "next" ptr points backward. */ ! 85: #define EXACTLY 8 /* str Match this string. */ ! 86: #define NOTHING 9 /* no Match empty string. */ ! 87: #define STAR 10 /* node Match this (simple) thing 0 or more times. */ ! 88: #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ ! 89: #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ ! 90: #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ ! 91: #define OPEN 20 /* no Mark this point in input as start of #n. */ ! 92: /* OPEN+1 is number 1, etc. */ ! 93: #define CLOSE 30 /* no Analogous to OPEN. */ ! 94: ! 95: /* ! 96: * Opcode notes: ! 97: * ! 98: * BRANCH The set of branches constituting a single choice are hooked ! 99: * together with their "next" pointers, since precedence prevents ! 100: * anything being concatenated to any individual branch. The ! 101: * "next" pointer of the last BRANCH in a choice points to the ! 102: * thing following the whole choice. This is also where the ! 103: * final "next" pointer of each individual branch points; each ! 104: * branch starts with the operand node of a BRANCH node. ! 105: * ! 106: * BACK Normal "next" pointers all implicitly point forward; BACK ! 107: * exists to make loop structures possible. ! 108: * ! 109: * STAR,PLUS '?', and complex '*' and '+', are implemented as circular ! 110: * BRANCH structures using BACK. Simple cases (one character ! 111: * per match) are implemented with STAR and PLUS for speed ! 112: * and to minimize recursive plunges. ! 113: * ! 114: * OPEN,CLOSE ...are numbered at compile time. ! 115: */ ! 116: ! 117: /* ! 118: * A node is one char of opcode followed by two chars of "next" pointer. ! 119: * "Next" pointers are stored as two 8-bit pieces, high order first. The ! 120: * value is a positive offset from the opcode of the node containing it. ! 121: * An operand, if any, simply follows the node. (Note that much of the ! 122: * code generation knows about this implicit relationship.) ! 123: * ! 124: * Using two bytes for the "next" pointer is vast overkill for most things, ! 125: * but allows patterns to get big without disasters. ! 126: */ ! 127: #define OP(p) (*(p)) ! 128: #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) ! 129: #define OPERAND(p) ((p) + 3) ! 130: ! 131: /* ! 132: * See regmagic.h for one further detail of program structure. ! 133: */ ! 134: ! 135: ! 136: /* ! 137: * Utility definitions. ! 138: */ ! 139: #ifndef CHARBITS ! 140: #define UCHARAT(p) ((int)*(unsigned char *)(p)) ! 141: #else ! 142: #define UCHARAT(p) ((int)*(p)&CHARBITS) ! 143: #endif ! 144: ! 145: #define FAIL(m) { regerror(m); return(NULL); } ! 146: #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') ! 147: ! 148: /* ! 149: * Flags to be passed up and down. ! 150: */ ! 151: #define HASWIDTH 01 /* Known never to match null string. */ ! 152: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ ! 153: #define SPSTART 04 /* Starts with * or +. */ ! 154: #define WORST 0 /* Worst case. */ ! 155: ! 156: /* ! 157: * Global work variables for regcomp(). ! 158: */ ! 159: static char *regparse; /* Input-scan pointer. */ ! 160: static int regnpar; /* () count. */ ! 161: static char regdummy; ! 162: static char *regcode; /* Code-emit pointer; ®dummy = don't. */ ! 163: static long regsize; /* Code size. */ ! 164: ! 165: /* ! 166: * Forward declarations for regcomp()'s friends. ! 167: */ ! 168: #ifndef STATIC ! 169: #define STATIC static ! 170: #endif ! 171: STATIC char *reg(); ! 172: STATIC char *regbranch(); ! 173: STATIC char *regpiece(); ! 174: STATIC char *regatom(); ! 175: STATIC char *regnode(); ! 176: STATIC char *regnext(); ! 177: STATIC void regc(); ! 178: STATIC void reginsert(); ! 179: STATIC void regtail(); ! 180: STATIC void regoptail(); ! 181: #ifdef STRCSPN ! 182: STATIC int strcspn(); ! 183: #endif ! 184: ! 185: /* ! 186: - regcomp - compile a regular expression into internal code ! 187: * ! 188: * We can't allocate space until we know how big the compiled form will be, ! 189: * but we can't compile it (and thus know how big it is) until we've got a ! 190: * place to put the code. So we cheat: we compile it twice, once with code ! 191: * generation turned off and size counting turned on, and once "for real". ! 192: * This also means that we don't allocate space until we are sure that the ! 193: * thing really will compile successfully, and we never have to move the ! 194: * code and thus invalidate pointers into it. (Note that it has to be in ! 195: * one piece because free() must be able to free it all.) ! 196: * ! 197: * Beware that the optimization-preparation code in here knows about some ! 198: * of the structure of the compiled regexp. ! 199: */ ! 200: regexp * ! 201: regcomp(exp) ! 202: char *exp; ! 203: { ! 204: register regexp *r; ! 205: register char *scan; ! 206: register char *longest; ! 207: register int len; ! 208: int flags; ! 209: extern char *malloc(); ! 210: ! 211: if (exp == NULL) ! 212: FAIL("NULL argument"); ! 213: ! 214: /* First pass: determine size, legality. */ ! 215: if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ ! 216: regparse = exp; ! 217: regnpar = 1; ! 218: regsize = 0L; ! 219: regcode = ®dummy; ! 220: regc(MAGIC); ! 221: if (reg(0, &flags) == NULL) ! 222: return(NULL); ! 223: ! 224: /* Small enough for pointer-storage convention? */ ! 225: if (regsize >= 32767L) /* Probably could be 65535L. */ ! 226: FAIL("regexp too big"); ! 227: ! 228: /* Allocate space. */ ! 229: r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); ! 230: if (r == NULL) ! 231: FAIL("out of space"); ! 232: ! 233: /* Second pass: emit code. */ ! 234: regparse = exp; ! 235: regnpar = 1; ! 236: regcode = r->program; ! 237: regc(MAGIC); ! 238: if (reg(0, &flags) == NULL) ! 239: return(NULL); ! 240: ! 241: /* Dig out information for optimizations. */ ! 242: r->regstart = '\0'; /* Worst-case defaults. */ ! 243: r->reganch = 0; ! 244: r->regmust = NULL; ! 245: r->regmlen = 0; ! 246: scan = r->program+1; /* First BRANCH. */ ! 247: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ ! 248: scan = OPERAND(scan); ! 249: ! 250: /* Starting-point info. */ ! 251: if (OP(scan) == EXACTLY) ! 252: r->regstart = *OPERAND(scan); ! 253: else if (OP(scan) == BOL) ! 254: r->reganch++; ! 255: ! 256: /* ! 257: * If there's something expensive in the r.e., find the ! 258: * longest literal string that must appear and make it the ! 259: * regmust. Resolve ties in favor of later strings, since ! 260: * the regstart check works with the beginning of the r.e. ! 261: * and avoiding duplication strengthens checking. Not a ! 262: * strong reason, but sufficient in the absence of others. ! 263: */ ! 264: if (flags&SPSTART) { ! 265: longest = NULL; ! 266: len = 0; ! 267: for (; scan != NULL; scan = regnext(scan)) ! 268: if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { ! 269: longest = OPERAND(scan); ! 270: len = strlen(OPERAND(scan)); ! 271: } ! 272: r->regmust = longest; ! 273: r->regmlen = len; ! 274: } ! 275: } ! 276: ! 277: return(r); ! 278: } ! 279: ! 280: /* ! 281: - reg - regular expression, i.e. main body or parenthesized thing ! 282: * ! 283: * Caller must absorb opening parenthesis. ! 284: * ! 285: * Combining parenthesis handling with the base level of regular expression ! 286: * is a trifle forced, but the need to tie the tails of the branches to what ! 287: * follows makes it hard to avoid. ! 288: */ ! 289: static char * ! 290: reg(paren, flagp) ! 291: int paren; /* Parenthesized? */ ! 292: int *flagp; ! 293: { ! 294: register char *ret; ! 295: register char *br; ! 296: register char *ender; ! 297: register int parno; ! 298: int flags; ! 299: ! 300: *flagp = HASWIDTH; /* Tentatively. */ ! 301: ! 302: /* Make an OPEN node, if parenthesized. */ ! 303: if (paren) { ! 304: if (regnpar >= NSUBEXP) ! 305: FAIL("too many ()"); ! 306: parno = regnpar; ! 307: regnpar++; ! 308: ret = regnode(OPEN+parno); ! 309: } else ! 310: ret = NULL; ! 311: ! 312: /* Pick up the branches, linking them together. */ ! 313: br = regbranch(&flags); ! 314: if (br == NULL) ! 315: return(NULL); ! 316: if (ret != NULL) ! 317: regtail(ret, br); /* OPEN -> first. */ ! 318: else ! 319: ret = br; ! 320: if (!(flags&HASWIDTH)) ! 321: *flagp &= ~HASWIDTH; ! 322: *flagp |= flags&SPSTART; ! 323: while (*regparse == '|' || *regparse == '\n') { ! 324: regparse++; ! 325: br = regbranch(&flags); ! 326: if (br == NULL) ! 327: return(NULL); ! 328: regtail(ret, br); /* BRANCH -> BRANCH. */ ! 329: if (!(flags&HASWIDTH)) ! 330: *flagp &= ~HASWIDTH; ! 331: *flagp |= flags&SPSTART; ! 332: } ! 333: ! 334: /* Make a closing node, and hook it on the end. */ ! 335: ender = regnode((paren) ? CLOSE+parno : END); ! 336: regtail(ret, ender); ! 337: ! 338: /* Hook the tails of the branches to the closing node. */ ! 339: for (br = ret; br != NULL; br = regnext(br)) ! 340: regoptail(br, ender); ! 341: ! 342: /* Check for proper termination. */ ! 343: if (paren && *regparse++ != ')') { ! 344: FAIL("unmatched ()"); ! 345: } else if (!paren && *regparse != '\0') { ! 346: if (*regparse == ')') { ! 347: FAIL("unmatched ()"); ! 348: } else ! 349: FAIL("junk on end"); /* "Can't happen". */ ! 350: /* NOTREACHED */ ! 351: } ! 352: ! 353: return(ret); ! 354: } ! 355: ! 356: /* ! 357: - regbranch - one alternative of an | operator ! 358: * ! 359: * Implements the concatenation operator. ! 360: */ ! 361: static char * ! 362: regbranch(flagp) ! 363: int *flagp; ! 364: { ! 365: register char *ret; ! 366: register char *chain; ! 367: register char *latest; ! 368: int flags; ! 369: ! 370: *flagp = WORST; /* Tentatively. */ ! 371: ! 372: ret = regnode(BRANCH); ! 373: chain = NULL; ! 374: while (*regparse != '\0' && *regparse != ')' && ! 375: *regparse != '\n' && *regparse != '|') { ! 376: latest = regpiece(&flags); ! 377: if (latest == NULL) ! 378: return(NULL); ! 379: *flagp |= flags&HASWIDTH; ! 380: if (chain == NULL) /* First piece. */ ! 381: *flagp |= flags&SPSTART; ! 382: else ! 383: regtail(chain, latest); ! 384: chain = latest; ! 385: } ! 386: if (chain == NULL) /* Loop ran zero times. */ ! 387: (void) regnode(NOTHING); ! 388: ! 389: return(ret); ! 390: } ! 391: ! 392: /* ! 393: - regpiece - something followed by possible [*+?] ! 394: * ! 395: * Note that the branching code sequences used for ? and the general cases ! 396: * of * and + are somewhat optimized: they use the same NOTHING node as ! 397: * both the endmarker for their branch list and the body of the last branch. ! 398: * It might seem that this node could be dispensed with entirely, but the ! 399: * endmarker role is not redundant. ! 400: */ ! 401: static char * ! 402: regpiece(flagp) ! 403: int *flagp; ! 404: { ! 405: register char *ret; ! 406: register char op; ! 407: register char *next; ! 408: int flags; ! 409: ! 410: ret = regatom(&flags); ! 411: if (ret == NULL) ! 412: return(NULL); ! 413: ! 414: op = *regparse; ! 415: if (!ISMULT(op)) { ! 416: *flagp = flags; ! 417: return(ret); ! 418: } ! 419: ! 420: if (!(flags&HASWIDTH) && op != '?') ! 421: FAIL("*+ operand could be empty"); ! 422: *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); ! 423: ! 424: if (op == '*' && (flags&SIMPLE)) ! 425: reginsert(STAR, ret); ! 426: else if (op == '*') { ! 427: /* Emit x* as (x&|), where & means "self". */ ! 428: reginsert(BRANCH, ret); /* Either x */ ! 429: regoptail(ret, regnode(BACK)); /* and loop */ ! 430: regoptail(ret, ret); /* back */ ! 431: regtail(ret, regnode(BRANCH)); /* or */ ! 432: regtail(ret, regnode(NOTHING)); /* null. */ ! 433: } else if (op == '+' && (flags&SIMPLE)) ! 434: reginsert(PLUS, ret); ! 435: else if (op == '+') { ! 436: /* Emit x+ as x(&|), where & means "self". */ ! 437: next = regnode(BRANCH); /* Either */ ! 438: regtail(ret, next); ! 439: regtail(regnode(BACK), ret); /* loop back */ ! 440: regtail(next, regnode(BRANCH)); /* or */ ! 441: regtail(ret, regnode(NOTHING)); /* null. */ ! 442: } else if (op == '?') { ! 443: /* Emit x? as (x|) */ ! 444: reginsert(BRANCH, ret); /* Either x */ ! 445: regtail(ret, regnode(BRANCH)); /* or */ ! 446: next = regnode(NOTHING); /* null. */ ! 447: regtail(ret, next); ! 448: regoptail(ret, next); ! 449: } ! 450: regparse++; ! 451: if (ISMULT(*regparse)) ! 452: FAIL("nested *?+"); ! 453: ! 454: return(ret); ! 455: } ! 456: ! 457: /* ! 458: - regatom - the lowest level ! 459: * ! 460: * Optimization: gobbles an entire sequence of ordinary characters so that ! 461: * it can turn them into a single node, which is smaller to store and ! 462: * faster to run. Backslashed characters are exceptions, each becoming a ! 463: * separate node; the code is simpler that way and it's not worth fixing. ! 464: */ ! 465: static char * ! 466: regatom(flagp) ! 467: int *flagp; ! 468: { ! 469: register char *ret; ! 470: int flags; ! 471: ! 472: *flagp = WORST; /* Tentatively. */ ! 473: ! 474: switch (*regparse++) { ! 475: /* FIXME: these chars only have meaning at beg/end of pat? */ ! 476: case '^': ! 477: ret = regnode(BOL); ! 478: break; ! 479: case '$': ! 480: ret = regnode(EOL); ! 481: break; ! 482: case '.': ! 483: ret = regnode(ANY); ! 484: *flagp |= HASWIDTH|SIMPLE; ! 485: break; ! 486: case '[': { ! 487: register int class; ! 488: register int classend; ! 489: ! 490: if (*regparse == '^') { /* Complement of range. */ ! 491: ret = regnode(ANYBUT); ! 492: regparse++; ! 493: } else ! 494: ret = regnode(ANYOF); ! 495: if (*regparse == ']' || *regparse == '-') ! 496: regc(*regparse++); ! 497: while (*regparse != '\0' && *regparse != ']') { ! 498: if (*regparse == '-') { ! 499: regparse++; ! 500: if (*regparse == ']' || *regparse == '\0') ! 501: regc('-'); ! 502: else { ! 503: class = UCHARAT(regparse-2)+1; ! 504: classend = UCHARAT(regparse); ! 505: if (class > classend+1) ! 506: FAIL("invalid [] range"); ! 507: for (; class <= classend; class++) ! 508: regc(class); ! 509: regparse++; ! 510: } ! 511: } else ! 512: regc(*regparse++); ! 513: } ! 514: regc('\0'); ! 515: if (*regparse != ']') ! 516: FAIL("unmatched []"); ! 517: regparse++; ! 518: *flagp |= HASWIDTH|SIMPLE; ! 519: } ! 520: break; ! 521: case '(': ! 522: ret = reg(1, &flags); ! 523: if (ret == NULL) ! 524: return(NULL); ! 525: *flagp |= flags&(HASWIDTH|SPSTART); ! 526: break; ! 527: case '\0': ! 528: case '|': ! 529: case '\n': ! 530: case ')': ! 531: FAIL("internal urp"); /* Supposed to be caught earlier. */ ! 532: break; ! 533: case '?': ! 534: case '+': ! 535: case '*': ! 536: FAIL("?+* follows nothing"); ! 537: break; ! 538: case '\\': ! 539: switch (*regparse++) { ! 540: case '\0': ! 541: FAIL("trailing \\"); ! 542: break; ! 543: case '<': ! 544: ret = regnode(WORDA); ! 545: break; ! 546: case '>': ! 547: ret = regnode(WORDZ); ! 548: break; ! 549: /* FIXME: Someday handle \1, \2, ... */ ! 550: default: ! 551: /* Handle general quoted chars in exact-match routine */ ! 552: goto de_fault; ! 553: } ! 554: break; ! 555: de_fault: ! 556: default: ! 557: /* ! 558: * Encode a string of characters to be matched exactly. ! 559: * ! 560: * This is a bit tricky due to quoted chars and due to ! 561: * '*', '+', and '?' taking the SINGLE char previous ! 562: * as their operand. ! 563: * ! 564: * On entry, the char at regparse[-1] is going to go ! 565: * into the string, no matter what it is. (It could be ! 566: * following a \ if we are entered from the '\' case.) ! 567: * ! 568: * Basic idea is to pick up a good char in ch and ! 569: * examine the next char. If it's *+? then we twiddle. ! 570: * If it's \ then we frozzle. If it's other magic char ! 571: * we push ch and terminate the string. If none of the ! 572: * above, we push ch on the string and go around again. ! 573: * ! 574: * regprev is used to remember where "the current char" ! 575: * starts in the string, if due to a *+? we need to back ! 576: * up and put the current char in a separate, 1-char, string. ! 577: * When regprev is NULL, ch is the only char in the ! 578: * string; this is used in *+? handling, and in setting ! 579: * flags |= SIMPLE at the end. ! 580: */ ! 581: { ! 582: char *regprev; ! 583: register char ch; ! 584: ! 585: regparse--; /* Look at cur char */ ! 586: ret = regnode(EXACTLY); ! 587: for ( regprev = 0 ; ; ) { ! 588: ch = *regparse++; /* Get current char */ ! 589: switch (*regparse) { /* look at next one */ ! 590: ! 591: default: ! 592: regc(ch); /* Add cur to string */ ! 593: break; ! 594: ! 595: case '.': case '[': case '(': ! 596: case ')': case '|': case '\n': ! 597: case '$': case '^': ! 598: case '\0': ! 599: /* FIXME, $ and ^ should not always be magic */ ! 600: magic: ! 601: regc(ch); /* dump cur char */ ! 602: goto done; /* and we are done */ ! 603: ! 604: case '?': case '+': case '*': ! 605: if (!regprev) /* If just ch in str, */ ! 606: goto magic; /* use it */ ! 607: /* End mult-char string one early */ ! 608: regparse = regprev; /* Back up parse */ ! 609: goto done; ! 610: ! 611: case '\\': ! 612: regc(ch); /* Cur char OK */ ! 613: switch (regparse[1]){ /* Look after \ */ ! 614: case '\0': ! 615: case '<': ! 616: case '>': ! 617: /* FIXME: Someday handle \1, \2, ... */ ! 618: goto done; /* Not quoted */ ! 619: default: ! 620: /* Backup point is \, scan * point is after it. */ ! 621: regprev = regparse; ! 622: regparse++; ! 623: continue; /* NOT break; */ ! 624: } ! 625: } ! 626: regprev = regparse; /* Set backup point */ ! 627: } ! 628: done: ! 629: regc('\0'); ! 630: *flagp |= HASWIDTH; ! 631: if (!regprev) /* One char? */ ! 632: *flagp |= SIMPLE; ! 633: } ! 634: break; ! 635: } ! 636: ! 637: return(ret); ! 638: } ! 639: ! 640: /* ! 641: - regnode - emit a node ! 642: */ ! 643: static char * /* Location. */ ! 644: regnode(op) ! 645: char op; ! 646: { ! 647: register char *ret; ! 648: register char *ptr; ! 649: ! 650: ret = regcode; ! 651: if (ret == ®dummy) { ! 652: regsize += 3; ! 653: return(ret); ! 654: } ! 655: ! 656: ptr = ret; ! 657: *ptr++ = op; ! 658: *ptr++ = '\0'; /* Null "next" pointer. */ ! 659: *ptr++ = '\0'; ! 660: regcode = ptr; ! 661: ! 662: return(ret); ! 663: } ! 664: ! 665: /* ! 666: - regc - emit (if appropriate) a byte of code ! 667: */ ! 668: static void ! 669: regc(b) ! 670: char b; ! 671: { ! 672: if (regcode != ®dummy) ! 673: *regcode++ = b; ! 674: else ! 675: regsize++; ! 676: } ! 677: ! 678: /* ! 679: - reginsert - insert an operator in front of already-emitted operand ! 680: * ! 681: * Means relocating the operand. ! 682: */ ! 683: static void ! 684: reginsert(op, opnd) ! 685: char op; ! 686: char *opnd; ! 687: { ! 688: register char *src; ! 689: register char *dst; ! 690: register char *place; ! 691: ! 692: if (regcode == ®dummy) { ! 693: regsize += 3; ! 694: return; ! 695: } ! 696: ! 697: src = regcode; ! 698: regcode += 3; ! 699: dst = regcode; ! 700: while (src > opnd) ! 701: *--dst = *--src; ! 702: ! 703: place = opnd; /* Op node, where operand used to be. */ ! 704: *place++ = op; ! 705: *place++ = '\0'; ! 706: *place++ = '\0'; ! 707: } ! 708: ! 709: /* ! 710: - regtail - set the next-pointer at the end of a node chain ! 711: */ ! 712: static void ! 713: regtail(p, val) ! 714: char *p; ! 715: char *val; ! 716: { ! 717: register char *scan; ! 718: register char *temp; ! 719: register int offset; ! 720: ! 721: if (p == ®dummy) ! 722: return; ! 723: ! 724: /* Find last node. */ ! 725: scan = p; ! 726: for (;;) { ! 727: temp = regnext(scan); ! 728: if (temp == NULL) ! 729: break; ! 730: scan = temp; ! 731: } ! 732: ! 733: if (OP(scan) == BACK) ! 734: offset = scan - val; ! 735: else ! 736: offset = val - scan; ! 737: *(scan+1) = (offset>>8)&0377; ! 738: *(scan+2) = offset&0377; ! 739: } ! 740: ! 741: /* ! 742: - regoptail - regtail on operand of first argument; nop if operandless ! 743: */ ! 744: static void ! 745: regoptail(p, val) ! 746: char *p; ! 747: char *val; ! 748: { ! 749: /* "Operandless" and "op != BRANCH" are synonymous in practice. */ ! 750: if (p == NULL || p == ®dummy || OP(p) != BRANCH) ! 751: return; ! 752: regtail(OPERAND(p), val); ! 753: } ! 754: ! 755: /* ! 756: * regexec and friends ! 757: */ ! 758: ! 759: /* ! 760: * Global work variables for regexec(). ! 761: */ ! 762: static char *reginput; /* String-input pointer. */ ! 763: static char *regbol; /* Beginning of input, for ^ check. */ ! 764: static char **regstartp; /* Pointer to startp array. */ ! 765: static char **regendp; /* Ditto for endp. */ ! 766: ! 767: /* ! 768: * Forwards. ! 769: */ ! 770: STATIC int regtry(); ! 771: STATIC int regmatch(); ! 772: STATIC int regrepeat(); ! 773: ! 774: #ifdef DEBUG ! 775: int regnarrate = 0; ! 776: void regdump(); ! 777: STATIC char *regprop(); ! 778: #endif ! 779: ! 780: /* ! 781: - regexec - match a regexp against a string ! 782: */ ! 783: int ! 784: regexec(prog, string) ! 785: register regexp *prog; ! 786: register char *string; ! 787: { ! 788: register char *s; ! 789: extern char *strchr(); ! 790: ! 791: /* Be paranoid... */ ! 792: if (prog == NULL || string == NULL) { ! 793: regerror("NULL parameter"); ! 794: return(0); ! 795: } ! 796: ! 797: /* Check validity of program. */ ! 798: if (UCHARAT(prog->program) != MAGIC) { ! 799: regerror("corrupted program"); ! 800: return(0); ! 801: } ! 802: ! 803: /* If there is a "must appear" string, look for it. */ ! 804: if (prog->regmust != NULL) { ! 805: s = string; ! 806: while ((s = strchr(s, prog->regmust[0])) != NULL) { ! 807: if (strncmp(s, prog->regmust, prog->regmlen) == 0) ! 808: break; /* Found it. */ ! 809: s++; ! 810: } ! 811: if (s == NULL) /* Not present. */ ! 812: return(0); ! 813: } ! 814: ! 815: /* Mark beginning of line for ^ . */ ! 816: regbol = string; ! 817: ! 818: /* Simplest case: anchored match need be tried only once. */ ! 819: if (prog->reganch) ! 820: return(regtry(prog, string)); ! 821: ! 822: /* Messy cases: unanchored match. */ ! 823: s = string; ! 824: if (prog->regstart != '\0') ! 825: /* We know what char it must start with. */ ! 826: while ((s = strchr(s, prog->regstart)) != NULL) { ! 827: if (regtry(prog, s)) ! 828: return(1); ! 829: s++; ! 830: } ! 831: else ! 832: /* We don't -- general case. */ ! 833: do { ! 834: if (regtry(prog, s)) ! 835: return(1); ! 836: } while (*s++ != '\0'); ! 837: ! 838: /* Failure. */ ! 839: return(0); ! 840: } ! 841: ! 842: /* ! 843: - regtry - try match at specific point ! 844: */ ! 845: static int /* 0 failure, 1 success */ ! 846: regtry(prog, string) ! 847: regexp *prog; ! 848: char *string; ! 849: { ! 850: register int i; ! 851: register char **sp; ! 852: register char **ep; ! 853: ! 854: reginput = string; ! 855: regstartp = prog->startp; ! 856: regendp = prog->endp; ! 857: ! 858: sp = prog->startp; ! 859: ep = prog->endp; ! 860: for (i = NSUBEXP; i > 0; i--) { ! 861: *sp++ = NULL; ! 862: *ep++ = NULL; ! 863: } ! 864: if (regmatch(prog->program + 1)) { ! 865: prog->startp[0] = string; ! 866: prog->endp[0] = reginput; ! 867: return(1); ! 868: } else ! 869: return(0); ! 870: } ! 871: ! 872: /* ! 873: - regmatch - main matching routine ! 874: * ! 875: * Conceptually the strategy is simple: check to see whether the current ! 876: * node matches, call self recursively to see whether the rest matches, ! 877: * and then act accordingly. In practice we make some effort to avoid ! 878: * recursion, in particular by going through "ordinary" nodes (that don't ! 879: * need to know whether the rest of the match failed) by a loop instead of ! 880: * by recursion. ! 881: */ ! 882: static int /* 0 failure, 1 success */ ! 883: regmatch(prog) ! 884: char *prog; ! 885: { ! 886: register char *scan; /* Current node. */ ! 887: char *next; /* Next node. */ ! 888: extern char *strchr(); ! 889: ! 890: scan = prog; ! 891: #ifdef DEBUG ! 892: if (scan != NULL && regnarrate) ! 893: fprintf(stderr, "%s(\n", regprop(scan)); ! 894: #endif ! 895: while (scan != NULL) { ! 896: #ifdef DEBUG ! 897: if (regnarrate) ! 898: fprintf(stderr, "%s...\n", regprop(scan)); ! 899: #endif ! 900: next = regnext(scan); ! 901: ! 902: switch (OP(scan)) { ! 903: case BOL: ! 904: if (reginput != regbol) ! 905: return(0); ! 906: break; ! 907: case EOL: ! 908: if (*reginput != '\0') ! 909: return(0); ! 910: break; ! 911: case WORDA: ! 912: /* Must be looking at a letter, digit, or _ */ ! 913: if ((!isalnum(*reginput)) && *reginput != '_') ! 914: return(0); ! 915: /* Prev must be BOL or nonword */ ! 916: if (reginput > regbol && ! 917: (isalnum(reginput[-1]) || reginput[-1] == '_')) ! 918: return(0); ! 919: break; ! 920: case WORDZ: ! 921: /* Must be looking at non letter, digit, or _ */ ! 922: if (isalnum(*reginput) || *reginput == '_') ! 923: return(0); ! 924: /* We don't care what the previous char was */ ! 925: break; ! 926: case ANY: ! 927: if (*reginput == '\0') ! 928: return(0); ! 929: reginput++; ! 930: break; ! 931: case EXACTLY: { ! 932: register int len; ! 933: register char *opnd; ! 934: ! 935: opnd = OPERAND(scan); ! 936: /* Inline the first character, for speed. */ ! 937: if (*opnd != *reginput) ! 938: return(0); ! 939: len = strlen(opnd); ! 940: if (len > 1 && strncmp(opnd, reginput, len) != 0) ! 941: return(0); ! 942: reginput += len; ! 943: } ! 944: break; ! 945: case ANYOF: ! 946: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) ! 947: return(0); ! 948: reginput++; ! 949: break; ! 950: case ANYBUT: ! 951: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) ! 952: return(0); ! 953: reginput++; ! 954: break; ! 955: case NOTHING: ! 956: break; ! 957: case BACK: ! 958: break; ! 959: case OPEN+1: ! 960: case OPEN+2: ! 961: case OPEN+3: ! 962: case OPEN+4: ! 963: case OPEN+5: ! 964: case OPEN+6: ! 965: case OPEN+7: ! 966: case OPEN+8: ! 967: case OPEN+9: { ! 968: register int no; ! 969: register char *save; ! 970: ! 971: no = OP(scan) - OPEN; ! 972: save = reginput; ! 973: ! 974: if (regmatch(next)) { ! 975: /* ! 976: * Don't set startp if some later ! 977: * invocation of the same parentheses ! 978: * already has. ! 979: */ ! 980: if (regstartp[no] == NULL) ! 981: regstartp[no] = save; ! 982: return(1); ! 983: } else ! 984: return(0); ! 985: } ! 986: break; ! 987: case CLOSE+1: ! 988: case CLOSE+2: ! 989: case CLOSE+3: ! 990: case CLOSE+4: ! 991: case CLOSE+5: ! 992: case CLOSE+6: ! 993: case CLOSE+7: ! 994: case CLOSE+8: ! 995: case CLOSE+9: { ! 996: register int no; ! 997: register char *save; ! 998: ! 999: no = OP(scan) - CLOSE; ! 1000: save = reginput; ! 1001: ! 1002: if (regmatch(next)) { ! 1003: /* ! 1004: * Don't set endp if some later ! 1005: * invocation of the same parentheses ! 1006: * already has. ! 1007: */ ! 1008: if (regendp[no] == NULL) ! 1009: regendp[no] = save; ! 1010: return(1); ! 1011: } else ! 1012: return(0); ! 1013: } ! 1014: break; ! 1015: case BRANCH: { ! 1016: register char *save; ! 1017: ! 1018: if (OP(next) != BRANCH) /* No choice. */ ! 1019: next = OPERAND(scan); /* Avoid recursion. */ ! 1020: else { ! 1021: do { ! 1022: save = reginput; ! 1023: if (regmatch(OPERAND(scan))) ! 1024: return(1); ! 1025: reginput = save; ! 1026: scan = regnext(scan); ! 1027: } while (scan != NULL && OP(scan) == BRANCH); ! 1028: return(0); ! 1029: /* NOTREACHED */ ! 1030: } ! 1031: } ! 1032: break; ! 1033: case STAR: ! 1034: case PLUS: { ! 1035: register char nextch; ! 1036: register int no; ! 1037: register char *save; ! 1038: register int min; ! 1039: ! 1040: /* ! 1041: * Lookahead to avoid useless match attempts ! 1042: * when we know what character comes next. ! 1043: */ ! 1044: nextch = '\0'; ! 1045: if (OP(next) == EXACTLY) ! 1046: nextch = *OPERAND(next); ! 1047: min = (OP(scan) == STAR) ? 0 : 1; ! 1048: save = reginput; ! 1049: no = regrepeat(OPERAND(scan)); ! 1050: while (no >= min) { ! 1051: /* If it could work, try it. */ ! 1052: if (nextch == '\0' || *reginput == nextch) ! 1053: if (regmatch(next)) ! 1054: return(1); ! 1055: /* Couldn't or didn't -- back up. */ ! 1056: no--; ! 1057: reginput = save + no; ! 1058: } ! 1059: return(0); ! 1060: } ! 1061: break; ! 1062: case END: ! 1063: return(1); /* Success! */ ! 1064: break; ! 1065: default: ! 1066: regerror("memory corruption"); ! 1067: return(0); ! 1068: break; ! 1069: } ! 1070: ! 1071: scan = next; ! 1072: } ! 1073: ! 1074: /* ! 1075: * We get here only if there's trouble -- normally "case END" is ! 1076: * the terminating point. ! 1077: */ ! 1078: regerror("corrupted pointers"); ! 1079: return(0); ! 1080: } ! 1081: ! 1082: /* ! 1083: - regrepeat - repeatedly match something simple, report how many ! 1084: */ ! 1085: static int ! 1086: regrepeat(p) ! 1087: char *p; ! 1088: { ! 1089: register int count = 0; ! 1090: register char *scan; ! 1091: register char *opnd; ! 1092: ! 1093: scan = reginput; ! 1094: opnd = OPERAND(p); ! 1095: switch (OP(p)) { ! 1096: case ANY: ! 1097: count = strlen(scan); ! 1098: scan += count; ! 1099: break; ! 1100: case EXACTLY: ! 1101: while (*opnd == *scan) { ! 1102: count++; ! 1103: scan++; ! 1104: } ! 1105: break; ! 1106: case ANYOF: ! 1107: while (*scan != '\0' && strchr(opnd, *scan) != NULL) { ! 1108: count++; ! 1109: scan++; ! 1110: } ! 1111: break; ! 1112: case ANYBUT: ! 1113: while (*scan != '\0' && strchr(opnd, *scan) == NULL) { ! 1114: count++; ! 1115: scan++; ! 1116: } ! 1117: break; ! 1118: default: /* Oh dear. Called inappropriately. */ ! 1119: regerror("internal foulup"); ! 1120: count = 0; /* Best compromise. */ ! 1121: break; ! 1122: } ! 1123: reginput = scan; ! 1124: ! 1125: return(count); ! 1126: } ! 1127: ! 1128: /* ! 1129: - regnext - dig the "next" pointer out of a node ! 1130: */ ! 1131: static char * ! 1132: regnext(p) ! 1133: register char *p; ! 1134: { ! 1135: register int offset; ! 1136: ! 1137: if (p == ®dummy) ! 1138: return(NULL); ! 1139: ! 1140: offset = NEXT(p); ! 1141: if (offset == 0) ! 1142: return(NULL); ! 1143: ! 1144: if (OP(p) == BACK) ! 1145: return(p-offset); ! 1146: else ! 1147: return(p+offset); ! 1148: } ! 1149: ! 1150: #ifdef DEBUG ! 1151: ! 1152: STATIC char *regprop(); ! 1153: ! 1154: /* ! 1155: - regdump - dump a regexp onto stdout in vaguely comprehensible form ! 1156: */ ! 1157: void ! 1158: regdump(r) ! 1159: regexp *r; ! 1160: { ! 1161: register char *s; ! 1162: register char op = EXACTLY; /* Arbitrary non-END op. */ ! 1163: register char *next; ! 1164: extern char *strchr(); ! 1165: ! 1166: ! 1167: s = r->program + 1; ! 1168: while (op != END) { /* While that wasn't END last time... */ ! 1169: op = OP(s); ! 1170: printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ ! 1171: next = regnext(s); ! 1172: if (next == NULL) /* Next ptr. */ ! 1173: printf("(0)"); ! 1174: else ! 1175: printf("(%d)", (s-r->program)+(next-s)); ! 1176: s += 3; ! 1177: if (op == ANYOF || op == ANYBUT || op == EXACTLY) { ! 1178: /* Literal string, where present. */ ! 1179: while (*s != '\0') { ! 1180: putchar(*s); ! 1181: s++; ! 1182: } ! 1183: s++; ! 1184: } ! 1185: putchar('\n'); ! 1186: } ! 1187: ! 1188: /* Header fields of interest. */ ! 1189: if (r->regstart != '\0') ! 1190: printf("start `%c' ", r->regstart); ! 1191: if (r->reganch) ! 1192: printf("anchored "); ! 1193: if (r->regmust != NULL) ! 1194: printf("must have \"%s\"", r->regmust); ! 1195: printf("\n"); ! 1196: } ! 1197: ! 1198: /* ! 1199: - regprop - printable representation of opcode ! 1200: */ ! 1201: static char * ! 1202: regprop(op) ! 1203: char *op; ! 1204: { ! 1205: register char *p; ! 1206: static char buf[50]; ! 1207: ! 1208: (void) strcpy(buf, ":"); ! 1209: ! 1210: switch (OP(op)) { ! 1211: case BOL: ! 1212: p = "BOL"; ! 1213: break; ! 1214: case EOL: ! 1215: p = "EOL"; ! 1216: break; ! 1217: case ANY: ! 1218: p = "ANY"; ! 1219: break; ! 1220: case ANYOF: ! 1221: p = "ANYOF"; ! 1222: break; ! 1223: case ANYBUT: ! 1224: p = "ANYBUT"; ! 1225: break; ! 1226: case BRANCH: ! 1227: p = "BRANCH"; ! 1228: break; ! 1229: case EXACTLY: ! 1230: p = "EXACTLY"; ! 1231: break; ! 1232: case NOTHING: ! 1233: p = "NOTHING"; ! 1234: break; ! 1235: case BACK: ! 1236: p = "BACK"; ! 1237: break; ! 1238: case END: ! 1239: p = "END"; ! 1240: break; ! 1241: case OPEN+1: ! 1242: case OPEN+2: ! 1243: case OPEN+3: ! 1244: case OPEN+4: ! 1245: case OPEN+5: ! 1246: case OPEN+6: ! 1247: case OPEN+7: ! 1248: case OPEN+8: ! 1249: case OPEN+9: ! 1250: sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); ! 1251: p = NULL; ! 1252: break; ! 1253: case CLOSE+1: ! 1254: case CLOSE+2: ! 1255: case CLOSE+3: ! 1256: case CLOSE+4: ! 1257: case CLOSE+5: ! 1258: case CLOSE+6: ! 1259: case CLOSE+7: ! 1260: case CLOSE+8: ! 1261: case CLOSE+9: ! 1262: sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); ! 1263: p = NULL; ! 1264: break; ! 1265: case STAR: ! 1266: p = "STAR"; ! 1267: break; ! 1268: case PLUS: ! 1269: p = "PLUS"; ! 1270: break; ! 1271: case WORDA: ! 1272: p = "WORDA"; ! 1273: break; ! 1274: case WORDZ: ! 1275: p = "WORDZ"; ! 1276: break; ! 1277: default: ! 1278: regerror("corrupted opcode"); ! 1279: break; ! 1280: } ! 1281: if (p != NULL) ! 1282: (void) strcat(buf, p); ! 1283: return(buf); ! 1284: } ! 1285: #endif ! 1286: ! 1287: /* ! 1288: * The following is provided for those people who do not have strcspn() in ! 1289: * their C libraries. They should get off their butts and do something ! 1290: * about it; at least one public-domain implementation of those (highly ! 1291: * useful) string routines has been published on Usenet. ! 1292: */ ! 1293: #ifdef STRCSPN ! 1294: /* ! 1295: * strcspn - find length of initial segment of s1 consisting entirely ! 1296: * of characters not from s2 ! 1297: */ ! 1298: ! 1299: static int ! 1300: strcspn(s1, s2) ! 1301: char *s1; ! 1302: char *s2; ! 1303: { ! 1304: register char *scan1; ! 1305: register char *scan2; ! 1306: register int count; ! 1307: ! 1308: count = 0; ! 1309: for (scan1 = s1; *scan1 != '\0'; scan1++) { ! 1310: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ ! 1311: if (*scan1 == *scan2++) ! 1312: return(count); ! 1313: count++; ! 1314: } ! 1315: return(count); ! 1316: } ! 1317: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.