|
|
1.1 ! root 1: # ! 2: ! 3: /* ! 4: * routines to do regular expression matching ! 5: * ! 6: * Entry points: ! 7: * ! 8: * re_comp(s) ! 9: * char *s; ! 10: * ... returns 0 if the string s was compiled successfully, ! 11: * a pointer to an error message otherwise. ! 12: * If passed 0 or a null string returns without changing ! 13: * the currently compiled re (see note 11 below). ! 14: * ! 15: * re_exec(s) ! 16: * char *s; ! 17: * ... returns 1 if the string s matches the last compiled regular ! 18: * expression, ! 19: * 0 if the string s failed to match the last compiled ! 20: * regular expression, and ! 21: * -1 if the compiled regular expression was invalid ! 22: * (indicating an internal error). ! 23: * ! 24: * The strings passed to both re_comp and re_exec may have trailing or ! 25: * embedded newline characters; they are terminated by nulls. ! 26: * ! 27: * The identity of the author of these routines is lost in antiquity; ! 28: * this is essentially the same as the re code in the original V6 ed. ! 29: * ! 30: * The regular expressions recognized are described below. This description ! 31: * is essentially the same as that for ed. ! 32: * ! 33: * A regular expression specifies a set of strings of characters. ! 34: * A member of this set of strings is said to be matched by ! 35: * the regular expression. In the following specification for ! 36: * regular expressions the word `character' means any character but NUL. ! 37: * ! 38: * 1. Any character except a special character matches itself. ! 39: * Special characters are the regular expression delimiter plus ! 40: * \ [ . and sometimes ^ * $. ! 41: * 2. A . matches any character. ! 42: * 3. A \ followed by any character except a digit or ( ) ! 43: * matches that character. ! 44: * 4. A nonempty string s bracketed [s] (or [^s]) matches any ! 45: * character in (or not in) s. In s, \ has no special meaning, ! 46: * and ] may only appear as the first letter. A substring ! 47: * a-b, with a and b in ascending ASCII order, stands for ! 48: * the inclusive range of ASCII characters. ! 49: * 5. A regular expression of form 1-4 followed by * matches a ! 50: * sequence of 0 or more matches of the regular expression. ! 51: * 6. A regular expression, x, of form 1-8, bracketed \(x\) ! 52: * matches what x matches. ! 53: * 7. A \ followed by a digit n matches a copy of the string that the ! 54: * bracketed regular expression beginning with the nth \( matched. ! 55: * 8. A regular expression of form 1-8, x, followed by a regular ! 56: * expression of form 1-7, y matches a match for x followed by ! 57: * a match for y, with the x match being as long as possible ! 58: * while still permitting a y match. ! 59: * 9. A regular expression of form 1-8 preceded by ^ (or followed ! 60: * by $), is constrained to matches that begin at the left ! 61: * (or end at the right) end of a line. ! 62: * 10. A regular expression of form 1-9 picks out the longest among ! 63: * the leftmost matches in a line. ! 64: * 11. An empty regular expression stands for a copy of the last ! 65: * regular expression encountered. ! 66: */ ! 67: ! 68: /* ! 69: * constants for re's ! 70: */ ! 71: #define CBRA 1 ! 72: #define CCHR 2 ! 73: #define CDOT 4 ! 74: #define CCL 6 ! 75: #define NCCL 8 ! 76: #define CDOL 10 ! 77: #define CEOF 11 ! 78: #define CKET 12 ! 79: #define CBACK 18 ! 80: ! 81: #define CSTAR 01 ! 82: ! 83: #define ESIZE 512 ! 84: #define NBRA 9 ! 85: ! 86: static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; ! 87: static char circf; ! 88: ! 89: /* ! 90: * compile the regular expression argument into a dfa ! 91: */ ! 92: char * ! 93: re_comp(sp) ! 94: register char *sp; ! 95: { ! 96: register int c; ! 97: register char *ep = expbuf; ! 98: int cclcnt, numbra = 0; ! 99: char *lastep = 0; ! 100: char bracket[NBRA]; ! 101: char *bracketp = &bracket[0]; ! 102: static char *retoolong = "Regular expression too long"; ! 103: ! 104: #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } ! 105: ! 106: if (sp == 0 || *sp == '\0') { ! 107: if (*ep == 0) ! 108: return("No previous regular expression"); ! 109: return(0); ! 110: } ! 111: if (*sp == '^') { ! 112: circf = 1; ! 113: sp++; ! 114: } ! 115: else ! 116: circf = 0; ! 117: for (;;) { ! 118: if (ep >= &expbuf[ESIZE]) ! 119: comerr(retoolong); ! 120: if ((c = *sp++) == '\0') { ! 121: if (bracketp != bracket) ! 122: comerr("unmatched \\("); ! 123: *ep++ = CEOF; ! 124: *ep++ = 0; ! 125: return(0); ! 126: } ! 127: if (c != '*') ! 128: lastep = ep; ! 129: switch (c) { ! 130: ! 131: case '.': ! 132: *ep++ = CDOT; ! 133: continue; ! 134: ! 135: case '*': ! 136: if (lastep == 0 || *lastep == CBRA || *lastep == CKET) ! 137: goto defchar; ! 138: *lastep |= CSTAR; ! 139: continue; ! 140: ! 141: case '$': ! 142: if (*sp != '\0') ! 143: goto defchar; ! 144: *ep++ = CDOL; ! 145: continue; ! 146: ! 147: case '[': ! 148: *ep++ = CCL; ! 149: *ep++ = 0; ! 150: cclcnt = 1; ! 151: if ((c = *sp++) == '^') { ! 152: c = *sp++; ! 153: ep[-2] = NCCL; ! 154: } ! 155: do { ! 156: if (c == '\0') ! 157: comerr("missing ]"); ! 158: if (c == '-' && ep [-1] != 0) { ! 159: if ((c = *sp++) == ']') { ! 160: *ep++ = '-'; ! 161: cclcnt++; ! 162: break; ! 163: } ! 164: while (ep[-1] < c) { ! 165: *ep = ep[-1] + 1; ! 166: ep++; ! 167: cclcnt++; ! 168: if (ep >= &expbuf[ESIZE]) ! 169: comerr(retoolong); ! 170: } ! 171: } ! 172: *ep++ = c; ! 173: cclcnt++; ! 174: if (ep >= &expbuf[ESIZE]) ! 175: comerr(retoolong); ! 176: } while ((c = *sp++) != ']'); ! 177: lastep[1] = cclcnt; ! 178: continue; ! 179: ! 180: case '\\': ! 181: if ((c = *sp++) == '(') { ! 182: if (numbra >= NBRA) ! 183: comerr("too many \\(\\) pairs"); ! 184: *bracketp++ = numbra; ! 185: *ep++ = CBRA; ! 186: *ep++ = numbra++; ! 187: continue; ! 188: } ! 189: if (c == ')') { ! 190: if (bracketp <= bracket) ! 191: comerr("unmatched \\)"); ! 192: *ep++ = CKET; ! 193: *ep++ = *--bracketp; ! 194: continue; ! 195: } ! 196: if (c >= '1' && c < ('1' + NBRA)) { ! 197: *ep++ = CBACK; ! 198: *ep++ = c - '1'; ! 199: continue; ! 200: } ! 201: *ep++ = CCHR; ! 202: *ep++ = c; ! 203: continue; ! 204: ! 205: defchar: ! 206: default: ! 207: *ep++ = CCHR; ! 208: *ep++ = c; ! 209: } ! 210: } ! 211: } ! 212: ! 213: /* ! 214: * match the argument string against the compiled re ! 215: */ ! 216: int ! 217: re_exec(p1) ! 218: register char *p1; ! 219: { ! 220: register char *p2 = expbuf; ! 221: register int c; ! 222: int rv; ! 223: ! 224: for (c = 0; c < NBRA; c++) { ! 225: braslist[c] = 0; ! 226: braelist[c] = 0; ! 227: } ! 228: if (circf) ! 229: return((advance(p1, p2))); ! 230: /* ! 231: * fast check for first character ! 232: */ ! 233: if (*p2 == CCHR) { ! 234: c = p2[1]; ! 235: do { ! 236: if (*p1 != c) ! 237: continue; ! 238: if (rv = advance(p1, p2)) ! 239: return(rv); ! 240: } while (*p1++); ! 241: return(0); ! 242: } ! 243: /* ! 244: * regular algorithm ! 245: */ ! 246: do ! 247: if (rv = advance(p1, p2)) ! 248: return(rv); ! 249: while (*p1++); ! 250: return(0); ! 251: } ! 252: ! 253: /* ! 254: * try to match the next thing in the dfa ! 255: */ ! 256: static int ! 257: advance(lp, ep) ! 258: register char *lp, *ep; ! 259: { ! 260: register char *curlp; ! 261: int ct, i; ! 262: int rv; ! 263: ! 264: for (;;) ! 265: switch (*ep++) { ! 266: ! 267: case CCHR: ! 268: if (*ep++ == *lp++) ! 269: continue; ! 270: return(0); ! 271: ! 272: case CDOT: ! 273: if (*lp++) ! 274: continue; ! 275: return(0); ! 276: ! 277: case CDOL: ! 278: if (*lp == '\0') ! 279: continue; ! 280: return(0); ! 281: ! 282: case CEOF: ! 283: return(1); ! 284: ! 285: case CCL: ! 286: if (cclass(ep, *lp++, 1)) { ! 287: ep += *ep; ! 288: continue; ! 289: } ! 290: return(0); ! 291: ! 292: case NCCL: ! 293: if (cclass(ep, *lp++, 0)) { ! 294: ep += *ep; ! 295: continue; ! 296: } ! 297: return(0); ! 298: ! 299: case CBRA: ! 300: braslist[*ep++] = lp; ! 301: continue; ! 302: ! 303: case CKET: ! 304: braelist[*ep++] = lp; ! 305: continue; ! 306: ! 307: case CBACK: ! 308: if (braelist[i = *ep++] == 0) ! 309: return(-1); ! 310: if (backref(i, lp)) { ! 311: lp += braelist[i] - braslist[i]; ! 312: continue; ! 313: } ! 314: return(0); ! 315: ! 316: case CBACK|CSTAR: ! 317: if (braelist[i = *ep++] == 0) ! 318: return(-1); ! 319: curlp = lp; ! 320: ct = braelist[i] - braslist[i]; ! 321: while (backref(i, lp)) ! 322: lp += ct; ! 323: while (lp >= curlp) { ! 324: if (rv = advance(lp, ep)) ! 325: return(rv); ! 326: lp -= ct; ! 327: } ! 328: continue; ! 329: ! 330: case CDOT|CSTAR: ! 331: curlp = lp; ! 332: while (*lp++) ! 333: ; ! 334: goto star; ! 335: ! 336: case CCHR|CSTAR: ! 337: curlp = lp; ! 338: while (*lp++ == *ep) ! 339: ; ! 340: ep++; ! 341: goto star; ! 342: ! 343: case CCL|CSTAR: ! 344: case NCCL|CSTAR: ! 345: curlp = lp; ! 346: while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) ! 347: ; ! 348: ep += *ep; ! 349: goto star; ! 350: ! 351: star: ! 352: do { ! 353: lp--; ! 354: if (rv = advance(lp, ep)) ! 355: return(rv); ! 356: } while (lp > curlp); ! 357: return(0); ! 358: ! 359: default: ! 360: return(-1); ! 361: } ! 362: } ! 363: ! 364: backref(i, lp) ! 365: register int i; ! 366: register char *lp; ! 367: { ! 368: register char *bp; ! 369: ! 370: bp = braslist[i]; ! 371: while (*bp++ == *lp++) ! 372: if (bp >= braelist[i]) ! 373: return(1); ! 374: return(0); ! 375: } ! 376: ! 377: int ! 378: cclass(set, c, af) ! 379: register char *set, c; ! 380: int af; ! 381: { ! 382: register int n; ! 383: ! 384: if (c == 0) ! 385: return(0); ! 386: n = *set++; ! 387: while (--n) ! 388: if (*set++ == c) ! 389: return(af); ! 390: return(! af); ! 391: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.