|
|
1.1 ! root 1: /* @(#)regex.c 4.1 (Berkeley) 12/21/80 */ ! 2: # ! 3: ! 4: /* ! 5: * routines to do regular expression matching ! 6: * ! 7: * Entry points: ! 8: * ! 9: * re_comp(s) ! 10: * char *s; ! 11: * ... returns 0 if the string s was compiled successfully, ! 12: * a pointer to an error message otherwise. ! 13: * If passed 0 or a null string returns without changing ! 14: * the currently compiled re (see note 11 below). ! 15: * ! 16: * re_exec(s) ! 17: * char *s; ! 18: * ... returns 1 if the string s matches the last compiled regular ! 19: * expression, ! 20: * 0 if the string s failed to match the last compiled ! 21: * regular expression, and ! 22: * -1 if the compiled regular expression was invalid ! 23: * (indicating an internal error). ! 24: * ! 25: * The strings passed to both re_comp and re_exec may have trailing or ! 26: * embedded newline characters; they are terminated by nulls. ! 27: * ! 28: * The identity of the author of these routines is lost in antiquity; ! 29: * this is essentially the same as the re code in the original V6 ed. ! 30: * ! 31: * The regular expressions recognized are described below. This description ! 32: * is essentially the same as that for ed. ! 33: * ! 34: * A regular expression specifies a set of strings of characters. ! 35: * A member of this set of strings is said to be matched by ! 36: * the regular expression. In the following specification for ! 37: * regular expressions the word `character' means any character but NUL. ! 38: * ! 39: * 1. Any character except a special character matches itself. ! 40: * Special characters are the regular expression delimiter plus ! 41: * \ [ . and sometimes ^ * $. ! 42: * 2. A . matches any character. ! 43: * 3. A \ followed by any character except a digit or ( ) ! 44: * matches that character. ! 45: * 4. A nonempty string s bracketed [s] (or [^s]) matches any ! 46: * character in (or not in) s. In s, \ has no special meaning, ! 47: * and ] may only appear as the first letter. A substring ! 48: * a-b, with a and b in ascending ASCII order, stands for ! 49: * the inclusive range of ASCII characters. ! 50: * 5. A regular expression of form 1-4 followed by * matches a ! 51: * sequence of 0 or more matches of the regular expression. ! 52: * 6. A regular expression, x, of form 1-8, bracketed \(x\) ! 53: * matches what x matches. ! 54: * 7. A \ followed by a digit n matches a copy of the string that the ! 55: * bracketed regular expression beginning with the nth \( matched. ! 56: * 8. A regular expression of form 1-8, x, followed by a regular ! 57: * expression of form 1-7, y matches a match for x followed by ! 58: * a match for y, with the x match being as long as possible ! 59: * while still permitting a y match. ! 60: * 9. A regular expression of form 1-8 preceded by ^ (or followed ! 61: * by $), is constrained to matches that begin at the left ! 62: * (or end at the right) end of a line. ! 63: * 10. A regular expression of form 1-9 picks out the longest among ! 64: * the leftmost matches in a line. ! 65: * 11. An empty regular expression stands for a copy of the last ! 66: * regular expression encountered. ! 67: */ ! 68: ! 69: /* ! 70: * constants for re's ! 71: */ ! 72: #define CBRA 1 ! 73: #define CCHR 2 ! 74: #define CDOT 4 ! 75: #define CCL 6 ! 76: #define NCCL 8 ! 77: #define CDOL 10 ! 78: #define CEOF 11 ! 79: #define CKET 12 ! 80: #define CBACK 18 ! 81: ! 82: #define CSTAR 01 ! 83: ! 84: #define ESIZE 512 ! 85: #define NBRA 9 ! 86: ! 87: static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; ! 88: static char circf; ! 89: ! 90: /* ! 91: * compile the regular expression argument into a dfa ! 92: */ ! 93: char * ! 94: re_comp(sp) ! 95: register char *sp; ! 96: { ! 97: register int c; ! 98: register char *ep = expbuf; ! 99: int cclcnt, numbra = 0; ! 100: char *lastep = 0; ! 101: char bracket[NBRA]; ! 102: char *bracketp = &bracket[0]; ! 103: static char *retoolong = "Regular expression too long"; ! 104: ! 105: #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } ! 106: ! 107: if (sp == 0 || *sp == '\0') { ! 108: if (*ep == 0) ! 109: return("No previous regular expression"); ! 110: return(0); ! 111: } ! 112: if (*sp == '^') { ! 113: circf = 1; ! 114: sp++; ! 115: } ! 116: else ! 117: circf = 0; ! 118: for (;;) { ! 119: if (ep >= &expbuf[ESIZE]) ! 120: comerr(retoolong); ! 121: if ((c = *sp++) == '\0') { ! 122: if (bracketp != bracket) ! 123: comerr("unmatched \\("); ! 124: *ep++ = CEOF; ! 125: *ep++ = 0; ! 126: return(0); ! 127: } ! 128: if (c != '*') ! 129: lastep = ep; ! 130: switch (c) { ! 131: ! 132: case '.': ! 133: *ep++ = CDOT; ! 134: continue; ! 135: ! 136: case '*': ! 137: if (lastep == 0 || *lastep == CBRA || *lastep == CKET) ! 138: goto defchar; ! 139: *lastep |= CSTAR; ! 140: continue; ! 141: ! 142: case '$': ! 143: if (*sp != '\0') ! 144: goto defchar; ! 145: *ep++ = CDOL; ! 146: continue; ! 147: ! 148: case '[': ! 149: *ep++ = CCL; ! 150: *ep++ = 0; ! 151: cclcnt = 1; ! 152: if ((c = *sp++) == '^') { ! 153: c = *sp++; ! 154: ep[-2] = NCCL; ! 155: } ! 156: do { ! 157: if (c == '\0') ! 158: comerr("missing ]"); ! 159: if (c == '-' && ep [-1] != 0) { ! 160: if ((c = *sp++) == ']') { ! 161: *ep++ = '-'; ! 162: cclcnt++; ! 163: break; ! 164: } ! 165: while (ep[-1] < c) { ! 166: *ep = ep[-1] + 1; ! 167: ep++; ! 168: cclcnt++; ! 169: if (ep >= &expbuf[ESIZE]) ! 170: comerr(retoolong); ! 171: } ! 172: } ! 173: *ep++ = c; ! 174: cclcnt++; ! 175: if (ep >= &expbuf[ESIZE]) ! 176: comerr(retoolong); ! 177: } while ((c = *sp++) != ']'); ! 178: lastep[1] = cclcnt; ! 179: continue; ! 180: ! 181: case '\\': ! 182: if ((c = *sp++) == '(') { ! 183: if (numbra >= NBRA) ! 184: comerr("too many \\(\\) pairs"); ! 185: *bracketp++ = numbra; ! 186: *ep++ = CBRA; ! 187: *ep++ = numbra++; ! 188: continue; ! 189: } ! 190: if (c == ')') { ! 191: if (bracketp <= bracket) ! 192: comerr("unmatched \\)"); ! 193: *ep++ = CKET; ! 194: *ep++ = *--bracketp; ! 195: continue; ! 196: } ! 197: if (c >= '1' && c < ('1' + NBRA)) { ! 198: *ep++ = CBACK; ! 199: *ep++ = c - '1'; ! 200: continue; ! 201: } ! 202: *ep++ = CCHR; ! 203: *ep++ = c; ! 204: continue; ! 205: ! 206: defchar: ! 207: default: ! 208: *ep++ = CCHR; ! 209: *ep++ = c; ! 210: } ! 211: } ! 212: } ! 213: ! 214: /* ! 215: * match the argument string against the compiled re ! 216: */ ! 217: int ! 218: re_exec(p1) ! 219: register char *p1; ! 220: { ! 221: register char *p2 = expbuf; ! 222: register int c; ! 223: int rv; ! 224: ! 225: for (c = 0; c < NBRA; c++) { ! 226: braslist[c] = 0; ! 227: braelist[c] = 0; ! 228: } ! 229: if (circf) ! 230: return((advance(p1, p2))); ! 231: /* ! 232: * fast check for first character ! 233: */ ! 234: if (*p2 == CCHR) { ! 235: c = p2[1]; ! 236: do { ! 237: if (*p1 != c) ! 238: continue; ! 239: if (rv = advance(p1, p2)) ! 240: return(rv); ! 241: } while (*p1++); ! 242: return(0); ! 243: } ! 244: /* ! 245: * regular algorithm ! 246: */ ! 247: do ! 248: if (rv = advance(p1, p2)) ! 249: return(rv); ! 250: while (*p1++); ! 251: return(0); ! 252: } ! 253: ! 254: /* ! 255: * try to match the next thing in the dfa ! 256: */ ! 257: static int ! 258: advance(lp, ep) ! 259: register char *lp, *ep; ! 260: { ! 261: register char *curlp; ! 262: int ct, i; ! 263: int rv; ! 264: ! 265: for (;;) ! 266: switch (*ep++) { ! 267: ! 268: case CCHR: ! 269: if (*ep++ == *lp++) ! 270: continue; ! 271: return(0); ! 272: ! 273: case CDOT: ! 274: if (*lp++) ! 275: continue; ! 276: return(0); ! 277: ! 278: case CDOL: ! 279: if (*lp == '\0') ! 280: continue; ! 281: return(0); ! 282: ! 283: case CEOF: ! 284: return(1); ! 285: ! 286: case CCL: ! 287: if (cclass(ep, *lp++, 1)) { ! 288: ep += *ep; ! 289: continue; ! 290: } ! 291: return(0); ! 292: ! 293: case NCCL: ! 294: if (cclass(ep, *lp++, 0)) { ! 295: ep += *ep; ! 296: continue; ! 297: } ! 298: return(0); ! 299: ! 300: case CBRA: ! 301: braslist[*ep++] = lp; ! 302: continue; ! 303: ! 304: case CKET: ! 305: braelist[*ep++] = lp; ! 306: continue; ! 307: ! 308: case CBACK: ! 309: if (braelist[i = *ep++] == 0) ! 310: return(-1); ! 311: if (backref(i, lp)) { ! 312: lp += braelist[i] - braslist[i]; ! 313: continue; ! 314: } ! 315: return(0); ! 316: ! 317: case CBACK|CSTAR: ! 318: if (braelist[i = *ep++] == 0) ! 319: return(-1); ! 320: curlp = lp; ! 321: ct = braelist[i] - braslist[i]; ! 322: while (backref(i, lp)) ! 323: lp += ct; ! 324: while (lp >= curlp) { ! 325: if (rv = advance(lp, ep)) ! 326: return(rv); ! 327: lp -= ct; ! 328: } ! 329: continue; ! 330: ! 331: case CDOT|CSTAR: ! 332: curlp = lp; ! 333: while (*lp++) ! 334: ; ! 335: goto star; ! 336: ! 337: case CCHR|CSTAR: ! 338: curlp = lp; ! 339: while (*lp++ == *ep) ! 340: ; ! 341: ep++; ! 342: goto star; ! 343: ! 344: case CCL|CSTAR: ! 345: case NCCL|CSTAR: ! 346: curlp = lp; ! 347: while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) ! 348: ; ! 349: ep += *ep; ! 350: goto star; ! 351: ! 352: star: ! 353: do { ! 354: lp--; ! 355: if (rv = advance(lp, ep)) ! 356: return(rv); ! 357: } while (lp > curlp); ! 358: return(0); ! 359: ! 360: default: ! 361: return(-1); ! 362: } ! 363: } ! 364: ! 365: backref(i, lp) ! 366: register int i; ! 367: register char *lp; ! 368: { ! 369: register char *bp; ! 370: ! 371: bp = braslist[i]; ! 372: while (*bp++ == *lp++) ! 373: if (bp >= braelist[i]) ! 374: return(1); ! 375: return(0); ! 376: } ! 377: ! 378: int ! 379: cclass(set, c, af) ! 380: register char *set, c; ! 381: int af; ! 382: { ! 383: register int n; ! 384: ! 385: if (c == 0) ! 386: return(0); ! 387: n = *set++; ! 388: while (--n) ! 389: if (*set++ == c) ! 390: return(af); ! 391: return(! af); ! 392: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.