|
|
1.1 ! root 1: /* ! 2: * Copyright (c) 1980 Regents of the University of California. ! 3: * All rights reserved. The Berkeley software License Agreement ! 4: * specifies the terms and conditions for redistribution. ! 5: */ ! 6: ! 7: #if defined(LIBC_SCCS) && !defined(lint) ! 8: static char sccsid[] = "@(#)regex.c 5.2 (Berkeley) 3/9/86"; ! 9: #endif LIBC_SCCS and not lint ! 10: ! 11: # ! 12: ! 13: /* ! 14: * routines to do regular expression matching ! 15: * ! 16: * Entry points: ! 17: * ! 18: * re_comp(s) ! 19: * char *s; ! 20: * ... returns 0 if the string s was compiled successfully, ! 21: * a pointer to an error message otherwise. ! 22: * If passed 0 or a null string returns without changing ! 23: * the currently compiled re (see note 11 below). ! 24: * ! 25: * re_exec(s) ! 26: * char *s; ! 27: * ... returns 1 if the string s matches the last compiled regular ! 28: * expression, ! 29: * 0 if the string s failed to match the last compiled ! 30: * regular expression, and ! 31: * -1 if the compiled regular expression was invalid ! 32: * (indicating an internal error). ! 33: * ! 34: * The strings passed to both re_comp and re_exec may have trailing or ! 35: * embedded newline characters; they are terminated by nulls. ! 36: * ! 37: * The identity of the author of these routines is lost in antiquity; ! 38: * this is essentially the same as the re code in the original V6 ed. ! 39: * ! 40: * The regular expressions recognized are described below. This description ! 41: * is essentially the same as that for ed. ! 42: * ! 43: * A regular expression specifies a set of strings of characters. ! 44: * A member of this set of strings is said to be matched by ! 45: * the regular expression. In the following specification for ! 46: * regular expressions the word `character' means any character but NUL. ! 47: * ! 48: * 1. Any character except a special character matches itself. ! 49: * Special characters are the regular expression delimiter plus ! 50: * \ [ . and sometimes ^ * $. ! 51: * 2. A . matches any character. ! 52: * 3. A \ followed by any character except a digit or ( ) ! 53: * matches that character. ! 54: * 4. A nonempty string s bracketed [s] (or [^s]) matches any ! 55: * character in (or not in) s. In s, \ has no special meaning, ! 56: * and ] may only appear as the first letter. A substring ! 57: * a-b, with a and b in ascending ASCII order, stands for ! 58: * the inclusive range of ASCII characters. ! 59: * 5. A regular expression of form 1-4 followed by * matches a ! 60: * sequence of 0 or more matches of the regular expression. ! 61: * 6. A regular expression, x, of form 1-8, bracketed \(x\) ! 62: * matches what x matches. ! 63: * 7. A \ followed by a digit n matches a copy of the string that the ! 64: * bracketed regular expression beginning with the nth \( matched. ! 65: * 8. A regular expression of form 1-8, x, followed by a regular ! 66: * expression of form 1-7, y matches a match for x followed by ! 67: * a match for y, with the x match being as long as possible ! 68: * while still permitting a y match. ! 69: * 9. A regular expression of form 1-8 preceded by ^ (or followed ! 70: * by $), is constrained to matches that begin at the left ! 71: * (or end at the right) end of a line. ! 72: * 10. A regular expression of form 1-9 picks out the longest among ! 73: * the leftmost matches in a line. ! 74: * 11. An empty regular expression stands for a copy of the last ! 75: * regular expression encountered. ! 76: */ ! 77: ! 78: /* ! 79: * constants for re's ! 80: */ ! 81: #define CBRA 1 ! 82: #define CCHR 2 ! 83: #define CDOT 4 ! 84: #define CCL 6 ! 85: #define NCCL 8 ! 86: #define CDOL 10 ! 87: #define CEOF 11 ! 88: #define CKET 12 ! 89: #define CBACK 18 ! 90: ! 91: #define CSTAR 01 ! 92: ! 93: #define ESIZE 512 ! 94: #define NBRA 9 ! 95: ! 96: static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA]; ! 97: static char circf; ! 98: ! 99: /* ! 100: * compile the regular expression argument into a dfa ! 101: */ ! 102: char * ! 103: re_comp(sp) ! 104: register char *sp; ! 105: { ! 106: register int c; ! 107: register char *ep = expbuf; ! 108: int cclcnt, numbra = 0; ! 109: char *lastep = 0; ! 110: char bracket[NBRA]; ! 111: char *bracketp = &bracket[0]; ! 112: static char *retoolong = "Regular expression too long"; ! 113: ! 114: #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); } ! 115: ! 116: if (sp == 0 || *sp == '\0') { ! 117: if (*ep == 0) ! 118: return("No previous regular expression"); ! 119: return(0); ! 120: } ! 121: if (*sp == '^') { ! 122: circf = 1; ! 123: sp++; ! 124: } ! 125: else ! 126: circf = 0; ! 127: for (;;) { ! 128: if (ep >= &expbuf[ESIZE]) ! 129: comerr(retoolong); ! 130: if ((c = *sp++) == '\0') { ! 131: if (bracketp != bracket) ! 132: comerr("unmatched \\("); ! 133: *ep++ = CEOF; ! 134: *ep++ = 0; ! 135: return(0); ! 136: } ! 137: if (c != '*') ! 138: lastep = ep; ! 139: switch (c) { ! 140: ! 141: case '.': ! 142: *ep++ = CDOT; ! 143: continue; ! 144: ! 145: case '*': ! 146: if (lastep == 0 || *lastep == CBRA || *lastep == CKET) ! 147: goto defchar; ! 148: *lastep |= CSTAR; ! 149: continue; ! 150: ! 151: case '$': ! 152: if (*sp != '\0') ! 153: goto defchar; ! 154: *ep++ = CDOL; ! 155: continue; ! 156: ! 157: case '[': ! 158: *ep++ = CCL; ! 159: *ep++ = 0; ! 160: cclcnt = 1; ! 161: if ((c = *sp++) == '^') { ! 162: c = *sp++; ! 163: ep[-2] = NCCL; ! 164: } ! 165: do { ! 166: if (c == '\0') ! 167: comerr("missing ]"); ! 168: if (c == '-' && ep [-1] != 0) { ! 169: if ((c = *sp++) == ']') { ! 170: *ep++ = '-'; ! 171: cclcnt++; ! 172: break; ! 173: } ! 174: while (ep[-1] < c) { ! 175: *ep = ep[-1] + 1; ! 176: ep++; ! 177: cclcnt++; ! 178: if (ep >= &expbuf[ESIZE]) ! 179: comerr(retoolong); ! 180: } ! 181: } ! 182: *ep++ = c; ! 183: cclcnt++; ! 184: if (ep >= &expbuf[ESIZE]) ! 185: comerr(retoolong); ! 186: } while ((c = *sp++) != ']'); ! 187: lastep[1] = cclcnt; ! 188: continue; ! 189: ! 190: case '\\': ! 191: if ((c = *sp++) == '(') { ! 192: if (numbra >= NBRA) ! 193: comerr("too many \\(\\) pairs"); ! 194: *bracketp++ = numbra; ! 195: *ep++ = CBRA; ! 196: *ep++ = numbra++; ! 197: continue; ! 198: } ! 199: if (c == ')') { ! 200: if (bracketp <= bracket) ! 201: comerr("unmatched \\)"); ! 202: *ep++ = CKET; ! 203: *ep++ = *--bracketp; ! 204: continue; ! 205: } ! 206: if (c >= '1' && c < ('1' + NBRA)) { ! 207: *ep++ = CBACK; ! 208: *ep++ = c - '1'; ! 209: continue; ! 210: } ! 211: *ep++ = CCHR; ! 212: *ep++ = c; ! 213: continue; ! 214: ! 215: defchar: ! 216: default: ! 217: *ep++ = CCHR; ! 218: *ep++ = c; ! 219: } ! 220: } ! 221: } ! 222: ! 223: /* ! 224: * match the argument string against the compiled re ! 225: */ ! 226: int ! 227: re_exec(p1) ! 228: register char *p1; ! 229: { ! 230: register char *p2 = expbuf; ! 231: register int c; ! 232: int rv; ! 233: ! 234: for (c = 0; c < NBRA; c++) { ! 235: braslist[c] = 0; ! 236: braelist[c] = 0; ! 237: } ! 238: if (circf) ! 239: return((advance(p1, p2))); ! 240: /* ! 241: * fast check for first character ! 242: */ ! 243: if (*p2 == CCHR) { ! 244: c = p2[1]; ! 245: do { ! 246: if (*p1 != c) ! 247: continue; ! 248: if (rv = advance(p1, p2)) ! 249: return(rv); ! 250: } while (*p1++); ! 251: return(0); ! 252: } ! 253: /* ! 254: * regular algorithm ! 255: */ ! 256: do ! 257: if (rv = advance(p1, p2)) ! 258: return(rv); ! 259: while (*p1++); ! 260: return(0); ! 261: } ! 262: ! 263: /* ! 264: * try to match the next thing in the dfa ! 265: */ ! 266: static int ! 267: advance(lp, ep) ! 268: register char *lp, *ep; ! 269: { ! 270: register char *curlp; ! 271: int ct, i; ! 272: int rv; ! 273: ! 274: for (;;) ! 275: switch (*ep++) { ! 276: ! 277: case CCHR: ! 278: if (*ep++ == *lp++) ! 279: continue; ! 280: return(0); ! 281: ! 282: case CDOT: ! 283: if (*lp++) ! 284: continue; ! 285: return(0); ! 286: ! 287: case CDOL: ! 288: if (*lp == '\0') ! 289: continue; ! 290: return(0); ! 291: ! 292: case CEOF: ! 293: return(1); ! 294: ! 295: case CCL: ! 296: if (cclass(ep, *lp++, 1)) { ! 297: ep += *ep; ! 298: continue; ! 299: } ! 300: return(0); ! 301: ! 302: case NCCL: ! 303: if (cclass(ep, *lp++, 0)) { ! 304: ep += *ep; ! 305: continue; ! 306: } ! 307: return(0); ! 308: ! 309: case CBRA: ! 310: braslist[*ep++] = lp; ! 311: continue; ! 312: ! 313: case CKET: ! 314: braelist[*ep++] = lp; ! 315: continue; ! 316: ! 317: case CBACK: ! 318: if (braelist[i = *ep++] == 0) ! 319: return(-1); ! 320: if (backref(i, lp)) { ! 321: lp += braelist[i] - braslist[i]; ! 322: continue; ! 323: } ! 324: return(0); ! 325: ! 326: case CBACK|CSTAR: ! 327: if (braelist[i = *ep++] == 0) ! 328: return(-1); ! 329: curlp = lp; ! 330: ct = braelist[i] - braslist[i]; ! 331: while (backref(i, lp)) ! 332: lp += ct; ! 333: while (lp >= curlp) { ! 334: if (rv = advance(lp, ep)) ! 335: return(rv); ! 336: lp -= ct; ! 337: } ! 338: continue; ! 339: ! 340: case CDOT|CSTAR: ! 341: curlp = lp; ! 342: while (*lp++) ! 343: ; ! 344: goto star; ! 345: ! 346: case CCHR|CSTAR: ! 347: curlp = lp; ! 348: while (*lp++ == *ep) ! 349: ; ! 350: ep++; ! 351: goto star; ! 352: ! 353: case CCL|CSTAR: ! 354: case NCCL|CSTAR: ! 355: curlp = lp; ! 356: while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR))) ! 357: ; ! 358: ep += *ep; ! 359: goto star; ! 360: ! 361: star: ! 362: do { ! 363: lp--; ! 364: if (rv = advance(lp, ep)) ! 365: return(rv); ! 366: } while (lp > curlp); ! 367: return(0); ! 368: ! 369: default: ! 370: return(-1); ! 371: } ! 372: } ! 373: ! 374: backref(i, lp) ! 375: register int i; ! 376: register char *lp; ! 377: { ! 378: register char *bp; ! 379: ! 380: bp = braslist[i]; ! 381: while (*bp++ == *lp++) ! 382: if (bp >= braelist[i]) ! 383: return(1); ! 384: return(0); ! 385: } ! 386: ! 387: int ! 388: cclass(set, c, af) ! 389: register char *set, c; ! 390: int af; ! 391: { ! 392: register int n; ! 393: ! 394: if (c == 0) ! 395: return(0); ! 396: n = *set++; ! 397: while (--n) ! 398: if (*set++ == c) ! 399: return(af); ! 400: return(! af); ! 401: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.