|
|
1.1 ! root 1: static char sccsid[] = "@(#)regexp.c 4.1 (Berkeley) 10/19/82"; ! 2: ! 3: #include <ctype.h> ! 4: ! 5: typedef int boolean; ! 6: #define TRUE 1 ! 7: #define FALSE 0 ! 8: #define NIL 0 ! 9: ! 10: boolean l_onecase; /* true if upper and lower equivalent */ ! 11: ! 12: #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) ! 13: ! 14: /* STRNCMP - like strncmp except that we convert the ! 15: * first string to lower case before comparing ! 16: * if l_onecase is set. ! 17: */ ! 18: ! 19: STRNCMP(s1, s2, len) ! 20: register char *s1,*s2; ! 21: register int len; ! 22: { ! 23: if (l_onecase) { ! 24: do ! 25: if (*s2 - makelower(*s1)) ! 26: return (*s2 - makelower(*s1)); ! 27: else { ! 28: s2++; ! 29: s1++; ! 30: } ! 31: while (--len); ! 32: } else { ! 33: do ! 34: if (*s2 - *s1) ! 35: return (*s2 - *s1); ! 36: else { ! 37: s2++; ! 38: s1++; ! 39: } ! 40: while (--len); ! 41: } ! 42: return(0); ! 43: } ! 44: ! 45: /* The following routine converts an irregular expression to ! 46: * internal format. ! 47: * ! 48: * Either meta symbols (\a \d or \p) or character strings or ! 49: * operations ( alternation or perenthesizing ) can be ! 50: * specified. Each starts with a descriptor byte. The descriptor ! 51: * byte has STR set for strings, META set for meta symbols ! 52: * and OPER set for operations. ! 53: * The descriptor byte can also have the OPT bit set if the object ! 54: * defined is optional. Also ALT can be set to indicate an alternation. ! 55: * ! 56: * For metasymbols the byte following the descriptor byte identities ! 57: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For ! 58: * strings the byte after the descriptor is a character count for ! 59: * the string: ! 60: * ! 61: * meta symbols := descriptor ! 62: * symbol ! 63: * ! 64: * strings := descriptor ! 65: * character count ! 66: * the string ! 67: * ! 68: * operatins := descriptor ! 69: * symbol ! 70: * character count ! 71: */ ! 72: ! 73: /* ! 74: * handy macros for accessing parts of match blocks ! 75: */ ! 76: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ ! 77: #define MNEXT(A) (A+2) /* character following a metasymbol block */ ! 78: ! 79: #define OSYM(A) (*(A+1)) /* symbol in an operation block */ ! 80: #define OCNT(A) (*(A+2)) /* character count */ ! 81: #define ONEXT(A) (A+3) /* next character after the operation */ ! 82: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ ! 83: ! 84: #define SCNT(A) (*(A+1)) /* byte count of a string */ ! 85: #define SSTR(A) (A+2) /* address of the string */ ! 86: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ ! 87: ! 88: /* ! 89: * bit flags in the descriptor ! 90: */ ! 91: #define OPT 1 ! 92: #define STR 2 ! 93: #define META 4 ! 94: #define ALT 8 ! 95: #define OPER 16 ! 96: ! 97: char *ure; /* pointer current position in unconverted exp */ ! 98: char *ccre; /* pointer to current position in converted exp*/ ! 99: char *malloc(); ! 100: ! 101: char * ! 102: convexp(re) ! 103: char *re; /* unconverted irregular expression */ ! 104: { ! 105: register char *cre; /* pointer to converted regular expression */ ! 106: ! 107: /* allocate room for the converted expression */ ! 108: if (re == NIL) ! 109: return (NIL); ! 110: if (*re == '\0') ! 111: return (NIL); ! 112: cre = malloc (4 * strlen(re) + 3); ! 113: ccre = cre; ! 114: ure = re; ! 115: ! 116: /* start the conversion with a \a */ ! 117: *cre = META | OPT; ! 118: MSYM(cre) = 'a'; ! 119: ccre = MNEXT(cre); ! 120: ! 121: /* start the conversion (its recursive) */ ! 122: expconv (); ! 123: *ccre = 0; ! 124: return (cre); ! 125: } ! 126: ! 127: expconv() ! 128: { ! 129: register char *cs; /* pointer to current symbol in converted exp */ ! 130: register char c; /* character being processed */ ! 131: register char *acs; /* pinter to last alternate */ ! 132: register int temp; ! 133: ! 134: /* let the conversion begin */ ! 135: acs = NIL; ! 136: while (*ure != NIL) { ! 137: switch (c = *ure++) { ! 138: ! 139: case '\\': ! 140: switch (c = *ure++) { ! 141: ! 142: /* escaped characters are just characters */ ! 143: default: ! 144: if ((*cs & STR) == 0) { ! 145: cs = ccre; ! 146: *cs = STR; ! 147: SCNT(cs) = 1; ! 148: ccre += 2; ! 149: } else ! 150: SCNT(cs)++; ! 151: *ccre++ = c; ! 152: break; ! 153: ! 154: /* normal(?) metacharacters */ ! 155: case 'a': ! 156: case 'd': ! 157: case 'e': ! 158: case 'p': ! 159: if (acs != NIL && acs != cs) { ! 160: do { ! 161: temp = OCNT(acs); ! 162: OCNT(acs) = ccre - acs; ! 163: acs -= temp; ! 164: } while (temp != 0); ! 165: acs = NIL; ! 166: } ! 167: cs = ccre; ! 168: *cs = META; ! 169: MSYM(cs) = c; ! 170: ccre = MNEXT(cs); ! 171: break; ! 172: } ! 173: break; ! 174: ! 175: /* just put the symbol in */ ! 176: case '^': ! 177: case '$': ! 178: if (acs != NIL && acs != cs) { ! 179: do { ! 180: temp = OCNT(acs); ! 181: OCNT(acs) = ccre - acs; ! 182: acs -= temp; ! 183: } while (temp != 0); ! 184: acs = NIL; ! 185: } ! 186: cs = ccre; ! 187: *cs = META; ! 188: MSYM(cs) = c; ! 189: ccre = MNEXT(cs); ! 190: break; ! 191: ! 192: /* mark the last match sequence as optional */ ! 193: case '?': ! 194: *cs = *cs | OPT; ! 195: break; ! 196: ! 197: /* recurse and define a subexpression */ ! 198: case '(': ! 199: if (acs != NIL && acs != cs) { ! 200: do { ! 201: temp = OCNT(acs); ! 202: OCNT(acs) = ccre - acs; ! 203: acs -= temp; ! 204: } while (temp != 0); ! 205: acs = NIL; ! 206: } ! 207: cs = ccre; ! 208: *cs = OPER; ! 209: OSYM(cs) = '('; ! 210: ccre = ONEXT(cs); ! 211: expconv (); ! 212: OCNT(cs) = ccre - cs; /* offset to next symbol */ ! 213: break; ! 214: ! 215: /* return from a recursion */ ! 216: case ')': ! 217: if (acs != NIL) { ! 218: do { ! 219: temp = OCNT(acs); ! 220: OCNT(acs) = ccre - acs; ! 221: acs -= temp; ! 222: } while (temp != 0); ! 223: acs = NIL; ! 224: } ! 225: cs = ccre; ! 226: *cs = META; ! 227: MSYM(cs) = c; ! 228: ccre = MNEXT(cs); ! 229: return; ! 230: ! 231: /* mark the last match sequence as having an alternate */ ! 232: /* the third byte will contain an offset to jump over the */ ! 233: /* alternate match in case the first did not fail */ ! 234: case '|': ! 235: if (acs != NIL && acs != cs) ! 236: OCNT(ccre) = ccre - acs; /* make a back pointer */ ! 237: else ! 238: OCNT(ccre) = 0; ! 239: *cs |= ALT; ! 240: cs = ccre; ! 241: *cs = OPER; ! 242: OSYM(cs) = '|'; ! 243: ccre = ONEXT(cs); ! 244: acs = cs; /* remember that the pointer is to be filles */ ! 245: break; ! 246: ! 247: /* if its not a metasymbol just build a scharacter string */ ! 248: default: ! 249: if ((*cs & STR) == 0) { ! 250: cs = ccre; ! 251: *cs = STR; ! 252: SCNT(cs) = 1; ! 253: ccre = SSTR(cs); ! 254: } else ! 255: SCNT(cs)++; ! 256: *ccre++ = c; ! 257: break; ! 258: } ! 259: } ! 260: if (acs != NIL) { ! 261: do { ! 262: temp = OCNT(acs); ! 263: OCNT(acs) = ccre - acs; ! 264: acs -= temp; ! 265: } while (temp != 0); ! 266: acs = NIL; ! 267: } ! 268: return; ! 269: } ! 270: /* end of convertre */ ! 271: ! 272: ! 273: /* ! 274: * The following routine recognises an irregular expresion ! 275: * with the following special characters: ! 276: * ! 277: * \? - means last match was optional ! 278: * \a - matches any number of characters ! 279: * \d - matches any number of spaces and tabs ! 280: * \p - matches any number of alphanumeric ! 281: * characters. The ! 282: * characters matched will be copied into ! 283: * the area pointed to by 'name'. ! 284: * \| - alternation ! 285: * \( \) - grouping used mostly for alternation and ! 286: * optionality ! 287: * ! 288: * The irregular expression must be translated to internal form ! 289: * prior to calling this routine ! 290: * ! 291: * The value returned is the pointer to the first non \a ! 292: * character matched. ! 293: */ ! 294: ! 295: boolean _escaped; /* true if we are currently _escaped */ ! 296: char *_start; /* start of string */ ! 297: ! 298: char * ! 299: expmatch (s, re, mstring) ! 300: register char *s; /* string to check for a match in */ ! 301: register char *re; /* a converted irregular expression */ ! 302: register char *mstring; /* where to put whatever matches a \p */ ! 303: { ! 304: register char *cs; /* the current symbol */ ! 305: register char *ptr,*s1; /* temporary pointer */ ! 306: boolean matched; /* a temporary boolean */ ! 307: ! 308: /* initial conditions */ ! 309: if (re == NIL) ! 310: return (NIL); ! 311: cs = re; ! 312: matched = FALSE; ! 313: ! 314: /* loop till expression string is exhausted (or at least pretty tired) */ ! 315: while (*cs) { ! 316: switch (*cs & (OPER | STR | META)) { ! 317: ! 318: /* try to match a string */ ! 319: case STR: ! 320: matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); ! 321: if (matched) { ! 322: ! 323: /* hoorah it matches */ ! 324: s += SCNT(cs); ! 325: cs = SNEXT(cs); ! 326: } else if (*cs & ALT) { ! 327: ! 328: /* alternation, skip to next expression */ ! 329: cs = SNEXT(cs); ! 330: } else if (*cs & OPT) { ! 331: ! 332: /* the match is optional */ ! 333: cs = SNEXT(cs); ! 334: matched = 1; /* indicate a successful match */ ! 335: } else { ! 336: ! 337: /* no match, error return */ ! 338: return (NIL); ! 339: } ! 340: break; ! 341: ! 342: /* an operator, do something fancy */ ! 343: case OPER: ! 344: switch (OSYM(cs)) { ! 345: ! 346: /* this is an alternation */ ! 347: case '|': ! 348: if (matched) ! 349: ! 350: /* last thing in the alternation was a match, skip ahead */ ! 351: cs = OPTR(cs); ! 352: else ! 353: ! 354: /* no match, keep trying */ ! 355: cs = ONEXT(cs); ! 356: break; ! 357: ! 358: /* this is a grouping, recurse */ ! 359: case '(': ! 360: ptr = expmatch (s, ONEXT(cs), mstring); ! 361: if (ptr != NIL) { ! 362: ! 363: /* the subexpression matched */ ! 364: matched = 1; ! 365: s = ptr; ! 366: } else if (*cs & ALT) { ! 367: ! 368: /* alternation, skip to next expression */ ! 369: matched = 0; ! 370: } else if (*cs & OPT) { ! 371: ! 372: /* the match is optional */ ! 373: matched = 1; /* indicate a successful match */ ! 374: } else { ! 375: ! 376: /* no match, error return */ ! 377: return (NIL); ! 378: } ! 379: cs = OPTR(cs); ! 380: break; ! 381: } ! 382: break; ! 383: ! 384: /* try to match a metasymbol */ ! 385: case META: ! 386: switch (MSYM(cs)) { ! 387: ! 388: /* try to match anything and remember what was matched */ ! 389: case 'p': ! 390: /* ! 391: * This is really the same as trying the match the ! 392: * remaining parts of the expression to any subset ! 393: * of the string. ! 394: */ ! 395: s1 = s; ! 396: do { ! 397: ptr = expmatch (s1, MNEXT(cs), mstring); ! 398: if (ptr != NIL && s1 != s) { ! 399: ! 400: /* we have a match, remember the match */ ! 401: strncpy (mstring, s, s1 - s); ! 402: mstring[s1 - s] = '\0'; ! 403: return (ptr); ! 404: } else if (ptr != NIL && (*cs & OPT)) { ! 405: ! 406: /* it was aoptional so no match is ok */ ! 407: return (ptr); ! 408: } else if (ptr != NIL) { ! 409: ! 410: /* not optional and we still matched */ ! 411: return (NIL); ! 412: } ! 413: if (!isalnum(*s1) && *s1 != '_') ! 414: return (NIL); ! 415: if (*s1 == '\\') ! 416: _escaped = _escaped ? FALSE : TRUE; ! 417: else ! 418: _escaped = FALSE; ! 419: } while (*s1++); ! 420: return (NIL); ! 421: ! 422: /* try to match anything */ ! 423: case 'a': ! 424: /* ! 425: * This is really the same as trying the match the ! 426: * remaining parts of the expression to any subset ! 427: * of the string. ! 428: */ ! 429: s1 = s; ! 430: do { ! 431: ptr = expmatch (s1, MNEXT(cs), mstring); ! 432: if (ptr != NIL && s1 != s) { ! 433: ! 434: /* we have a match */ ! 435: return (ptr); ! 436: } else if (ptr != NIL && (*cs & OPT)) { ! 437: ! 438: /* it was aoptional so no match is ok */ ! 439: return (ptr); ! 440: } else if (ptr != NIL) { ! 441: ! 442: /* not optional and we still matched */ ! 443: return (NIL); ! 444: } ! 445: if (*s1 == '\\') ! 446: _escaped = _escaped ? FALSE : TRUE; ! 447: else ! 448: _escaped = FALSE; ! 449: } while (*s1++); ! 450: return (NIL); ! 451: ! 452: /* fail if we are currently _escaped */ ! 453: case 'e': ! 454: if (_escaped) ! 455: return(NIL); ! 456: cs = MNEXT(cs); ! 457: break; ! 458: ! 459: /* match any number of tabs and spaces */ ! 460: case 'd': ! 461: ptr = s; ! 462: while (*s == ' ' || *s == '\t') ! 463: s++; ! 464: if (s != ptr || s == _start) { ! 465: ! 466: /* match, be happy */ ! 467: matched = 1; ! 468: cs = MNEXT(cs); ! 469: } else if (*s == '\n' || *s == '\0') { ! 470: ! 471: /* match, be happy */ ! 472: matched = 1; ! 473: cs = MNEXT(cs); ! 474: } else if (*cs & ALT) { ! 475: ! 476: /* try the next part */ ! 477: matched = 0; ! 478: cs = MNEXT(cs); ! 479: } else if (*cs & OPT) { ! 480: ! 481: /* doesn't matter */ ! 482: matched = 1; ! 483: cs = MNEXT(cs); ! 484: } else ! 485: ! 486: /* no match, error return */ ! 487: return (NIL); ! 488: break; ! 489: ! 490: /* check for end of line */ ! 491: case '$': ! 492: if (*s == '\0' || *s == '\n') { ! 493: ! 494: /* match, be happy */ ! 495: s++; ! 496: matched = 1; ! 497: cs = MNEXT(cs); ! 498: } else if (*cs & ALT) { ! 499: ! 500: /* try the next part */ ! 501: matched = 0; ! 502: cs = MNEXT(cs); ! 503: } else if (*cs & OPT) { ! 504: ! 505: /* doesn't matter */ ! 506: matched = 1; ! 507: cs = MNEXT(cs); ! 508: } else ! 509: ! 510: /* no match, error return */ ! 511: return (NIL); ! 512: break; ! 513: ! 514: /* check for start of line */ ! 515: case '^': ! 516: if (s == _start) { ! 517: ! 518: /* match, be happy */ ! 519: matched = 1; ! 520: cs = MNEXT(cs); ! 521: } else if (*cs & ALT) { ! 522: ! 523: /* try the next part */ ! 524: matched = 0; ! 525: cs = MNEXT(cs); ! 526: } else if (*cs & OPT) { ! 527: ! 528: /* doesn't matter */ ! 529: matched = 1; ! 530: cs = MNEXT(cs); ! 531: } else ! 532: ! 533: /* no match, error return */ ! 534: return (NIL); ! 535: break; ! 536: ! 537: /* end of a subexpression, return success */ ! 538: case ')': ! 539: return (s); ! 540: } ! 541: break; ! 542: } ! 543: } ! 544: return (s); ! 545: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.