|
|
1.1 ! root 1: /* ! 2: * Copyright (c) 1980 Regents of the University of California. ! 3: * All rights reserved. The Berkeley software License Agreement ! 4: * specifies the terms and conditions for redistribution. ! 5: */ ! 6: ! 7: #ifndef lint ! 8: static char sccsid[] = "@(#)regexp.c 5.1 (Berkeley) 6/5/85"; ! 9: #endif not lint ! 10: ! 11: #include <ctype.h> ! 12: ! 13: typedef int boolean; ! 14: #define TRUE 1 ! 15: #define FALSE 0 ! 16: #define NIL 0 ! 17: ! 18: boolean l_onecase; /* true if upper and lower equivalent */ ! 19: ! 20: #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) ! 21: ! 22: /* STRNCMP - like strncmp except that we convert the ! 23: * first string to lower case before comparing ! 24: * if l_onecase is set. ! 25: */ ! 26: ! 27: STRNCMP(s1, s2, len) ! 28: register char *s1,*s2; ! 29: register int len; ! 30: { ! 31: if (l_onecase) { ! 32: do ! 33: if (*s2 - makelower(*s1)) ! 34: return (*s2 - makelower(*s1)); ! 35: else { ! 36: s2++; ! 37: s1++; ! 38: } ! 39: while (--len); ! 40: } else { ! 41: do ! 42: if (*s2 - *s1) ! 43: return (*s2 - *s1); ! 44: else { ! 45: s2++; ! 46: s1++; ! 47: } ! 48: while (--len); ! 49: } ! 50: return(0); ! 51: } ! 52: ! 53: /* The following routine converts an irregular expression to ! 54: * internal format. ! 55: * ! 56: * Either meta symbols (\a \d or \p) or character strings or ! 57: * operations ( alternation or perenthesizing ) can be ! 58: * specified. Each starts with a descriptor byte. The descriptor ! 59: * byte has STR set for strings, META set for meta symbols ! 60: * and OPER set for operations. ! 61: * The descriptor byte can also have the OPT bit set if the object ! 62: * defined is optional. Also ALT can be set to indicate an alternation. ! 63: * ! 64: * For metasymbols the byte following the descriptor byte identities ! 65: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For ! 66: * strings the byte after the descriptor is a character count for ! 67: * the string: ! 68: * ! 69: * meta symbols := descriptor ! 70: * symbol ! 71: * ! 72: * strings := descriptor ! 73: * character count ! 74: * the string ! 75: * ! 76: * operatins := descriptor ! 77: * symbol ! 78: * character count ! 79: */ ! 80: ! 81: /* ! 82: * handy macros for accessing parts of match blocks ! 83: */ ! 84: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ ! 85: #define MNEXT(A) (A+2) /* character following a metasymbol block */ ! 86: ! 87: #define OSYM(A) (*(A+1)) /* symbol in an operation block */ ! 88: #define OCNT(A) (*(A+2)) /* character count */ ! 89: #define ONEXT(A) (A+3) /* next character after the operation */ ! 90: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ ! 91: ! 92: #define SCNT(A) (*(A+1)) /* byte count of a string */ ! 93: #define SSTR(A) (A+2) /* address of the string */ ! 94: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ ! 95: ! 96: /* ! 97: * bit flags in the descriptor ! 98: */ ! 99: #define OPT 1 ! 100: #define STR 2 ! 101: #define META 4 ! 102: #define ALT 8 ! 103: #define OPER 16 ! 104: ! 105: char *ure; /* pointer current position in unconverted exp */ ! 106: char *ccre; /* pointer to current position in converted exp*/ ! 107: char *malloc(); ! 108: ! 109: char * ! 110: convexp(re) ! 111: char *re; /* unconverted irregular expression */ ! 112: { ! 113: register char *cre; /* pointer to converted regular expression */ ! 114: ! 115: /* allocate room for the converted expression */ ! 116: if (re == NIL) ! 117: return (NIL); ! 118: if (*re == '\0') ! 119: return (NIL); ! 120: cre = malloc (4 * strlen(re) + 3); ! 121: ccre = cre; ! 122: ure = re; ! 123: ! 124: /* start the conversion with a \a */ ! 125: *cre = META | OPT; ! 126: MSYM(cre) = 'a'; ! 127: ccre = MNEXT(cre); ! 128: ! 129: /* start the conversion (its recursive) */ ! 130: expconv (); ! 131: *ccre = 0; ! 132: return (cre); ! 133: } ! 134: ! 135: expconv() ! 136: { ! 137: register char *cs; /* pointer to current symbol in converted exp */ ! 138: register char c; /* character being processed */ ! 139: register char *acs; /* pinter to last alternate */ ! 140: register int temp; ! 141: ! 142: /* let the conversion begin */ ! 143: acs = NIL; ! 144: cs = NIL; ! 145: while (*ure != NIL) { ! 146: switch (c = *ure++) { ! 147: ! 148: case '\\': ! 149: switch (c = *ure++) { ! 150: ! 151: /* escaped characters are just characters */ ! 152: default: ! 153: if (cs == NIL || (*cs & STR) == 0) { ! 154: cs = ccre; ! 155: *cs = STR; ! 156: SCNT(cs) = 1; ! 157: ccre += 2; ! 158: } else ! 159: SCNT(cs)++; ! 160: *ccre++ = c; ! 161: break; ! 162: ! 163: /* normal(?) metacharacters */ ! 164: case 'a': ! 165: case 'd': ! 166: case 'e': ! 167: case 'p': ! 168: if (acs != NIL && acs != cs) { ! 169: do { ! 170: temp = OCNT(acs); ! 171: OCNT(acs) = ccre - acs; ! 172: acs -= temp; ! 173: } while (temp != 0); ! 174: acs = NIL; ! 175: } ! 176: cs = ccre; ! 177: *cs = META; ! 178: MSYM(cs) = c; ! 179: ccre = MNEXT(cs); ! 180: break; ! 181: } ! 182: break; ! 183: ! 184: /* just put the symbol in */ ! 185: case '^': ! 186: case '$': ! 187: if (acs != NIL && acs != cs) { ! 188: do { ! 189: temp = OCNT(acs); ! 190: OCNT(acs) = ccre - acs; ! 191: acs -= temp; ! 192: } while (temp != 0); ! 193: acs = NIL; ! 194: } ! 195: cs = ccre; ! 196: *cs = META; ! 197: MSYM(cs) = c; ! 198: ccre = MNEXT(cs); ! 199: break; ! 200: ! 201: /* mark the last match sequence as optional */ ! 202: case '?': ! 203: if (cs) ! 204: *cs = *cs | OPT; ! 205: break; ! 206: ! 207: /* recurse and define a subexpression */ ! 208: case '(': ! 209: if (acs != NIL && acs != cs) { ! 210: do { ! 211: temp = OCNT(acs); ! 212: OCNT(acs) = ccre - acs; ! 213: acs -= temp; ! 214: } while (temp != 0); ! 215: acs = NIL; ! 216: } ! 217: cs = ccre; ! 218: *cs = OPER; ! 219: OSYM(cs) = '('; ! 220: ccre = ONEXT(cs); ! 221: expconv (); ! 222: OCNT(cs) = ccre - cs; /* offset to next symbol */ ! 223: break; ! 224: ! 225: /* return from a recursion */ ! 226: case ')': ! 227: if (acs != NIL) { ! 228: do { ! 229: temp = OCNT(acs); ! 230: OCNT(acs) = ccre - acs; ! 231: acs -= temp; ! 232: } while (temp != 0); ! 233: acs = NIL; ! 234: } ! 235: cs = ccre; ! 236: *cs = META; ! 237: MSYM(cs) = c; ! 238: ccre = MNEXT(cs); ! 239: return; ! 240: ! 241: /* mark the last match sequence as having an alternate */ ! 242: /* the third byte will contain an offset to jump over the */ ! 243: /* alternate match in case the first did not fail */ ! 244: case '|': ! 245: if (acs != NIL && acs != cs) ! 246: OCNT(ccre) = ccre - acs; /* make a back pointer */ ! 247: else ! 248: OCNT(ccre) = 0; ! 249: *cs |= ALT; ! 250: cs = ccre; ! 251: *cs = OPER; ! 252: OSYM(cs) = '|'; ! 253: ccre = ONEXT(cs); ! 254: acs = cs; /* remember that the pointer is to be filles */ ! 255: break; ! 256: ! 257: /* if its not a metasymbol just build a scharacter string */ ! 258: default: ! 259: if (cs == NIL || (*cs & STR) == 0) { ! 260: cs = ccre; ! 261: *cs = STR; ! 262: SCNT(cs) = 1; ! 263: ccre = SSTR(cs); ! 264: } else ! 265: SCNT(cs)++; ! 266: *ccre++ = c; ! 267: break; ! 268: } ! 269: } ! 270: if (acs != NIL) { ! 271: do { ! 272: temp = OCNT(acs); ! 273: OCNT(acs) = ccre - acs; ! 274: acs -= temp; ! 275: } while (temp != 0); ! 276: acs = NIL; ! 277: } ! 278: return; ! 279: } ! 280: /* end of convertre */ ! 281: ! 282: ! 283: /* ! 284: * The following routine recognises an irregular expresion ! 285: * with the following special characters: ! 286: * ! 287: * \? - means last match was optional ! 288: * \a - matches any number of characters ! 289: * \d - matches any number of spaces and tabs ! 290: * \p - matches any number of alphanumeric ! 291: * characters. The ! 292: * characters matched will be copied into ! 293: * the area pointed to by 'name'. ! 294: * \| - alternation ! 295: * \( \) - grouping used mostly for alternation and ! 296: * optionality ! 297: * ! 298: * The irregular expression must be translated to internal form ! 299: * prior to calling this routine ! 300: * ! 301: * The value returned is the pointer to the first non \a ! 302: * character matched. ! 303: */ ! 304: ! 305: boolean _escaped; /* true if we are currently _escaped */ ! 306: char *_start; /* start of string */ ! 307: ! 308: char * ! 309: expmatch (s, re, mstring) ! 310: register char *s; /* string to check for a match in */ ! 311: register char *re; /* a converted irregular expression */ ! 312: register char *mstring; /* where to put whatever matches a \p */ ! 313: { ! 314: register char *cs; /* the current symbol */ ! 315: register char *ptr,*s1; /* temporary pointer */ ! 316: boolean matched; /* a temporary boolean */ ! 317: ! 318: /* initial conditions */ ! 319: if (re == NIL) ! 320: return (NIL); ! 321: cs = re; ! 322: matched = FALSE; ! 323: ! 324: /* loop till expression string is exhausted (or at least pretty tired) */ ! 325: while (*cs) { ! 326: switch (*cs & (OPER | STR | META)) { ! 327: ! 328: /* try to match a string */ ! 329: case STR: ! 330: matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); ! 331: if (matched) { ! 332: ! 333: /* hoorah it matches */ ! 334: s += SCNT(cs); ! 335: cs = SNEXT(cs); ! 336: } else if (*cs & ALT) { ! 337: ! 338: /* alternation, skip to next expression */ ! 339: cs = SNEXT(cs); ! 340: } else if (*cs & OPT) { ! 341: ! 342: /* the match is optional */ ! 343: cs = SNEXT(cs); ! 344: matched = 1; /* indicate a successful match */ ! 345: } else { ! 346: ! 347: /* no match, error return */ ! 348: return (NIL); ! 349: } ! 350: break; ! 351: ! 352: /* an operator, do something fancy */ ! 353: case OPER: ! 354: switch (OSYM(cs)) { ! 355: ! 356: /* this is an alternation */ ! 357: case '|': ! 358: if (matched) ! 359: ! 360: /* last thing in the alternation was a match, skip ahead */ ! 361: cs = OPTR(cs); ! 362: else ! 363: ! 364: /* no match, keep trying */ ! 365: cs = ONEXT(cs); ! 366: break; ! 367: ! 368: /* this is a grouping, recurse */ ! 369: case '(': ! 370: ptr = expmatch (s, ONEXT(cs), mstring); ! 371: if (ptr != NIL) { ! 372: ! 373: /* the subexpression matched */ ! 374: matched = 1; ! 375: s = ptr; ! 376: } else if (*cs & ALT) { ! 377: ! 378: /* alternation, skip to next expression */ ! 379: matched = 0; ! 380: } else if (*cs & OPT) { ! 381: ! 382: /* the match is optional */ ! 383: matched = 1; /* indicate a successful match */ ! 384: } else { ! 385: ! 386: /* no match, error return */ ! 387: return (NIL); ! 388: } ! 389: cs = OPTR(cs); ! 390: break; ! 391: } ! 392: break; ! 393: ! 394: /* try to match a metasymbol */ ! 395: case META: ! 396: switch (MSYM(cs)) { ! 397: ! 398: /* try to match anything and remember what was matched */ ! 399: case 'p': ! 400: /* ! 401: * This is really the same as trying the match the ! 402: * remaining parts of the expression to any subset ! 403: * of the string. ! 404: */ ! 405: s1 = s; ! 406: do { ! 407: ptr = expmatch (s1, MNEXT(cs), mstring); ! 408: if (ptr != NIL && s1 != s) { ! 409: ! 410: /* we have a match, remember the match */ ! 411: strncpy (mstring, s, s1 - s); ! 412: mstring[s1 - s] = '\0'; ! 413: return (ptr); ! 414: } else if (ptr != NIL && (*cs & OPT)) { ! 415: ! 416: /* it was aoptional so no match is ok */ ! 417: return (ptr); ! 418: } else if (ptr != NIL) { ! 419: ! 420: /* not optional and we still matched */ ! 421: return (NIL); ! 422: } ! 423: if (!isalnum(*s1) && *s1 != '_') ! 424: return (NIL); ! 425: if (*s1 == '\\') ! 426: _escaped = _escaped ? FALSE : TRUE; ! 427: else ! 428: _escaped = FALSE; ! 429: } while (*s1++); ! 430: return (NIL); ! 431: ! 432: /* try to match anything */ ! 433: case 'a': ! 434: /* ! 435: * This is really the same as trying the match the ! 436: * remaining parts of the expression to any subset ! 437: * of the string. ! 438: */ ! 439: s1 = s; ! 440: do { ! 441: ptr = expmatch (s1, MNEXT(cs), mstring); ! 442: if (ptr != NIL && s1 != s) { ! 443: ! 444: /* we have a match */ ! 445: return (ptr); ! 446: } else if (ptr != NIL && (*cs & OPT)) { ! 447: ! 448: /* it was aoptional so no match is ok */ ! 449: return (ptr); ! 450: } else if (ptr != NIL) { ! 451: ! 452: /* not optional and we still matched */ ! 453: return (NIL); ! 454: } ! 455: if (*s1 == '\\') ! 456: _escaped = _escaped ? FALSE : TRUE; ! 457: else ! 458: _escaped = FALSE; ! 459: } while (*s1++); ! 460: return (NIL); ! 461: ! 462: /* fail if we are currently _escaped */ ! 463: case 'e': ! 464: if (_escaped) ! 465: return(NIL); ! 466: cs = MNEXT(cs); ! 467: break; ! 468: ! 469: /* match any number of tabs and spaces */ ! 470: case 'd': ! 471: ptr = s; ! 472: while (*s == ' ' || *s == '\t') ! 473: s++; ! 474: if (s != ptr || s == _start) { ! 475: ! 476: /* match, be happy */ ! 477: matched = 1; ! 478: cs = MNEXT(cs); ! 479: } else if (*s == '\n' || *s == '\0') { ! 480: ! 481: /* match, be happy */ ! 482: matched = 1; ! 483: cs = MNEXT(cs); ! 484: } else if (*cs & ALT) { ! 485: ! 486: /* try the next part */ ! 487: matched = 0; ! 488: cs = MNEXT(cs); ! 489: } else if (*cs & OPT) { ! 490: ! 491: /* doesn't matter */ ! 492: matched = 1; ! 493: cs = MNEXT(cs); ! 494: } else ! 495: ! 496: /* no match, error return */ ! 497: return (NIL); ! 498: break; ! 499: ! 500: /* check for end of line */ ! 501: case '$': ! 502: if (*s == '\0' || *s == '\n') { ! 503: ! 504: /* match, be happy */ ! 505: s++; ! 506: matched = 1; ! 507: cs = MNEXT(cs); ! 508: } else if (*cs & ALT) { ! 509: ! 510: /* try the next part */ ! 511: matched = 0; ! 512: cs = MNEXT(cs); ! 513: } else if (*cs & OPT) { ! 514: ! 515: /* doesn't matter */ ! 516: matched = 1; ! 517: cs = MNEXT(cs); ! 518: } else ! 519: ! 520: /* no match, error return */ ! 521: return (NIL); ! 522: break; ! 523: ! 524: /* check for start of line */ ! 525: case '^': ! 526: if (s == _start) { ! 527: ! 528: /* match, be happy */ ! 529: matched = 1; ! 530: cs = MNEXT(cs); ! 531: } else if (*cs & ALT) { ! 532: ! 533: /* try the next part */ ! 534: matched = 0; ! 535: cs = MNEXT(cs); ! 536: } else if (*cs & OPT) { ! 537: ! 538: /* doesn't matter */ ! 539: matched = 1; ! 540: cs = MNEXT(cs); ! 541: } else ! 542: ! 543: /* no match, error return */ ! 544: return (NIL); ! 545: break; ! 546: ! 547: /* end of a subexpression, return success */ ! 548: case ')': ! 549: return (s); ! 550: } ! 551: break; ! 552: } ! 553: } ! 554: return (s); ! 555: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.