|
|
1.1 ! root 1: /* ! 2: * Copyright (c) 1980 The Regents of the University of California. ! 3: * All rights reserved. ! 4: * ! 5: * Redistribution and use in source and binary forms are permitted ! 6: * provided that: (1) source distributions retain this entire copyright ! 7: * notice and comment, and (2) distributions including binaries display ! 8: * the following acknowledgement: ``This product includes software ! 9: * developed by the University of California, Berkeley and its contributors'' ! 10: * in the documentation or other materials provided with the distribution ! 11: * and in all advertising materials mentioning features or use of this ! 12: * software. Neither the name of the University nor the names of its ! 13: * contributors may be used to endorse or promote products derived ! 14: * from this software without specific prior written permission. ! 15: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR ! 16: * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED ! 17: * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. ! 18: */ ! 19: ! 20: #ifndef lint ! 21: static char sccsid[] = "@(#)regexp.c 5.3 (Berkeley) 6/1/90"; ! 22: #endif /* not lint */ ! 23: ! 24: #include <ctype.h> ! 25: ! 26: typedef int boolean; ! 27: #define TRUE 1 ! 28: #define FALSE 0 ! 29: #define NIL 0 ! 30: ! 31: boolean l_onecase; /* true if upper and lower equivalent */ ! 32: ! 33: #define makelower(c) (isupper((c)) ? tolower((c)) : (c)) ! 34: ! 35: /* STRNCMP - like strncmp except that we convert the ! 36: * first string to lower case before comparing ! 37: * if l_onecase is set. ! 38: */ ! 39: ! 40: STRNCMP(s1, s2, len) ! 41: register char *s1,*s2; ! 42: register int len; ! 43: { ! 44: if (l_onecase) { ! 45: do ! 46: if (*s2 - makelower(*s1)) ! 47: return (*s2 - makelower(*s1)); ! 48: else { ! 49: s2++; ! 50: s1++; ! 51: } ! 52: while (--len); ! 53: } else { ! 54: do ! 55: if (*s2 - *s1) ! 56: return (*s2 - *s1); ! 57: else { ! 58: s2++; ! 59: s1++; ! 60: } ! 61: while (--len); ! 62: } ! 63: return(0); ! 64: } ! 65: ! 66: /* The following routine converts an irregular expression to ! 67: * internal format. ! 68: * ! 69: * Either meta symbols (\a \d or \p) or character strings or ! 70: * operations ( alternation or perenthesizing ) can be ! 71: * specified. Each starts with a descriptor byte. The descriptor ! 72: * byte has STR set for strings, META set for meta symbols ! 73: * and OPER set for operations. ! 74: * The descriptor byte can also have the OPT bit set if the object ! 75: * defined is optional. Also ALT can be set to indicate an alternation. ! 76: * ! 77: * For metasymbols the byte following the descriptor byte identities ! 78: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For ! 79: * strings the byte after the descriptor is a character count for ! 80: * the string: ! 81: * ! 82: * meta symbols := descriptor ! 83: * symbol ! 84: * ! 85: * strings := descriptor ! 86: * character count ! 87: * the string ! 88: * ! 89: * operatins := descriptor ! 90: * symbol ! 91: * character count ! 92: */ ! 93: ! 94: /* ! 95: * handy macros for accessing parts of match blocks ! 96: */ ! 97: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */ ! 98: #define MNEXT(A) (A+2) /* character following a metasymbol block */ ! 99: ! 100: #define OSYM(A) (*(A+1)) /* symbol in an operation block */ ! 101: #define OCNT(A) (*(A+2)) /* character count */ ! 102: #define ONEXT(A) (A+3) /* next character after the operation */ ! 103: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */ ! 104: ! 105: #define SCNT(A) (*(A+1)) /* byte count of a string */ ! 106: #define SSTR(A) (A+2) /* address of the string */ ! 107: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */ ! 108: ! 109: /* ! 110: * bit flags in the descriptor ! 111: */ ! 112: #define OPT 1 ! 113: #define STR 2 ! 114: #define META 4 ! 115: #define ALT 8 ! 116: #define OPER 16 ! 117: ! 118: char *ure; /* pointer current position in unconverted exp */ ! 119: char *ccre; /* pointer to current position in converted exp*/ ! 120: char *malloc(); ! 121: ! 122: char * ! 123: convexp(re) ! 124: char *re; /* unconverted irregular expression */ ! 125: { ! 126: register char *cre; /* pointer to converted regular expression */ ! 127: ! 128: /* allocate room for the converted expression */ ! 129: if (re == NIL) ! 130: return (NIL); ! 131: if (*re == '\0') ! 132: return (NIL); ! 133: cre = malloc (4 * strlen(re) + 3); ! 134: ccre = cre; ! 135: ure = re; ! 136: ! 137: /* start the conversion with a \a */ ! 138: *cre = META | OPT; ! 139: MSYM(cre) = 'a'; ! 140: ccre = MNEXT(cre); ! 141: ! 142: /* start the conversion (its recursive) */ ! 143: expconv (); ! 144: *ccre = 0; ! 145: return (cre); ! 146: } ! 147: ! 148: expconv() ! 149: { ! 150: register char *cs; /* pointer to current symbol in converted exp */ ! 151: register char c; /* character being processed */ ! 152: register char *acs; /* pinter to last alternate */ ! 153: register int temp; ! 154: ! 155: /* let the conversion begin */ ! 156: acs = NIL; ! 157: cs = NIL; ! 158: while (*ure != NIL) { ! 159: switch (c = *ure++) { ! 160: ! 161: case '\\': ! 162: switch (c = *ure++) { ! 163: ! 164: /* escaped characters are just characters */ ! 165: default: ! 166: if (cs == NIL || (*cs & STR) == 0) { ! 167: cs = ccre; ! 168: *cs = STR; ! 169: SCNT(cs) = 1; ! 170: ccre += 2; ! 171: } else ! 172: SCNT(cs)++; ! 173: *ccre++ = c; ! 174: break; ! 175: ! 176: /* normal(?) metacharacters */ ! 177: case 'a': ! 178: case 'd': ! 179: case 'e': ! 180: case 'p': ! 181: if (acs != NIL && acs != cs) { ! 182: do { ! 183: temp = OCNT(acs); ! 184: OCNT(acs) = ccre - acs; ! 185: acs -= temp; ! 186: } while (temp != 0); ! 187: acs = NIL; ! 188: } ! 189: cs = ccre; ! 190: *cs = META; ! 191: MSYM(cs) = c; ! 192: ccre = MNEXT(cs); ! 193: break; ! 194: } ! 195: break; ! 196: ! 197: /* just put the symbol in */ ! 198: case '^': ! 199: case '$': ! 200: if (acs != NIL && acs != cs) { ! 201: do { ! 202: temp = OCNT(acs); ! 203: OCNT(acs) = ccre - acs; ! 204: acs -= temp; ! 205: } while (temp != 0); ! 206: acs = NIL; ! 207: } ! 208: cs = ccre; ! 209: *cs = META; ! 210: MSYM(cs) = c; ! 211: ccre = MNEXT(cs); ! 212: break; ! 213: ! 214: /* mark the last match sequence as optional */ ! 215: case '?': ! 216: if (cs) ! 217: *cs = *cs | OPT; ! 218: break; ! 219: ! 220: /* recurse and define a subexpression */ ! 221: case '(': ! 222: if (acs != NIL && acs != cs) { ! 223: do { ! 224: temp = OCNT(acs); ! 225: OCNT(acs) = ccre - acs; ! 226: acs -= temp; ! 227: } while (temp != 0); ! 228: acs = NIL; ! 229: } ! 230: cs = ccre; ! 231: *cs = OPER; ! 232: OSYM(cs) = '('; ! 233: ccre = ONEXT(cs); ! 234: expconv (); ! 235: OCNT(cs) = ccre - cs; /* offset to next symbol */ ! 236: break; ! 237: ! 238: /* return from a recursion */ ! 239: case ')': ! 240: if (acs != NIL) { ! 241: do { ! 242: temp = OCNT(acs); ! 243: OCNT(acs) = ccre - acs; ! 244: acs -= temp; ! 245: } while (temp != 0); ! 246: acs = NIL; ! 247: } ! 248: cs = ccre; ! 249: *cs = META; ! 250: MSYM(cs) = c; ! 251: ccre = MNEXT(cs); ! 252: return; ! 253: ! 254: /* mark the last match sequence as having an alternate */ ! 255: /* the third byte will contain an offset to jump over the */ ! 256: /* alternate match in case the first did not fail */ ! 257: case '|': ! 258: if (acs != NIL && acs != cs) ! 259: OCNT(ccre) = ccre - acs; /* make a back pointer */ ! 260: else ! 261: OCNT(ccre) = 0; ! 262: *cs |= ALT; ! 263: cs = ccre; ! 264: *cs = OPER; ! 265: OSYM(cs) = '|'; ! 266: ccre = ONEXT(cs); ! 267: acs = cs; /* remember that the pointer is to be filles */ ! 268: break; ! 269: ! 270: /* if its not a metasymbol just build a scharacter string */ ! 271: default: ! 272: if (cs == NIL || (*cs & STR) == 0) { ! 273: cs = ccre; ! 274: *cs = STR; ! 275: SCNT(cs) = 1; ! 276: ccre = SSTR(cs); ! 277: } else ! 278: SCNT(cs)++; ! 279: *ccre++ = c; ! 280: break; ! 281: } ! 282: } ! 283: if (acs != NIL) { ! 284: do { ! 285: temp = OCNT(acs); ! 286: OCNT(acs) = ccre - acs; ! 287: acs -= temp; ! 288: } while (temp != 0); ! 289: acs = NIL; ! 290: } ! 291: return; ! 292: } ! 293: /* end of convertre */ ! 294: ! 295: ! 296: /* ! 297: * The following routine recognises an irregular expresion ! 298: * with the following special characters: ! 299: * ! 300: * \? - means last match was optional ! 301: * \a - matches any number of characters ! 302: * \d - matches any number of spaces and tabs ! 303: * \p - matches any number of alphanumeric ! 304: * characters. The ! 305: * characters matched will be copied into ! 306: * the area pointed to by 'name'. ! 307: * \| - alternation ! 308: * \( \) - grouping used mostly for alternation and ! 309: * optionality ! 310: * ! 311: * The irregular expression must be translated to internal form ! 312: * prior to calling this routine ! 313: * ! 314: * The value returned is the pointer to the first non \a ! 315: * character matched. ! 316: */ ! 317: ! 318: boolean _escaped; /* true if we are currently _escaped */ ! 319: char *_start; /* start of string */ ! 320: ! 321: char * ! 322: expmatch (s, re, mstring) ! 323: register char *s; /* string to check for a match in */ ! 324: register char *re; /* a converted irregular expression */ ! 325: register char *mstring; /* where to put whatever matches a \p */ ! 326: { ! 327: register char *cs; /* the current symbol */ ! 328: register char *ptr,*s1; /* temporary pointer */ ! 329: boolean matched; /* a temporary boolean */ ! 330: ! 331: /* initial conditions */ ! 332: if (re == NIL) ! 333: return (NIL); ! 334: cs = re; ! 335: matched = FALSE; ! 336: ! 337: /* loop till expression string is exhausted (or at least pretty tired) */ ! 338: while (*cs) { ! 339: switch (*cs & (OPER | STR | META)) { ! 340: ! 341: /* try to match a string */ ! 342: case STR: ! 343: matched = !STRNCMP (s, SSTR(cs), SCNT(cs)); ! 344: if (matched) { ! 345: ! 346: /* hoorah it matches */ ! 347: s += SCNT(cs); ! 348: cs = SNEXT(cs); ! 349: } else if (*cs & ALT) { ! 350: ! 351: /* alternation, skip to next expression */ ! 352: cs = SNEXT(cs); ! 353: } else if (*cs & OPT) { ! 354: ! 355: /* the match is optional */ ! 356: cs = SNEXT(cs); ! 357: matched = 1; /* indicate a successful match */ ! 358: } else { ! 359: ! 360: /* no match, error return */ ! 361: return (NIL); ! 362: } ! 363: break; ! 364: ! 365: /* an operator, do something fancy */ ! 366: case OPER: ! 367: switch (OSYM(cs)) { ! 368: ! 369: /* this is an alternation */ ! 370: case '|': ! 371: if (matched) ! 372: ! 373: /* last thing in the alternation was a match, skip ahead */ ! 374: cs = OPTR(cs); ! 375: else ! 376: ! 377: /* no match, keep trying */ ! 378: cs = ONEXT(cs); ! 379: break; ! 380: ! 381: /* this is a grouping, recurse */ ! 382: case '(': ! 383: ptr = expmatch (s, ONEXT(cs), mstring); ! 384: if (ptr != NIL) { ! 385: ! 386: /* the subexpression matched */ ! 387: matched = 1; ! 388: s = ptr; ! 389: } else if (*cs & ALT) { ! 390: ! 391: /* alternation, skip to next expression */ ! 392: matched = 0; ! 393: } else if (*cs & OPT) { ! 394: ! 395: /* the match is optional */ ! 396: matched = 1; /* indicate a successful match */ ! 397: } else { ! 398: ! 399: /* no match, error return */ ! 400: return (NIL); ! 401: } ! 402: cs = OPTR(cs); ! 403: break; ! 404: } ! 405: break; ! 406: ! 407: /* try to match a metasymbol */ ! 408: case META: ! 409: switch (MSYM(cs)) { ! 410: ! 411: /* try to match anything and remember what was matched */ ! 412: case 'p': ! 413: /* ! 414: * This is really the same as trying the match the ! 415: * remaining parts of the expression to any subset ! 416: * of the string. ! 417: */ ! 418: s1 = s; ! 419: do { ! 420: ptr = expmatch (s1, MNEXT(cs), mstring); ! 421: if (ptr != NIL && s1 != s) { ! 422: ! 423: /* we have a match, remember the match */ ! 424: strncpy (mstring, s, s1 - s); ! 425: mstring[s1 - s] = '\0'; ! 426: return (ptr); ! 427: } else if (ptr != NIL && (*cs & OPT)) { ! 428: ! 429: /* it was aoptional so no match is ok */ ! 430: return (ptr); ! 431: } else if (ptr != NIL) { ! 432: ! 433: /* not optional and we still matched */ ! 434: return (NIL); ! 435: } ! 436: if (!isalnum(*s1) && *s1 != '_') ! 437: return (NIL); ! 438: if (*s1 == '\\') ! 439: _escaped = _escaped ? FALSE : TRUE; ! 440: else ! 441: _escaped = FALSE; ! 442: } while (*s1++); ! 443: return (NIL); ! 444: ! 445: /* try to match anything */ ! 446: case 'a': ! 447: /* ! 448: * This is really the same as trying the match the ! 449: * remaining parts of the expression to any subset ! 450: * of the string. ! 451: */ ! 452: s1 = s; ! 453: do { ! 454: ptr = expmatch (s1, MNEXT(cs), mstring); ! 455: if (ptr != NIL && s1 != s) { ! 456: ! 457: /* we have a match */ ! 458: return (ptr); ! 459: } else if (ptr != NIL && (*cs & OPT)) { ! 460: ! 461: /* it was aoptional so no match is ok */ ! 462: return (ptr); ! 463: } else if (ptr != NIL) { ! 464: ! 465: /* not optional and we still matched */ ! 466: return (NIL); ! 467: } ! 468: if (*s1 == '\\') ! 469: _escaped = _escaped ? FALSE : TRUE; ! 470: else ! 471: _escaped = FALSE; ! 472: } while (*s1++); ! 473: return (NIL); ! 474: ! 475: /* fail if we are currently _escaped */ ! 476: case 'e': ! 477: if (_escaped) ! 478: return(NIL); ! 479: cs = MNEXT(cs); ! 480: break; ! 481: ! 482: /* match any number of tabs and spaces */ ! 483: case 'd': ! 484: ptr = s; ! 485: while (*s == ' ' || *s == '\t') ! 486: s++; ! 487: if (s != ptr || s == _start) { ! 488: ! 489: /* match, be happy */ ! 490: matched = 1; ! 491: cs = MNEXT(cs); ! 492: } else if (*s == '\n' || *s == '\0') { ! 493: ! 494: /* match, be happy */ ! 495: matched = 1; ! 496: cs = MNEXT(cs); ! 497: } else if (*cs & ALT) { ! 498: ! 499: /* try the next part */ ! 500: matched = 0; ! 501: cs = MNEXT(cs); ! 502: } else if (*cs & OPT) { ! 503: ! 504: /* doesn't matter */ ! 505: matched = 1; ! 506: cs = MNEXT(cs); ! 507: } else ! 508: ! 509: /* no match, error return */ ! 510: return (NIL); ! 511: break; ! 512: ! 513: /* check for end of line */ ! 514: case '$': ! 515: if (*s == '\0' || *s == '\n') { ! 516: ! 517: /* match, be happy */ ! 518: s++; ! 519: matched = 1; ! 520: cs = MNEXT(cs); ! 521: } else if (*cs & ALT) { ! 522: ! 523: /* try the next part */ ! 524: matched = 0; ! 525: cs = MNEXT(cs); ! 526: } else if (*cs & OPT) { ! 527: ! 528: /* doesn't matter */ ! 529: matched = 1; ! 530: cs = MNEXT(cs); ! 531: } else ! 532: ! 533: /* no match, error return */ ! 534: return (NIL); ! 535: break; ! 536: ! 537: /* check for start of line */ ! 538: case '^': ! 539: if (s == _start) { ! 540: ! 541: /* match, be happy */ ! 542: matched = 1; ! 543: cs = MNEXT(cs); ! 544: } else if (*cs & ALT) { ! 545: ! 546: /* try the next part */ ! 547: matched = 0; ! 548: cs = MNEXT(cs); ! 549: } else if (*cs & OPT) { ! 550: ! 551: /* doesn't matter */ ! 552: matched = 1; ! 553: cs = MNEXT(cs); ! 554: } else ! 555: ! 556: /* no match, error return */ ! 557: return (NIL); ! 558: break; ! 559: ! 560: /* end of a subexpression, return success */ ! 561: case ')': ! 562: return (s); ! 563: } ! 564: break; ! 565: } ! 566: } ! 567: return (s); ! 568: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.