|
|
1.1 ! root 1: /* $Header: search.c,v 4.3 85/05/01 11:50:16 lwall Exp $ ! 2: * ! 3: * $Log: search.c,v $ ! 4: * Revision 4.3 85/05/01 11:50:16 lwall ! 5: * Baseline for release with 4.3bsd. ! 6: * ! 7: */ ! 8: ! 9: /* string search routines */ ! 10: ! 11: /* Copyright (c) 1981,1980 James Gosling */ ! 12: ! 13: /* Modified Aug. 12, 1981 by Tom London to include regular expressions ! 14: as in ed. RE stuff hacked over by jag to correct a few major problems, ! 15: mainly dealing with searching within the buffer rather than copying ! 16: each line to a separate array. Newlines can now appear in RE's */ ! 17: ! 18: /* Ripped to shreds and glued back together to make a search package, ! 19: * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.) ! 20: * Changes include: ! 21: * Buffer, window, and mlisp stuff gone. ! 22: * Translation tables reduced to 1 table. ! 23: * Expression buffer is now dynamically allocated. ! 24: * Character classes now implemented with a bitmap. ! 25: */ ! 26: ! 27: #include "EXTERN.h" ! 28: #include "common.h" ! 29: #include "util.h" ! 30: #include "INTERN.h" ! 31: #include "search.h" ! 32: ! 33: #ifndef BITSPERBYTE ! 34: #define BITSPERBYTE 8 ! 35: #endif ! 36: ! 37: #define BMAPSIZ (127 / BITSPERBYTE + 1) ! 38: ! 39: /* meta characters in the "compiled" form of a regular expression */ ! 40: #define CBRA 2 /* \( -- begin bracket */ ! 41: #define CCHR 4 /* a vanilla character */ ! 42: #define CDOT 6 /* . -- match anything except a newline */ ! 43: #define CCL 8 /* [...] -- character class */ ! 44: #define NCCL 10 /* [^...] -- negated character class */ ! 45: #define CDOL 12 /* $ -- matches the end of a line */ ! 46: #define CEND 14 /* The end of the pattern */ ! 47: #define CKET 16 /* \) -- close bracket */ ! 48: #define CBACK 18 /* \N -- backreference to the Nth bracketed ! 49: string */ ! 50: #define CIRC 20 /* ^ matches the beginning of a line */ ! 51: ! 52: #define WORD 32 /* matches word character \w */ ! 53: #define NWORD 34 /* matches non-word characer \W */ ! 54: #define WBOUND 36 /* matches word boundary \b */ ! 55: #define NWBOUND 38 /* matches non-(word boundary) \B */ ! 56: ! 57: #define STAR 01 /* * -- Kleene star, repeats the previous ! 58: REas many times as possible; the value ! 59: ORs with the other operator types */ ! 60: ! 61: #define ASCSIZ 0200 ! 62: typedef char TRANSTABLE[ASCSIZ]; ! 63: ! 64: static TRANSTABLE trans = { ! 65: 0000,0001,0002,0003,0004,0005,0006,0007, ! 66: 0010,0011,0012,0013,0014,0015,0016,0017, ! 67: 0020,0021,0022,0023,0024,0025,0026,0027, ! 68: 0030,0031,0032,0033,0034,0035,0036,0037, ! 69: 0040,0041,0042,0043,0044,0045,0046,0047, ! 70: 0050,0051,0052,0053,0054,0055,0056,0057, ! 71: 0060,0061,0062,0063,0064,0065,0066,0067, ! 72: 0070,0071,0072,0073,0074,0075,0076,0077, ! 73: 0100,0101,0102,0103,0104,0105,0106,0107, ! 74: 0110,0111,0112,0113,0114,0115,0116,0117, ! 75: 0120,0121,0122,0123,0124,0125,0126,0127, ! 76: 0130,0131,0132,0133,0134,0135,0136,0137, ! 77: 0140,0141,0142,0143,0144,0145,0146,0147, ! 78: 0150,0151,0152,0153,0154,0155,0156,0157, ! 79: 0160,0161,0162,0163,0164,0165,0166,0167, ! 80: 0170,0171,0172,0173,0174,0175,0176,0177, ! 81: }; ! 82: static bool folding = FALSE; ! 83: ! 84: static int err; ! 85: static char *FirstCharacter; ! 86: ! 87: void ! 88: search_init() ! 89: { ! 90: #ifdef UNDEF ! 91: register int i; ! 92: ! 93: for (i = 0; i < ASCSIZ; i++) ! 94: trans[i] = i; ! 95: #else ! 96: ; ! 97: #endif ! 98: } ! 99: ! 100: void ! 101: init_compex(compex) ! 102: register COMPEX *compex; ! 103: { ! 104: /* the following must start off zeroed */ ! 105: ! 106: compex->eblen = 0; ! 107: compex->brastr = Nullch; ! 108: } ! 109: ! 110: void ! 111: free_compex(compex) ! 112: register COMPEX *compex; ! 113: { ! 114: if (compex->eblen) { ! 115: free(compex->expbuf); ! 116: compex->eblen = 0; ! 117: } ! 118: if (compex->brastr) { ! 119: free(compex->brastr); ! 120: compex->brastr = Nullch; ! 121: } ! 122: } ! 123: ! 124: static char *gbr_str = Nullch; ! 125: static int gbr_siz = 0; ! 126: ! 127: char * ! 128: getbracket(compex,n) ! 129: register COMPEX *compex; ! 130: int n; ! 131: { ! 132: int length = compex->braelist[n] - compex->braslist[n]; ! 133: ! 134: if (!compex->nbra || n > compex->nbra || !compex->braelist[n] || length<0) ! 135: return nullstr; ! 136: growstr(&gbr_str, &gbr_siz, length+1); ! 137: safecpy(gbr_str, compex->braslist[n], length+1); ! 138: return gbr_str; ! 139: } ! 140: ! 141: void ! 142: case_fold(which) ! 143: int which; ! 144: { ! 145: register int i; ! 146: ! 147: if (which != folding) { ! 148: if (which) { ! 149: for (i = 'A'; i <= 'Z'; i++) ! 150: trans[i] = tolower(i); ! 151: } ! 152: else { ! 153: for (i = 'A'; i <= 'Z'; i++) ! 154: trans[i] = i; ! 155: } ! 156: folding = which; ! 157: } ! 158: } ! 159: ! 160: /* Compile the given regular expression into a [secret] internal format */ ! 161: ! 162: char * ! 163: compile (compex, strp, RE, fold) ! 164: register COMPEX *compex; ! 165: register char *strp; ! 166: int RE; ! 167: int fold; ! 168: { ! 169: register int c; ! 170: register char *ep; ! 171: char *lastep; ! 172: char bracket[NBRA], ! 173: *bracketp; ! 174: char **alt = compex->alternatives; ! 175: char *retmes = "Badly formed search string"; ! 176: ! 177: case_fold(compex->do_folding = fold); ! 178: if (!compex->eblen) { ! 179: compex->expbuf = safemalloc(84); ! 180: compex->eblen = 80; ! 181: } ! 182: ep = compex->expbuf; /* point at expression buffer */ ! 183: *alt++ = ep; /* first alternative starts here */ ! 184: bracketp = bracket; /* first bracket goes here */ ! 185: if (*strp == 0) { /* nothing to compile? */ ! 186: if (*ep == 0) /* nothing there yet? */ ! 187: return "Null search string"; ! 188: return Nullch; /* just keep old expression */ ! 189: } ! 190: compex->nbra = 0; /* no brackets yet */ ! 191: lastep = 0; ! 192: for (;;) { ! 193: if (ep - compex->expbuf >= compex->eblen) ! 194: grow_eb(compex); ! 195: c = *strp++; /* fetch next char of pattern */ ! 196: if (c == 0) { /* end of pattern? */ ! 197: if (bracketp != bracket) { /* balanced brackets? */ ! 198: #ifdef VERBOSE ! 199: retmes = "Unbalanced parens"; ! 200: #endif ! 201: goto cerror; ! 202: } ! 203: *ep++ = CEND; /* terminate expression */ ! 204: *alt++ = 0; /* terminal alternative list */ ! 205: /* ! 206: compex->eblen = ep - compex->expbuf + 1; ! 207: compex->expbuf = saferealloc(compex->expbuf,compex->eblen+4); */ ! 208: return Nullch; /* return success */ ! 209: } ! 210: if (c != '*') ! 211: lastep = ep; ! 212: if (!RE) { /* just a normal search string? */ ! 213: *ep++ = CCHR; /* everything is a normal char */ ! 214: *ep++ = c; ! 215: } ! 216: else /* it is a regular expression */ ! 217: switch (c) { ! 218: ! 219: case '\\': /* meta something */ ! 220: switch (c = *strp++) { ! 221: case '(': ! 222: if (compex->nbra >= NBRA) { ! 223: #ifdef VERBOSE ! 224: retmes = "Too many parens"; ! 225: #endif ! 226: goto cerror; ! 227: } ! 228: *bracketp++ = ++compex->nbra; ! 229: *ep++ = CBRA; ! 230: *ep++ = compex->nbra; ! 231: break; ! 232: case '|': ! 233: if (bracketp>bracket) { ! 234: #ifdef VERBOSE ! 235: retmes = "No \\| in parens"; /* Alas! */ ! 236: #endif ! 237: goto cerror; ! 238: } ! 239: *ep++ = CEND; ! 240: *alt++ = ep; ! 241: break; ! 242: case ')': ! 243: if (bracketp <= bracket) { ! 244: #ifdef VERBOSE ! 245: retmes = "Unmatched right paren"; ! 246: #endif ! 247: goto cerror; ! 248: } ! 249: *ep++ = CKET; ! 250: *ep++ = *--bracketp; ! 251: break; ! 252: case 'w': ! 253: *ep++ = WORD; ! 254: break; ! 255: case 'W': ! 256: *ep++ = NWORD; ! 257: break; ! 258: case 'b': ! 259: *ep++ = WBOUND; ! 260: break; ! 261: case 'B': ! 262: *ep++ = NWBOUND; ! 263: break; ! 264: case '0': case '1': case '2': case '3': case '4': ! 265: case '5': case '6': case '7': case '8': case '9': ! 266: *ep++ = CBACK; ! 267: *ep++ = c - '0'; ! 268: break; ! 269: default: ! 270: *ep++ = CCHR; ! 271: if (c == '\0') ! 272: goto cerror; ! 273: *ep++ = c; ! 274: break; ! 275: } ! 276: break; ! 277: case '.': ! 278: *ep++ = CDOT; ! 279: continue; ! 280: ! 281: case '*': ! 282: if (lastep == 0 || *lastep == CBRA || *lastep == CKET ! 283: || *lastep == CIRC ! 284: || (*lastep&STAR)|| *lastep>NWORD) ! 285: goto defchar; ! 286: *lastep |= STAR; ! 287: continue; ! 288: ! 289: case '^': ! 290: if (ep != compex->expbuf && ep[-1] != CEND) ! 291: goto defchar; ! 292: *ep++ = CIRC; ! 293: continue; ! 294: ! 295: case '$': ! 296: if (*strp != 0 && (*strp != '\\' || strp[1] != '|')) ! 297: goto defchar; ! 298: *ep++ = CDOL; ! 299: continue; ! 300: ! 301: case '[': { /* character class */ ! 302: register int i; ! 303: ! 304: if (ep - compex->expbuf >= compex->eblen - BMAPSIZ) ! 305: grow_eb(compex); /* reserve bitmap */ ! 306: for (i = BMAPSIZ; i; --i) ! 307: ep[i] = 0; ! 308: ! 309: if ((c = *strp++) == '^') { ! 310: c = *strp++; ! 311: *ep++ = NCCL; /* negated */ ! 312: } ! 313: else ! 314: *ep++ = CCL; /* normal */ ! 315: ! 316: i = 0; /* remember oldchar */ ! 317: do { ! 318: if (c == '\0') { ! 319: #ifdef VERBOSE ! 320: retmes = "Missing ]"; ! 321: #endif ! 322: goto cerror; ! 323: } ! 324: if (*strp == '-' && *(++strp)) ! 325: i = *strp++; ! 326: else ! 327: i = c; ! 328: while (c <= i) { ! 329: ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE); ! 330: if (fold && isalpha(c)) ! 331: ep[(c ^ 32) / BITSPERBYTE] |= ! 332: 1 << ((c ^ 32) % BITSPERBYTE); ! 333: /* set the other bit too */ ! 334: c++; ! 335: } ! 336: } while ((c = *strp++) != ']'); ! 337: ep += BMAPSIZ; ! 338: continue; ! 339: } ! 340: ! 341: defchar: ! 342: default: ! 343: *ep++ = CCHR; ! 344: *ep++ = c; ! 345: } ! 346: } ! 347: cerror: ! 348: compex->expbuf[0] = 0; ! 349: compex->nbra = 0; ! 350: return retmes; ! 351: } ! 352: ! 353: void ! 354: grow_eb(compex) ! 355: register COMPEX *compex; ! 356: { ! 357: compex->eblen += 80; ! 358: compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4); ! 359: } ! 360: ! 361: char * ! 362: execute (compex, addr) ! 363: register COMPEX *compex; ! 364: char *addr; ! 365: { ! 366: register char *p1 = addr; ! 367: register char *trt = trans; ! 368: register int c; ! 369: ! 370: if (addr == Nullch) ! 371: return Nullch; ! 372: if (compex->nbra) { /* any brackets? */ ! 373: for (c = 0; c <= compex->nbra; c++) ! 374: compex->braslist[c] = compex->braelist[c] = Nullch; ! 375: if (compex->brastr) ! 376: free(compex->brastr); ! 377: compex->brastr = savestr(p1); /* in case p1 is not static */ ! 378: p1 = compex->brastr; /* ! */ ! 379: } ! 380: case_fold(compex->do_folding); /* make sure table is correct */ ! 381: FirstCharacter = p1; /* for ^ tests */ ! 382: if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) { ! 383: c = trt[compex->expbuf[1]]; /* fast check for first character */ ! 384: do { ! 385: if (trt[*p1] == c && advance (compex, p1, compex->expbuf)) ! 386: return p1; ! 387: p1++; ! 388: } while (*p1 && !err); ! 389: return Nullch; ! 390: } ! 391: else { /* regular algorithm */ ! 392: do { ! 393: register char **alt = compex->alternatives; ! 394: while (*alt) { ! 395: if (advance (compex, p1, *alt++)) ! 396: return p1; ! 397: } ! 398: p1++; ! 399: } while (*p1 && !err); ! 400: return Nullch; ! 401: } ! 402: } ! 403: ! 404: /* advance the match of the regular expression starting at ep along the ! 405: string lp, simulates an NDFSA */ ! 406: bool ! 407: advance (compex, lp, ep) ! 408: register COMPEX *compex; ! 409: register char *ep; ! 410: register char *lp; ! 411: { ! 412: register char *curlp; ! 413: register char *trt = trans; ! 414: register int i; ! 415: ! 416: while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET) ! 417: switch (*ep++) { ! 418: ! 419: case CCHR: ! 420: if (trt[*ep++] != trt[*lp]) return FALSE; ! 421: lp++; ! 422: continue; ! 423: ! 424: case CDOT: ! 425: if (*lp == '\n') return FALSE; ! 426: lp++; ! 427: continue; ! 428: ! 429: case CDOL: ! 430: if (!*lp || *lp == '\n') ! 431: continue; ! 432: return FALSE; ! 433: ! 434: case CIRC: ! 435: if (lp == FirstCharacter || lp[-1]=='\n') ! 436: continue; ! 437: return FALSE; ! 438: ! 439: case WORD: ! 440: if (isalnum(*lp)) { ! 441: lp++; ! 442: continue; ! 443: } ! 444: return FALSE; ! 445: ! 446: case NWORD: ! 447: if (!isalnum(*lp)) { ! 448: lp++; ! 449: continue; ! 450: } ! 451: return FALSE; ! 452: ! 453: case WBOUND: ! 454: if ((lp == FirstCharacter || !isalnum(lp[-1])) != ! 455: (!*lp || !isalnum(*lp)) ) ! 456: continue; ! 457: return FALSE; ! 458: ! 459: case NWBOUND: ! 460: if ((lp == FirstCharacter || !isalnum(lp[-1])) == ! 461: (!*lp || !isalnum(*lp))) ! 462: continue; ! 463: return FALSE; ! 464: ! 465: case CEND: ! 466: return TRUE; ! 467: ! 468: case CCL: ! 469: if (cclass (ep, *lp, 1)) { ! 470: ep += BMAPSIZ; ! 471: lp++; ! 472: continue; ! 473: } ! 474: return FALSE; ! 475: ! 476: case NCCL: ! 477: if (cclass (ep, *lp, 0)) { ! 478: ep += BMAPSIZ; ! 479: lp++; ! 480: continue; ! 481: } ! 482: return FALSE; ! 483: ! 484: case CBRA: ! 485: compex->braslist[*ep++] = lp; ! 486: continue; ! 487: ! 488: case CKET: ! 489: i = *ep++; ! 490: compex->braelist[i] = lp; ! 491: compex->braelist[0] = lp; ! 492: compex->braslist[0] = compex->braslist[i]; ! 493: continue; ! 494: ! 495: case CBACK: ! 496: if (compex->braelist[i = *ep++] == 0) { ! 497: fputs("bad braces\n",stdout) FLUSH; ! 498: err = TRUE; ! 499: return FALSE; ! 500: } ! 501: if (backref (compex, i, lp)) { ! 502: lp += compex->braelist[i] - compex->braslist[i]; ! 503: continue; ! 504: } ! 505: return FALSE; ! 506: ! 507: case CBACK | STAR: ! 508: if (compex->braelist[i = *ep++] == 0) { ! 509: fputs("bad braces\n",stdout) FLUSH; ! 510: err = TRUE; ! 511: return FALSE; ! 512: } ! 513: curlp = lp; ! 514: while (backref (compex, i, lp)) { ! 515: lp += compex->braelist[i] - compex->braslist[i]; ! 516: } ! 517: while (lp >= curlp) { ! 518: if (advance (compex, lp, ep)) ! 519: return TRUE; ! 520: lp -= compex->braelist[i] - compex->braslist[i]; ! 521: } ! 522: continue; ! 523: ! 524: case CDOT | STAR: ! 525: curlp = lp; ! 526: while (*lp++ && lp[-1] != '\n'); ! 527: goto star; ! 528: ! 529: case WORD | STAR: ! 530: curlp = lp; ! 531: while (*lp++ && isalnum(lp[-1])); ! 532: goto star; ! 533: ! 534: case NWORD | STAR: ! 535: curlp = lp; ! 536: while (*lp++ && !isalnum(lp[-1])); ! 537: goto star; ! 538: ! 539: case CCHR | STAR: ! 540: curlp = lp; ! 541: while (*lp++ && trt[lp[-1]] == trt[*ep]); ! 542: ep++; ! 543: goto star; ! 544: ! 545: case CCL | STAR: ! 546: case NCCL | STAR: ! 547: curlp = lp; ! 548: while (*lp++ && cclass (ep, lp[-1], ep[-1] == (CCL | STAR))); ! 549: ep += BMAPSIZ; ! 550: goto star; ! 551: ! 552: star: ! 553: do { ! 554: lp--; ! 555: if (advance (compex, lp, ep)) ! 556: return TRUE; ! 557: } while (lp > curlp); ! 558: return FALSE; ! 559: ! 560: default: ! 561: fputs("Badly compiled pattern\n",stdout) FLUSH; ! 562: err = TRUE; ! 563: return -1; ! 564: } ! 565: if (*ep == CEND || *ep == CDOL) { ! 566: return TRUE; ! 567: } ! 568: return FALSE; ! 569: } ! 570: ! 571: bool ! 572: backref (compex, i, lp) ! 573: register COMPEX *compex; ! 574: register int i; ! 575: register char *lp; ! 576: { ! 577: register char *bp; ! 578: ! 579: bp = compex->braslist[i]; ! 580: while (*lp && *bp == *lp) { ! 581: bp++; ! 582: lp++; ! 583: if (bp >= compex->braelist[i]) ! 584: return TRUE; ! 585: } ! 586: return FALSE; ! 587: } ! 588: ! 589: bool ! 590: cclass (set, c, af) ! 591: register char *set; ! 592: register int c; ! 593: { ! 594: c &= 0177; ! 595: #if BITSPERBYTE == 8 ! 596: if (set[c >> 3] & 1 << (c & 7)) ! 597: #else ! 598: if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE)) ! 599: #endif ! 600: return af; ! 601: return !af; ! 602: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.