|
|
1.1 ! root 1: #ifndef lint ! 2: static char sccsid[] = "@(#)egrep.c 5.12 (Berkeley) 5/11/89"; ! 3: #endif not lint ! 4: ! 5: /* ! 6: Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 ! 7: table as in original paper (CACM, October, 1977). No delta1 or delta2. ! 8: According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of ! 9: minimal practical value. However, to improve for worst case input, ! 10: integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. ! 11: Comput., Feb. 1986) deserves consideration. ! 12: ! 13: Method: extract longest metacharacter-free string from expression. ! 14: this is done using a side-effect from henry spencer's regcomp(). ! 15: use boyer-moore to match such, then pass submatching lines ! 16: to either regexp() or standard 'egrep', depending on certain ! 17: criteria within execstrategy() below. [this tradeoff is due ! 18: to the general slowness of the regexp() nondeterministic ! 19: machine on complex expressions, as well as the startup time ! 20: of standard 'egrep' on short files.] alternatively, one may ! 21: change the vendor-supplied 'egrep' automaton to include ! 22: boyer-moore directly. see accompanying writeup for discussion ! 23: of kanji expression treatment. ! 24: ! 25: late addition: apply trickbag for fast match of simple ! 26: alternations (sublinear, in common low-cardinality cases). ! 27: trap fgrep into this lair. ! 28: ! 29: gnu additions: -f, newline as |, \< and \> [in regexec()], more ! 30: comments. inspire better dfa exec() strategy. ! 31: serious testing and help with special cases. ! 32: ! 33: Algorithm amalgam summary: ! 34: ! 35: dfa e?grep (aho/thompson) ! 36: ndfa regexp() (spencer/aho) ! 37: bmg (boyer/moore/gosper) ! 38: "superimposed" bmg (jaw) ! 39: fgrep (aho/corrasick) ! 40: ! 41: sorry, but the knuth/morris/pratt machine, horspool's ! 42: "frequentist" code, and the rabin/karp matcher, however cute, ! 43: just don't cut it for this production. ! 44: ! 45: James A. Woods Copyright (c) 1986 ! 46: NASA Ames Research Center ! 47: */ ! 48: #include <sys/types.h> ! 49: #include <sys/stat.h> ! 50: #include <sys/file.h> ! 51: #include <regexp.h> /* must be henry spencer's version */ ! 52: #include <stdio.h> ! 53: #include <ctype.h> ! 54: #include "pathnames.h" ! 55: ! 56: #define MIN(A, B) ((A) > (B) ? (B) : (A)) ! 57: ! 58: #ifdef SLOWSYS ! 59: #define read xread ! 60: #endif ! 61: ! 62: #define BUFSIZE 8192 /* make higher for cray */ ! 63: #define PATSIZE 6000 ! 64: #define LARGE BUFSIZE + PATSIZE ! 65: ! 66: #define NALT 7 /* tied to scanf() size in alternate() */ ! 67: #define NMUSH 6 /* loosely relates to expected alt length */ ! 68: ! 69: #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ ! 70: #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input ! 71: * was processed by regexp(), exec std egrep. */ ! 72: #define NL '\n' ! 73: #define EOS '\0' ! 74: #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ ! 75: #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ ! 76: #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ ! 77: #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ ! 78: ! 79: extern char *optarg; ! 80: extern int optind; ! 81: char *progname; ! 82: ! 83: int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ ! 84: int sflag, hflag; /* v7, v8, bsd */ ! 85: ! 86: int firstflag; /* Stop at first match */ ! 87: int grepflag; /* Called as "grep" */ ! 88: int fgrepflag; /* Called as "fgrep" */ ! 89: int altflag; /* Simple alternation in pattern */ ! 90: int boyonly; /* No regexp needed -- all simple */ ! 91: int flushflag; ! 92: int grepold, egrepold, fgrepold; ! 93: ! 94: int nalt; /* Number of alternatives */ ! 95: int nsuccess; /* 1 for match, 2 for error */ ! 96: int altmin; /* Minimum length of all the alternate ! 97: * strings */ ! 98: int firstfile; /* argv index of first file argument */ ! 99: int patind; /* argv index of pattern */ ! 100: long nmatch; /* Number of matches in this file */ ! 101: long incount, counted; /* Amount of input consumed */ ! 102: long rxcount; /* Bytes of input processed by regexec() */ ! 103: int boyfound; /* accumulated partial matches (tripped by ! 104: * FIRSTFEW) */ ! 105: int prevmatch; /* next three lines aid fast -n */ ! 106: long nline, prevnline; ! 107: char *prevloc; ! 108: ! 109: regexp *rspencer; ! 110: char *pattern; ! 111: char *patboy; /* Pattern for simple Boyer-Moore */ ! 112: char *patfile; /* Filename containing pattern(s) */ ! 113: ! 114: int delta0[256]; /* Boyer-Moore algorithm core */ ! 115: char cmap[256]; /* Usually 0-255, but if -i, maps upper to ! 116: * lower case */ ! 117: char str[BUFSIZE + 2]; ! 118: int nleftover; ! 119: char linetemp[BUFSIZE]; ! 120: char *altpat[NALT]; /* alternation component storage */ ! 121: int altlen[NALT]; ! 122: short altset[NMUSH + 1][256]; ! 123: char preamble[200]; /* match prefix (filename, line no.) */ ! 124: ! 125: int fd; ! 126: char * ! 127: strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc(); ! 128: char * ! 129: grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); ! 130: char *gotamatch(), *kanji(), *linesave(), *submatch(); ! 131: char **args; ! 132: ! 133: main(argc, argv) ! 134: int argc; ! 135: char *argv[]; ! 136: { ! 137: int c, oflag; ! 138: int errflag = 0; ! 139: ! 140: args = argv; ! 141: ! 142: if ((progname = strrchr(argv[0], '/')) != 0) ! 143: progname++; ! 144: else ! 145: progname = argv[0]; ! 146: if (strcmp(progname, "grep") == 0) ! 147: grepflag++; ! 148: else if (strcmp(progname, "fgrep") == 0) ! 149: fgrepflag++; ! 150: ! 151: oflag = 0; ! 152: while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) { ! 153: switch (c) { ! 154: ! 155: case 'f': ! 156: fflag++; ! 157: patfile = optarg; ! 158: continue; ! 159: case 'b': ! 160: case 'v': ! 161: egrepold++; /* boyer-moore of little help here */ ! 162: continue; ! 163: case 'c': ! 164: cflag++; ! 165: continue; ! 166: case 'e': ! 167: eflag++; ! 168: pattern = optarg; ! 169: continue; ! 170: case 'h': ! 171: hflag++; ! 172: continue; ! 173: case 'o': ! 174: oflag++; ! 175: continue; ! 176: case '1': /* Stop at very first match */ ! 177: firstflag++; /* spead freaks only */ ! 178: continue; ! 179: case 'i': ! 180: iflag++; ! 181: continue; ! 182: case 'l': ! 183: lflag++; ! 184: continue; ! 185: case 'n': ! 186: nflag++; ! 187: continue; ! 188: case 's': ! 189: sflag++; ! 190: continue; ! 191: case 'w': ! 192: case 'y': ! 193: if (!grepflag) ! 194: errflag++; ! 195: grepold++; ! 196: continue; ! 197: case 'x': /* needs more work, like -b above */ ! 198: if (!fgrepflag) ! 199: errflag++; ! 200: fgrepold++; ! 201: continue; ! 202: case '?': ! 203: errflag++; ! 204: } ! 205: } ! 206: if (errflag || ((argc <= optind) && !fflag && !eflag)) { ! 207: if (grepflag) ! 208: oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]"); ! 209: else if (fgrepflag) ! 210: oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]"); ! 211: else /* encourage SVID options, though we provide ! 212: * others */ ! 213: oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]"); ! 214: } ! 215: if (fflag) ! 216: pattern = pfile(patfile); ! 217: else if (!eflag) { ! 218: patind = optind; ! 219: pattern = argv[optind++]; ! 220: } ! 221: ! 222: if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */ ! 223: hflag++; ! 224: if (pattern[0] == EOS) ! 225: kernighan(argv); /* same as it ever was */ ! 226: /* ! 227: * 'grep/egrep' merger -- "old" grep is called to handle: tagged ! 228: * exprs \( \), word matches \< and \>, -w and -y options, char ! 229: * classes with '-' at end (egrep bug?), and patterns beginning with ! 230: * an asterisk (don't ask why). otherwise, characters meaningful to ! 231: * 'egrep' but not to 'grep' are escaped; the entire expr is then ! 232: * passed to 'egrep'. ! 233: */ ! 234: if (grepflag && !grepold) { ! 235: if (strindex(pattern, "\\(") >= 0 || ! 236: strindex(pattern, "\\<") >= 0 || ! 237: strindex(pattern, "\\>") >= 0 || ! 238: strindex(pattern, "-]") >= 0 || ! 239: pattern[0] == '*') /* grep bug */ ! 240: grepold++; ! 241: else ! 242: pattern = grepxlat(pattern); ! 243: } ! 244: if (grepold || egrepold || fgrepold) ! 245: kernighan(argv); ! 246: ! 247: if (iflag) ! 248: strcpy(pattern, fold(pattern)); ! 249: /* ! 250: * If the pattern is a plain string, just run boyer-moore. If it ! 251: * consists of meta-free alternatives, run "superimposed" bmg. ! 252: * Otherwise, find best string, and compile pattern for regexec(). ! 253: */ ! 254: if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ ! 255: boyonly++; ! 256: patboy = pattern; ! 257: } else { ! 258: if ((patboy = alternate(pattern)) != NULL) ! 259: boyonly++; ! 260: else { ! 261: if ((patboy = isolate(pattern)) == NULL) ! 262: kernighan(argv); /* expr too involved */ ! 263: #ifndef NOKANJI ! 264: for (c = 0; pattern[c] != EOS; c++) ! 265: if (pattern[c] & NONASCII) /* kanji + meta */ ! 266: kernighan(argv); ! 267: #endif ! 268: if ((rspencer = regcomp(pattern)) == NULL) ! 269: oops("regcomp failure"); ! 270: } ! 271: } ! 272: gosper(patboy); /* "pre-conditioning is wonderful" ! 273: * -- v. strassen */ ! 274: ! 275: if ((firstfile = optind) >= argc) { ! 276: /* Grep standard input */ ! 277: if (lflag) /* We don't know its name! */ ! 278: exit(1); ! 279: egsecute((char *) NULL); ! 280: } else { ! 281: while (optind < argc) { ! 282: egsecute(argv[optind]); ! 283: optind++; ! 284: if (firstflag && (nsuccess == 1)) ! 285: break; ! 286: } ! 287: } ! 288: exit((nsuccess == 2) ? 2 : (nsuccess == 0)); ! 289: } ! 290: ! 291: char * ! 292: pfile(pfname) /* absorb expression from file */ ! 293: char *pfname; ! 294: { ! 295: int fd; ! 296: struct stat patstat; ! 297: static char *pat; ! 298: ! 299: if ((fd = open(pfname, O_RDONLY, 0)) < 0) ! 300: oops("can't read pattern file"); ! 301: if (fstat(fd, &patstat) != 0) ! 302: oops("can't stat pattern file"); ! 303: if (patstat.st_size > PATSIZE) { ! 304: if (fgrepflag) { /* defer to unix version */ ! 305: fgrepold++; ! 306: return "dummy"; ! 307: } else ! 308: oops("pattern file too big"); ! 309: } ! 310: if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) ! 311: oops("out of memory to read pattern file"); ! 312: if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) ! 313: oops("error reading pattern file"); ! 314: (void) close(fd); ! 315: ! 316: pat[patstat.st_size] = EOS; ! 317: if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ ! 318: pat[patstat.st_size - 1] = EOS; ! 319: ! 320: if (nlcount(pat, &pat[patstat.st_size]) > NALT) { ! 321: if (fgrepflag) ! 322: fgrepold++; /* "what's it all about, alfie?" */ ! 323: else ! 324: egrepold++; ! 325: } ! 326: return (pat); ! 327: } ! 328: ! 329: egsecute(file) ! 330: char *file; ! 331: { ! 332: if (file == NULL) ! 333: fd = 0; ! 334: else if ((fd = open(file, O_RDONLY, 0)) <= 0) { ! 335: fprintf(stderr, "%s: can't open %s\n", progname, file); ! 336: nsuccess = 2; ! 337: return; ! 338: } ! 339: chimaera(file, patboy); ! 340: ! 341: if (!boyonly && !flushflag && file != NULL) ! 342: flushmatches(); ! 343: if (file != NULL) ! 344: close(fd); ! 345: } ! 346: ! 347: chimaera(file, pat) /* "reach out and boyer-moore search someone" */ ! 348: char *file, *pat; /* -- soon-to-be-popular bumper sticker */ ! 349: { ! 350: register char *k, *strend, *s; ! 351: register int j, count; ! 352: register int *deltazero = delta0; ! 353: int patlen = altmin; ! 354: char *t; ! 355: ! 356: nleftover = boyfound = flushflag = 0; ! 357: nline = 1L; ! 358: prevmatch = 0; ! 359: nmatch = counted = rxcount = 0L; ! 360: ! 361: while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { ! 362: ! 363: counted += count; ! 364: strend = linesave(str, count); ! 365: ! 366: for (k = str + patlen - 1; k < strend;) { ! 367: /* ! 368: * for a large class of patterns, upwards of 80% of ! 369: * match time is spent on the next line. we beat ! 370: * existing microcode (vax 'matchc') this way. ! 371: */ ! 372: while ((k += deltazero[*(unsigned char *) k]) < strend); ! 373: if (k < (str + LARGE)) ! 374: break; ! 375: k -= LARGE; ! 376: ! 377: if (altflag) { ! 378: /* ! 379: * Parallel Boyer-Moore. Check whether each ! 380: * of the previous <altmin> chars COULD be ! 381: * from one of the alternative strings. ! 382: */ ! 383: s = k - 1; ! 384: j = altmin; ! 385: while (altset[--j][(unsigned char) ! 386: cmap[*(unsigned char *) s--]]); ! 387: /* ! 388: * quick test fails. in this life, compare ! 389: * 'em all. but, a "reverse trie" would ! 390: * attenuate worst case (linear w/delta2?). ! 391: */ ! 392: if (--j < 0) { ! 393: count = nalt - 1; ! 394: do { ! 395: s = k; ! 396: j = altlen[count]; ! 397: t = altpat[count]; ! 398: ! 399: while ! 400: (cmap[*(unsigned char *) s--] ! 401: == t[--j]); ! 402: if (j < 0) ! 403: break; ! 404: } ! 405: while (count--); ! 406: } ! 407: } else { ! 408: /* One string -- check it */ ! 409: j = patlen - 1; ! 410: s = k - 1; ! 411: while (cmap[*(unsigned char *) s--] == pat[--j]); ! 412: } ! 413: /* ! 414: * delta-less shortcut for literati. short shrift for ! 415: * genetic engineers? ! 416: */ ! 417: if (j >= 0) { ! 418: k++; /* no match; restart next char */ ! 419: continue; ! 420: } ! 421: k = submatch(file, pat, str, strend, k, count); ! 422: if (k == NULL) ! 423: return; ! 424: } ! 425: if (nflag) { ! 426: if (prevmatch) ! 427: nline = prevnline + nlcount(prevloc, k); ! 428: else ! 429: nline = nline + nlcount(str, k); ! 430: prevmatch = 0; ! 431: } ! 432: strncpy(str, linetemp, nleftover); ! 433: } ! 434: if (cflag) { ! 435: /* Bug from old grep: -c overrides -h. We fix the bug. */ ! 436: if (!hflag) ! 437: printf("%s:", file); ! 438: printf("%ld\n", nmatch); ! 439: } ! 440: } ! 441: ! 442: char * ! 443: linesave(str, count) /* accumulate partial line at end of buffer */ ! 444: char str[]; ! 445: register int count; ! 446: { ! 447: register int j; ! 448: ! 449: count += nleftover; ! 450: if (count != BUFSIZE && fd != 0) ! 451: str[count++] = NL; /* insurance for broken last line */ ! 452: str[count] = EOS; ! 453: for (j = count - 1; str[j] != NL && j >= 0;) ! 454: j--; ! 455: /* ! 456: * break up these lines: long line (> BUFSIZE), last line of file, or ! 457: * short return from read(), as from tee(1) input ! 458: */ ! 459: if (j < 0 && (count == (BUFSIZE - nleftover))) { ! 460: str[count++] = NL; ! 461: str[count] = EOS; ! 462: linetemp[0] = EOS; ! 463: nleftover = 0; ! 464: return (str + count); ! 465: } else { ! 466: nleftover = count - j - 1; ! 467: strncpy(linetemp, str + j + 1, nleftover); ! 468: return (str + j); ! 469: } ! 470: } ! 471: ! 472: /* ! 473: * Process partial match. First check for mis-aligned Kanji, then match line ! 474: * against full compiled r.e. if statistics do not warrant handing off to ! 475: * standard egrep. ! 476: */ ! 477: char * ! 478: submatch(file, pat, str, strend, k, altindex) ! 479: char file[], pat[], str[]; ! 480: register char *strend, *k; ! 481: int altindex; ! 482: { ! 483: register char *s; ! 484: char *t, c; ! 485: ! 486: t = k; ! 487: s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); ! 488: #ifndef NOKANJI ! 489: c = ((altflag) ? altpat[altindex][0] : pat[0]); ! 490: if (c & NONASCII) ! 491: if ((s = kanji(str, s, k)) == NULL) ! 492: return (++k); /* reject false kanji */ ! 493: #endif ! 494: do; ! 495: while (*s != NL && --s >= str); ! 496: k = s + 1; /* now at line start */ ! 497: ! 498: if (boyonly) ! 499: return (gotamatch(file, k)); ! 500: ! 501: incount = counted - (strend - k); ! 502: if (boyfound++ == FIRSTFEW) ! 503: execstrategy(file); ! 504: ! 505: s = t; ! 506: do ! 507: rxcount++; ! 508: while (*s++ != NL); ! 509: *--s = EOS; ! 510: /* ! 511: * "quick henry -- the flit" (after theodor geisel) ! 512: */ ! 513: if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { ! 514: *s = NL; ! 515: if (gotamatch(file, k) == NULL) ! 516: return (NULL); ! 517: } ! 518: *s = NL; ! 519: return (s + 1); ! 520: } ! 521: ! 522: #ifndef NOKANJI ! 523: /* ! 524: * EUC code disambiguation -- scan backwards to first 7-bit code, while ! 525: * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. ! 526: * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. ! 527: */ ! 528: char * ! 529: kanji(str, s, k) ! 530: register char *str, *s, *k; ! 531: { ! 532: register int j = 0; ! 533: ! 534: for (s--; s >= str; s--) { ! 535: if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) ! 536: break; ! 537: j++; ! 538: } ! 539: #ifndef CHINESE ! 540: if (*s == SS2) ! 541: j -= 1; ! 542: #endif CHINESE ! 543: return ((j & 01) ? NULL : k); ! 544: } ! 545: #endif ! 546: ! 547: /* ! 548: * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] ! 549: */ ! 550: gosper(pattern) ! 551: char *pattern; /* ... HAKMEM lives ... */ ! 552: { ! 553: register int i, j; ! 554: unsigned char c; ! 555: ! 556: /* Make one-string case look like simple alternatives case */ ! 557: if (!altflag) { ! 558: nalt = 1; ! 559: altmin = altlen[0] = strlen(pattern); ! 560: altpat[0] = pattern; ! 561: } ! 562: /* For chars that aren't in any string, skip by string length. */ ! 563: for (j = 0; j < 256; j++) { ! 564: delta0[j] = altmin; ! 565: cmap[j] = j; /* Sneak in initialization of cmap */ ! 566: } ! 567: ! 568: /* For chars in a string, skip distance from char to end of string. */ ! 569: /* (If char appears more than once, skip minimum distance.) */ ! 570: for (i = 0; i < nalt; i++) ! 571: for (j = 0; j < altlen[i] - 1; j++) { ! 572: c = altpat[i][j]; ! 573: delta0[c] = MIN(delta0[c], altlen[i] - j - 1); ! 574: if (iflag && islower((int) c)) ! 575: delta0[toupper((int) c)] = delta0[c]; ! 576: } ! 577: ! 578: /* For last char of each string, fall out of search loop. */ ! 579: for (i = 0; i < nalt; i++) { ! 580: c = altpat[i][altlen[i] - 1]; ! 581: delta0[c] = LARGE; ! 582: if (iflag && islower((int) c)) ! 583: delta0[toupper((int) c)] = LARGE; ! 584: } ! 585: if (iflag) ! 586: for (j = 'A'; j <= 'Z'; j++) ! 587: cmap[j] = tolower((int) j); ! 588: } ! 589: ! 590: /* ! 591: * Print, count, or stop on full match. Result is either the location for ! 592: * continued search, or NULL to stop. ! 593: */ ! 594: char * ! 595: gotamatch(file, s) ! 596: register char *file, *s; ! 597: { ! 598: char *savematch(); ! 599: int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ ! 600: ! 601: nmatch++; ! 602: nsuccess = 1; ! 603: if (!boyonly && boyfound <= FIRSTFEW && file != NULL) ! 604: squirrel = 1; ! 605: ! 606: if (sflag) ! 607: return (NULL); /* -s usurps all flags (unlike some versions) */ ! 608: if (cflag) { /* -c overrides -l, we guess */ ! 609: do; ! 610: while (*s++ != NL); ! 611: } else if (lflag) { ! 612: puts(file); ! 613: return (NULL); ! 614: } else { ! 615: if (!hflag) ! 616: if (!squirrel) ! 617: printf("%s:", file); ! 618: else ! 619: (void)sprintf(preamble, "%s:", file); ! 620: if (nflag) { ! 621: if (prevmatch) ! 622: prevnline = prevnline + nlcount(prevloc, s); ! 623: else ! 624: prevnline = nline + nlcount(str, s); ! 625: prevmatch = 1; ! 626: ! 627: if (!squirrel) ! 628: printf("%ld:", prevnline); ! 629: else ! 630: (void)sprintf(preamble + strlen(preamble), ! 631: "%ld:", prevnline); ! 632: } ! 633: if (!squirrel) { ! 634: do ! 635: putchar(*s); ! 636: while (*s++ != NL); ! 637: } else ! 638: s = savematch(s); ! 639: ! 640: if (nflag) ! 641: prevloc = s - 1; ! 642: } ! 643: return ((firstflag && !cflag) ? NULL : s); ! 644: } ! 645: ! 646: char * ! 647: fold(line) ! 648: char *line; ! 649: { ! 650: static char fline[BUFSIZE]; ! 651: register char *s, *t = fline; ! 652: ! 653: for (s = line; *s != EOS; s++) ! 654: *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); ! 655: *t = EOS; ! 656: return (fline); ! 657: } ! 658: ! 659: strindex(s, t) /* the easy way, as in K&P, p. 192 */ ! 660: char *s, *t; ! 661: { ! 662: int i, n; ! 663: ! 664: n = strlen(t); ! 665: for (i = 0; s[i] != '\0'; i++) ! 666: if (strncmp(s + i, t, n) == 0) ! 667: return (i); ! 668: return (-1); ! 669: } ! 670: ! 671: char * ! 672: grepxlat(pattern) /* grep pattern meta conversion */ ! 673: char *pattern; ! 674: { ! 675: register char *p, *s; ! 676: static char newpat[BUFSIZE]; ! 677: ! 678: for (s = newpat, p = pattern; *p != EOS;) { ! 679: if (*p == '\\') { /* skip escapes ... */ ! 680: *s++ = *p++; ! 681: if (*p) ! 682: *s++ = *p++; ! 683: } else if (*p == '[') { /* ... and char classes */ ! 684: while (*p != EOS && *p != ']') ! 685: *s++ = *p++; ! 686: } else if (strchr("+?|()", *p) != NULL) { ! 687: *s++ = '\\'; /* insert protection */ ! 688: *s++ = *p++; ! 689: } else ! 690: *s++ = *p++; ! 691: } ! 692: *s = EOS; ! 693: grepflag = ((patind) ? 0 : 1); ! 694: return (newpat); ! 695: } ! 696: ! 697: /* ! 698: * Test for simple alternation. Result is NULL if it's not so simple, or is ! 699: * a pointer to the first string if it is. Warning: sscanf size is a ! 700: * fixpoint, beyond which the speedup linearity starts to break down. In the ! 701: * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing ! 702: * altpat[] to arbitrary size is not useful. ! 703: */ ! 704: char * ! 705: alternate(regexpr) ! 706: char *regexpr; ! 707: { ! 708: register int i, j; ! 709: register char *start, *stop; ! 710: unsigned char c; ! 711: ! 712: if (fgrepflag && strchr(regexpr, '|')) ! 713: return (NULL); ! 714: ! 715: /* ! 716: * break pattern up into altpat array; delimit on newline, bar, ! 717: * or EOS. We know we won't overflow, we've already checked the ! 718: * number of patterns we're going to find against NALT. ! 719: * Also, set length of pattern and find minimum pattern length. ! 720: */ ! 721: nalt = 0; ! 722: altmin = NMUSH; ! 723: for (start = stop = regexpr;; ++stop) ! 724: if (!*stop || *stop == '|' || *stop == NL) { ! 725: altlen[nalt] = j = stop - start; ! 726: if (j < altmin) ! 727: altmin = j; ! 728: if (!(altpat[nalt] = malloc((u_int)(j + 1)))) ! 729: oops("out of memory"); ! 730: bcopy(start, altpat[nalt], j); ! 731: altpat[nalt][j] = EOS; ! 732: ++nalt; ! 733: if (!*stop) ! 734: break; ! 735: if (nalt == NALT) ! 736: return(NULL); ! 737: if (*stop == NL) ! 738: *stop = '|'; ! 739: start = stop + 1; ! 740: } ! 741: if (!fgrepflag) { ! 742: if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') ! 743: return (NULL); ! 744: if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL ! 745: || strindex(regexpr, "||") >= 0) ! 746: return (NULL); ! 747: } ! 748: ! 749: if (nalt > 1) { /* build superimposed "pre-match" sets per ! 750: * char */ ! 751: altflag++; ! 752: for (j = 0; j < nalt; j++) ! 753: for (i = 0; i < altmin; i++) { ! 754: c = altpat[j][altlen[j] - altmin + i]; ! 755: altset[i + 1][c] = 1; /* offset for sentinel */ ! 756: } ! 757: } ! 758: return (altpat[0]); ! 759: } ! 760: ! 761: /* ! 762: * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to ! 763: * determine whether to use dfa-based egrep: We do FIRSTFEW matches with ! 764: * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT ! 765: * of the input, the r.e. is likely to be underspecified, so do old *grep, ! 766: * which is faster on complex patterns than regexp(). At FIRSTFEW, ! 767: * dump the saved matches collected by savematch(). They are saved ! 768: * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, ! 769: * since it's hard to rewind. ! 770: */ ! 771: ! 772: execstrategy(file) ! 773: char *file; ! 774: { ! 775: int pctmatch; ! 776: ! 777: pctmatch = (100 * rxcount) / incount; ! 778: if (pctmatch > PUNTPERCENT && file != NULL) ! 779: kernighan(args); ! 780: if (file != NULL) ! 781: flushmatches(); ! 782: } ! 783: ! 784: nlcount(bstart, bstop) /* flail interval to totalize newlines. */ ! 785: char *bstart, *bstop; ! 786: { ! 787: register char *s = bstart; ! 788: register char *t = bstop; ! 789: register int count = 0; ! 790: ! 791: do { /* loop unroll for older architectures */ ! 792: if (*t == NL) /* ... ask ames!jaw for sample code */ ! 793: count++; ! 794: } while (t-- > s); ! 795: ! 796: return (count); ! 797: } ! 798: ! 799: char * ! 800: isolate(regexpr) /* isolate longest metacharacter-free string */ ! 801: char *regexpr; ! 802: { ! 803: char *dummyexpr; ! 804: ! 805: /* ! 806: * We add (.)* because Henry's regcomp only figures regmust if it ! 807: * sees a leading * pattern. Foo! ! 808: */ ! 809: dummyexpr = malloc((unsigned) strlen(regexpr) + 5); ! 810: (void)sprintf(dummyexpr, "(.)*%s", regexpr); ! 811: if ((rspencer = regcomp(dummyexpr)) == NULL) ! 812: kernighan(args); ! 813: return (rspencer->regmust); ! 814: } ! 815: ! 816: char *matches[FIRSTFEW]; ! 817: static int mcount = 0; ! 818: ! 819: char * ! 820: savematch(s) /* horde matches during statistics gathering */ ! 821: register char *s; ! 822: { ! 823: char *p; ! 824: char *start = s; ! 825: int msize = 0; ! 826: int psize = strlen(preamble); ! 827: ! 828: while (*s++ != NL) ! 829: msize++; ! 830: *--s = EOS; ! 831: ! 832: p = malloc((unsigned) msize + 1 + psize); ! 833: strcpy(p, preamble); ! 834: strcpy(p + psize, start); ! 835: matches[mcount++] = p; ! 836: ! 837: preamble[0] = 0; ! 838: *s = NL; ! 839: return (s); ! 840: } ! 841: ! 842: flushmatches() ! 843: { ! 844: int n; ! 845: ! 846: flushflag = 1; ! 847: for (n = 0; n < mcount; n++) ! 848: printf("%s\n", matches[n]); ! 849: mcount = 0; ! 850: } ! 851: ! 852: oops(message) ! 853: char *message; ! 854: { ! 855: fprintf(stderr, "%s: %s\n", progname, message); ! 856: exit(2); ! 857: } ! 858: ! 859: kernighan(args) /* "let others do the hard part ..." */ ! 860: char *args[]; ! 861: { ! 862: /* ! 863: * We may have already run grep on some of the files; remove them ! 864: * from the arg list we pass on. Note that we can't delete them ! 865: * totally because the number of file names affects the output ! 866: * (automatic -h). ! 867: */ ! 868: /* better would be fork/exec per punted file -- jaw */ ! 869: ! 870: while (firstfile && optind > firstfile) ! 871: args[firstfile++] = _PATH_DEVNULL; ! 872: if (patind) ! 873: args[patind] = pattern; ! 874: (void) fflush(stdout); ! 875: ! 876: if (grepflag) ! 877: execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'"); ! 878: else if (fgrepflag) ! 879: execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'"); ! 880: else ! 881: execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'"); ! 882: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.