|
|
1.1 ! root 1: #ifndef lint ! 2: static char sccsid[] = "@(#)egrep.c 5.10 (Berkeley) 6/30/88"; ! 3: #endif not lint ! 4: ! 5: /* ! 6: Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0 ! 7: table as in original paper (CACM, October, 1977). No delta1 or delta2. ! 8: According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of ! 9: minimal practical value. However, to improve for worst case input, ! 10: integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J. ! 11: Comput., Feb. 1986) deserves consideration. ! 12: ! 13: Method: extract longest metacharacter-free string from expression. ! 14: this is done using a side-effect from henry spencer's regcomp(). ! 15: use boyer-moore to match such, then pass submatching lines ! 16: to either regexp() or standard 'egrep', depending on certain ! 17: criteria within execstrategy() below. [this tradeoff is due ! 18: to the general slowness of the regexp() nondeterministic ! 19: machine on complex expressions, as well as the startup time ! 20: of standard 'egrep' on short files.] alternatively, one may ! 21: change the vendor-supplied 'egrep' automaton to include ! 22: boyer-moore directly. see accompanying writeup for discussion ! 23: of kanji expression treatment. ! 24: ! 25: late addition: apply trickbag for fast match of simple ! 26: alternations (sublinear, in common low-cardinality cases). ! 27: trap fgrep into this lair. ! 28: ! 29: gnu additions: -f, newline as |, \< and \> [in regexec()], more ! 30: comments. inspire better dfa exec() strategy. ! 31: serious testing and help with special cases. ! 32: ! 33: Algorithm amalgam summary: ! 34: ! 35: dfa e?grep (aho/thompson) ! 36: ndfa regexp() (spencer/aho) ! 37: bmg (boyer/moore/gosper) ! 38: "superimposed" bmg (jaw) ! 39: fgrep (aho/corrasick) ! 40: ! 41: sorry, but the knuth/morris/pratt machine, horspool's ! 42: "frequentist" code, and the rabin/karp matcher, however cute, ! 43: just don't cut it for this production. ! 44: ! 45: James A. Woods Copyright (c) 1986 ! 46: NASA Ames Research Center ! 47: */ ! 48: #include <stdio.h> ! 49: #include <ctype.h> ! 50: #include <sys/types.h> ! 51: #include <sys/stat.h> ! 52: #include <sys/file.h> ! 53: #include <regexp.h> /* must be henry spencer's version */ ! 54: ! 55: #define MIN(A, B) ((A) > (B) ? (B) : (A)) ! 56: ! 57: #ifdef SLOWSYS ! 58: #define read xread ! 59: #endif ! 60: /* ! 61: * define existing [ef]?grep program locations below for use by execvp(). ! 62: * [execlp() would be used were it not for the possibility of ! 63: * installation-dependent recursion.] ! 64: */ ! 65: #ifndef EGREPSTD ! 66: #define EGREPSTD "/usr/bin/egrep" ! 67: #endif ! 68: #ifndef GREPSTD ! 69: #define GREPSTD "/bin/grep" ! 70: #endif ! 71: #ifndef FGREPSTD ! 72: #define FGREPSTD "/usr/bin/fgrep" ! 73: #endif ! 74: ! 75: #define BUFSIZE 8192 /* make higher for cray */ ! 76: #define PATSIZE 6000 ! 77: #define LARGE BUFSIZE + PATSIZE ! 78: ! 79: #define NALT 7 /* tied to scanf() size in alternate() */ ! 80: #define NMUSH 6 /* loosely relates to expected alt length */ ! 81: ! 82: #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */ ! 83: #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input ! 84: * was processed by regexp(), exec std egrep. */ ! 85: #define NL '\n' ! 86: #define EOS '\0' ! 87: #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */ ! 88: #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */ ! 89: #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */ ! 90: #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */ ! 91: ! 92: extern char *optarg; ! 93: extern int optind; ! 94: char *progname; ! 95: ! 96: int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */ ! 97: int sflag, hflag; /* v7, v8, bsd */ ! 98: ! 99: int firstflag; /* Stop at first match */ ! 100: int grepflag; /* Called as "grep" */ ! 101: int fgrepflag; /* Called as "fgrep" */ ! 102: int altflag; /* Simple alternation in pattern */ ! 103: int boyonly; /* No regexp needed -- all simple */ ! 104: int flushflag; ! 105: int grepold, egrepold, fgrepold; ! 106: ! 107: int nalt; /* Number of alternatives */ ! 108: int nsuccess; /* 1 for match, 2 for error */ ! 109: int altmin; /* Minimum length of all the alternate ! 110: * strings */ ! 111: int firstfile; /* argv index of first file argument */ ! 112: int patind; /* argv index of pattern */ ! 113: long nmatch; /* Number of matches in this file */ ! 114: long incount, counted; /* Amount of input consumed */ ! 115: long rxcount; /* Bytes of input processed by regexec() */ ! 116: int boyfound; /* accumulated partial matches (tripped by ! 117: * FIRSTFEW) */ ! 118: int prevmatch; /* next three lines aid fast -n */ ! 119: long nline, prevnline; ! 120: char *prevloc; ! 121: ! 122: regexp *rspencer; ! 123: char *pattern; ! 124: char *patboy; /* Pattern for simple Boyer-Moore */ ! 125: char *patfile; /* Filename containing pattern(s) */ ! 126: ! 127: int delta0[256]; /* Boyer-Moore algorithm core */ ! 128: char cmap[256]; /* Usually 0-255, but if -i, maps upper to ! 129: * lower case */ ! 130: char str[BUFSIZE + 2]; ! 131: int nleftover; ! 132: char linetemp[BUFSIZE]; ! 133: char *altpat[NALT]; /* alternation component storage */ ! 134: int altlen[NALT]; ! 135: short altset[NMUSH + 1][256]; ! 136: char preamble[200]; /* match prefix (filename, line no.) */ ! 137: ! 138: int fd; ! 139: char * ! 140: strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc(); ! 141: char * ! 142: grepxlat(), *fold(), *pfile(), *alternate(), *isolate(); ! 143: char **args; ! 144: ! 145: main(argc, argv) ! 146: int argc; ! 147: char *argv[]; ! 148: { ! 149: int c, oflag; ! 150: int errflag = 0; ! 151: ! 152: args = argv; ! 153: ! 154: if ((progname = strrchr(argv[0], '/')) != 0) ! 155: progname++; ! 156: else ! 157: progname = argv[0]; ! 158: if (strcmp(progname, "grep") == 0) ! 159: grepflag++; ! 160: else if (strcmp(progname, "fgrep") == 0) ! 161: fgrepflag++; ! 162: ! 163: oflag = 0; ! 164: while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) { ! 165: switch (c) { ! 166: ! 167: case 'f': ! 168: fflag++; ! 169: patfile = optarg; ! 170: continue; ! 171: case 'b': ! 172: case 'v': ! 173: egrepold++; /* boyer-moore of little help here */ ! 174: continue; ! 175: case 'c': ! 176: cflag++; ! 177: continue; ! 178: case 'e': ! 179: eflag++; ! 180: pattern = optarg; ! 181: continue; ! 182: case 'h': ! 183: hflag++; ! 184: continue; ! 185: case 'o': ! 186: oflag++; ! 187: continue; ! 188: case '1': /* Stop at very first match */ ! 189: firstflag++; /* spead freaks only */ ! 190: continue; ! 191: case 'i': ! 192: iflag++; ! 193: continue; ! 194: case 'l': ! 195: lflag++; ! 196: continue; ! 197: case 'n': ! 198: nflag++; ! 199: continue; ! 200: case 's': ! 201: sflag++; ! 202: continue; ! 203: case 'w': ! 204: case 'y': ! 205: if (!grepflag) ! 206: errflag++; ! 207: grepold++; ! 208: continue; ! 209: case 'x': /* needs more work, like -b above */ ! 210: if (!fgrepflag) ! 211: errflag++; ! 212: fgrepold++; ! 213: continue; ! 214: case '?': ! 215: errflag++; ! 216: } ! 217: } ! 218: if (errflag || ((argc <= optind) && !fflag && !eflag)) { ! 219: if (grepflag) ! 220: oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]"); ! 221: else if (fgrepflag) ! 222: oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]"); ! 223: else /* encourage SVID options, though we provide ! 224: * others */ ! 225: oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]"); ! 226: } ! 227: if (fflag) ! 228: pattern = pfile(patfile); ! 229: else if (!eflag) { ! 230: patind = optind; ! 231: pattern = argv[optind++]; ! 232: } ! 233: ! 234: if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */ ! 235: hflag++; ! 236: if (pattern[0] == EOS) ! 237: kernighan(argv); /* same as it ever was */ ! 238: /* ! 239: * 'grep/egrep' merger -- "old" grep is called to handle: tagged ! 240: * exprs \( \), word matches \< and \>, -w and -y options, char ! 241: * classes with '-' at end (egrep bug?), and patterns beginning with ! 242: * an asterisk (don't ask why). otherwise, characters meaningful to ! 243: * 'egrep' but not to 'grep' are escaped; the entire expr is then ! 244: * passed to 'egrep'. ! 245: */ ! 246: if (grepflag && !grepold) { ! 247: if (strindex(pattern, "\\(") >= 0 || ! 248: strindex(pattern, "\\<") >= 0 || ! 249: strindex(pattern, "\\>") >= 0 || ! 250: strindex(pattern, "-]") >= 0 || ! 251: pattern[0] == '*') /* grep bug */ ! 252: grepold++; ! 253: else ! 254: pattern = grepxlat(pattern); ! 255: } ! 256: if (grepold || egrepold || fgrepold) ! 257: kernighan(argv); ! 258: ! 259: if (iflag) ! 260: strcpy(pattern, fold(pattern)); ! 261: /* ! 262: * If the pattern is a plain string, just run boyer-moore. If it ! 263: * consists of meta-free alternatives, run "superimposed" bmg. ! 264: * Otherwise, find best string, and compile pattern for regexec(). ! 265: */ ! 266: if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */ ! 267: boyonly++; ! 268: patboy = pattern; ! 269: } else { ! 270: if ((patboy = alternate(pattern)) != NULL) ! 271: boyonly++; ! 272: else { ! 273: if ((patboy = isolate(pattern)) == NULL) ! 274: kernighan(argv); /* expr too involved */ ! 275: #ifndef NOKANJI ! 276: for (c = 0; pattern[c] != EOS; c++) ! 277: if (pattern[c] & NONASCII) /* kanji + meta */ ! 278: kernighan(argv); ! 279: #endif ! 280: if ((rspencer = regcomp(pattern)) == NULL) ! 281: oops("regcomp failure"); ! 282: } ! 283: } ! 284: gosper(patboy); /* "pre-conditioning is wonderful" ! 285: * -- v. strassen */ ! 286: ! 287: if ((firstfile = optind) >= argc) { ! 288: /* Grep standard input */ ! 289: if (lflag) /* We don't know its name! */ ! 290: exit(1); ! 291: egsecute((char *) NULL); ! 292: } else { ! 293: while (optind < argc) { ! 294: egsecute(argv[optind]); ! 295: optind++; ! 296: if (firstflag && (nsuccess == 1)) ! 297: break; ! 298: } ! 299: } ! 300: exit((nsuccess == 2) ? 2 : (nsuccess == 0)); ! 301: } ! 302: ! 303: char * ! 304: pfile(pfname) /* absorb expression from file */ ! 305: char *pfname; ! 306: { ! 307: int fd; ! 308: struct stat patstat; ! 309: static char *pat; ! 310: ! 311: if ((fd = open(pfname, O_RDONLY, 0)) < 0) ! 312: oops("can't read pattern file"); ! 313: if (fstat(fd, &patstat) != 0) ! 314: oops("can't stat pattern file"); ! 315: if (patstat.st_size > PATSIZE) { ! 316: if (fgrepflag) { /* defer to unix version */ ! 317: fgrepold++; ! 318: return "dummy"; ! 319: } else ! 320: oops("pattern file too big"); ! 321: } ! 322: if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL) ! 323: oops("out of memory to read pattern file"); ! 324: if (patstat.st_size != read(fd, pat, (int)patstat.st_size)) ! 325: oops("error reading pattern file"); ! 326: (void) close(fd); ! 327: ! 328: pat[patstat.st_size] = EOS; ! 329: if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */ ! 330: pat[patstat.st_size - 1] = EOS; ! 331: ! 332: if (nlcount(pat, &pat[patstat.st_size]) > NALT) { ! 333: if (fgrepflag) ! 334: fgrepold++; /* "what's it all about, alfie?" */ ! 335: else ! 336: egrepold++; ! 337: } ! 338: return (pat); ! 339: } ! 340: ! 341: egsecute(file) ! 342: char *file; ! 343: { ! 344: if (file == NULL) ! 345: fd = 0; ! 346: else if ((fd = open(file, O_RDONLY, 0)) <= 0) { ! 347: fprintf(stderr, "%s: can't open %s\n", progname, file); ! 348: nsuccess = 2; ! 349: return; ! 350: } ! 351: chimaera(file, patboy); ! 352: ! 353: if (!boyonly && !flushflag && file != NULL) ! 354: flushmatches(); ! 355: if (file != NULL) ! 356: close(fd); ! 357: } ! 358: ! 359: chimaera(file, pat) /* "reach out and boyer-moore search someone" */ ! 360: char *file, *pat; /* -- soon-to-be-popular bumper sticker */ ! 361: { ! 362: register char *k, *strend, *s; ! 363: register int j, count; ! 364: register int *deltazero = delta0; ! 365: int patlen = altmin; ! 366: char *t; ! 367: char *gotamatch(), *kanji(), *linesave(), *submatch(); ! 368: ! 369: nleftover = boyfound = flushflag = 0; ! 370: nline = 1L; ! 371: prevmatch = 0; ! 372: nmatch = counted = rxcount = 0L; ! 373: ! 374: while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) { ! 375: ! 376: counted += count; ! 377: strend = linesave(str, count); ! 378: ! 379: for (k = str + patlen - 1; k < strend;) { ! 380: /* ! 381: * for a large class of patterns, upwards of 80% of ! 382: * match time is spent on the next line. we beat ! 383: * existing microcode (vax 'matchc') this way. ! 384: */ ! 385: while ((k += deltazero[*(unsigned char *) k]) < strend); ! 386: if (k < (str + LARGE)) ! 387: break; ! 388: k -= LARGE; ! 389: ! 390: if (altflag) { ! 391: /* ! 392: * Parallel Boyer-Moore. Check whether each ! 393: * of the previous <altmin> chars COULD be ! 394: * from one of the alternative strings. ! 395: */ ! 396: s = k - 1; ! 397: j = altmin; ! 398: while (altset[--j][(unsigned char) ! 399: cmap[*(unsigned char *) s--]]); ! 400: /* ! 401: * quick test fails. in this life, compare ! 402: * 'em all. but, a "reverse trie" would ! 403: * attenuate worst case (linear w/delta2?). ! 404: */ ! 405: if (--j < 0) { ! 406: count = nalt - 1; ! 407: do { ! 408: s = k; ! 409: j = altlen[count]; ! 410: t = altpat[count]; ! 411: ! 412: while ! 413: (cmap[*(unsigned char *) s--] ! 414: == t[--j]); ! 415: if (j < 0) ! 416: break; ! 417: } ! 418: while (count--); ! 419: } ! 420: } else { ! 421: /* One string -- check it */ ! 422: j = patlen - 1; ! 423: s = k - 1; ! 424: while (cmap[*(unsigned char *) s--] == pat[--j]); ! 425: } ! 426: /* ! 427: * delta-less shortcut for literati. short shrift for ! 428: * genetic engineers? ! 429: */ ! 430: if (j >= 0) { ! 431: k++; /* no match; restart next char */ ! 432: continue; ! 433: } ! 434: k = submatch(file, pat, str, strend, k, count); ! 435: if (k == NULL) ! 436: return; ! 437: } ! 438: if (nflag) { ! 439: if (prevmatch) ! 440: nline = prevnline + nlcount(prevloc, k); ! 441: else ! 442: nline = nline + nlcount(str, k); ! 443: prevmatch = 0; ! 444: } ! 445: strncpy(str, linetemp, nleftover); ! 446: } ! 447: if (cflag) { ! 448: /* Bug from old grep: -c overrides -h. We fix the bug. */ ! 449: if (!hflag) ! 450: printf("%s:", file); ! 451: printf("%ld\n", nmatch); ! 452: } ! 453: } ! 454: ! 455: char * ! 456: linesave(str, count) /* accumulate partial line at end of buffer */ ! 457: char str[]; ! 458: register int count; ! 459: { ! 460: register int j; ! 461: ! 462: count += nleftover; ! 463: if (count != BUFSIZE && fd != 0) ! 464: str[count++] = NL; /* insurance for broken last line */ ! 465: str[count] = EOS; ! 466: for (j = count - 1; str[j] != NL && j >= 0;) ! 467: j--; ! 468: /* ! 469: * break up these lines: long line (> BUFSIZE), last line of file, or ! 470: * short return from read(), as from tee(1) input ! 471: */ ! 472: if (j < 0 && (count == (BUFSIZE - nleftover))) { ! 473: str[count++] = NL; ! 474: str[count] = EOS; ! 475: linetemp[0] = EOS; ! 476: nleftover = 0; ! 477: return (str + count); ! 478: } else { ! 479: nleftover = count - j - 1; ! 480: strncpy(linetemp, str + j + 1, nleftover); ! 481: return (str + j); ! 482: } ! 483: } ! 484: ! 485: /* ! 486: * Process partial match. First check for mis-aligned Kanji, then match line ! 487: * against full compiled r.e. if statistics do not warrant handing off to ! 488: * standard egrep. ! 489: */ ! 490: char * ! 491: submatch(file, pat, str, strend, k, altindex) ! 492: char file[], pat[], str[]; ! 493: register char *strend, *k; ! 494: int altindex; ! 495: { ! 496: register char *s; ! 497: char *t, c; ! 498: ! 499: t = k; ! 500: s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1); ! 501: #ifndef NOKANJI ! 502: c = ((altflag) ? altpat[altindex][0] : pat[0]); ! 503: if (c & NONASCII) ! 504: if ((s = kanji(str, s, k)) == NULL) ! 505: return (++k); /* reject false kanji */ ! 506: #endif ! 507: do; ! 508: while (*s != NL && --s >= str); ! 509: k = s + 1; /* now at line start */ ! 510: ! 511: if (boyonly) ! 512: return (gotamatch(file, k)); ! 513: ! 514: incount = counted - (strend - k); ! 515: if (boyfound++ == FIRSTFEW) ! 516: execstrategy(file); ! 517: ! 518: s = t; ! 519: do ! 520: rxcount++; ! 521: while (*s++ != NL); ! 522: *--s = EOS; ! 523: /* ! 524: * "quick henry -- the flit" (after theodor geisel) ! 525: */ ! 526: if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) { ! 527: *s = NL; ! 528: if (gotamatch(file, k) == NULL) ! 529: return (NULL); ! 530: } ! 531: *s = NL; ! 532: return (s + 1); ! 533: } ! 534: ! 535: #ifndef NOKANJI ! 536: /* ! 537: * EUC code disambiguation -- scan backwards to first 7-bit code, while ! 538: * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern. ! 539: * SS2/3 checks are for intermixed Japanase Katakana or Kanji2. ! 540: */ ! 541: char * ! 542: kanji(str, s, k) ! 543: register char *str, *s, *k; ! 544: { ! 545: register int j = 0; ! 546: ! 547: for (s--; s >= str; s--) { ! 548: if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0) ! 549: break; ! 550: j++; ! 551: } ! 552: #ifndef CHINESE ! 553: if (*s == SS2) ! 554: j -= 1; ! 555: #endif CHINESE ! 556: return ((j & 01) ? NULL : k); ! 557: } ! 558: #endif ! 559: ! 560: /* ! 561: * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c] ! 562: */ ! 563: gosper(pattern) ! 564: char *pattern; /* ... HAKMEM lives ... */ ! 565: { ! 566: register int i, j; ! 567: unsigned char c; ! 568: ! 569: /* Make one-string case look like simple alternatives case */ ! 570: if (!altflag) { ! 571: nalt = 1; ! 572: altmin = altlen[0] = strlen(pattern); ! 573: altpat[0] = pattern; ! 574: } ! 575: /* For chars that aren't in any string, skip by string length. */ ! 576: for (j = 0; j < 256; j++) { ! 577: delta0[j] = altmin; ! 578: cmap[j] = j; /* Sneak in initialization of cmap */ ! 579: } ! 580: ! 581: /* For chars in a string, skip distance from char to end of string. */ ! 582: /* (If char appears more than once, skip minimum distance.) */ ! 583: for (i = 0; i < nalt; i++) ! 584: for (j = 0; j < altlen[i] - 1; j++) { ! 585: c = altpat[i][j]; ! 586: delta0[c] = MIN(delta0[c], altlen[i] - j - 1); ! 587: if (iflag && islower((int) c)) ! 588: delta0[toupper((int) c)] = delta0[c]; ! 589: } ! 590: ! 591: /* For last char of each string, fall out of search loop. */ ! 592: for (i = 0; i < nalt; i++) { ! 593: c = altpat[i][altlen[i] - 1]; ! 594: delta0[c] = LARGE; ! 595: if (iflag && islower((int) c)) ! 596: delta0[toupper((int) c)] = LARGE; ! 597: } ! 598: if (iflag) ! 599: for (j = 'A'; j <= 'Z'; j++) ! 600: cmap[j] = tolower((int) j); ! 601: } ! 602: ! 603: /* ! 604: * Print, count, or stop on full match. Result is either the location for ! 605: * continued search, or NULL to stop. ! 606: */ ! 607: char * ! 608: gotamatch(file, s) ! 609: register char *file, *s; ! 610: { ! 611: char *savematch(); ! 612: int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */ ! 613: ! 614: nmatch++; ! 615: nsuccess = 1; ! 616: if (!boyonly && boyfound <= FIRSTFEW && file != NULL) ! 617: squirrel = 1; ! 618: ! 619: if (sflag) ! 620: return (NULL); /* -s usurps all flags (unlike some versions) */ ! 621: if (cflag) { /* -c overrides -l, we guess */ ! 622: do; ! 623: while (*s++ != NL); ! 624: } else if (lflag) { ! 625: puts(file); ! 626: return (NULL); ! 627: } else { ! 628: if (!hflag) ! 629: if (!squirrel) ! 630: printf("%s:", file); ! 631: else ! 632: (void)sprintf(preamble, "%s:", file); ! 633: if (nflag) { ! 634: if (prevmatch) ! 635: prevnline = prevnline + nlcount(prevloc, s); ! 636: else ! 637: prevnline = nline + nlcount(str, s); ! 638: prevmatch = 1; ! 639: ! 640: if (!squirrel) ! 641: printf("%ld:", prevnline); ! 642: else ! 643: (void)sprintf(preamble + strlen(preamble), ! 644: "%ld:", prevnline); ! 645: } ! 646: if (!squirrel) { ! 647: do ! 648: putchar(*s); ! 649: while (*s++ != NL); ! 650: } else ! 651: s = savematch(s); ! 652: ! 653: if (nflag) ! 654: prevloc = s - 1; ! 655: } ! 656: return ((firstflag && !cflag) ? NULL : s); ! 657: } ! 658: ! 659: char * ! 660: fold(line) ! 661: char *line; ! 662: { ! 663: static char fline[BUFSIZE]; ! 664: register char *s, *t = fline; ! 665: ! 666: for (s = line; *s != EOS; s++) ! 667: *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s); ! 668: *t = EOS; ! 669: return (fline); ! 670: } ! 671: ! 672: strindex(s, t) /* the easy way, as in K&P, p. 192 */ ! 673: char *s, *t; ! 674: { ! 675: int i, n; ! 676: ! 677: n = strlen(t); ! 678: for (i = 0; s[i] != '\0'; i++) ! 679: if (strncmp(s + i, t, n) == 0) ! 680: return (i); ! 681: return (-1); ! 682: } ! 683: ! 684: char * ! 685: grepxlat(pattern) /* grep pattern meta conversion */ ! 686: char *pattern; ! 687: { ! 688: register char *p, *s; ! 689: static char newpat[BUFSIZE]; ! 690: ! 691: for (s = newpat, p = pattern; *p != EOS;) { ! 692: if (*p == '\\') { /* skip escapes ... */ ! 693: *s++ = *p++; ! 694: if (*p) ! 695: *s++ = *p++; ! 696: } else if (*p == '[') { /* ... and char classes */ ! 697: while (*p != EOS && *p != ']') ! 698: *s++ = *p++; ! 699: } else if (strchr("+?|()", *p) != NULL) { ! 700: *s++ = '\\'; /* insert protection */ ! 701: *s++ = *p++; ! 702: } else ! 703: *s++ = *p++; ! 704: } ! 705: *s = EOS; ! 706: grepflag = ((patind) ? 0 : 1); ! 707: return (newpat); ! 708: } ! 709: ! 710: /* ! 711: * Test for simple alternation. Result is NULL if it's not so simple, or is ! 712: * a pointer to the first string if it is. Warning: sscanf size is a ! 713: * fixpoint, beyond which the speedup linearity starts to break down. In the ! 714: * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing ! 715: * altpat[] to arbitrary size is not useful. ! 716: */ ! 717: char * ! 718: alternate(regexpr) ! 719: char *regexpr; ! 720: { ! 721: register int i, j; ! 722: register char *start, *stop; ! 723: unsigned char c; ! 724: ! 725: if (fgrepflag && strchr(regexpr, '|')) ! 726: return (NULL); ! 727: ! 728: /* ! 729: * break pattern up into altpat array; delimit on newline, bar, ! 730: * or EOS. We know we won't overflow, we've already checked the ! 731: * number of patterns we're going to find against NALT. ! 732: * Also, set length of pattern and find minimum pattern length. ! 733: */ ! 734: nalt = 0; ! 735: altmin = NMUSH; ! 736: for (start = stop = regexpr;; ++stop) ! 737: if (!*stop || *stop == '|' || *stop == NL) { ! 738: altlen[nalt] = j = stop - start; ! 739: if (j < altmin) ! 740: altmin = j; ! 741: if (!(altpat[nalt] = malloc((u_int)(j + 1)))) ! 742: oops("out of memory"); ! 743: bcopy(start, altpat[nalt], j); ! 744: altpat[nalt][j] = EOS; ! 745: ++nalt; ! 746: if (!*stop) ! 747: break; ! 748: if (nalt == NALT) ! 749: return(NULL); ! 750: if (*stop == NL) ! 751: *stop = '|'; ! 752: start = stop + 1; ! 753: } ! 754: if (!fgrepflag) { ! 755: if (strchr(regexpr, '|') == NULL || regexpr[0] == '|') ! 756: return (NULL); ! 757: if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL ! 758: || strindex(regexpr, "||") >= 0) ! 759: return (NULL); ! 760: } ! 761: ! 762: if (nalt > 1) { /* build superimposed "pre-match" sets per ! 763: * char */ ! 764: altflag++; ! 765: for (j = 0; j < nalt; j++) ! 766: for (i = 0; i < altmin; i++) { ! 767: c = altpat[j][altlen[j] - altmin + i]; ! 768: altset[i + 1][c] = 1; /* offset for sentinel */ ! 769: } ! 770: } ! 771: return (altpat[0]); ! 772: } ! 773: ! 774: /* ! 775: * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to ! 776: * determine whether to use dfa-based egrep: We do FIRSTFEW matches with ! 777: * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT ! 778: * of the input, the r.e. is likely to be underspecified, so do old *grep, ! 779: * which is faster on complex patterns than regexp(). At FIRSTFEW, ! 780: * dump the saved matches collected by savematch(). They are saved ! 781: * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic, ! 782: * since it's hard to rewind. ! 783: */ ! 784: ! 785: execstrategy(file) ! 786: char *file; ! 787: { ! 788: int pctmatch; ! 789: ! 790: pctmatch = (100 * rxcount) / incount; ! 791: if (pctmatch > PUNTPERCENT && file != NULL) ! 792: kernighan(args); ! 793: if (file != NULL) ! 794: flushmatches(); ! 795: } ! 796: ! 797: nlcount(bstart, bstop) /* flail interval to totalize newlines. */ ! 798: char *bstart, *bstop; ! 799: { ! 800: register char *s = bstart; ! 801: register char *t = bstop; ! 802: register int count = 0; ! 803: ! 804: do { /* loop unroll for older architectures */ ! 805: if (*t == NL) /* ... ask ames!jaw for sample code */ ! 806: count++; ! 807: } while (t-- > s); ! 808: ! 809: return (count); ! 810: } ! 811: ! 812: char * ! 813: isolate(regexpr) /* isolate longest metacharacter-free string */ ! 814: char *regexpr; ! 815: { ! 816: char *dummyexpr; ! 817: ! 818: /* ! 819: * We add (.)* because Henry's regcomp only figures regmust if it ! 820: * sees a leading * pattern. Foo! ! 821: */ ! 822: dummyexpr = malloc((unsigned) strlen(regexpr) + 5); ! 823: (void)sprintf(dummyexpr, "(.)*%s", regexpr); ! 824: if ((rspencer = regcomp(dummyexpr)) == NULL) ! 825: kernighan(args); ! 826: return (rspencer->regmust); ! 827: } ! 828: ! 829: char *matches[FIRSTFEW]; ! 830: static int mcount = 0; ! 831: ! 832: char * ! 833: savematch(s) /* horde matches during statistics gathering */ ! 834: register char *s; ! 835: { ! 836: char *p; ! 837: char *start = s; ! 838: int msize = 0; ! 839: int psize = strlen(preamble); ! 840: ! 841: while (*s++ != NL) ! 842: msize++; ! 843: *--s = EOS; ! 844: ! 845: p = malloc((unsigned) msize + 1 + psize); ! 846: strcpy(p, preamble); ! 847: strcpy(p + psize, start); ! 848: matches[mcount++] = p; ! 849: ! 850: preamble[0] = 0; ! 851: *s = NL; ! 852: return (s); ! 853: } ! 854: ! 855: flushmatches() ! 856: { ! 857: int n; ! 858: ! 859: flushflag = 1; ! 860: for (n = 0; n < mcount; n++) ! 861: printf("%s\n", matches[n]); ! 862: mcount = 0; ! 863: } ! 864: ! 865: oops(message) ! 866: char *message; ! 867: { ! 868: fprintf(stderr, "%s: %s\n", progname, message); ! 869: exit(2); ! 870: } ! 871: ! 872: kernighan(args) /* "let others do the hard part ..." */ ! 873: char *args[]; ! 874: { ! 875: /* ! 876: * We may have already run grep on some of the files; remove them ! 877: * from the arg list we pass on. Note that we can't delete them ! 878: * totally because the number of file names affects the output ! 879: * (automatic -h). ! 880: */ ! 881: /* better would be fork/exec per punted file -- jaw */ ! 882: ! 883: while (firstfile && optind > firstfile) ! 884: args[firstfile++] = "/dev/null"; ! 885: if (patind) ! 886: args[patind] = pattern; ! 887: (void) fflush(stdout); ! 888: ! 889: if (grepflag) ! 890: execvp(GREPSTD, args), oops("can't exec old 'grep'"); ! 891: else if (fgrepflag) ! 892: execvp(FGREPSTD, args), oops("can't exec old 'fgrep'"); ! 893: else ! 894: execvp(EGREPSTD, args), oops("can't exec old 'egrep'"); ! 895: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.