|
|
1.1 root 1: #ifndef lint
2: static char sccsid[] = "@(#)egrep.c 5.10 (Berkeley) 6/30/88";
3: #endif not lint
4:
5: /*
6: Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
7: table as in original paper (CACM, October, 1977). No delta1 or delta2.
8: According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
9: minimal practical value. However, to improve for worst case input,
10: integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
11: Comput., Feb. 1986) deserves consideration.
12:
13: Method: extract longest metacharacter-free string from expression.
14: this is done using a side-effect from henry spencer's regcomp().
15: use boyer-moore to match such, then pass submatching lines
16: to either regexp() or standard 'egrep', depending on certain
17: criteria within execstrategy() below. [this tradeoff is due
18: to the general slowness of the regexp() nondeterministic
19: machine on complex expressions, as well as the startup time
20: of standard 'egrep' on short files.] alternatively, one may
21: change the vendor-supplied 'egrep' automaton to include
22: boyer-moore directly. see accompanying writeup for discussion
23: of kanji expression treatment.
24:
25: late addition: apply trickbag for fast match of simple
26: alternations (sublinear, in common low-cardinality cases).
27: trap fgrep into this lair.
28:
29: gnu additions: -f, newline as |, \< and \> [in regexec()], more
30: comments. inspire better dfa exec() strategy.
31: serious testing and help with special cases.
32:
33: Algorithm amalgam summary:
34:
35: dfa e?grep (aho/thompson)
36: ndfa regexp() (spencer/aho)
37: bmg (boyer/moore/gosper)
38: "superimposed" bmg (jaw)
39: fgrep (aho/corrasick)
40:
41: sorry, but the knuth/morris/pratt machine, horspool's
42: "frequentist" code, and the rabin/karp matcher, however cute,
43: just don't cut it for this production.
44:
45: James A. Woods Copyright (c) 1986
46: NASA Ames Research Center
47: */
48: #include <stdio.h>
49: #include <ctype.h>
50: #include <sys/types.h>
51: #include <sys/stat.h>
52: #include <sys/file.h>
53: #include <regexp.h> /* must be henry spencer's version */
54:
55: #define MIN(A, B) ((A) > (B) ? (B) : (A))
56:
57: #ifdef SLOWSYS
58: #define read xread
59: #endif
60: /*
61: * define existing [ef]?grep program locations below for use by execvp().
62: * [execlp() would be used were it not for the possibility of
63: * installation-dependent recursion.]
64: */
65: #ifndef EGREPSTD
66: #define EGREPSTD "/usr/bin/egrep"
67: #endif
68: #ifndef GREPSTD
69: #define GREPSTD "/bin/grep"
70: #endif
71: #ifndef FGREPSTD
72: #define FGREPSTD "/usr/bin/fgrep"
73: #endif
74:
75: #define BUFSIZE 8192 /* make higher for cray */
76: #define PATSIZE 6000
77: #define LARGE BUFSIZE + PATSIZE
78:
79: #define NALT 7 /* tied to scanf() size in alternate() */
80: #define NMUSH 6 /* loosely relates to expected alt length */
81:
82: #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */
83: #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input
84: * was processed by regexp(), exec std egrep. */
85: #define NL '\n'
86: #define EOS '\0'
87: #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */
88: #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */
89: #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */
90: #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */
91:
92: extern char *optarg;
93: extern int optind;
94: char *progname;
95:
96: int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */
97: int sflag, hflag; /* v7, v8, bsd */
98:
99: int firstflag; /* Stop at first match */
100: int grepflag; /* Called as "grep" */
101: int fgrepflag; /* Called as "fgrep" */
102: int altflag; /* Simple alternation in pattern */
103: int boyonly; /* No regexp needed -- all simple */
104: int flushflag;
105: int grepold, egrepold, fgrepold;
106:
107: int nalt; /* Number of alternatives */
108: int nsuccess; /* 1 for match, 2 for error */
109: int altmin; /* Minimum length of all the alternate
110: * strings */
111: int firstfile; /* argv index of first file argument */
112: int patind; /* argv index of pattern */
113: long nmatch; /* Number of matches in this file */
114: long incount, counted; /* Amount of input consumed */
115: long rxcount; /* Bytes of input processed by regexec() */
116: int boyfound; /* accumulated partial matches (tripped by
117: * FIRSTFEW) */
118: int prevmatch; /* next three lines aid fast -n */
119: long nline, prevnline;
120: char *prevloc;
121:
122: regexp *rspencer;
123: char *pattern;
124: char *patboy; /* Pattern for simple Boyer-Moore */
125: char *patfile; /* Filename containing pattern(s) */
126:
127: int delta0[256]; /* Boyer-Moore algorithm core */
128: char cmap[256]; /* Usually 0-255, but if -i, maps upper to
129: * lower case */
130: char str[BUFSIZE + 2];
131: int nleftover;
132: char linetemp[BUFSIZE];
133: char *altpat[NALT]; /* alternation component storage */
134: int altlen[NALT];
135: short altset[NMUSH + 1][256];
136: char preamble[200]; /* match prefix (filename, line no.) */
137:
138: int fd;
139: char *
140: strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
141: char *
142: grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
143: char **args;
144:
145: main(argc, argv)
146: int argc;
147: char *argv[];
148: {
149: int c, oflag;
150: int errflag = 0;
151:
152: args = argv;
153:
154: if ((progname = strrchr(argv[0], '/')) != 0)
155: progname++;
156: else
157: progname = argv[0];
158: if (strcmp(progname, "grep") == 0)
159: grepflag++;
160: else if (strcmp(progname, "fgrep") == 0)
161: fgrepflag++;
162:
163: oflag = 0;
164: while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
165: switch (c) {
166:
167: case 'f':
168: fflag++;
169: patfile = optarg;
170: continue;
171: case 'b':
172: case 'v':
173: egrepold++; /* boyer-moore of little help here */
174: continue;
175: case 'c':
176: cflag++;
177: continue;
178: case 'e':
179: eflag++;
180: pattern = optarg;
181: continue;
182: case 'h':
183: hflag++;
184: continue;
185: case 'o':
186: oflag++;
187: continue;
188: case '1': /* Stop at very first match */
189: firstflag++; /* spead freaks only */
190: continue;
191: case 'i':
192: iflag++;
193: continue;
194: case 'l':
195: lflag++;
196: continue;
197: case 'n':
198: nflag++;
199: continue;
200: case 's':
201: sflag++;
202: continue;
203: case 'w':
204: case 'y':
205: if (!grepflag)
206: errflag++;
207: grepold++;
208: continue;
209: case 'x': /* needs more work, like -b above */
210: if (!fgrepflag)
211: errflag++;
212: fgrepold++;
213: continue;
214: case '?':
215: errflag++;
216: }
217: }
218: if (errflag || ((argc <= optind) && !fflag && !eflag)) {
219: if (grepflag)
220: oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
221: else if (fgrepflag)
222: oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
223: else /* encourage SVID options, though we provide
224: * others */
225: oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
226: }
227: if (fflag)
228: pattern = pfile(patfile);
229: else if (!eflag) {
230: patind = optind;
231: pattern = argv[optind++];
232: }
233:
234: if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */
235: hflag++;
236: if (pattern[0] == EOS)
237: kernighan(argv); /* same as it ever was */
238: /*
239: * 'grep/egrep' merger -- "old" grep is called to handle: tagged
240: * exprs \( \), word matches \< and \>, -w and -y options, char
241: * classes with '-' at end (egrep bug?), and patterns beginning with
242: * an asterisk (don't ask why). otherwise, characters meaningful to
243: * 'egrep' but not to 'grep' are escaped; the entire expr is then
244: * passed to 'egrep'.
245: */
246: if (grepflag && !grepold) {
247: if (strindex(pattern, "\\(") >= 0 ||
248: strindex(pattern, "\\<") >= 0 ||
249: strindex(pattern, "\\>") >= 0 ||
250: strindex(pattern, "-]") >= 0 ||
251: pattern[0] == '*') /* grep bug */
252: grepold++;
253: else
254: pattern = grepxlat(pattern);
255: }
256: if (grepold || egrepold || fgrepold)
257: kernighan(argv);
258:
259: if (iflag)
260: strcpy(pattern, fold(pattern));
261: /*
262: * If the pattern is a plain string, just run boyer-moore. If it
263: * consists of meta-free alternatives, run "superimposed" bmg.
264: * Otherwise, find best string, and compile pattern for regexec().
265: */
266: if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */
267: boyonly++;
268: patboy = pattern;
269: } else {
270: if ((patboy = alternate(pattern)) != NULL)
271: boyonly++;
272: else {
273: if ((patboy = isolate(pattern)) == NULL)
274: kernighan(argv); /* expr too involved */
275: #ifndef NOKANJI
276: for (c = 0; pattern[c] != EOS; c++)
277: if (pattern[c] & NONASCII) /* kanji + meta */
278: kernighan(argv);
279: #endif
280: if ((rspencer = regcomp(pattern)) == NULL)
281: oops("regcomp failure");
282: }
283: }
284: gosper(patboy); /* "pre-conditioning is wonderful"
285: * -- v. strassen */
286:
287: if ((firstfile = optind) >= argc) {
288: /* Grep standard input */
289: if (lflag) /* We don't know its name! */
290: exit(1);
291: egsecute((char *) NULL);
292: } else {
293: while (optind < argc) {
294: egsecute(argv[optind]);
295: optind++;
296: if (firstflag && (nsuccess == 1))
297: break;
298: }
299: }
300: exit((nsuccess == 2) ? 2 : (nsuccess == 0));
301: }
302:
303: char *
304: pfile(pfname) /* absorb expression from file */
305: char *pfname;
306: {
307: int fd;
308: struct stat patstat;
309: static char *pat;
310:
311: if ((fd = open(pfname, O_RDONLY, 0)) < 0)
312: oops("can't read pattern file");
313: if (fstat(fd, &patstat) != 0)
314: oops("can't stat pattern file");
315: if (patstat.st_size > PATSIZE) {
316: if (fgrepflag) { /* defer to unix version */
317: fgrepold++;
318: return "dummy";
319: } else
320: oops("pattern file too big");
321: }
322: if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
323: oops("out of memory to read pattern file");
324: if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
325: oops("error reading pattern file");
326: (void) close(fd);
327:
328: pat[patstat.st_size] = EOS;
329: if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */
330: pat[patstat.st_size - 1] = EOS;
331:
332: if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
333: if (fgrepflag)
334: fgrepold++; /* "what's it all about, alfie?" */
335: else
336: egrepold++;
337: }
338: return (pat);
339: }
340:
341: egsecute(file)
342: char *file;
343: {
344: if (file == NULL)
345: fd = 0;
346: else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
347: fprintf(stderr, "%s: can't open %s\n", progname, file);
348: nsuccess = 2;
349: return;
350: }
351: chimaera(file, patboy);
352:
353: if (!boyonly && !flushflag && file != NULL)
354: flushmatches();
355: if (file != NULL)
356: close(fd);
357: }
358:
359: chimaera(file, pat) /* "reach out and boyer-moore search someone" */
360: char *file, *pat; /* -- soon-to-be-popular bumper sticker */
361: {
362: register char *k, *strend, *s;
363: register int j, count;
364: register int *deltazero = delta0;
365: int patlen = altmin;
366: char *t;
367: char *gotamatch(), *kanji(), *linesave(), *submatch();
368:
369: nleftover = boyfound = flushflag = 0;
370: nline = 1L;
371: prevmatch = 0;
372: nmatch = counted = rxcount = 0L;
373:
374: while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
375:
376: counted += count;
377: strend = linesave(str, count);
378:
379: for (k = str + patlen - 1; k < strend;) {
380: /*
381: * for a large class of patterns, upwards of 80% of
382: * match time is spent on the next line. we beat
383: * existing microcode (vax 'matchc') this way.
384: */
385: while ((k += deltazero[*(unsigned char *) k]) < strend);
386: if (k < (str + LARGE))
387: break;
388: k -= LARGE;
389:
390: if (altflag) {
391: /*
392: * Parallel Boyer-Moore. Check whether each
393: * of the previous <altmin> chars COULD be
394: * from one of the alternative strings.
395: */
396: s = k - 1;
397: j = altmin;
398: while (altset[--j][(unsigned char)
399: cmap[*(unsigned char *) s--]]);
400: /*
401: * quick test fails. in this life, compare
402: * 'em all. but, a "reverse trie" would
403: * attenuate worst case (linear w/delta2?).
404: */
405: if (--j < 0) {
406: count = nalt - 1;
407: do {
408: s = k;
409: j = altlen[count];
410: t = altpat[count];
411:
412: while
413: (cmap[*(unsigned char *) s--]
414: == t[--j]);
415: if (j < 0)
416: break;
417: }
418: while (count--);
419: }
420: } else {
421: /* One string -- check it */
422: j = patlen - 1;
423: s = k - 1;
424: while (cmap[*(unsigned char *) s--] == pat[--j]);
425: }
426: /*
427: * delta-less shortcut for literati. short shrift for
428: * genetic engineers?
429: */
430: if (j >= 0) {
431: k++; /* no match; restart next char */
432: continue;
433: }
434: k = submatch(file, pat, str, strend, k, count);
435: if (k == NULL)
436: return;
437: }
438: if (nflag) {
439: if (prevmatch)
440: nline = prevnline + nlcount(prevloc, k);
441: else
442: nline = nline + nlcount(str, k);
443: prevmatch = 0;
444: }
445: strncpy(str, linetemp, nleftover);
446: }
447: if (cflag) {
448: /* Bug from old grep: -c overrides -h. We fix the bug. */
449: if (!hflag)
450: printf("%s:", file);
451: printf("%ld\n", nmatch);
452: }
453: }
454:
455: char *
456: linesave(str, count) /* accumulate partial line at end of buffer */
457: char str[];
458: register int count;
459: {
460: register int j;
461:
462: count += nleftover;
463: if (count != BUFSIZE && fd != 0)
464: str[count++] = NL; /* insurance for broken last line */
465: str[count] = EOS;
466: for (j = count - 1; str[j] != NL && j >= 0;)
467: j--;
468: /*
469: * break up these lines: long line (> BUFSIZE), last line of file, or
470: * short return from read(), as from tee(1) input
471: */
472: if (j < 0 && (count == (BUFSIZE - nleftover))) {
473: str[count++] = NL;
474: str[count] = EOS;
475: linetemp[0] = EOS;
476: nleftover = 0;
477: return (str + count);
478: } else {
479: nleftover = count - j - 1;
480: strncpy(linetemp, str + j + 1, nleftover);
481: return (str + j);
482: }
483: }
484:
485: /*
486: * Process partial match. First check for mis-aligned Kanji, then match line
487: * against full compiled r.e. if statistics do not warrant handing off to
488: * standard egrep.
489: */
490: char *
491: submatch(file, pat, str, strend, k, altindex)
492: char file[], pat[], str[];
493: register char *strend, *k;
494: int altindex;
495: {
496: register char *s;
497: char *t, c;
498:
499: t = k;
500: s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
501: #ifndef NOKANJI
502: c = ((altflag) ? altpat[altindex][0] : pat[0]);
503: if (c & NONASCII)
504: if ((s = kanji(str, s, k)) == NULL)
505: return (++k); /* reject false kanji */
506: #endif
507: do;
508: while (*s != NL && --s >= str);
509: k = s + 1; /* now at line start */
510:
511: if (boyonly)
512: return (gotamatch(file, k));
513:
514: incount = counted - (strend - k);
515: if (boyfound++ == FIRSTFEW)
516: execstrategy(file);
517:
518: s = t;
519: do
520: rxcount++;
521: while (*s++ != NL);
522: *--s = EOS;
523: /*
524: * "quick henry -- the flit" (after theodor geisel)
525: */
526: if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
527: *s = NL;
528: if (gotamatch(file, k) == NULL)
529: return (NULL);
530: }
531: *s = NL;
532: return (s + 1);
533: }
534:
535: #ifndef NOKANJI
536: /*
537: * EUC code disambiguation -- scan backwards to first 7-bit code, while
538: * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern.
539: * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
540: */
541: char *
542: kanji(str, s, k)
543: register char *str, *s, *k;
544: {
545: register int j = 0;
546:
547: for (s--; s >= str; s--) {
548: if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
549: break;
550: j++;
551: }
552: #ifndef CHINESE
553: if (*s == SS2)
554: j -= 1;
555: #endif CHINESE
556: return ((j & 01) ? NULL : k);
557: }
558: #endif
559:
560: /*
561: * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
562: */
563: gosper(pattern)
564: char *pattern; /* ... HAKMEM lives ... */
565: {
566: register int i, j;
567: unsigned char c;
568:
569: /* Make one-string case look like simple alternatives case */
570: if (!altflag) {
571: nalt = 1;
572: altmin = altlen[0] = strlen(pattern);
573: altpat[0] = pattern;
574: }
575: /* For chars that aren't in any string, skip by string length. */
576: for (j = 0; j < 256; j++) {
577: delta0[j] = altmin;
578: cmap[j] = j; /* Sneak in initialization of cmap */
579: }
580:
581: /* For chars in a string, skip distance from char to end of string. */
582: /* (If char appears more than once, skip minimum distance.) */
583: for (i = 0; i < nalt; i++)
584: for (j = 0; j < altlen[i] - 1; j++) {
585: c = altpat[i][j];
586: delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
587: if (iflag && islower((int) c))
588: delta0[toupper((int) c)] = delta0[c];
589: }
590:
591: /* For last char of each string, fall out of search loop. */
592: for (i = 0; i < nalt; i++) {
593: c = altpat[i][altlen[i] - 1];
594: delta0[c] = LARGE;
595: if (iflag && islower((int) c))
596: delta0[toupper((int) c)] = LARGE;
597: }
598: if (iflag)
599: for (j = 'A'; j <= 'Z'; j++)
600: cmap[j] = tolower((int) j);
601: }
602:
603: /*
604: * Print, count, or stop on full match. Result is either the location for
605: * continued search, or NULL to stop.
606: */
607: char *
608: gotamatch(file, s)
609: register char *file, *s;
610: {
611: char *savematch();
612: int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */
613:
614: nmatch++;
615: nsuccess = 1;
616: if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
617: squirrel = 1;
618:
619: if (sflag)
620: return (NULL); /* -s usurps all flags (unlike some versions) */
621: if (cflag) { /* -c overrides -l, we guess */
622: do;
623: while (*s++ != NL);
624: } else if (lflag) {
625: puts(file);
626: return (NULL);
627: } else {
628: if (!hflag)
629: if (!squirrel)
630: printf("%s:", file);
631: else
632: (void)sprintf(preamble, "%s:", file);
633: if (nflag) {
634: if (prevmatch)
635: prevnline = prevnline + nlcount(prevloc, s);
636: else
637: prevnline = nline + nlcount(str, s);
638: prevmatch = 1;
639:
640: if (!squirrel)
641: printf("%ld:", prevnline);
642: else
643: (void)sprintf(preamble + strlen(preamble),
644: "%ld:", prevnline);
645: }
646: if (!squirrel) {
647: do
648: putchar(*s);
649: while (*s++ != NL);
650: } else
651: s = savematch(s);
652:
653: if (nflag)
654: prevloc = s - 1;
655: }
656: return ((firstflag && !cflag) ? NULL : s);
657: }
658:
659: char *
660: fold(line)
661: char *line;
662: {
663: static char fline[BUFSIZE];
664: register char *s, *t = fline;
665:
666: for (s = line; *s != EOS; s++)
667: *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
668: *t = EOS;
669: return (fline);
670: }
671:
672: strindex(s, t) /* the easy way, as in K&P, p. 192 */
673: char *s, *t;
674: {
675: int i, n;
676:
677: n = strlen(t);
678: for (i = 0; s[i] != '\0'; i++)
679: if (strncmp(s + i, t, n) == 0)
680: return (i);
681: return (-1);
682: }
683:
684: char *
685: grepxlat(pattern) /* grep pattern meta conversion */
686: char *pattern;
687: {
688: register char *p, *s;
689: static char newpat[BUFSIZE];
690:
691: for (s = newpat, p = pattern; *p != EOS;) {
692: if (*p == '\\') { /* skip escapes ... */
693: *s++ = *p++;
694: if (*p)
695: *s++ = *p++;
696: } else if (*p == '[') { /* ... and char classes */
697: while (*p != EOS && *p != ']')
698: *s++ = *p++;
699: } else if (strchr("+?|()", *p) != NULL) {
700: *s++ = '\\'; /* insert protection */
701: *s++ = *p++;
702: } else
703: *s++ = *p++;
704: }
705: *s = EOS;
706: grepflag = ((patind) ? 0 : 1);
707: return (newpat);
708: }
709:
710: /*
711: * Test for simple alternation. Result is NULL if it's not so simple, or is
712: * a pointer to the first string if it is. Warning: sscanf size is a
713: * fixpoint, beyond which the speedup linearity starts to break down. In the
714: * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
715: * altpat[] to arbitrary size is not useful.
716: */
717: char *
718: alternate(regexpr)
719: char *regexpr;
720: {
721: register int i, j;
722: register char *start, *stop;
723: unsigned char c;
724:
725: if (fgrepflag && strchr(regexpr, '|'))
726: return (NULL);
727:
728: /*
729: * break pattern up into altpat array; delimit on newline, bar,
730: * or EOS. We know we won't overflow, we've already checked the
731: * number of patterns we're going to find against NALT.
732: * Also, set length of pattern and find minimum pattern length.
733: */
734: nalt = 0;
735: altmin = NMUSH;
736: for (start = stop = regexpr;; ++stop)
737: if (!*stop || *stop == '|' || *stop == NL) {
738: altlen[nalt] = j = stop - start;
739: if (j < altmin)
740: altmin = j;
741: if (!(altpat[nalt] = malloc((u_int)(j + 1))))
742: oops("out of memory");
743: bcopy(start, altpat[nalt], j);
744: altpat[nalt][j] = EOS;
745: ++nalt;
746: if (!*stop)
747: break;
748: if (nalt == NALT)
749: return(NULL);
750: if (*stop == NL)
751: *stop = '|';
752: start = stop + 1;
753: }
754: if (!fgrepflag) {
755: if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
756: return (NULL);
757: if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
758: || strindex(regexpr, "||") >= 0)
759: return (NULL);
760: }
761:
762: if (nalt > 1) { /* build superimposed "pre-match" sets per
763: * char */
764: altflag++;
765: for (j = 0; j < nalt; j++)
766: for (i = 0; i < altmin; i++) {
767: c = altpat[j][altlen[j] - altmin + i];
768: altset[i + 1][c] = 1; /* offset for sentinel */
769: }
770: }
771: return (altpat[0]);
772: }
773:
774: /*
775: * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
776: * determine whether to use dfa-based egrep: We do FIRSTFEW matches with
777: * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT
778: * of the input, the r.e. is likely to be underspecified, so do old *grep,
779: * which is faster on complex patterns than regexp(). At FIRSTFEW,
780: * dump the saved matches collected by savematch(). They are saved
781: * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic,
782: * since it's hard to rewind.
783: */
784:
785: execstrategy(file)
786: char *file;
787: {
788: int pctmatch;
789:
790: pctmatch = (100 * rxcount) / incount;
791: if (pctmatch > PUNTPERCENT && file != NULL)
792: kernighan(args);
793: if (file != NULL)
794: flushmatches();
795: }
796:
797: nlcount(bstart, bstop) /* flail interval to totalize newlines. */
798: char *bstart, *bstop;
799: {
800: register char *s = bstart;
801: register char *t = bstop;
802: register int count = 0;
803:
804: do { /* loop unroll for older architectures */
805: if (*t == NL) /* ... ask ames!jaw for sample code */
806: count++;
807: } while (t-- > s);
808:
809: return (count);
810: }
811:
812: char *
813: isolate(regexpr) /* isolate longest metacharacter-free string */
814: char *regexpr;
815: {
816: char *dummyexpr;
817:
818: /*
819: * We add (.)* because Henry's regcomp only figures regmust if it
820: * sees a leading * pattern. Foo!
821: */
822: dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
823: (void)sprintf(dummyexpr, "(.)*%s", regexpr);
824: if ((rspencer = regcomp(dummyexpr)) == NULL)
825: kernighan(args);
826: return (rspencer->regmust);
827: }
828:
829: char *matches[FIRSTFEW];
830: static int mcount = 0;
831:
832: char *
833: savematch(s) /* horde matches during statistics gathering */
834: register char *s;
835: {
836: char *p;
837: char *start = s;
838: int msize = 0;
839: int psize = strlen(preamble);
840:
841: while (*s++ != NL)
842: msize++;
843: *--s = EOS;
844:
845: p = malloc((unsigned) msize + 1 + psize);
846: strcpy(p, preamble);
847: strcpy(p + psize, start);
848: matches[mcount++] = p;
849:
850: preamble[0] = 0;
851: *s = NL;
852: return (s);
853: }
854:
855: flushmatches()
856: {
857: int n;
858:
859: flushflag = 1;
860: for (n = 0; n < mcount; n++)
861: printf("%s\n", matches[n]);
862: mcount = 0;
863: }
864:
865: oops(message)
866: char *message;
867: {
868: fprintf(stderr, "%s: %s\n", progname, message);
869: exit(2);
870: }
871:
872: kernighan(args) /* "let others do the hard part ..." */
873: char *args[];
874: {
875: /*
876: * We may have already run grep on some of the files; remove them
877: * from the arg list we pass on. Note that we can't delete them
878: * totally because the number of file names affects the output
879: * (automatic -h).
880: */
881: /* better would be fork/exec per punted file -- jaw */
882:
883: while (firstfile && optind > firstfile)
884: args[firstfile++] = "/dev/null";
885: if (patind)
886: args[patind] = pattern;
887: (void) fflush(stdout);
888:
889: if (grepflag)
890: execvp(GREPSTD, args), oops("can't exec old 'grep'");
891: else if (fgrepflag)
892: execvp(FGREPSTD, args), oops("can't exec old 'fgrep'");
893: else
894: execvp(EGREPSTD, args), oops("can't exec old 'egrep'");
895: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.