|
|
1.1 root 1: #ifndef lint
2: static char sccsid[] = "@(#)egrep.c 5.12 (Berkeley) 5/11/89";
3: #endif not lint
4:
5: /*
6: Hybrid Boyer/Moore/Gosper-assisted 'grep/egrep/fgrep' search, with delta0
7: table as in original paper (CACM, October, 1977). No delta1 or delta2.
8: According to experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of
9: minimal practical value. However, to improve for worst case input,
10: integrating the improved Galil strategies (Apostolico/Giancarlo, SIAM. J.
11: Comput., Feb. 1986) deserves consideration.
12:
13: Method: extract longest metacharacter-free string from expression.
14: this is done using a side-effect from henry spencer's regcomp().
15: use boyer-moore to match such, then pass submatching lines
16: to either regexp() or standard 'egrep', depending on certain
17: criteria within execstrategy() below. [this tradeoff is due
18: to the general slowness of the regexp() nondeterministic
19: machine on complex expressions, as well as the startup time
20: of standard 'egrep' on short files.] alternatively, one may
21: change the vendor-supplied 'egrep' automaton to include
22: boyer-moore directly. see accompanying writeup for discussion
23: of kanji expression treatment.
24:
25: late addition: apply trickbag for fast match of simple
26: alternations (sublinear, in common low-cardinality cases).
27: trap fgrep into this lair.
28:
29: gnu additions: -f, newline as |, \< and \> [in regexec()], more
30: comments. inspire better dfa exec() strategy.
31: serious testing and help with special cases.
32:
33: Algorithm amalgam summary:
34:
35: dfa e?grep (aho/thompson)
36: ndfa regexp() (spencer/aho)
37: bmg (boyer/moore/gosper)
38: "superimposed" bmg (jaw)
39: fgrep (aho/corrasick)
40:
41: sorry, but the knuth/morris/pratt machine, horspool's
42: "frequentist" code, and the rabin/karp matcher, however cute,
43: just don't cut it for this production.
44:
45: James A. Woods Copyright (c) 1986
46: NASA Ames Research Center
47: */
48: #include <sys/types.h>
49: #include <sys/stat.h>
50: #include <sys/file.h>
51: #include <regexp.h> /* must be henry spencer's version */
52: #include <stdio.h>
53: #include <ctype.h>
54: #include "pathnames.h"
55:
56: #define MIN(A, B) ((A) > (B) ? (B) : (A))
57:
58: #ifdef SLOWSYS
59: #define read xread
60: #endif
61:
62: #define BUFSIZE 8192 /* make higher for cray */
63: #define PATSIZE 6000
64: #define LARGE BUFSIZE + PATSIZE
65:
66: #define NALT 7 /* tied to scanf() size in alternate() */
67: #define NMUSH 6 /* loosely relates to expected alt length */
68:
69: #define FIRSTFEW 33 /* Always do FIRSTFEW matches with regexec() */
70: #define PUNTPERCENT 10 /* After FIRSTFEW, if PUNTPERCENT of the input
71: * was processed by regexp(), exec std egrep. */
72: #define NL '\n'
73: #define EOS '\0'
74: #define NONASCII 0200 /* Bit mask for Kanji non-ascii chars */
75: #define META "\n^$.[]()?+*|\\" /* egrep meta-characters */
76: #define SS2 '\216' /* EUC Katakana (or Chinese2) prefix */
77: #define SS3 '\217' /* EUC Kanji2 (or Chinese3) prefix */
78:
79: extern char *optarg;
80: extern int optind;
81: char *progname;
82:
83: int cflag, iflag, eflag, fflag, lflag, nflag; /* SVID flags */
84: int sflag, hflag; /* v7, v8, bsd */
85:
86: int firstflag; /* Stop at first match */
87: int grepflag; /* Called as "grep" */
88: int fgrepflag; /* Called as "fgrep" */
89: int altflag; /* Simple alternation in pattern */
90: int boyonly; /* No regexp needed -- all simple */
91: int flushflag;
92: int grepold, egrepold, fgrepold;
93:
94: int nalt; /* Number of alternatives */
95: int nsuccess; /* 1 for match, 2 for error */
96: int altmin; /* Minimum length of all the alternate
97: * strings */
98: int firstfile; /* argv index of first file argument */
99: int patind; /* argv index of pattern */
100: long nmatch; /* Number of matches in this file */
101: long incount, counted; /* Amount of input consumed */
102: long rxcount; /* Bytes of input processed by regexec() */
103: int boyfound; /* accumulated partial matches (tripped by
104: * FIRSTFEW) */
105: int prevmatch; /* next three lines aid fast -n */
106: long nline, prevnline;
107: char *prevloc;
108:
109: regexp *rspencer;
110: char *pattern;
111: char *patboy; /* Pattern for simple Boyer-Moore */
112: char *patfile; /* Filename containing pattern(s) */
113:
114: int delta0[256]; /* Boyer-Moore algorithm core */
115: char cmap[256]; /* Usually 0-255, but if -i, maps upper to
116: * lower case */
117: char str[BUFSIZE + 2];
118: int nleftover;
119: char linetemp[BUFSIZE];
120: char *altpat[NALT]; /* alternation component storage */
121: int altlen[NALT];
122: short altset[NMUSH + 1][256];
123: char preamble[200]; /* match prefix (filename, line no.) */
124:
125: int fd;
126: char *
127: strchr(), *strrchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
128: char *
129: grepxlat(), *fold(), *pfile(), *alternate(), *isolate();
130: char *gotamatch(), *kanji(), *linesave(), *submatch();
131: char **args;
132:
133: main(argc, argv)
134: int argc;
135: char *argv[];
136: {
137: int c, oflag;
138: int errflag = 0;
139:
140: args = argv;
141:
142: if ((progname = strrchr(argv[0], '/')) != 0)
143: progname++;
144: else
145: progname = argv[0];
146: if (strcmp(progname, "grep") == 0)
147: grepflag++;
148: else if (strcmp(progname, "fgrep") == 0)
149: fgrepflag++;
150:
151: oflag = 0;
152: while ((c = getopt(argc, argv, "bchie:f:lnosvwxy1")) != EOF) {
153: switch (c) {
154:
155: case 'f':
156: fflag++;
157: patfile = optarg;
158: continue;
159: case 'b':
160: case 'v':
161: egrepold++; /* boyer-moore of little help here */
162: continue;
163: case 'c':
164: cflag++;
165: continue;
166: case 'e':
167: eflag++;
168: pattern = optarg;
169: continue;
170: case 'h':
171: hflag++;
172: continue;
173: case 'o':
174: oflag++;
175: continue;
176: case '1': /* Stop at very first match */
177: firstflag++; /* spead freaks only */
178: continue;
179: case 'i':
180: iflag++;
181: continue;
182: case 'l':
183: lflag++;
184: continue;
185: case 'n':
186: nflag++;
187: continue;
188: case 's':
189: sflag++;
190: continue;
191: case 'w':
192: case 'y':
193: if (!grepflag)
194: errflag++;
195: grepold++;
196: continue;
197: case 'x': /* needs more work, like -b above */
198: if (!fgrepflag)
199: errflag++;
200: fgrepold++;
201: continue;
202: case '?':
203: errflag++;
204: }
205: }
206: if (errflag || ((argc <= optind) && !fflag && !eflag)) {
207: if (grepflag)
208: oops("usage: grep [-bchilnosvwy] [-e] pattern [file ...]");
209: else if (fgrepflag)
210: oops("usage: fgrep [-bchilnosvx] {-f patfile | [-e] strings} [file ...]");
211: else /* encourage SVID options, though we provide
212: * others */
213: oops("usage: egrep [-bchilnosv] {-f patfile | [-e] pattern} [file ...]");
214: }
215: if (fflag)
216: pattern = pfile(patfile);
217: else if (!eflag) {
218: patind = optind;
219: pattern = argv[optind++];
220: }
221:
222: if (!oflag && (argc - optind) <= 1) /* Filename invisible given < 2 files */
223: hflag++;
224: if (pattern[0] == EOS)
225: kernighan(argv); /* same as it ever was */
226: /*
227: * 'grep/egrep' merger -- "old" grep is called to handle: tagged
228: * exprs \( \), word matches \< and \>, -w and -y options, char
229: * classes with '-' at end (egrep bug?), and patterns beginning with
230: * an asterisk (don't ask why). otherwise, characters meaningful to
231: * 'egrep' but not to 'grep' are escaped; the entire expr is then
232: * passed to 'egrep'.
233: */
234: if (grepflag && !grepold) {
235: if (strindex(pattern, "\\(") >= 0 ||
236: strindex(pattern, "\\<") >= 0 ||
237: strindex(pattern, "\\>") >= 0 ||
238: strindex(pattern, "-]") >= 0 ||
239: pattern[0] == '*') /* grep bug */
240: grepold++;
241: else
242: pattern = grepxlat(pattern);
243: }
244: if (grepold || egrepold || fgrepold)
245: kernighan(argv);
246:
247: if (iflag)
248: strcpy(pattern, fold(pattern));
249: /*
250: * If the pattern is a plain string, just run boyer-moore. If it
251: * consists of meta-free alternatives, run "superimposed" bmg.
252: * Otherwise, find best string, and compile pattern for regexec().
253: */
254: if (strpbrk(pattern, META) == NULL) { /* do boyer-moore only */
255: boyonly++;
256: patboy = pattern;
257: } else {
258: if ((patboy = alternate(pattern)) != NULL)
259: boyonly++;
260: else {
261: if ((patboy = isolate(pattern)) == NULL)
262: kernighan(argv); /* expr too involved */
263: #ifndef NOKANJI
264: for (c = 0; pattern[c] != EOS; c++)
265: if (pattern[c] & NONASCII) /* kanji + meta */
266: kernighan(argv);
267: #endif
268: if ((rspencer = regcomp(pattern)) == NULL)
269: oops("regcomp failure");
270: }
271: }
272: gosper(patboy); /* "pre-conditioning is wonderful"
273: * -- v. strassen */
274:
275: if ((firstfile = optind) >= argc) {
276: /* Grep standard input */
277: if (lflag) /* We don't know its name! */
278: exit(1);
279: egsecute((char *) NULL);
280: } else {
281: while (optind < argc) {
282: egsecute(argv[optind]);
283: optind++;
284: if (firstflag && (nsuccess == 1))
285: break;
286: }
287: }
288: exit((nsuccess == 2) ? 2 : (nsuccess == 0));
289: }
290:
291: char *
292: pfile(pfname) /* absorb expression from file */
293: char *pfname;
294: {
295: int fd;
296: struct stat patstat;
297: static char *pat;
298:
299: if ((fd = open(pfname, O_RDONLY, 0)) < 0)
300: oops("can't read pattern file");
301: if (fstat(fd, &patstat) != 0)
302: oops("can't stat pattern file");
303: if (patstat.st_size > PATSIZE) {
304: if (fgrepflag) { /* defer to unix version */
305: fgrepold++;
306: return "dummy";
307: } else
308: oops("pattern file too big");
309: }
310: if ((pat = malloc((unsigned) patstat.st_size + 1)) == NULL)
311: oops("out of memory to read pattern file");
312: if (patstat.st_size != read(fd, pat, (int)patstat.st_size))
313: oops("error reading pattern file");
314: (void) close(fd);
315:
316: pat[patstat.st_size] = EOS;
317: if (pat[patstat.st_size - 1] == NL) /* NOP for egrep; helps grep */
318: pat[patstat.st_size - 1] = EOS;
319:
320: if (nlcount(pat, &pat[patstat.st_size]) > NALT) {
321: if (fgrepflag)
322: fgrepold++; /* "what's it all about, alfie?" */
323: else
324: egrepold++;
325: }
326: return (pat);
327: }
328:
329: egsecute(file)
330: char *file;
331: {
332: if (file == NULL)
333: fd = 0;
334: else if ((fd = open(file, O_RDONLY, 0)) <= 0) {
335: fprintf(stderr, "%s: can't open %s\n", progname, file);
336: nsuccess = 2;
337: return;
338: }
339: chimaera(file, patboy);
340:
341: if (!boyonly && !flushflag && file != NULL)
342: flushmatches();
343: if (file != NULL)
344: close(fd);
345: }
346:
347: chimaera(file, pat) /* "reach out and boyer-moore search someone" */
348: char *file, *pat; /* -- soon-to-be-popular bumper sticker */
349: {
350: register char *k, *strend, *s;
351: register int j, count;
352: register int *deltazero = delta0;
353: int patlen = altmin;
354: char *t;
355:
356: nleftover = boyfound = flushflag = 0;
357: nline = 1L;
358: prevmatch = 0;
359: nmatch = counted = rxcount = 0L;
360:
361: while ((count = read(fd, str + nleftover, BUFSIZE - nleftover)) > 0) {
362:
363: counted += count;
364: strend = linesave(str, count);
365:
366: for (k = str + patlen - 1; k < strend;) {
367: /*
368: * for a large class of patterns, upwards of 80% of
369: * match time is spent on the next line. we beat
370: * existing microcode (vax 'matchc') this way.
371: */
372: while ((k += deltazero[*(unsigned char *) k]) < strend);
373: if (k < (str + LARGE))
374: break;
375: k -= LARGE;
376:
377: if (altflag) {
378: /*
379: * Parallel Boyer-Moore. Check whether each
380: * of the previous <altmin> chars COULD be
381: * from one of the alternative strings.
382: */
383: s = k - 1;
384: j = altmin;
385: while (altset[--j][(unsigned char)
386: cmap[*(unsigned char *) s--]]);
387: /*
388: * quick test fails. in this life, compare
389: * 'em all. but, a "reverse trie" would
390: * attenuate worst case (linear w/delta2?).
391: */
392: if (--j < 0) {
393: count = nalt - 1;
394: do {
395: s = k;
396: j = altlen[count];
397: t = altpat[count];
398:
399: while
400: (cmap[*(unsigned char *) s--]
401: == t[--j]);
402: if (j < 0)
403: break;
404: }
405: while (count--);
406: }
407: } else {
408: /* One string -- check it */
409: j = patlen - 1;
410: s = k - 1;
411: while (cmap[*(unsigned char *) s--] == pat[--j]);
412: }
413: /*
414: * delta-less shortcut for literati. short shrift for
415: * genetic engineers?
416: */
417: if (j >= 0) {
418: k++; /* no match; restart next char */
419: continue;
420: }
421: k = submatch(file, pat, str, strend, k, count);
422: if (k == NULL)
423: return;
424: }
425: if (nflag) {
426: if (prevmatch)
427: nline = prevnline + nlcount(prevloc, k);
428: else
429: nline = nline + nlcount(str, k);
430: prevmatch = 0;
431: }
432: strncpy(str, linetemp, nleftover);
433: }
434: if (cflag) {
435: /* Bug from old grep: -c overrides -h. We fix the bug. */
436: if (!hflag)
437: printf("%s:", file);
438: printf("%ld\n", nmatch);
439: }
440: }
441:
442: char *
443: linesave(str, count) /* accumulate partial line at end of buffer */
444: char str[];
445: register int count;
446: {
447: register int j;
448:
449: count += nleftover;
450: if (count != BUFSIZE && fd != 0)
451: str[count++] = NL; /* insurance for broken last line */
452: str[count] = EOS;
453: for (j = count - 1; str[j] != NL && j >= 0;)
454: j--;
455: /*
456: * break up these lines: long line (> BUFSIZE), last line of file, or
457: * short return from read(), as from tee(1) input
458: */
459: if (j < 0 && (count == (BUFSIZE - nleftover))) {
460: str[count++] = NL;
461: str[count] = EOS;
462: linetemp[0] = EOS;
463: nleftover = 0;
464: return (str + count);
465: } else {
466: nleftover = count - j - 1;
467: strncpy(linetemp, str + j + 1, nleftover);
468: return (str + j);
469: }
470: }
471:
472: /*
473: * Process partial match. First check for mis-aligned Kanji, then match line
474: * against full compiled r.e. if statistics do not warrant handing off to
475: * standard egrep.
476: */
477: char *
478: submatch(file, pat, str, strend, k, altindex)
479: char file[], pat[], str[];
480: register char *strend, *k;
481: int altindex;
482: {
483: register char *s;
484: char *t, c;
485:
486: t = k;
487: s = ((altflag) ? k - altlen[altindex] + 1 : k - altmin + 1);
488: #ifndef NOKANJI
489: c = ((altflag) ? altpat[altindex][0] : pat[0]);
490: if (c & NONASCII)
491: if ((s = kanji(str, s, k)) == NULL)
492: return (++k); /* reject false kanji */
493: #endif
494: do;
495: while (*s != NL && --s >= str);
496: k = s + 1; /* now at line start */
497:
498: if (boyonly)
499: return (gotamatch(file, k));
500:
501: incount = counted - (strend - k);
502: if (boyfound++ == FIRSTFEW)
503: execstrategy(file);
504:
505: s = t;
506: do
507: rxcount++;
508: while (*s++ != NL);
509: *--s = EOS;
510: /*
511: * "quick henry -- the flit" (after theodor geisel)
512: */
513: if (regexec(rspencer, ((iflag) ? fold(k) : k)) == 1) {
514: *s = NL;
515: if (gotamatch(file, k) == NULL)
516: return (NULL);
517: }
518: *s = NL;
519: return (s + 1);
520: }
521:
522: #ifndef NOKANJI
523: /*
524: * EUC code disambiguation -- scan backwards to first 7-bit code, while
525: * counting intervening 8-bit codes. If odd, reject unaligned Kanji pattern.
526: * SS2/3 checks are for intermixed Japanase Katakana or Kanji2.
527: */
528: char *
529: kanji(str, s, k)
530: register char *str, *s, *k;
531: {
532: register int j = 0;
533:
534: for (s--; s >= str; s--) {
535: if (*s == SS2 || *s == SS3 || (*s & NONASCII) == 0)
536: break;
537: j++;
538: }
539: #ifndef CHINESE
540: if (*s == SS2)
541: j -= 1;
542: #endif CHINESE
543: return ((j & 01) ? NULL : k);
544: }
545: #endif
546:
547: /*
548: * Compute "Boyer-Moore" delta table -- put skip distance in delta0[c]
549: */
550: gosper(pattern)
551: char *pattern; /* ... HAKMEM lives ... */
552: {
553: register int i, j;
554: unsigned char c;
555:
556: /* Make one-string case look like simple alternatives case */
557: if (!altflag) {
558: nalt = 1;
559: altmin = altlen[0] = strlen(pattern);
560: altpat[0] = pattern;
561: }
562: /* For chars that aren't in any string, skip by string length. */
563: for (j = 0; j < 256; j++) {
564: delta0[j] = altmin;
565: cmap[j] = j; /* Sneak in initialization of cmap */
566: }
567:
568: /* For chars in a string, skip distance from char to end of string. */
569: /* (If char appears more than once, skip minimum distance.) */
570: for (i = 0; i < nalt; i++)
571: for (j = 0; j < altlen[i] - 1; j++) {
572: c = altpat[i][j];
573: delta0[c] = MIN(delta0[c], altlen[i] - j - 1);
574: if (iflag && islower((int) c))
575: delta0[toupper((int) c)] = delta0[c];
576: }
577:
578: /* For last char of each string, fall out of search loop. */
579: for (i = 0; i < nalt; i++) {
580: c = altpat[i][altlen[i] - 1];
581: delta0[c] = LARGE;
582: if (iflag && islower((int) c))
583: delta0[toupper((int) c)] = LARGE;
584: }
585: if (iflag)
586: for (j = 'A'; j <= 'Z'; j++)
587: cmap[j] = tolower((int) j);
588: }
589:
590: /*
591: * Print, count, or stop on full match. Result is either the location for
592: * continued search, or NULL to stop.
593: */
594: char *
595: gotamatch(file, s)
596: register char *file, *s;
597: {
598: char *savematch();
599: int squirrel = 0; /* nonzero to squirrel away FIRSTFEW matches */
600:
601: nmatch++;
602: nsuccess = 1;
603: if (!boyonly && boyfound <= FIRSTFEW && file != NULL)
604: squirrel = 1;
605:
606: if (sflag)
607: return (NULL); /* -s usurps all flags (unlike some versions) */
608: if (cflag) { /* -c overrides -l, we guess */
609: do;
610: while (*s++ != NL);
611: } else if (lflag) {
612: puts(file);
613: return (NULL);
614: } else {
615: if (!hflag)
616: if (!squirrel)
617: printf("%s:", file);
618: else
619: (void)sprintf(preamble, "%s:", file);
620: if (nflag) {
621: if (prevmatch)
622: prevnline = prevnline + nlcount(prevloc, s);
623: else
624: prevnline = nline + nlcount(str, s);
625: prevmatch = 1;
626:
627: if (!squirrel)
628: printf("%ld:", prevnline);
629: else
630: (void)sprintf(preamble + strlen(preamble),
631: "%ld:", prevnline);
632: }
633: if (!squirrel) {
634: do
635: putchar(*s);
636: while (*s++ != NL);
637: } else
638: s = savematch(s);
639:
640: if (nflag)
641: prevloc = s - 1;
642: }
643: return ((firstflag && !cflag) ? NULL : s);
644: }
645:
646: char *
647: fold(line)
648: char *line;
649: {
650: static char fline[BUFSIZE];
651: register char *s, *t = fline;
652:
653: for (s = line; *s != EOS; s++)
654: *t++ = (isupper((int) *s) ? (char) tolower((int) *s) : *s);
655: *t = EOS;
656: return (fline);
657: }
658:
659: strindex(s, t) /* the easy way, as in K&P, p. 192 */
660: char *s, *t;
661: {
662: int i, n;
663:
664: n = strlen(t);
665: for (i = 0; s[i] != '\0'; i++)
666: if (strncmp(s + i, t, n) == 0)
667: return (i);
668: return (-1);
669: }
670:
671: char *
672: grepxlat(pattern) /* grep pattern meta conversion */
673: char *pattern;
674: {
675: register char *p, *s;
676: static char newpat[BUFSIZE];
677:
678: for (s = newpat, p = pattern; *p != EOS;) {
679: if (*p == '\\') { /* skip escapes ... */
680: *s++ = *p++;
681: if (*p)
682: *s++ = *p++;
683: } else if (*p == '[') { /* ... and char classes */
684: while (*p != EOS && *p != ']')
685: *s++ = *p++;
686: } else if (strchr("+?|()", *p) != NULL) {
687: *s++ = '\\'; /* insert protection */
688: *s++ = *p++;
689: } else
690: *s++ = *p++;
691: }
692: *s = EOS;
693: grepflag = ((patind) ? 0 : 1);
694: return (newpat);
695: }
696:
697: /*
698: * Test for simple alternation. Result is NULL if it's not so simple, or is
699: * a pointer to the first string if it is. Warning: sscanf size is a
700: * fixpoint, beyond which the speedup linearity starts to break down. In the
701: * wake of the elegant aho/corrasick "trie"-based fgrep, generalizing
702: * altpat[] to arbitrary size is not useful.
703: */
704: char *
705: alternate(regexpr)
706: char *regexpr;
707: {
708: register int i, j;
709: register char *start, *stop;
710: unsigned char c;
711:
712: if (fgrepflag && strchr(regexpr, '|'))
713: return (NULL);
714:
715: /*
716: * break pattern up into altpat array; delimit on newline, bar,
717: * or EOS. We know we won't overflow, we've already checked the
718: * number of patterns we're going to find against NALT.
719: * Also, set length of pattern and find minimum pattern length.
720: */
721: nalt = 0;
722: altmin = NMUSH;
723: for (start = stop = regexpr;; ++stop)
724: if (!*stop || *stop == '|' || *stop == NL) {
725: altlen[nalt] = j = stop - start;
726: if (j < altmin)
727: altmin = j;
728: if (!(altpat[nalt] = malloc((u_int)(j + 1))))
729: oops("out of memory");
730: bcopy(start, altpat[nalt], j);
731: altpat[nalt][j] = EOS;
732: ++nalt;
733: if (!*stop)
734: break;
735: if (nalt == NALT)
736: return(NULL);
737: if (*stop == NL)
738: *stop = '|';
739: start = stop + 1;
740: }
741: if (!fgrepflag) {
742: if (strchr(regexpr, '|') == NULL || regexpr[0] == '|')
743: return (NULL);
744: if (strpbrk(regexpr, "^$.[]()?+*\\") != NULL
745: || strindex(regexpr, "||") >= 0)
746: return (NULL);
747: }
748:
749: if (nalt > 1) { /* build superimposed "pre-match" sets per
750: * char */
751: altflag++;
752: for (j = 0; j < nalt; j++)
753: for (i = 0; i < altmin; i++) {
754: c = altpat[j][altlen[j] - altmin + i];
755: altset[i + 1][c] = 1; /* offset for sentinel */
756: }
757: }
758: return (altpat[0]);
759: }
760:
761: /*
762: * Grapple with the dfa (std egrep) vs. ndfa (regexp) tradeoff. Criteria to
763: * determine whether to use dfa-based egrep: We do FIRSTFEW matches with
764: * regexec(). If Boyer-Moore up to now matched more than PUNTPERCENT
765: * of the input, the r.e. is likely to be underspecified, so do old *grep,
766: * which is faster on complex patterns than regexp(). At FIRSTFEW,
767: * dump the saved matches collected by savematch(). They are saved
768: * so that a "PUNT" can "rewind" to ignore them. Stdin is problematic,
769: * since it's hard to rewind.
770: */
771:
772: execstrategy(file)
773: char *file;
774: {
775: int pctmatch;
776:
777: pctmatch = (100 * rxcount) / incount;
778: if (pctmatch > PUNTPERCENT && file != NULL)
779: kernighan(args);
780: if (file != NULL)
781: flushmatches();
782: }
783:
784: nlcount(bstart, bstop) /* flail interval to totalize newlines. */
785: char *bstart, *bstop;
786: {
787: register char *s = bstart;
788: register char *t = bstop;
789: register int count = 0;
790:
791: do { /* loop unroll for older architectures */
792: if (*t == NL) /* ... ask ames!jaw for sample code */
793: count++;
794: } while (t-- > s);
795:
796: return (count);
797: }
798:
799: char *
800: isolate(regexpr) /* isolate longest metacharacter-free string */
801: char *regexpr;
802: {
803: char *dummyexpr;
804:
805: /*
806: * We add (.)* because Henry's regcomp only figures regmust if it
807: * sees a leading * pattern. Foo!
808: */
809: dummyexpr = malloc((unsigned) strlen(regexpr) + 5);
810: (void)sprintf(dummyexpr, "(.)*%s", regexpr);
811: if ((rspencer = regcomp(dummyexpr)) == NULL)
812: kernighan(args);
813: return (rspencer->regmust);
814: }
815:
816: char *matches[FIRSTFEW];
817: static int mcount = 0;
818:
819: char *
820: savematch(s) /* horde matches during statistics gathering */
821: register char *s;
822: {
823: char *p;
824: char *start = s;
825: int msize = 0;
826: int psize = strlen(preamble);
827:
828: while (*s++ != NL)
829: msize++;
830: *--s = EOS;
831:
832: p = malloc((unsigned) msize + 1 + psize);
833: strcpy(p, preamble);
834: strcpy(p + psize, start);
835: matches[mcount++] = p;
836:
837: preamble[0] = 0;
838: *s = NL;
839: return (s);
840: }
841:
842: flushmatches()
843: {
844: int n;
845:
846: flushflag = 1;
847: for (n = 0; n < mcount; n++)
848: printf("%s\n", matches[n]);
849: mcount = 0;
850: }
851:
852: oops(message)
853: char *message;
854: {
855: fprintf(stderr, "%s: %s\n", progname, message);
856: exit(2);
857: }
858:
859: kernighan(args) /* "let others do the hard part ..." */
860: char *args[];
861: {
862: /*
863: * We may have already run grep on some of the files; remove them
864: * from the arg list we pass on. Note that we can't delete them
865: * totally because the number of file names affects the output
866: * (automatic -h).
867: */
868: /* better would be fork/exec per punted file -- jaw */
869:
870: while (firstfile && optind > firstfile)
871: args[firstfile++] = _PATH_DEVNULL;
872: if (patind)
873: args[patind] = pattern;
874: (void) fflush(stdout);
875:
876: if (grepflag)
877: execvp(_PATH_GREPSTD, args), oops("can't exec old 'grep'");
878: else if (fgrepflag)
879: execvp(_PATH_FGREPSTD, args), oops("can't exec old 'fgrep'");
880: else
881: execvp(_PATH_EGREPSTD, args), oops("can't exec old 'egrep'");
882: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.