|
|
1.1 root 1: /*
2: * regcomp and regexec -- regsub and regerror are elsewhere
3: *
4: * Copyright (c) 1986 by University of Toronto.
5: * Written by Henry Spencer. Not derived from licensed software.
6: *
7: * Permission is granted to anyone to use this software for any
8: * purpose on any computer system, and to redistribute it freely,
9: * subject to the following restrictions:
10: *
11: * 1. The author is not responsible for the consequences of use of
12: * this software, no matter how awful, even if they arise
13: * from defects in it.
14: *
15: * 2. The origin of this software must not be misrepresented, either
16: * by explicit claim or by omission.
17: *
18: * 3. Altered versions must be plainly marked as such, and must not
19: * be misrepresented as being the original software.
20: *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21: *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22: *** to assist in implementing egrep.
23: *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24: *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25: *** as in BSD grep and ex.
26: *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27: *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28: *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29: *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30: *
31: * Beware that some of this code is subtly aware of the way operator
32: * precedence is structured in regular expressions. Serious changes in
33: * regular-expression syntax might require a total rethink.
34: */
35: #include <stdio.h>
36: #include <ctype.h>
37: #include <regexp.h>
38: #include "regmagic.h"
39:
40: /*
41: * The "internal use only" fields in regexp.h are present to pass info from
42: * compile to execute that permits the execute phase to run lots faster on
43: * simple cases. They are:
44: *
45: * regstart char that must begin a match; '\0' if none obvious
46: * reganch is the match anchored (at beginning-of-line only)?
47: * regmust string (pointer into program) that match must include, or NULL
48: * regmlen length of regmust string
49: *
50: * Regstart and reganch permit very fast decisions on suitable starting points
51: * for a match, cutting down the work a lot. Regmust permits fast rejection
52: * of lines that cannot possibly match. The regmust tests are costly enough
53: * that regcomp() supplies a regmust only if the r.e. contains something
54: * potentially expensive (at present, the only such thing detected is * or +
55: * at the start of the r.e., which can involve a lot of backup). Regmlen is
56: * supplied because the test in regexec() needs it and regcomp() is computing
57: * it anyway.
58: */
59:
60: /*
61: * Structure for regexp "program". This is essentially a linear encoding
62: * of a nondeterministic finite-state machine (aka syntax charts or
63: * "railroad normal form" in parsing technology). Each node is an opcode
64: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
65: * all nodes except BRANCH implement concatenation; a "next" pointer with
66: * a BRANCH on both ends of it is connecting two alternatives. (Here we
67: * have one of the subtle syntax dependencies: an individual BRANCH (as
68: * opposed to a collection of them) is never concatenated with anything
69: * because of operator precedence.) The operand of some types of node is
70: * a literal string; for others, it is a node leading into a sub-FSM. In
71: * particular, the operand of a BRANCH node is the first node of the branch.
72: * (NB this is *not* a tree structure: the tail of the branch connects
73: * to the thing following the set of BRANCHes.) The opcodes are:
74: */
75:
76: /* definition number opnd? meaning */
77: #define END 0 /* no End of program. */
78: #define BOL 1 /* no Match "" at beginning of line. */
79: #define EOL 2 /* no Match "" at end of line. */
80: #define ANY 3 /* no Match any one character. */
81: #define ANYOF 4 /* str Match any character in this string. */
82: #define ANYBUT 5 /* str Match any character not in this string. */
83: #define BRANCH 6 /* node Match this alternative, or the next... */
84: #define BACK 7 /* no Match "", "next" ptr points backward. */
85: #define EXACTLY 8 /* str Match this string. */
86: #define NOTHING 9 /* no Match empty string. */
87: #define STAR 10 /* node Match this (simple) thing 0 or more times. */
88: #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
89: #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */
90: #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */
91: #define OPEN 20 /* no Mark this point in input as start of #n. */
92: /* OPEN+1 is number 1, etc. */
93: #define CLOSE 30 /* no Analogous to OPEN. */
94:
95: /*
96: * Opcode notes:
97: *
98: * BRANCH The set of branches constituting a single choice are hooked
99: * together with their "next" pointers, since precedence prevents
100: * anything being concatenated to any individual branch. The
101: * "next" pointer of the last BRANCH in a choice points to the
102: * thing following the whole choice. This is also where the
103: * final "next" pointer of each individual branch points; each
104: * branch starts with the operand node of a BRANCH node.
105: *
106: * BACK Normal "next" pointers all implicitly point forward; BACK
107: * exists to make loop structures possible.
108: *
109: * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
110: * BRANCH structures using BACK. Simple cases (one character
111: * per match) are implemented with STAR and PLUS for speed
112: * and to minimize recursive plunges.
113: *
114: * OPEN,CLOSE ...are numbered at compile time.
115: */
116:
117: /*
118: * A node is one char of opcode followed by two chars of "next" pointer.
119: * "Next" pointers are stored as two 8-bit pieces, high order first. The
120: * value is a positive offset from the opcode of the node containing it.
121: * An operand, if any, simply follows the node. (Note that much of the
122: * code generation knows about this implicit relationship.)
123: *
124: * Using two bytes for the "next" pointer is vast overkill for most things,
125: * but allows patterns to get big without disasters.
126: */
127: #define OP(p) (*(p))
128: #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
129: #define OPERAND(p) ((p) + 3)
130:
131: /*
132: * See regmagic.h for one further detail of program structure.
133: */
134:
135:
136: /*
137: * Utility definitions.
138: */
139: #ifndef CHARBITS
140: #define UCHARAT(p) ((int)*(unsigned char *)(p))
141: #else
142: #define UCHARAT(p) ((int)*(p)&CHARBITS)
143: #endif
144:
145: #define FAIL(m) { regerror(m); return(NULL); }
146: #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
147:
148: /*
149: * Flags to be passed up and down.
150: */
151: #define HASWIDTH 01 /* Known never to match null string. */
152: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
153: #define SPSTART 04 /* Starts with * or +. */
154: #define WORST 0 /* Worst case. */
155:
156: /*
157: * Global work variables for regcomp().
158: */
159: static char *regparse; /* Input-scan pointer. */
160: static int regnpar; /* () count. */
161: static char regdummy;
162: static char *regcode; /* Code-emit pointer; ®dummy = don't. */
163: static long regsize; /* Code size. */
164:
165: /*
166: * Forward declarations for regcomp()'s friends.
167: */
168: #ifndef STATIC
169: #define STATIC static
170: #endif
171: STATIC char *reg();
172: STATIC char *regbranch();
173: STATIC char *regpiece();
174: STATIC char *regatom();
175: STATIC char *regnode();
176: STATIC char *regnext();
177: STATIC void regc();
178: STATIC void reginsert();
179: STATIC void regtail();
180: STATIC void regoptail();
181: #ifdef STRCSPN
182: STATIC int strcspn();
183: #endif
184:
185: /*
186: - regcomp - compile a regular expression into internal code
187: *
188: * We can't allocate space until we know how big the compiled form will be,
189: * but we can't compile it (and thus know how big it is) until we've got a
190: * place to put the code. So we cheat: we compile it twice, once with code
191: * generation turned off and size counting turned on, and once "for real".
192: * This also means that we don't allocate space until we are sure that the
193: * thing really will compile successfully, and we never have to move the
194: * code and thus invalidate pointers into it. (Note that it has to be in
195: * one piece because free() must be able to free it all.)
196: *
197: * Beware that the optimization-preparation code in here knows about some
198: * of the structure of the compiled regexp.
199: */
200: regexp *
201: regcomp(exp)
202: char *exp;
203: {
204: register regexp *r;
205: register char *scan;
206: register char *longest;
207: register int len;
208: int flags;
209: extern char *malloc();
210:
211: if (exp == NULL)
212: FAIL("NULL argument");
213:
214: /* First pass: determine size, legality. */
215: if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
216: regparse = exp;
217: regnpar = 1;
218: regsize = 0L;
219: regcode = ®dummy;
220: regc(MAGIC);
221: if (reg(0, &flags) == NULL)
222: return(NULL);
223:
224: /* Small enough for pointer-storage convention? */
225: if (regsize >= 32767L) /* Probably could be 65535L. */
226: FAIL("regexp too big");
227:
228: /* Allocate space. */
229: r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
230: if (r == NULL)
231: FAIL("out of space");
232:
233: /* Second pass: emit code. */
234: regparse = exp;
235: regnpar = 1;
236: regcode = r->program;
237: regc(MAGIC);
238: if (reg(0, &flags) == NULL)
239: return(NULL);
240:
241: /* Dig out information for optimizations. */
242: r->regstart = '\0'; /* Worst-case defaults. */
243: r->reganch = 0;
244: r->regmust = NULL;
245: r->regmlen = 0;
246: scan = r->program+1; /* First BRANCH. */
247: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
248: scan = OPERAND(scan);
249:
250: /* Starting-point info. */
251: if (OP(scan) == EXACTLY)
252: r->regstart = *OPERAND(scan);
253: else if (OP(scan) == BOL)
254: r->reganch++;
255:
256: /*
257: * If there's something expensive in the r.e., find the
258: * longest literal string that must appear and make it the
259: * regmust. Resolve ties in favor of later strings, since
260: * the regstart check works with the beginning of the r.e.
261: * and avoiding duplication strengthens checking. Not a
262: * strong reason, but sufficient in the absence of others.
263: */
264: if (flags&SPSTART) {
265: longest = NULL;
266: len = 0;
267: for (; scan != NULL; scan = regnext(scan))
268: if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
269: longest = OPERAND(scan);
270: len = strlen(OPERAND(scan));
271: }
272: r->regmust = longest;
273: r->regmlen = len;
274: }
275: }
276:
277: return(r);
278: }
279:
280: /*
281: - reg - regular expression, i.e. main body or parenthesized thing
282: *
283: * Caller must absorb opening parenthesis.
284: *
285: * Combining parenthesis handling with the base level of regular expression
286: * is a trifle forced, but the need to tie the tails of the branches to what
287: * follows makes it hard to avoid.
288: */
289: static char *
290: reg(paren, flagp)
291: int paren; /* Parenthesized? */
292: int *flagp;
293: {
294: register char *ret;
295: register char *br;
296: register char *ender;
297: register int parno;
298: int flags;
299:
300: *flagp = HASWIDTH; /* Tentatively. */
301:
302: /* Make an OPEN node, if parenthesized. */
303: if (paren) {
304: if (regnpar >= NSUBEXP)
305: FAIL("too many ()");
306: parno = regnpar;
307: regnpar++;
308: ret = regnode(OPEN+parno);
309: } else
310: ret = NULL;
311:
312: /* Pick up the branches, linking them together. */
313: br = regbranch(&flags);
314: if (br == NULL)
315: return(NULL);
316: if (ret != NULL)
317: regtail(ret, br); /* OPEN -> first. */
318: else
319: ret = br;
320: if (!(flags&HASWIDTH))
321: *flagp &= ~HASWIDTH;
322: *flagp |= flags&SPSTART;
323: while (*regparse == '|' || *regparse == '\n') {
324: regparse++;
325: br = regbranch(&flags);
326: if (br == NULL)
327: return(NULL);
328: regtail(ret, br); /* BRANCH -> BRANCH. */
329: if (!(flags&HASWIDTH))
330: *flagp &= ~HASWIDTH;
331: *flagp |= flags&SPSTART;
332: }
333:
334: /* Make a closing node, and hook it on the end. */
335: ender = regnode((paren) ? CLOSE+parno : END);
336: regtail(ret, ender);
337:
338: /* Hook the tails of the branches to the closing node. */
339: for (br = ret; br != NULL; br = regnext(br))
340: regoptail(br, ender);
341:
342: /* Check for proper termination. */
343: if (paren && *regparse++ != ')') {
344: FAIL("unmatched ()");
345: } else if (!paren && *regparse != '\0') {
346: if (*regparse == ')') {
347: FAIL("unmatched ()");
348: } else
349: FAIL("junk on end"); /* "Can't happen". */
350: /* NOTREACHED */
351: }
352:
353: return(ret);
354: }
355:
356: /*
357: - regbranch - one alternative of an | operator
358: *
359: * Implements the concatenation operator.
360: */
361: static char *
362: regbranch(flagp)
363: int *flagp;
364: {
365: register char *ret;
366: register char *chain;
367: register char *latest;
368: int flags;
369:
370: *flagp = WORST; /* Tentatively. */
371:
372: ret = regnode(BRANCH);
373: chain = NULL;
374: while (*regparse != '\0' && *regparse != ')' &&
375: *regparse != '\n' && *regparse != '|') {
376: latest = regpiece(&flags);
377: if (latest == NULL)
378: return(NULL);
379: *flagp |= flags&HASWIDTH;
380: if (chain == NULL) /* First piece. */
381: *flagp |= flags&SPSTART;
382: else
383: regtail(chain, latest);
384: chain = latest;
385: }
386: if (chain == NULL) /* Loop ran zero times. */
387: (void) regnode(NOTHING);
388:
389: return(ret);
390: }
391:
392: /*
393: - regpiece - something followed by possible [*+?]
394: *
395: * Note that the branching code sequences used for ? and the general cases
396: * of * and + are somewhat optimized: they use the same NOTHING node as
397: * both the endmarker for their branch list and the body of the last branch.
398: * It might seem that this node could be dispensed with entirely, but the
399: * endmarker role is not redundant.
400: */
401: static char *
402: regpiece(flagp)
403: int *flagp;
404: {
405: register char *ret;
406: register char op;
407: register char *next;
408: int flags;
409:
410: ret = regatom(&flags);
411: if (ret == NULL)
412: return(NULL);
413:
414: op = *regparse;
415: if (!ISMULT(op)) {
416: *flagp = flags;
417: return(ret);
418: }
419:
420: if (!(flags&HASWIDTH) && op != '?')
421: FAIL("*+ operand could be empty");
422: *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
423:
424: if (op == '*' && (flags&SIMPLE))
425: reginsert(STAR, ret);
426: else if (op == '*') {
427: /* Emit x* as (x&|), where & means "self". */
428: reginsert(BRANCH, ret); /* Either x */
429: regoptail(ret, regnode(BACK)); /* and loop */
430: regoptail(ret, ret); /* back */
431: regtail(ret, regnode(BRANCH)); /* or */
432: regtail(ret, regnode(NOTHING)); /* null. */
433: } else if (op == '+' && (flags&SIMPLE))
434: reginsert(PLUS, ret);
435: else if (op == '+') {
436: /* Emit x+ as x(&|), where & means "self". */
437: next = regnode(BRANCH); /* Either */
438: regtail(ret, next);
439: regtail(regnode(BACK), ret); /* loop back */
440: regtail(next, regnode(BRANCH)); /* or */
441: regtail(ret, regnode(NOTHING)); /* null. */
442: } else if (op == '?') {
443: /* Emit x? as (x|) */
444: reginsert(BRANCH, ret); /* Either x */
445: regtail(ret, regnode(BRANCH)); /* or */
446: next = regnode(NOTHING); /* null. */
447: regtail(ret, next);
448: regoptail(ret, next);
449: }
450: regparse++;
451: if (ISMULT(*regparse))
452: FAIL("nested *?+");
453:
454: return(ret);
455: }
456:
457: /*
458: - regatom - the lowest level
459: *
460: * Optimization: gobbles an entire sequence of ordinary characters so that
461: * it can turn them into a single node, which is smaller to store and
462: * faster to run. Backslashed characters are exceptions, each becoming a
463: * separate node; the code is simpler that way and it's not worth fixing.
464: */
465: static char *
466: regatom(flagp)
467: int *flagp;
468: {
469: register char *ret;
470: int flags;
471:
472: *flagp = WORST; /* Tentatively. */
473:
474: switch (*regparse++) {
475: /* FIXME: these chars only have meaning at beg/end of pat? */
476: case '^':
477: ret = regnode(BOL);
478: break;
479: case '$':
480: ret = regnode(EOL);
481: break;
482: case '.':
483: ret = regnode(ANY);
484: *flagp |= HASWIDTH|SIMPLE;
485: break;
486: case '[': {
487: register int class;
488: register int classend;
489:
490: if (*regparse == '^') { /* Complement of range. */
491: ret = regnode(ANYBUT);
492: regparse++;
493: } else
494: ret = regnode(ANYOF);
495: if (*regparse == ']' || *regparse == '-')
496: regc(*regparse++);
497: while (*regparse != '\0' && *regparse != ']') {
498: if (*regparse == '-') {
499: regparse++;
500: if (*regparse == ']' || *regparse == '\0')
501: regc('-');
502: else {
503: class = UCHARAT(regparse-2)+1;
504: classend = UCHARAT(regparse);
505: if (class > classend+1)
506: FAIL("invalid [] range");
507: for (; class <= classend; class++)
508: regc(class);
509: regparse++;
510: }
511: } else
512: regc(*regparse++);
513: }
514: regc('\0');
515: if (*regparse != ']')
516: FAIL("unmatched []");
517: regparse++;
518: *flagp |= HASWIDTH|SIMPLE;
519: }
520: break;
521: case '(':
522: ret = reg(1, &flags);
523: if (ret == NULL)
524: return(NULL);
525: *flagp |= flags&(HASWIDTH|SPSTART);
526: break;
527: case '\0':
528: case '|':
529: case '\n':
530: case ')':
531: FAIL("internal urp"); /* Supposed to be caught earlier. */
532: break;
533: case '?':
534: case '+':
535: case '*':
536: FAIL("?+* follows nothing");
537: break;
538: case '\\':
539: switch (*regparse++) {
540: case '\0':
541: FAIL("trailing \\");
542: break;
543: case '<':
544: ret = regnode(WORDA);
545: break;
546: case '>':
547: ret = regnode(WORDZ);
548: break;
549: /* FIXME: Someday handle \1, \2, ... */
550: default:
551: /* Handle general quoted chars in exact-match routine */
552: goto de_fault;
553: }
554: break;
555: de_fault:
556: default:
557: /*
558: * Encode a string of characters to be matched exactly.
559: *
560: * This is a bit tricky due to quoted chars and due to
561: * '*', '+', and '?' taking the SINGLE char previous
562: * as their operand.
563: *
564: * On entry, the char at regparse[-1] is going to go
565: * into the string, no matter what it is. (It could be
566: * following a \ if we are entered from the '\' case.)
567: *
568: * Basic idea is to pick up a good char in ch and
569: * examine the next char. If it's *+? then we twiddle.
570: * If it's \ then we frozzle. If it's other magic char
571: * we push ch and terminate the string. If none of the
572: * above, we push ch on the string and go around again.
573: *
574: * regprev is used to remember where "the current char"
575: * starts in the string, if due to a *+? we need to back
576: * up and put the current char in a separate, 1-char, string.
577: * When regprev is NULL, ch is the only char in the
578: * string; this is used in *+? handling, and in setting
579: * flags |= SIMPLE at the end.
580: */
581: {
582: char *regprev;
583: register char ch;
584:
585: regparse--; /* Look at cur char */
586: ret = regnode(EXACTLY);
587: for ( regprev = 0 ; ; ) {
588: ch = *regparse++; /* Get current char */
589: switch (*regparse) { /* look at next one */
590:
591: default:
592: regc(ch); /* Add cur to string */
593: break;
594:
595: case '.': case '[': case '(':
596: case ')': case '|': case '\n':
597: case '$': case '^':
598: case '\0':
599: /* FIXME, $ and ^ should not always be magic */
600: magic:
601: regc(ch); /* dump cur char */
602: goto done; /* and we are done */
603:
604: case '?': case '+': case '*':
605: if (!regprev) /* If just ch in str, */
606: goto magic; /* use it */
607: /* End mult-char string one early */
608: regparse = regprev; /* Back up parse */
609: goto done;
610:
611: case '\\':
612: regc(ch); /* Cur char OK */
613: switch (regparse[1]){ /* Look after \ */
614: case '\0':
615: case '<':
616: case '>':
617: /* FIXME: Someday handle \1, \2, ... */
618: goto done; /* Not quoted */
619: default:
620: /* Backup point is \, scan * point is after it. */
621: regprev = regparse;
622: regparse++;
623: continue; /* NOT break; */
624: }
625: }
626: regprev = regparse; /* Set backup point */
627: }
628: done:
629: regc('\0');
630: *flagp |= HASWIDTH;
631: if (!regprev) /* One char? */
632: *flagp |= SIMPLE;
633: }
634: break;
635: }
636:
637: return(ret);
638: }
639:
640: /*
641: - regnode - emit a node
642: */
643: static char * /* Location. */
644: regnode(op)
645: char op;
646: {
647: register char *ret;
648: register char *ptr;
649:
650: ret = regcode;
651: if (ret == ®dummy) {
652: regsize += 3;
653: return(ret);
654: }
655:
656: ptr = ret;
657: *ptr++ = op;
658: *ptr++ = '\0'; /* Null "next" pointer. */
659: *ptr++ = '\0';
660: regcode = ptr;
661:
662: return(ret);
663: }
664:
665: /*
666: - regc - emit (if appropriate) a byte of code
667: */
668: static void
669: regc(b)
670: char b;
671: {
672: if (regcode != ®dummy)
673: *regcode++ = b;
674: else
675: regsize++;
676: }
677:
678: /*
679: - reginsert - insert an operator in front of already-emitted operand
680: *
681: * Means relocating the operand.
682: */
683: static void
684: reginsert(op, opnd)
685: char op;
686: char *opnd;
687: {
688: register char *src;
689: register char *dst;
690: register char *place;
691:
692: if (regcode == ®dummy) {
693: regsize += 3;
694: return;
695: }
696:
697: src = regcode;
698: regcode += 3;
699: dst = regcode;
700: while (src > opnd)
701: *--dst = *--src;
702:
703: place = opnd; /* Op node, where operand used to be. */
704: *place++ = op;
705: *place++ = '\0';
706: *place++ = '\0';
707: }
708:
709: /*
710: - regtail - set the next-pointer at the end of a node chain
711: */
712: static void
713: regtail(p, val)
714: char *p;
715: char *val;
716: {
717: register char *scan;
718: register char *temp;
719: register int offset;
720:
721: if (p == ®dummy)
722: return;
723:
724: /* Find last node. */
725: scan = p;
726: for (;;) {
727: temp = regnext(scan);
728: if (temp == NULL)
729: break;
730: scan = temp;
731: }
732:
733: if (OP(scan) == BACK)
734: offset = scan - val;
735: else
736: offset = val - scan;
737: *(scan+1) = (offset>>8)&0377;
738: *(scan+2) = offset&0377;
739: }
740:
741: /*
742: - regoptail - regtail on operand of first argument; nop if operandless
743: */
744: static void
745: regoptail(p, val)
746: char *p;
747: char *val;
748: {
749: /* "Operandless" and "op != BRANCH" are synonymous in practice. */
750: if (p == NULL || p == ®dummy || OP(p) != BRANCH)
751: return;
752: regtail(OPERAND(p), val);
753: }
754:
755: /*
756: * regexec and friends
757: */
758:
759: /*
760: * Global work variables for regexec().
761: */
762: static char *reginput; /* String-input pointer. */
763: static char *regbol; /* Beginning of input, for ^ check. */
764: static char **regstartp; /* Pointer to startp array. */
765: static char **regendp; /* Ditto for endp. */
766:
767: /*
768: * Forwards.
769: */
770: STATIC int regtry();
771: STATIC int regmatch();
772: STATIC int regrepeat();
773:
774: #ifdef DEBUG
775: int regnarrate = 0;
776: void regdump();
777: STATIC char *regprop();
778: #endif
779:
780: /*
781: - regexec - match a regexp against a string
782: */
783: int
784: regexec(prog, string)
785: register regexp *prog;
786: register char *string;
787: {
788: register char *s;
789: extern char *strchr();
790:
791: /* Be paranoid... */
792: if (prog == NULL || string == NULL) {
793: regerror("NULL parameter");
794: return(0);
795: }
796:
797: /* Check validity of program. */
798: if (UCHARAT(prog->program) != MAGIC) {
799: regerror("corrupted program");
800: return(0);
801: }
802:
803: /* If there is a "must appear" string, look for it. */
804: if (prog->regmust != NULL) {
805: s = string;
806: while ((s = strchr(s, prog->regmust[0])) != NULL) {
807: if (strncmp(s, prog->regmust, prog->regmlen) == 0)
808: break; /* Found it. */
809: s++;
810: }
811: if (s == NULL) /* Not present. */
812: return(0);
813: }
814:
815: /* Mark beginning of line for ^ . */
816: regbol = string;
817:
818: /* Simplest case: anchored match need be tried only once. */
819: if (prog->reganch)
820: return(regtry(prog, string));
821:
822: /* Messy cases: unanchored match. */
823: s = string;
824: if (prog->regstart != '\0')
825: /* We know what char it must start with. */
826: while ((s = strchr(s, prog->regstart)) != NULL) {
827: if (regtry(prog, s))
828: return(1);
829: s++;
830: }
831: else
832: /* We don't -- general case. */
833: do {
834: if (regtry(prog, s))
835: return(1);
836: } while (*s++ != '\0');
837:
838: /* Failure. */
839: return(0);
840: }
841:
842: /*
843: - regtry - try match at specific point
844: */
845: static int /* 0 failure, 1 success */
846: regtry(prog, string)
847: regexp *prog;
848: char *string;
849: {
850: register int i;
851: register char **sp;
852: register char **ep;
853:
854: reginput = string;
855: regstartp = prog->startp;
856: regendp = prog->endp;
857:
858: sp = prog->startp;
859: ep = prog->endp;
860: for (i = NSUBEXP; i > 0; i--) {
861: *sp++ = NULL;
862: *ep++ = NULL;
863: }
864: if (regmatch(prog->program + 1)) {
865: prog->startp[0] = string;
866: prog->endp[0] = reginput;
867: return(1);
868: } else
869: return(0);
870: }
871:
872: /*
873: - regmatch - main matching routine
874: *
875: * Conceptually the strategy is simple: check to see whether the current
876: * node matches, call self recursively to see whether the rest matches,
877: * and then act accordingly. In practice we make some effort to avoid
878: * recursion, in particular by going through "ordinary" nodes (that don't
879: * need to know whether the rest of the match failed) by a loop instead of
880: * by recursion.
881: */
882: static int /* 0 failure, 1 success */
883: regmatch(prog)
884: char *prog;
885: {
886: register char *scan; /* Current node. */
887: char *next; /* Next node. */
888: extern char *strchr();
889:
890: scan = prog;
891: #ifdef DEBUG
892: if (scan != NULL && regnarrate)
893: fprintf(stderr, "%s(\n", regprop(scan));
894: #endif
895: while (scan != NULL) {
896: #ifdef DEBUG
897: if (regnarrate)
898: fprintf(stderr, "%s...\n", regprop(scan));
899: #endif
900: next = regnext(scan);
901:
902: switch (OP(scan)) {
903: case BOL:
904: if (reginput != regbol)
905: return(0);
906: break;
907: case EOL:
908: if (*reginput != '\0')
909: return(0);
910: break;
911: case WORDA:
912: /* Must be looking at a letter, digit, or _ */
913: if ((!isalnum(*reginput)) && *reginput != '_')
914: return(0);
915: /* Prev must be BOL or nonword */
916: if (reginput > regbol &&
917: (isalnum(reginput[-1]) || reginput[-1] == '_'))
918: return(0);
919: break;
920: case WORDZ:
921: /* Must be looking at non letter, digit, or _ */
922: if (isalnum(*reginput) || *reginput == '_')
923: return(0);
924: /* We don't care what the previous char was */
925: break;
926: case ANY:
927: if (*reginput == '\0')
928: return(0);
929: reginput++;
930: break;
931: case EXACTLY: {
932: register int len;
933: register char *opnd;
934:
935: opnd = OPERAND(scan);
936: /* Inline the first character, for speed. */
937: if (*opnd != *reginput)
938: return(0);
939: len = strlen(opnd);
940: if (len > 1 && strncmp(opnd, reginput, len) != 0)
941: return(0);
942: reginput += len;
943: }
944: break;
945: case ANYOF:
946: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
947: return(0);
948: reginput++;
949: break;
950: case ANYBUT:
951: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
952: return(0);
953: reginput++;
954: break;
955: case NOTHING:
956: break;
957: case BACK:
958: break;
959: case OPEN+1:
960: case OPEN+2:
961: case OPEN+3:
962: case OPEN+4:
963: case OPEN+5:
964: case OPEN+6:
965: case OPEN+7:
966: case OPEN+8:
967: case OPEN+9: {
968: register int no;
969: register char *save;
970:
971: no = OP(scan) - OPEN;
972: save = reginput;
973:
974: if (regmatch(next)) {
975: /*
976: * Don't set startp if some later
977: * invocation of the same parentheses
978: * already has.
979: */
980: if (regstartp[no] == NULL)
981: regstartp[no] = save;
982: return(1);
983: } else
984: return(0);
985: }
986: break;
987: case CLOSE+1:
988: case CLOSE+2:
989: case CLOSE+3:
990: case CLOSE+4:
991: case CLOSE+5:
992: case CLOSE+6:
993: case CLOSE+7:
994: case CLOSE+8:
995: case CLOSE+9: {
996: register int no;
997: register char *save;
998:
999: no = OP(scan) - CLOSE;
1000: save = reginput;
1001:
1002: if (regmatch(next)) {
1003: /*
1004: * Don't set endp if some later
1005: * invocation of the same parentheses
1006: * already has.
1007: */
1008: if (regendp[no] == NULL)
1009: regendp[no] = save;
1010: return(1);
1011: } else
1012: return(0);
1013: }
1014: break;
1015: case BRANCH: {
1016: register char *save;
1017:
1018: if (OP(next) != BRANCH) /* No choice. */
1019: next = OPERAND(scan); /* Avoid recursion. */
1020: else {
1021: do {
1022: save = reginput;
1023: if (regmatch(OPERAND(scan)))
1024: return(1);
1025: reginput = save;
1026: scan = regnext(scan);
1027: } while (scan != NULL && OP(scan) == BRANCH);
1028: return(0);
1029: /* NOTREACHED */
1030: }
1031: }
1032: break;
1033: case STAR:
1034: case PLUS: {
1035: register char nextch;
1036: register int no;
1037: register char *save;
1038: register int min;
1039:
1040: /*
1041: * Lookahead to avoid useless match attempts
1042: * when we know what character comes next.
1043: */
1044: nextch = '\0';
1045: if (OP(next) == EXACTLY)
1046: nextch = *OPERAND(next);
1047: min = (OP(scan) == STAR) ? 0 : 1;
1048: save = reginput;
1049: no = regrepeat(OPERAND(scan));
1050: while (no >= min) {
1051: /* If it could work, try it. */
1052: if (nextch == '\0' || *reginput == nextch)
1053: if (regmatch(next))
1054: return(1);
1055: /* Couldn't or didn't -- back up. */
1056: no--;
1057: reginput = save + no;
1058: }
1059: return(0);
1060: }
1061: break;
1062: case END:
1063: return(1); /* Success! */
1064: break;
1065: default:
1066: regerror("memory corruption");
1067: return(0);
1068: break;
1069: }
1070:
1071: scan = next;
1072: }
1073:
1074: /*
1075: * We get here only if there's trouble -- normally "case END" is
1076: * the terminating point.
1077: */
1078: regerror("corrupted pointers");
1079: return(0);
1080: }
1081:
1082: /*
1083: - regrepeat - repeatedly match something simple, report how many
1084: */
1085: static int
1086: regrepeat(p)
1087: char *p;
1088: {
1089: register int count = 0;
1090: register char *scan;
1091: register char *opnd;
1092:
1093: scan = reginput;
1094: opnd = OPERAND(p);
1095: switch (OP(p)) {
1096: case ANY:
1097: count = strlen(scan);
1098: scan += count;
1099: break;
1100: case EXACTLY:
1101: while (*opnd == *scan) {
1102: count++;
1103: scan++;
1104: }
1105: break;
1106: case ANYOF:
1107: while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1108: count++;
1109: scan++;
1110: }
1111: break;
1112: case ANYBUT:
1113: while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1114: count++;
1115: scan++;
1116: }
1117: break;
1118: default: /* Oh dear. Called inappropriately. */
1119: regerror("internal foulup");
1120: count = 0; /* Best compromise. */
1121: break;
1122: }
1123: reginput = scan;
1124:
1125: return(count);
1126: }
1127:
1128: /*
1129: - regnext - dig the "next" pointer out of a node
1130: */
1131: static char *
1132: regnext(p)
1133: register char *p;
1134: {
1135: register int offset;
1136:
1137: if (p == ®dummy)
1138: return(NULL);
1139:
1140: offset = NEXT(p);
1141: if (offset == 0)
1142: return(NULL);
1143:
1144: if (OP(p) == BACK)
1145: return(p-offset);
1146: else
1147: return(p+offset);
1148: }
1149:
1150: #ifdef DEBUG
1151:
1152: STATIC char *regprop();
1153:
1154: /*
1155: - regdump - dump a regexp onto stdout in vaguely comprehensible form
1156: */
1157: void
1158: regdump(r)
1159: regexp *r;
1160: {
1161: register char *s;
1162: register char op = EXACTLY; /* Arbitrary non-END op. */
1163: register char *next;
1164: extern char *strchr();
1165:
1166:
1167: s = r->program + 1;
1168: while (op != END) { /* While that wasn't END last time... */
1169: op = OP(s);
1170: printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1171: next = regnext(s);
1172: if (next == NULL) /* Next ptr. */
1173: printf("(0)");
1174: else
1175: printf("(%d)", (s-r->program)+(next-s));
1176: s += 3;
1177: if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1178: /* Literal string, where present. */
1179: while (*s != '\0') {
1180: putchar(*s);
1181: s++;
1182: }
1183: s++;
1184: }
1185: putchar('\n');
1186: }
1187:
1188: /* Header fields of interest. */
1189: if (r->regstart != '\0')
1190: printf("start `%c' ", r->regstart);
1191: if (r->reganch)
1192: printf("anchored ");
1193: if (r->regmust != NULL)
1194: printf("must have \"%s\"", r->regmust);
1195: printf("\n");
1196: }
1197:
1198: /*
1199: - regprop - printable representation of opcode
1200: */
1201: static char *
1202: regprop(op)
1203: char *op;
1204: {
1205: register char *p;
1206: static char buf[50];
1207:
1208: (void) strcpy(buf, ":");
1209:
1210: switch (OP(op)) {
1211: case BOL:
1212: p = "BOL";
1213: break;
1214: case EOL:
1215: p = "EOL";
1216: break;
1217: case ANY:
1218: p = "ANY";
1219: break;
1220: case ANYOF:
1221: p = "ANYOF";
1222: break;
1223: case ANYBUT:
1224: p = "ANYBUT";
1225: break;
1226: case BRANCH:
1227: p = "BRANCH";
1228: break;
1229: case EXACTLY:
1230: p = "EXACTLY";
1231: break;
1232: case NOTHING:
1233: p = "NOTHING";
1234: break;
1235: case BACK:
1236: p = "BACK";
1237: break;
1238: case END:
1239: p = "END";
1240: break;
1241: case OPEN+1:
1242: case OPEN+2:
1243: case OPEN+3:
1244: case OPEN+4:
1245: case OPEN+5:
1246: case OPEN+6:
1247: case OPEN+7:
1248: case OPEN+8:
1249: case OPEN+9:
1250: sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1251: p = NULL;
1252: break;
1253: case CLOSE+1:
1254: case CLOSE+2:
1255: case CLOSE+3:
1256: case CLOSE+4:
1257: case CLOSE+5:
1258: case CLOSE+6:
1259: case CLOSE+7:
1260: case CLOSE+8:
1261: case CLOSE+9:
1262: sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1263: p = NULL;
1264: break;
1265: case STAR:
1266: p = "STAR";
1267: break;
1268: case PLUS:
1269: p = "PLUS";
1270: break;
1271: case WORDA:
1272: p = "WORDA";
1273: break;
1274: case WORDZ:
1275: p = "WORDZ";
1276: break;
1277: default:
1278: regerror("corrupted opcode");
1279: break;
1280: }
1281: if (p != NULL)
1282: (void) strcat(buf, p);
1283: return(buf);
1284: }
1285: #endif
1286:
1287: /*
1288: * The following is provided for those people who do not have strcspn() in
1289: * their C libraries. They should get off their butts and do something
1290: * about it; at least one public-domain implementation of those (highly
1291: * useful) string routines has been published on Usenet.
1292: */
1293: #ifdef STRCSPN
1294: /*
1295: * strcspn - find length of initial segment of s1 consisting entirely
1296: * of characters not from s2
1297: */
1298:
1299: static int
1300: strcspn(s1, s2)
1301: char *s1;
1302: char *s2;
1303: {
1304: register char *scan1;
1305: register char *scan2;
1306: register int count;
1307:
1308: count = 0;
1309: for (scan1 = s1; *scan1 != '\0'; scan1++) {
1310: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1311: if (*scan1 == *scan2++)
1312: return(count);
1313: count++;
1314: }
1315: return(count);
1316: }
1317: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.