|
|
1.1 root 1: /*
2: * regcomp and regexec -- regsub and regerror are elsewhere
3: *
4: * Copyright (c) 1986 by University of Toronto.
5: * Written by Henry Spencer. Not derived from licensed software.
6: *
7: * Permission is granted to anyone to use this software for any
8: * purpose on any computer system, and to redistribute it freely,
9: * subject to the following restrictions:
10: *
11: * 1. The author is not responsible for the consequences of use of
12: * this software, no matter how awful, even if they arise
13: * from defects in it.
14: *
15: * 2. The origin of this software must not be misrepresented, either
16: * by explicit claim or by omission.
17: *
18: * 3. Altered versions must be plainly marked as such, and must not
19: * be misrepresented as being the original software.
20: ********* This version is slightly altered to fit cleanly with Coherent
21: ********* libmisc.a. The original is freely available on mwcbbs.
22: *
23: * Beware that some of this code is subtly aware of the way operator
24: * precedence is structured in regular expressions. Serious changes in
25: * regular-expression syntax might require a total rethink.
26: */
27: #include <stdio.h>
28: #include "local_misc.h"
29:
30: /*
31: * The "internal use only" fields in regexp.h are present to pass info from
32: * compile to execute that permits the execute phase to run lots faster on
33: * simple cases. They are:
34: *
35: * regstart char that must begin a match; '\0' if none obvious
36: * reganch is the match anchored (at beginning-of-line only)?
37: * regmust string (pointer into program) that match must include, or NULL
38: * regmlen length of regmust string
39: *
40: * Regstart and reganch permit very fast decisions on suitable starting points
41: * for a match, cutting down the work a lot. Regmust permits fast rejection
42: * of lines that cannot possibly match. The regmust tests are costly enough
43: * that regcomp() supplies a regmust only if the r.e. contains something
44: * potentially expensive (at present, the only such thing detected is * or +
45: * at the start of the r.e., which can involve a lot of backup). Regmlen is
46: * supplied because the test in regexec() needs it and regcomp() is computing
47: * it anyway.
48: */
49:
50: /*
51: * Structure for regexp "program". This is essentially a linear encoding
52: * of a nondeterministic finite-state machine (aka syntax charts or
53: * "railroad normal form" in parsing technology). Each node is an opcode
54: * plus a "next" pointer, possibly plus an operand. "Next" pointers of
55: * all nodes except BRANCH implement concatenation; a "next" pointer with
56: * a BRANCH on both ends of it is connecting two alternatives. (Here we
57: * have one of the subtle syntax dependencies: an individual BRANCH (as
58: * opposed to a collection of them) is never concatenated with anything
59: * because of operator precedence.) The operand of some types of node is
60: * a literal string; for others, it is a node leading into a sub-FSM. In
61: * particular, the operand of a BRANCH node is the first node of the branch.
62: * (NB this is *not* a tree structure: the tail of the branch connects
63: * to the thing following the set of BRANCHes.) The opcodes are:
64: */
65:
66: /* definition number opnd? meaning */
67: #define END 0 /* no End of program. */
68: #define BOL 1 /* no Match "" at beginning of line. */
69: #define EOL 2 /* no Match "" at end of line. */
70: #define ANY 3 /* no Match any one character. */
71: #define ANYOF 4 /* str Match any character in this string. */
72: #define ANYBUT 5 /* str Match any character not in this string. */
73: #define BRANCH 6 /* node Match this alternative, or the next... */
74: #define BACK 7 /* no Match "", "next" ptr points backward. */
75: #define EXACTLY 8 /* str Match this string. */
76: #define NOTHING 9 /* no Match empty string. */
77: #define STAR 10 /* node Match this (simple) thing 0 or more times. */
78: #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
79: #define OPEN 20 /* no Mark this point in input as start of #n. */
80: /* OPEN+1 is number 1, etc. */
81: #define CLOSE 30 /* no Analogous to OPEN. */
82:
83: /*
84: * Opcode notes:
85: *
86: * BRANCH The set of branches constituting a single choice are hooked
87: * together with their "next" pointers, since precedence prevents
88: * anything being concatenated to any individual branch. The
89: * "next" pointer of the last BRANCH in a choice points to the
90: * thing following the whole choice. This is also where the
91: * final "next" pointer of each individual branch points; each
92: * branch starts with the operand node of a BRANCH node.
93: *
94: * BACK Normal "next" pointers all implicitly point forward; BACK
95: * exists to make loop structures possible.
96: *
97: * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
98: * BRANCH structures using BACK. Simple cases (one character
99: * per match) are implemented with STAR and PLUS for speed
100: * and to minimize recursive plunges.
101: *
102: * OPEN,CLOSE ...are numbered at compile time.
103: */
104:
105: /*
106: * A node is one char of opcode followed by two chars of "next" pointer.
107: * "Next" pointers are stored as two 8-bit pieces, high order first. The
108: * value is a positive offset from the opcode of the node containing it.
109: * An operand, if any, simply follows the node. (Note that much of the
110: * code generation knows about this implicit relationship.)
111: *
112: * Using two bytes for the "next" pointer is vast overkill for most things,
113: * but allows patterns to get big without disasters.
114: */
115: #define OP(p) (*(p))
116: #define NEXT(p) (((*((p)+1)&0377)<<8) + *((p)+2)&0377)
117: #define OPERAND(p) ((p) + 3)
118:
119: /*
120: * See regmagic.h for one further detail of program structure.
121: */
122:
123:
124: /*
125: * Utility definitions.
126: */
127: #ifndef CHARBITS
128: #define UCHARAT(p) ((int)*(unsigned char *)(p))
129: #else
130: #define UCHARAT(p) ((int)*(p)&CHARBITS)
131: #endif
132:
133: #define FAIL(m) { regerror(m); return(NULL); }
134: #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
135: #define META "^$.[()|?+*\\"
136:
137: /*
138: * Flags to be passed up and down.
139: */
140: #define HASWIDTH 01 /* Known never to match null string. */
141: #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
142: #define SPSTART 04 /* Starts with * or +. */
143: #define WORST 0 /* Worst case. */
144:
145: /*
146: * Global work variables for regcomp().
147: */
148: static char *regparse; /* Input-scan pointer. */
149: static int regnpar; /* () count. */
150: static char regdummy;
151: static char *regcode; /* Code-emit pointer; ®dummy = don't. */
152: static long regsize; /* Code size. */
153:
154: /*
155: * Forward declarations for regcomp()'s friends.
156: */
157: #ifndef STATIC
158: #define STATIC static
159: #endif
160: STATIC char *reg();
161: STATIC char *regbranch();
162: STATIC char *regpiece();
163: STATIC char *regatom();
164: STATIC char *regnode();
165: STATIC char *regnext();
166: STATIC void regc();
167: STATIC void reginsert();
168: STATIC void regtail();
169: STATIC void regoptail();
170: #ifdef STRCSPN
171: STATIC int strcspn();
172: #endif
173:
174: /*
175: - regcomp - compile a regular expression into internal code
176: *
177: * We can't allocate space until we know how big the compiled form will be,
178: * but we can't compile it (and thus know how big it is) until we've got a
179: * place to put the code. So we cheat: we compile it twice, once with code
180: * generation turned off and size counting turned on, and once "for real".
181: * This also means that we don't allocate space until we are sure that the
182: * thing really will compile successfully, and we never have to move the
183: * code and thus invalidate pointers into it. (Note that it has to be in
184: * one piece because free() must be able to free it all.)
185: *
186: * Beware that the optimization-preparation code in here knows about some
187: * of the structure of the compiled regexp.
188: */
189: regexp *
190: regcomp(exp)
191: char *exp;
192: {
193: register regexp *r;
194: register char *scan;
195: register char *longest;
196: register int len;
197: int flags;
198: extern char *alloc();
199:
200: if (exp == NULL)
201: FAIL("NULL argument");
202:
203: /* First pass: determine size, legality. */
204: regparse = exp;
205: regnpar = 1;
206: regsize = 0L;
207: regcode = ®dummy;
208: regc(REG_MAGIC);
209: if (reg(0, &flags) == NULL)
210: return(NULL);
211:
212: /* Small enough for pointer-storage convention? */
213: if (regsize >= 32767L) /* Probably could be 65535L. */
214: FAIL("regexp too big");
215:
216: /* Allocate space. */
217: r = (regexp *)alloc(sizeof(regexp) + (unsigned)regsize);
218:
219: /* Second pass: emit code. */
220: regparse = exp;
221: regnpar = 1;
222: regcode = r->program;
223: regc(REG_MAGIC);
224: if (reg(0, &flags) == NULL)
225: return(NULL);
226:
227: /* Dig out information for optimizations. */
228: r->regstart = '\0'; /* Worst-case defaults. */
229: r->reganch = 0;
230: r->regmust = NULL;
231: r->regmlen = 0;
232: scan = r->program+1; /* First BRANCH. */
233: if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
234: scan = OPERAND(scan);
235:
236: /* Starting-point info. */
237: if (OP(scan) == EXACTLY)
238: r->regstart = *OPERAND(scan);
239: else if (OP(scan) == BOL)
240: r->reganch++;
241:
242: /*
243: * If there's something expensive in the r.e., find the
244: * longest literal string that must appear and make it the
245: * regmust. Resolve ties in favor of later strings, since
246: * the regstart check works with the beginning of the r.e.
247: * and avoiding duplication strengthens checking. Not a
248: * strong reason, but sufficient in the absence of others.
249: */
250: if (flags&SPSTART) {
251: longest = NULL;
252: len = 0;
253: for (; scan != NULL; scan = regnext(scan))
254: if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
255: longest = OPERAND(scan);
256: len = strlen(OPERAND(scan));
257: }
258: r->regmust = longest;
259: r->regmlen = len;
260: }
261: }
262:
263: return(r);
264: }
265:
266: /*
267: - reg - regular expression, i.e. main body or parenthesized thing
268: *
269: * Caller must absorb opening parenthesis.
270: *
271: * Combining parenthesis handling with the base level of regular expression
272: * is a trifle forced, but the need to tie the tails of the branches to what
273: * follows makes it hard to avoid.
274: */
275: static char *
276: reg(paren, flagp)
277: int paren; /* Parenthesized? */
278: int *flagp;
279: {
280: register char *ret;
281: register char *br;
282: register char *ender;
283: register int parno;
284: int flags;
285:
286: *flagp = HASWIDTH; /* Tentatively. */
287:
288: /* Make an OPEN node, if parenthesized. */
289: if (paren) {
290: if (regnpar >= NSUBEXP)
291: FAIL("too many ()");
292: parno = regnpar;
293: regnpar++;
294: ret = regnode(OPEN+parno);
295: } else
296: ret = NULL;
297:
298: /* Pick up the branches, linking them together. */
299: br = regbranch(&flags);
300: if (br == NULL)
301: return(NULL);
302: if (ret != NULL)
303: regtail(ret, br); /* OPEN -> first. */
304: else
305: ret = br;
306: if (!(flags&HASWIDTH))
307: *flagp &= ~HASWIDTH;
308: *flagp |= flags&SPSTART;
309: while (*regparse == '|') {
310: regparse++;
311: br = regbranch(&flags);
312: if (br == NULL)
313: return(NULL);
314: regtail(ret, br); /* BRANCH -> BRANCH. */
315: if (!(flags&HASWIDTH))
316: *flagp &= ~HASWIDTH;
317: *flagp |= flags&SPSTART;
318: }
319:
320: /* Make a closing node, and hook it on the end. */
321: ender = regnode((paren) ? CLOSE+parno : END);
322: regtail(ret, ender);
323:
324: /* Hook the tails of the branches to the closing node. */
325: for (br = ret; br != NULL; br = regnext(br))
326: regoptail(br, ender);
327:
328: /* Check for proper termination. */
329: if (paren && *regparse++ != ')') {
330: FAIL("unmatched ()");
331: } else if (!paren && *regparse != '\0') {
332: if (*regparse == ')') {
333: FAIL("unmatched ()");
334: } else
335: FAIL("junk on end"); /* "Can't happen". */
336: /* NOTREACHED */
337: }
338:
339: return(ret);
340: }
341:
342: /*
343: - regbranch - one alternative of an | operator
344: *
345: * Implements the concatenation operator.
346: */
347: static char *
348: regbranch(flagp)
349: int *flagp;
350: {
351: register char *ret;
352: register char *chain;
353: register char *latest;
354: int flags;
355:
356: *flagp = WORST; /* Tentatively. */
357:
358: ret = regnode(BRANCH);
359: chain = NULL;
360: while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
361: latest = regpiece(&flags);
362: if (latest == NULL)
363: return(NULL);
364: *flagp |= flags&HASWIDTH;
365: if (chain == NULL) /* First piece. */
366: *flagp |= flags&SPSTART;
367: else
368: regtail(chain, latest);
369: chain = latest;
370: }
371: if (chain == NULL) /* Loop ran zero times. */
372: (void) regnode(NOTHING);
373:
374: return(ret);
375: }
376:
377: /*
378: - regpiece - something followed by possible [*+?]
379: *
380: * Note that the branching code sequences used for ? and the general cases
381: * of * and + are somewhat optimized: they use the same NOTHING node as
382: * both the endmarker for their branch list and the body of the last branch.
383: * It might seem that this node could be dispensed with entirely, but the
384: * endmarker role is not redundant.
385: */
386: static char *
387: regpiece(flagp)
388: int *flagp;
389: {
390: register char *ret;
391: register char op;
392: register char *next;
393: int flags;
394:
395: ret = regatom(&flags);
396: if (ret == NULL)
397: return(NULL);
398:
399: op = *regparse;
400: if (!ISMULT(op)) {
401: *flagp = flags;
402: return(ret);
403: }
404:
405: if (!(flags&HASWIDTH) && op != '?')
406: FAIL("*+ operand could be empty");
407: *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
408:
409: if (op == '*' && (flags&SIMPLE))
410: reginsert(STAR, ret);
411: else if (op == '*') {
412: /* Emit x* as (x&|), where & means "self". */
413: reginsert(BRANCH, ret); /* Either x */
414: regoptail(ret, regnode(BACK)); /* and loop */
415: regoptail(ret, ret); /* back */
416: regtail(ret, regnode(BRANCH)); /* or */
417: regtail(ret, regnode(NOTHING)); /* null. */
418: } else if (op == '+' && (flags&SIMPLE))
419: reginsert(PLUS, ret);
420: else if (op == '+') {
421: /* Emit x+ as x(&|), where & means "self". */
422: next = regnode(BRANCH); /* Either */
423: regtail(ret, next);
424: regtail(regnode(BACK), ret); /* loop back */
425: regtail(next, regnode(BRANCH)); /* or */
426: regtail(ret, regnode(NOTHING)); /* null. */
427: } else if (op == '?') {
428: /* Emit x? as (x|) */
429: reginsert(BRANCH, ret); /* Either x */
430: regtail(ret, regnode(BRANCH)); /* or */
431: next = regnode(NOTHING); /* null. */
432: regtail(ret, next);
433: regoptail(ret, next);
434: }
435: regparse++;
436: if (ISMULT(*regparse))
437: FAIL("nested *?+");
438:
439: return(ret);
440: }
441:
442: /*
443: - regatom - the lowest level
444: *
445: * Optimization: gobbles an entire sequence of ordinary characters so that
446: * it can turn them into a single node, which is smaller to store and
447: * faster to run. Backslashed characters are exceptions, each becoming a
448: * separate node; the code is simpler that way and it's not worth fixing.
449: */
450: static char *
451: regatom(flagp)
452: int *flagp;
453: {
454: register char *ret;
455: int flags;
456:
457: *flagp = WORST; /* Tentatively. */
458:
459: switch (*regparse++) {
460: case '^':
461: ret = regnode(BOL);
462: break;
463: case '$':
464: ret = regnode(EOL);
465: break;
466: case '.':
467: ret = regnode(ANY);
468: *flagp |= HASWIDTH|SIMPLE;
469: break;
470: case '[': {
471: register int class;
472: register int classend;
473:
474: if (*regparse == '^') { /* Complement of range. */
475: ret = regnode(ANYBUT);
476: regparse++;
477: } else
478: ret = regnode(ANYOF);
479: if (*regparse == ']' || *regparse == '-')
480: regc(*regparse++);
481: while (*regparse != '\0' && *regparse != ']') {
482: if (*regparse == '-') {
483: regparse++;
484: if (*regparse == ']' || *regparse == '\0')
485: regc('-');
486: else {
487: class = UCHARAT(regparse-2)+1;
488: classend = UCHARAT(regparse);
489: if (class > classend+1)
490: FAIL("invalid [] range");
491: for (; class <= classend; class++)
492: regc(class);
493: regparse++;
494: }
495: } else
496: regc(*regparse++);
497: }
498: regc('\0');
499: if (*regparse != ']')
500: FAIL("unmatched []");
501: regparse++;
502: *flagp |= HASWIDTH|SIMPLE;
503: }
504: break;
505: case '(':
506: ret = reg(1, &flags);
507: if (ret == NULL)
508: return(NULL);
509: *flagp |= flags&(HASWIDTH|SPSTART);
510: break;
511: case '\0':
512: case '|':
513: case ')':
514: FAIL("internal urp"); /* Supposed to be caught earlier. */
515: break;
516: case '?':
517: case '+':
518: case '*':
519: FAIL("?+* follows nothing");
520: break;
521: case '\\':
522: if (*regparse == '\0')
523: FAIL("trailing \\");
524: ret = regnode(EXACTLY);
525: regc(*regparse++);
526: regc('\0');
527: *flagp |= HASWIDTH|SIMPLE;
528: break;
529: default: {
530: register int len;
531: register char ender;
532:
533: regparse--;
534: len = strcspn(regparse, META);
535: if (len <= 0)
536: FAIL("internal disaster");
537: ender = *(regparse+len);
538: if (len > 1 && ISMULT(ender))
539: len--; /* Back off clear of ?+* operand. */
540: *flagp |= HASWIDTH;
541: if (len == 1)
542: *flagp |= SIMPLE;
543: ret = regnode(EXACTLY);
544: while (len > 0) {
545: regc(*regparse++);
546: len--;
547: }
548: regc('\0');
549: }
550: break;
551: }
552:
553: return(ret);
554: }
555:
556: /*
557: - regnode - emit a node
558: */
559: static char * /* Location. */
560: regnode(op)
561: char op;
562: {
563: register char *ret;
564: register char *ptr;
565:
566: ret = regcode;
567: if (ret == ®dummy) {
568: regsize += 3;
569: return(ret);
570: }
571:
572: ptr = ret;
573: *ptr++ = op;
574: *ptr++ = '\0'; /* Null "next" pointer. */
575: *ptr++ = '\0';
576: regcode = ptr;
577:
578: return(ret);
579: }
580:
581: /*
582: - regc - emit (if appropriate) a byte of code
583: */
584: static void
585: regc(b)
586: char b;
587: {
588: if (regcode != ®dummy)
589: *regcode++ = b;
590: else
591: regsize++;
592: }
593:
594: /*
595: - reginsert - insert an operator in front of already-emitted operand
596: *
597: * Means relocating the operand.
598: */
599: static void
600: reginsert(op, opnd)
601: char op;
602: char *opnd;
603: {
604: register char *src;
605: register char *dst;
606: register char *place;
607:
608: if (regcode == ®dummy) {
609: regsize += 3;
610: return;
611: }
612:
613: src = regcode;
614: regcode += 3;
615: dst = regcode;
616: while (src > opnd)
617: *--dst = *--src;
618:
619: place = opnd; /* Op node, where operand used to be. */
620: *place++ = op;
621: *place++ = '\0';
622: *place++ = '\0';
623: }
624:
625: /*
626: - regtail - set the next-pointer at the end of a node chain
627: */
628: static void
629: regtail(p, val)
630: char *p;
631: char *val;
632: {
633: register char *scan;
634: register char *temp;
635: register int offset;
636:
637: if (p == ®dummy)
638: return;
639:
640: /* Find last node. */
641: scan = p;
642: for (;;) {
643: temp = regnext(scan);
644: if (temp == NULL)
645: break;
646: scan = temp;
647: }
648:
649: if (OP(scan) == BACK)
650: offset = scan - val;
651: else
652: offset = val - scan;
653: *(scan+1) = (offset>>8)&0377;
654: *(scan+2) = offset&0377;
655: }
656:
657: /*
658: - regoptail - regtail on operand of first argument; nop if operandless
659: */
660: static void
661: regoptail(p, val)
662: char *p;
663: char *val;
664: {
665: /* "Operandless" and "op != BRANCH" are synonymous in practice. */
666: if (p == NULL || p == ®dummy || OP(p) != BRANCH)
667: return;
668: regtail(OPERAND(p), val);
669: }
670:
671: /*
672: * regexec and friends
673: */
674:
675: /*
676: * Global work variables for regexec().
677: */
678: static char *reginput; /* String-input pointer. */
679: static char *regbol; /* Beginning of input, for ^ check. */
680: static char **regstartp; /* Pointer to startp array. */
681: static char **regendp; /* Ditto for endp. */
682:
683: /*
684: * Forwards.
685: */
686: STATIC int regtry();
687: STATIC int regmatch();
688: STATIC int regrepeat();
689:
690: #ifdef DEBUG
691: int regnarrate = 0;
692: void regdump();
693: STATIC char *regprop();
694: #endif
695:
696: /*
697: - regexec - match a regexp against a string
698: */
699: int
700: regexec(prog, string, fromline)
701: register regexp *prog;
702: register char *string;
703: {
704: int result;
705: register char *s;
706: extern char *strchr();
707:
708: /* Be paranoid... */
709: if (prog == NULL || string == NULL) {
710: regerror("NULL parameter");
711: return(0);
712: }
713:
714: /* Check validity of program. */
715: if (UCHARAT(prog->program) != (unsigned char)REG_MAGIC) {
716: regerror("corrupted program");
717: return(0);
718: }
719:
720: /* If there is a "must appear" string, look for it. */
721: if (prog->regmust != NULL) {
722: s = string;
723: while ((s = strchr(s, prog->regmust[0])) != NULL) {
724: if (strncmp(s, prog->regmust, prog->regmlen) == 0)
725: break; /* Found it. */
726: s++;
727: }
728: if (s == NULL) { /* Not present. */
729: /* printf("return 0 from %d\n", fromline); */
730: return(0);
731: }
732: }
733:
734: /* Mark beginning of line for ^ . */
735: regbol = string;
736:
737: /* Simplest case: anchored match need be tried only once. */
738: if (prog->reganch) {
739: result = regtry(prog, string);
740: /* printf("return %d from %d\n", result, fromline); */
741: return result;
742: }
743:
744: /* Messy cases: unanchored match. */
745: s = string;
746: if (prog->regstart != '\0')
747: /* We know what char it must start with. */
748: while ((s = strchr(s, prog->regstart)) != NULL) {
749: if (regtry(prog, s)) {
750: /* printf("return 1 from %d\n", fromline); */
751: return(1);
752: }
753: s++;
754: }
755: else
756: /* We don't -- general case. */
757: do {
758: if (regtry(prog, s)) {
759: /* printf("return 1 from %d\n", fromline); */
760: return(1);
761: }
762: } while (*s++ != '\0');
763:
764: /* Failure. */
765: /* printf("return 0 from %d\n", fromline); */
766: return(0);
767: }
768:
769: /*
770: - regtry - try match at specific point
771: */
772: static int /* 0 failure, 1 success */
773: regtry(prog, string)
774: regexp *prog;
775: char *string;
776: {
777: register int i;
778: register char **sp;
779: register char **ep;
780:
781: reginput = string;
782: regstartp = prog->startp;
783: regendp = prog->endp;
784:
785: sp = prog->startp;
786: ep = prog->endp;
787: for (i = NSUBEXP; i > 0; i--) {
788: *sp++ = NULL;
789: *ep++ = NULL;
790: }
791: if (regmatch(prog->program + 1)) {
792: prog->startp[0] = string;
793: prog->endp[0] = reginput;
794: return(1);
795: } else
796: return(0);
797: }
798:
799: /*
800: - regmatch - main matching routine
801: *
802: * Conceptually the strategy is simple: check to see whether the current
803: * node matches, call self recursively to see whether the rest matches,
804: * and then act accordingly. In practice we make some effort to avoid
805: * recursion, in particular by going through "ordinary" nodes (that don't
806: * need to know whether the rest of the match failed) by a loop instead of
807: * by recursion.
808: */
809: static int /* 0 failure, 1 success */
810: regmatch(prog)
811: char *prog;
812: {
813: register char *scan; /* Current node. */
814: char *next; /* Next node. */
815: extern char *strchr();
816:
817: scan = prog;
818: #ifdef DEBUG
819: if (scan != NULL && regnarrate)
820: fprintf(stderr, "%s(\n", regprop(scan));
821: #endif
822: while (scan != NULL) {
823: #ifdef DEBUG
824: if (regnarrate)
825: fprintf(stderr, "%s...\n", regprop(scan));
826: #endif
827: next = regnext(scan);
828:
829: switch (OP(scan)) {
830: case BOL:
831: if (reginput != regbol)
832: return(0);
833: break;
834: case EOL:
835: if (*reginput != '\0')
836: return(0);
837: break;
838: case ANY:
839: if (*reginput == '\0')
840: return(0);
841: reginput++;
842: break;
843: case EXACTLY: {
844: register int len;
845: register char *opnd;
846:
847: opnd = OPERAND(scan);
848: /* Inline the first character, for speed. */
849: if (*opnd != *reginput)
850: return(0);
851: len = strlen(opnd);
852: if (len > 1 && strncmp(opnd, reginput, len) != 0)
853: return(0);
854: reginput += len;
855: }
856: break;
857: case ANYOF:
858: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
859: return(0);
860: reginput++;
861: break;
862: case ANYBUT:
863: if (strchr(OPERAND(scan), *reginput) != NULL)
864: if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
865: return(0);
866: reginput++;
867: break;
868: case NOTHING:
869: break;
870: case BACK:
871: break;
872: case OPEN+1:
873: case OPEN+2:
874: case OPEN+3:
875: case OPEN+4:
876: case OPEN+5:
877: case OPEN+6:
878: case OPEN+7:
879: case OPEN+8:
880: case OPEN+9: {
881: register int no;
882: register char *save;
883:
884: no = OP(scan) - OPEN;
885: save = reginput;
886:
887: if (regmatch(next)) {
888: /*
889: * Don't set startp if some later
890: * invocation of the same parentheses
891: * already has.
892: */
893: if (regstartp[no] == NULL)
894: regstartp[no] = save;
895: return(1);
896: } else
897: return(0);
898: }
899: break;
900: case CLOSE+1:
901: case CLOSE+2:
902: case CLOSE+3:
903: case CLOSE+4:
904: case CLOSE+5:
905: case CLOSE+6:
906: case CLOSE+7:
907: case CLOSE+8:
908: case CLOSE+9: {
909: register int no;
910: register char *save;
911:
912: no = OP(scan) - CLOSE;
913: save = reginput;
914:
915: if (regmatch(next)) {
916: /*
917: * Don't set endp if some later
918: * invocation of the same parentheses
919: * already has.
920: */
921: if (regendp[no] == NULL)
922: regendp[no] = save;
923: return(1);
924: } else
925: return(0);
926: }
927: break;
928: case BRANCH: {
929: register char *save;
930:
931: if (OP(next) != BRANCH) /* No choice. */
932: next = OPERAND(scan); /* Avoid recursion. */
933: else {
934: do {
935: save = reginput;
936: if (regmatch(OPERAND(scan)))
937: return(1);
938: reginput = save;
939: scan = regnext(scan);
940: } while (scan != NULL && OP(scan) == BRANCH);
941: return(0);
942: /* NOTREACHED */
943: }
944: }
945: break;
946: case STAR:
947: case PLUS: {
948: register char nextch;
949: register int no;
950: register char *save;
951: register int min;
952:
953: /*
954: * Lookahead to avoid useless match attempts
955: * when we know what character comes next.
956: */
957: nextch = '\0';
958: if (OP(next) == EXACTLY)
959: nextch = *OPERAND(next);
960: min = (OP(scan) == STAR) ? 0 : 1;
961: save = reginput;
962: no = regrepeat(OPERAND(scan));
963: while (no >= min) {
964: /* If it could work, try it. */
965: if (nextch == '\0' || *reginput == nextch)
966: if (regmatch(next))
967: return(1);
968: /* Couldn't or didn't -- back up. */
969: no--;
970: reginput = save + no;
971: }
972: return(0);
973: }
974: break;
975: case END:
976: return(1); /* Success! */
977: break;
978: default:
979: regerror("memory corruption");
980: return(0);
981: break;
982: }
983:
984: scan = next;
985: }
986:
987: /*
988: * We get here only if there's trouble -- normally "case END" is
989: * the terminating point.
990: */
991: regerror("corrupted pointers");
992: return(0);
993: }
994:
995: /*
996: - regrepeat - repeatedly match something simple, report how many
997: */
998: static int
999: regrepeat(p)
1000: char *p;
1001: {
1002: register int count = 0;
1003: register char *scan;
1004: register char *opnd;
1005:
1006: scan = reginput;
1007: opnd = OPERAND(p);
1008: switch (OP(p)) {
1009: case ANY:
1010: count = strlen(scan);
1011: scan += count;
1012: break;
1013: case EXACTLY:
1014: while (*opnd == *scan) {
1015: count++;
1016: scan++;
1017: }
1018: break;
1019: case ANYOF:
1020: while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1021: count++;
1022: scan++;
1023: }
1024: break;
1025: case ANYBUT:
1026: while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1027: count++;
1028: scan++;
1029: }
1030: break;
1031: default: /* Oh dear. Called inappropriately. */
1032: regerror("internal foulup");
1033: count = 0; /* Best compromise. */
1034: break;
1035: }
1036: reginput = scan;
1037:
1038: return(count);
1039: }
1040:
1041: /*
1042: - regnext - dig the "next" pointer out of a node
1043: */
1044: static char *
1045: regnext(p)
1046: register char *p;
1047: {
1048: register int offset;
1049:
1050: if (p == ®dummy)
1051: return(NULL);
1052:
1053: offset = NEXT(p);
1054: if (offset == 0)
1055: return(NULL);
1056:
1057: if (OP(p) == BACK)
1058: return(p-offset);
1059: else
1060: return(p+offset);
1061: }
1062:
1063: #ifdef DEBUG
1064:
1065: STATIC char *regprop();
1066:
1067: /*
1068: - regdump - dump a regexp onto stdout in vaguely comprehensible form
1069: */
1070: void
1071: regdump(r)
1072: regexp *r;
1073: {
1074: register char *s;
1075: register char op = EXACTLY; /* Arbitrary non-END op. */
1076: register char *next;
1077: extern char *strchr();
1078:
1079:
1080: s = r->program + 1;
1081: while (op != END) { /* While that wasn't END last time... */
1082: op = OP(s);
1083: printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1084: next = regnext(s);
1085: if (next == NULL) /* Next ptr. */
1086: printf("(0)");
1087: else
1088: printf("(%d)", (s-r->program)+(next-s));
1089: s += 3;
1090: if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1091: /* Literal string, where present. */
1092: while (*s != '\0') {
1093: putchar(*s);
1094: s++;
1095: }
1096: s++;
1097: }
1098: putchar('\n');
1099: }
1100:
1101: /* Header fields of interest. */
1102: if (r->regstart != '\0')
1103: printf("start `%c' ", r->regstart);
1104: if (r->reganch)
1105: printf("anchored ");
1106: if (r->regmust != NULL)
1107: printf("must have \"%s\"", r->regmust);
1108: printf("\n");
1109: }
1110:
1111: /*
1112: - regprop - printable representation of opcode
1113: */
1114: static char *
1115: regprop(op)
1116: char *op;
1117: {
1118: register char *p;
1119: static char buf[50];
1120:
1121: (void) strcpy(buf, ":");
1122:
1123: switch (OP(op)) {
1124: case BOL:
1125: p = "BOL";
1126: break;
1127: case EOL:
1128: p = "EOL";
1129: break;
1130: case ANY:
1131: p = "ANY";
1132: break;
1133: case ANYOF:
1134: p = "ANYOF";
1135: break;
1136: case ANYBUT:
1137: p = "ANYBUT";
1138: break;
1139: case BRANCH:
1140: p = "BRANCH";
1141: break;
1142: case EXACTLY:
1143: p = "EXACTLY";
1144: break;
1145: case NOTHING:
1146: p = "NOTHING";
1147: break;
1148: case BACK:
1149: p = "BACK";
1150: break;
1151: case END:
1152: p = "END";
1153: break;
1154: case OPEN+1:
1155: case OPEN+2:
1156: case OPEN+3:
1157: case OPEN+4:
1158: case OPEN+5:
1159: case OPEN+6:
1160: case OPEN+7:
1161: case OPEN+8:
1162: case OPEN+9:
1163: sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1164: p = NULL;
1165: break;
1166: case CLOSE+1:
1167: case CLOSE+2:
1168: case CLOSE+3:
1169: case CLOSE+4:
1170: case CLOSE+5:
1171: case CLOSE+6:
1172: case CLOSE+7:
1173: case CLOSE+8:
1174: case CLOSE+9:
1175: sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1176: p = NULL;
1177: break;
1178: case STAR:
1179: p = "STAR";
1180: break;
1181: case PLUS:
1182: p = "PLUS";
1183: break;
1184: default:
1185: regerror("corrupted opcode");
1186: break;
1187: }
1188: if (p != NULL)
1189: (void) strcat(buf, p);
1190: return(buf);
1191: }
1192: #endif
1193:
1194: /*
1195: * The following is provided for those people who do not have strcspn() in
1196: * their C libraries. They should get off their butts and do something
1197: * about it; at least one public-domain implementation of those (highly
1198: * useful) string routines has been published on Usenet.
1199: */
1200: #ifdef STRCSPN
1201: /*
1202: * strcspn - find length of initial segment of s1 consisting entirely
1203: * of characters not from s2
1204: */
1205:
1206: static int
1207: strcspn(s1, s2)
1208: char *s1;
1209: char *s2;
1210: {
1211: register char *scan1;
1212: register char *scan2;
1213: register int count;
1214:
1215: count = 0;
1216: for (scan1 = s1; *scan1 != '\0'; scan1++) {
1217: for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1218: if (*scan1 == *scan2++)
1219: return(count);
1220: count++;
1221: }
1222: return(count);
1223: }
1224: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.