|
|
1.1 root 1: /* @(#)regex.c 4.1 (Berkeley) 12/21/80 */
2: #
3:
4: /*
5: * routines to do regular expression matching
6: *
7: * Entry points:
8: *
9: * re_comp(s)
10: * char *s;
11: * ... returns 0 if the string s was compiled successfully,
12: * a pointer to an error message otherwise.
13: * If passed 0 or a null string returns without changing
14: * the currently compiled re (see note 11 below).
15: *
16: * re_exec(s)
17: * char *s;
18: * ... returns 1 if the string s matches the last compiled regular
19: * expression,
20: * 0 if the string s failed to match the last compiled
21: * regular expression, and
22: * -1 if the compiled regular expression was invalid
23: * (indicating an internal error).
24: *
25: * The strings passed to both re_comp and re_exec may have trailing or
26: * embedded newline characters; they are terminated by nulls.
27: *
28: * The identity of the author of these routines is lost in antiquity;
29: * this is essentially the same as the re code in the original V6 ed.
30: *
31: * The regular expressions recognized are described below. This description
32: * is essentially the same as that for ed.
33: *
34: * A regular expression specifies a set of strings of characters.
35: * A member of this set of strings is said to be matched by
36: * the regular expression. In the following specification for
37: * regular expressions the word `character' means any character but NUL.
38: *
39: * 1. Any character except a special character matches itself.
40: * Special characters are the regular expression delimiter plus
41: * \ [ . and sometimes ^ * $.
42: * 2. A . matches any character.
43: * 3. A \ followed by any character except a digit or ( )
44: * matches that character.
45: * 4. A nonempty string s bracketed [s] (or [^s]) matches any
46: * character in (or not in) s. In s, \ has no special meaning,
47: * and ] may only appear as the first letter. A substring
48: * a-b, with a and b in ascending ASCII order, stands for
49: * the inclusive range of ASCII characters.
50: * 5. A regular expression of form 1-4 followed by * matches a
51: * sequence of 0 or more matches of the regular expression.
52: * 6. A regular expression, x, of form 1-8, bracketed \(x\)
53: * matches what x matches.
54: * 7. A \ followed by a digit n matches a copy of the string that the
55: * bracketed regular expression beginning with the nth \( matched.
56: * 8. A regular expression of form 1-8, x, followed by a regular
57: * expression of form 1-7, y matches a match for x followed by
58: * a match for y, with the x match being as long as possible
59: * while still permitting a y match.
60: * 9. A regular expression of form 1-8 preceded by ^ (or followed
61: * by $), is constrained to matches that begin at the left
62: * (or end at the right) end of a line.
63: * 10. A regular expression of form 1-9 picks out the longest among
64: * the leftmost matches in a line.
65: * 11. An empty regular expression stands for a copy of the last
66: * regular expression encountered.
67: */
68:
69: /*
70: * constants for re's
71: */
72: #define CBRA 1
73: #define CCHR 2
74: #define CDOT 4
75: #define CCL 6
76: #define NCCL 8
77: #define CDOL 10
78: #define CEOF 11
79: #define CKET 12
80: #define CBACK 18
81:
82: #define CSTAR 01
83:
84: #define ESIZE 512
85: #define NBRA 9
86:
87: static char expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
88: static char circf;
89:
90: /*
91: * compile the regular expression argument into a dfa
92: */
93: char *
94: re_comp(sp)
95: register char *sp;
96: {
97: register int c;
98: register char *ep = expbuf;
99: int cclcnt, numbra = 0;
100: char *lastep = 0;
101: char bracket[NBRA];
102: char *bracketp = &bracket[0];
103: static char *retoolong = "Regular expression too long";
104:
105: #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
106:
107: if (sp == 0 || *sp == '\0') {
108: if (*ep == 0)
109: return("No previous regular expression");
110: return(0);
111: }
112: if (*sp == '^') {
113: circf = 1;
114: sp++;
115: }
116: else
117: circf = 0;
118: for (;;) {
119: if (ep >= &expbuf[ESIZE])
120: comerr(retoolong);
121: if ((c = *sp++) == '\0') {
122: if (bracketp != bracket)
123: comerr("unmatched \\(");
124: *ep++ = CEOF;
125: *ep++ = 0;
126: return(0);
127: }
128: if (c != '*')
129: lastep = ep;
130: switch (c) {
131:
132: case '.':
133: *ep++ = CDOT;
134: continue;
135:
136: case '*':
137: if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
138: goto defchar;
139: *lastep |= CSTAR;
140: continue;
141:
142: case '$':
143: if (*sp != '\0')
144: goto defchar;
145: *ep++ = CDOL;
146: continue;
147:
148: case '[':
149: *ep++ = CCL;
150: *ep++ = 0;
151: cclcnt = 1;
152: if ((c = *sp++) == '^') {
153: c = *sp++;
154: ep[-2] = NCCL;
155: }
156: do {
157: if (c == '\0')
158: comerr("missing ]");
159: if (c == '-' && ep [-1] != 0) {
160: if ((c = *sp++) == ']') {
161: *ep++ = '-';
162: cclcnt++;
163: break;
164: }
165: while (ep[-1] < c) {
166: *ep = ep[-1] + 1;
167: ep++;
168: cclcnt++;
169: if (ep >= &expbuf[ESIZE])
170: comerr(retoolong);
171: }
172: }
173: *ep++ = c;
174: cclcnt++;
175: if (ep >= &expbuf[ESIZE])
176: comerr(retoolong);
177: } while ((c = *sp++) != ']');
178: lastep[1] = cclcnt;
179: continue;
180:
181: case '\\':
182: if ((c = *sp++) == '(') {
183: if (numbra >= NBRA)
184: comerr("too many \\(\\) pairs");
185: *bracketp++ = numbra;
186: *ep++ = CBRA;
187: *ep++ = numbra++;
188: continue;
189: }
190: if (c == ')') {
191: if (bracketp <= bracket)
192: comerr("unmatched \\)");
193: *ep++ = CKET;
194: *ep++ = *--bracketp;
195: continue;
196: }
197: if (c >= '1' && c < ('1' + NBRA)) {
198: *ep++ = CBACK;
199: *ep++ = c - '1';
200: continue;
201: }
202: *ep++ = CCHR;
203: *ep++ = c;
204: continue;
205:
206: defchar:
207: default:
208: *ep++ = CCHR;
209: *ep++ = c;
210: }
211: }
212: }
213:
214: /*
215: * match the argument string against the compiled re
216: */
217: int
218: re_exec(p1)
219: register char *p1;
220: {
221: register char *p2 = expbuf;
222: register int c;
223: int rv;
224:
225: for (c = 0; c < NBRA; c++) {
226: braslist[c] = 0;
227: braelist[c] = 0;
228: }
229: if (circf)
230: return((advance(p1, p2)));
231: /*
232: * fast check for first character
233: */
234: if (*p2 == CCHR) {
235: c = p2[1];
236: do {
237: if (*p1 != c)
238: continue;
239: if (rv = advance(p1, p2))
240: return(rv);
241: } while (*p1++);
242: return(0);
243: }
244: /*
245: * regular algorithm
246: */
247: do
248: if (rv = advance(p1, p2))
249: return(rv);
250: while (*p1++);
251: return(0);
252: }
253:
254: /*
255: * try to match the next thing in the dfa
256: */
257: static int
258: advance(lp, ep)
259: register char *lp, *ep;
260: {
261: register char *curlp;
262: int ct, i;
263: int rv;
264:
265: for (;;)
266: switch (*ep++) {
267:
268: case CCHR:
269: if (*ep++ == *lp++)
270: continue;
271: return(0);
272:
273: case CDOT:
274: if (*lp++)
275: continue;
276: return(0);
277:
278: case CDOL:
279: if (*lp == '\0')
280: continue;
281: return(0);
282:
283: case CEOF:
284: return(1);
285:
286: case CCL:
287: if (cclass(ep, *lp++, 1)) {
288: ep += *ep;
289: continue;
290: }
291: return(0);
292:
293: case NCCL:
294: if (cclass(ep, *lp++, 0)) {
295: ep += *ep;
296: continue;
297: }
298: return(0);
299:
300: case CBRA:
301: braslist[*ep++] = lp;
302: continue;
303:
304: case CKET:
305: braelist[*ep++] = lp;
306: continue;
307:
308: case CBACK:
309: if (braelist[i = *ep++] == 0)
310: return(-1);
311: if (backref(i, lp)) {
312: lp += braelist[i] - braslist[i];
313: continue;
314: }
315: return(0);
316:
317: case CBACK|CSTAR:
318: if (braelist[i = *ep++] == 0)
319: return(-1);
320: curlp = lp;
321: ct = braelist[i] - braslist[i];
322: while (backref(i, lp))
323: lp += ct;
324: while (lp >= curlp) {
325: if (rv = advance(lp, ep))
326: return(rv);
327: lp -= ct;
328: }
329: continue;
330:
331: case CDOT|CSTAR:
332: curlp = lp;
333: while (*lp++)
334: ;
335: goto star;
336:
337: case CCHR|CSTAR:
338: curlp = lp;
339: while (*lp++ == *ep)
340: ;
341: ep++;
342: goto star;
343:
344: case CCL|CSTAR:
345: case NCCL|CSTAR:
346: curlp = lp;
347: while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
348: ;
349: ep += *ep;
350: goto star;
351:
352: star:
353: do {
354: lp--;
355: if (rv = advance(lp, ep))
356: return(rv);
357: } while (lp > curlp);
358: return(0);
359:
360: default:
361: return(-1);
362: }
363: }
364:
365: backref(i, lp)
366: register int i;
367: register char *lp;
368: {
369: register char *bp;
370:
371: bp = braslist[i];
372: while (*bp++ == *lp++)
373: if (bp >= braelist[i])
374: return(1);
375: return(0);
376: }
377:
378: int
379: cclass(set, c, af)
380: register char *set, c;
381: int af;
382: {
383: register int n;
384:
385: if (c == 0)
386: return(0);
387: n = *set++;
388: while (--n)
389: if (*set++ == c)
390: return(af);
391: return(! af);
392: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.