|
|
1.1 root 1: static char sccsid[] = "@(#)regexp.c 4.1 (Berkeley) 10/19/82";
2:
3: #include <ctype.h>
4:
5: typedef int boolean;
6: #define TRUE 1
7: #define FALSE 0
8: #define NIL 0
9:
10: boolean l_onecase; /* true if upper and lower equivalent */
11:
12: #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
13:
14: /* STRNCMP - like strncmp except that we convert the
15: * first string to lower case before comparing
16: * if l_onecase is set.
17: */
18:
19: STRNCMP(s1, s2, len)
20: register char *s1,*s2;
21: register int len;
22: {
23: if (l_onecase) {
24: do
25: if (*s2 - makelower(*s1))
26: return (*s2 - makelower(*s1));
27: else {
28: s2++;
29: s1++;
30: }
31: while (--len);
32: } else {
33: do
34: if (*s2 - *s1)
35: return (*s2 - *s1);
36: else {
37: s2++;
38: s1++;
39: }
40: while (--len);
41: }
42: return(0);
43: }
44:
45: /* The following routine converts an irregular expression to
46: * internal format.
47: *
48: * Either meta symbols (\a \d or \p) or character strings or
49: * operations ( alternation or perenthesizing ) can be
50: * specified. Each starts with a descriptor byte. The descriptor
51: * byte has STR set for strings, META set for meta symbols
52: * and OPER set for operations.
53: * The descriptor byte can also have the OPT bit set if the object
54: * defined is optional. Also ALT can be set to indicate an alternation.
55: *
56: * For metasymbols the byte following the descriptor byte identities
57: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
58: * strings the byte after the descriptor is a character count for
59: * the string:
60: *
61: * meta symbols := descriptor
62: * symbol
63: *
64: * strings := descriptor
65: * character count
66: * the string
67: *
68: * operatins := descriptor
69: * symbol
70: * character count
71: */
72:
73: /*
74: * handy macros for accessing parts of match blocks
75: */
76: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
77: #define MNEXT(A) (A+2) /* character following a metasymbol block */
78:
79: #define OSYM(A) (*(A+1)) /* symbol in an operation block */
80: #define OCNT(A) (*(A+2)) /* character count */
81: #define ONEXT(A) (A+3) /* next character after the operation */
82: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
83:
84: #define SCNT(A) (*(A+1)) /* byte count of a string */
85: #define SSTR(A) (A+2) /* address of the string */
86: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
87:
88: /*
89: * bit flags in the descriptor
90: */
91: #define OPT 1
92: #define STR 2
93: #define META 4
94: #define ALT 8
95: #define OPER 16
96:
97: char *ure; /* pointer current position in unconverted exp */
98: char *ccre; /* pointer to current position in converted exp*/
99: char *malloc();
100:
101: char *
102: convexp(re)
103: char *re; /* unconverted irregular expression */
104: {
105: register char *cre; /* pointer to converted regular expression */
106:
107: /* allocate room for the converted expression */
108: if (re == NIL)
109: return (NIL);
110: if (*re == '\0')
111: return (NIL);
112: cre = malloc (4 * strlen(re) + 3);
113: ccre = cre;
114: ure = re;
115:
116: /* start the conversion with a \a */
117: *cre = META | OPT;
118: MSYM(cre) = 'a';
119: ccre = MNEXT(cre);
120:
121: /* start the conversion (its recursive) */
122: expconv ();
123: *ccre = 0;
124: return (cre);
125: }
126:
127: expconv()
128: {
129: register char *cs; /* pointer to current symbol in converted exp */
130: register char c; /* character being processed */
131: register char *acs; /* pinter to last alternate */
132: register int temp;
133:
134: /* let the conversion begin */
135: acs = NIL;
136: while (*ure != NIL) {
137: switch (c = *ure++) {
138:
139: case '\\':
140: switch (c = *ure++) {
141:
142: /* escaped characters are just characters */
143: default:
144: if ((*cs & STR) == 0) {
145: cs = ccre;
146: *cs = STR;
147: SCNT(cs) = 1;
148: ccre += 2;
149: } else
150: SCNT(cs)++;
151: *ccre++ = c;
152: break;
153:
154: /* normal(?) metacharacters */
155: case 'a':
156: case 'd':
157: case 'e':
158: case 'p':
159: if (acs != NIL && acs != cs) {
160: do {
161: temp = OCNT(acs);
162: OCNT(acs) = ccre - acs;
163: acs -= temp;
164: } while (temp != 0);
165: acs = NIL;
166: }
167: cs = ccre;
168: *cs = META;
169: MSYM(cs) = c;
170: ccre = MNEXT(cs);
171: break;
172: }
173: break;
174:
175: /* just put the symbol in */
176: case '^':
177: case '$':
178: if (acs != NIL && acs != cs) {
179: do {
180: temp = OCNT(acs);
181: OCNT(acs) = ccre - acs;
182: acs -= temp;
183: } while (temp != 0);
184: acs = NIL;
185: }
186: cs = ccre;
187: *cs = META;
188: MSYM(cs) = c;
189: ccre = MNEXT(cs);
190: break;
191:
192: /* mark the last match sequence as optional */
193: case '?':
194: *cs = *cs | OPT;
195: break;
196:
197: /* recurse and define a subexpression */
198: case '(':
199: if (acs != NIL && acs != cs) {
200: do {
201: temp = OCNT(acs);
202: OCNT(acs) = ccre - acs;
203: acs -= temp;
204: } while (temp != 0);
205: acs = NIL;
206: }
207: cs = ccre;
208: *cs = OPER;
209: OSYM(cs) = '(';
210: ccre = ONEXT(cs);
211: expconv ();
212: OCNT(cs) = ccre - cs; /* offset to next symbol */
213: break;
214:
215: /* return from a recursion */
216: case ')':
217: if (acs != NIL) {
218: do {
219: temp = OCNT(acs);
220: OCNT(acs) = ccre - acs;
221: acs -= temp;
222: } while (temp != 0);
223: acs = NIL;
224: }
225: cs = ccre;
226: *cs = META;
227: MSYM(cs) = c;
228: ccre = MNEXT(cs);
229: return;
230:
231: /* mark the last match sequence as having an alternate */
232: /* the third byte will contain an offset to jump over the */
233: /* alternate match in case the first did not fail */
234: case '|':
235: if (acs != NIL && acs != cs)
236: OCNT(ccre) = ccre - acs; /* make a back pointer */
237: else
238: OCNT(ccre) = 0;
239: *cs |= ALT;
240: cs = ccre;
241: *cs = OPER;
242: OSYM(cs) = '|';
243: ccre = ONEXT(cs);
244: acs = cs; /* remember that the pointer is to be filles */
245: break;
246:
247: /* if its not a metasymbol just build a scharacter string */
248: default:
249: if ((*cs & STR) == 0) {
250: cs = ccre;
251: *cs = STR;
252: SCNT(cs) = 1;
253: ccre = SSTR(cs);
254: } else
255: SCNT(cs)++;
256: *ccre++ = c;
257: break;
258: }
259: }
260: if (acs != NIL) {
261: do {
262: temp = OCNT(acs);
263: OCNT(acs) = ccre - acs;
264: acs -= temp;
265: } while (temp != 0);
266: acs = NIL;
267: }
268: return;
269: }
270: /* end of convertre */
271:
272:
273: /*
274: * The following routine recognises an irregular expresion
275: * with the following special characters:
276: *
277: * \? - means last match was optional
278: * \a - matches any number of characters
279: * \d - matches any number of spaces and tabs
280: * \p - matches any number of alphanumeric
281: * characters. The
282: * characters matched will be copied into
283: * the area pointed to by 'name'.
284: * \| - alternation
285: * \( \) - grouping used mostly for alternation and
286: * optionality
287: *
288: * The irregular expression must be translated to internal form
289: * prior to calling this routine
290: *
291: * The value returned is the pointer to the first non \a
292: * character matched.
293: */
294:
295: boolean _escaped; /* true if we are currently _escaped */
296: char *_start; /* start of string */
297:
298: char *
299: expmatch (s, re, mstring)
300: register char *s; /* string to check for a match in */
301: register char *re; /* a converted irregular expression */
302: register char *mstring; /* where to put whatever matches a \p */
303: {
304: register char *cs; /* the current symbol */
305: register char *ptr,*s1; /* temporary pointer */
306: boolean matched; /* a temporary boolean */
307:
308: /* initial conditions */
309: if (re == NIL)
310: return (NIL);
311: cs = re;
312: matched = FALSE;
313:
314: /* loop till expression string is exhausted (or at least pretty tired) */
315: while (*cs) {
316: switch (*cs & (OPER | STR | META)) {
317:
318: /* try to match a string */
319: case STR:
320: matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
321: if (matched) {
322:
323: /* hoorah it matches */
324: s += SCNT(cs);
325: cs = SNEXT(cs);
326: } else if (*cs & ALT) {
327:
328: /* alternation, skip to next expression */
329: cs = SNEXT(cs);
330: } else if (*cs & OPT) {
331:
332: /* the match is optional */
333: cs = SNEXT(cs);
334: matched = 1; /* indicate a successful match */
335: } else {
336:
337: /* no match, error return */
338: return (NIL);
339: }
340: break;
341:
342: /* an operator, do something fancy */
343: case OPER:
344: switch (OSYM(cs)) {
345:
346: /* this is an alternation */
347: case '|':
348: if (matched)
349:
350: /* last thing in the alternation was a match, skip ahead */
351: cs = OPTR(cs);
352: else
353:
354: /* no match, keep trying */
355: cs = ONEXT(cs);
356: break;
357:
358: /* this is a grouping, recurse */
359: case '(':
360: ptr = expmatch (s, ONEXT(cs), mstring);
361: if (ptr != NIL) {
362:
363: /* the subexpression matched */
364: matched = 1;
365: s = ptr;
366: } else if (*cs & ALT) {
367:
368: /* alternation, skip to next expression */
369: matched = 0;
370: } else if (*cs & OPT) {
371:
372: /* the match is optional */
373: matched = 1; /* indicate a successful match */
374: } else {
375:
376: /* no match, error return */
377: return (NIL);
378: }
379: cs = OPTR(cs);
380: break;
381: }
382: break;
383:
384: /* try to match a metasymbol */
385: case META:
386: switch (MSYM(cs)) {
387:
388: /* try to match anything and remember what was matched */
389: case 'p':
390: /*
391: * This is really the same as trying the match the
392: * remaining parts of the expression to any subset
393: * of the string.
394: */
395: s1 = s;
396: do {
397: ptr = expmatch (s1, MNEXT(cs), mstring);
398: if (ptr != NIL && s1 != s) {
399:
400: /* we have a match, remember the match */
401: strncpy (mstring, s, s1 - s);
402: mstring[s1 - s] = '\0';
403: return (ptr);
404: } else if (ptr != NIL && (*cs & OPT)) {
405:
406: /* it was aoptional so no match is ok */
407: return (ptr);
408: } else if (ptr != NIL) {
409:
410: /* not optional and we still matched */
411: return (NIL);
412: }
413: if (!isalnum(*s1) && *s1 != '_')
414: return (NIL);
415: if (*s1 == '\\')
416: _escaped = _escaped ? FALSE : TRUE;
417: else
418: _escaped = FALSE;
419: } while (*s1++);
420: return (NIL);
421:
422: /* try to match anything */
423: case 'a':
424: /*
425: * This is really the same as trying the match the
426: * remaining parts of the expression to any subset
427: * of the string.
428: */
429: s1 = s;
430: do {
431: ptr = expmatch (s1, MNEXT(cs), mstring);
432: if (ptr != NIL && s1 != s) {
433:
434: /* we have a match */
435: return (ptr);
436: } else if (ptr != NIL && (*cs & OPT)) {
437:
438: /* it was aoptional so no match is ok */
439: return (ptr);
440: } else if (ptr != NIL) {
441:
442: /* not optional and we still matched */
443: return (NIL);
444: }
445: if (*s1 == '\\')
446: _escaped = _escaped ? FALSE : TRUE;
447: else
448: _escaped = FALSE;
449: } while (*s1++);
450: return (NIL);
451:
452: /* fail if we are currently _escaped */
453: case 'e':
454: if (_escaped)
455: return(NIL);
456: cs = MNEXT(cs);
457: break;
458:
459: /* match any number of tabs and spaces */
460: case 'd':
461: ptr = s;
462: while (*s == ' ' || *s == '\t')
463: s++;
464: if (s != ptr || s == _start) {
465:
466: /* match, be happy */
467: matched = 1;
468: cs = MNEXT(cs);
469: } else if (*s == '\n' || *s == '\0') {
470:
471: /* match, be happy */
472: matched = 1;
473: cs = MNEXT(cs);
474: } else if (*cs & ALT) {
475:
476: /* try the next part */
477: matched = 0;
478: cs = MNEXT(cs);
479: } else if (*cs & OPT) {
480:
481: /* doesn't matter */
482: matched = 1;
483: cs = MNEXT(cs);
484: } else
485:
486: /* no match, error return */
487: return (NIL);
488: break;
489:
490: /* check for end of line */
491: case '$':
492: if (*s == '\0' || *s == '\n') {
493:
494: /* match, be happy */
495: s++;
496: matched = 1;
497: cs = MNEXT(cs);
498: } else if (*cs & ALT) {
499:
500: /* try the next part */
501: matched = 0;
502: cs = MNEXT(cs);
503: } else if (*cs & OPT) {
504:
505: /* doesn't matter */
506: matched = 1;
507: cs = MNEXT(cs);
508: } else
509:
510: /* no match, error return */
511: return (NIL);
512: break;
513:
514: /* check for start of line */
515: case '^':
516: if (s == _start) {
517:
518: /* match, be happy */
519: matched = 1;
520: cs = MNEXT(cs);
521: } else if (*cs & ALT) {
522:
523: /* try the next part */
524: matched = 0;
525: cs = MNEXT(cs);
526: } else if (*cs & OPT) {
527:
528: /* doesn't matter */
529: matched = 1;
530: cs = MNEXT(cs);
531: } else
532:
533: /* no match, error return */
534: return (NIL);
535: break;
536:
537: /* end of a subexpression, return success */
538: case ')':
539: return (s);
540: }
541: break;
542: }
543: }
544: return (s);
545: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.