|
|
1.1 root 1: /*
2: * Copyright (c) 1980 The Regents of the University of California.
3: * All rights reserved.
4: *
5: * Redistribution and use in source and binary forms are permitted
6: * provided that: (1) source distributions retain this entire copyright
7: * notice and comment, and (2) distributions including binaries display
8: * the following acknowledgement: ``This product includes software
9: * developed by the University of California, Berkeley and its contributors''
10: * in the documentation or other materials provided with the distribution
11: * and in all advertising materials mentioning features or use of this
12: * software. Neither the name of the University nor the names of its
13: * contributors may be used to endorse or promote products derived
14: * from this software without specific prior written permission.
15: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
16: * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
17: * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
18: */
19:
20: #ifndef lint
21: static char sccsid[] = "@(#)regexp.c 5.3 (Berkeley) 6/1/90";
22: #endif /* not lint */
23:
24: #include <ctype.h>
25:
26: typedef int boolean;
27: #define TRUE 1
28: #define FALSE 0
29: #define NIL 0
30:
31: boolean l_onecase; /* true if upper and lower equivalent */
32:
33: #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
34:
35: /* STRNCMP - like strncmp except that we convert the
36: * first string to lower case before comparing
37: * if l_onecase is set.
38: */
39:
40: STRNCMP(s1, s2, len)
41: register char *s1,*s2;
42: register int len;
43: {
44: if (l_onecase) {
45: do
46: if (*s2 - makelower(*s1))
47: return (*s2 - makelower(*s1));
48: else {
49: s2++;
50: s1++;
51: }
52: while (--len);
53: } else {
54: do
55: if (*s2 - *s1)
56: return (*s2 - *s1);
57: else {
58: s2++;
59: s1++;
60: }
61: while (--len);
62: }
63: return(0);
64: }
65:
66: /* The following routine converts an irregular expression to
67: * internal format.
68: *
69: * Either meta symbols (\a \d or \p) or character strings or
70: * operations ( alternation or perenthesizing ) can be
71: * specified. Each starts with a descriptor byte. The descriptor
72: * byte has STR set for strings, META set for meta symbols
73: * and OPER set for operations.
74: * The descriptor byte can also have the OPT bit set if the object
75: * defined is optional. Also ALT can be set to indicate an alternation.
76: *
77: * For metasymbols the byte following the descriptor byte identities
78: * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
79: * strings the byte after the descriptor is a character count for
80: * the string:
81: *
82: * meta symbols := descriptor
83: * symbol
84: *
85: * strings := descriptor
86: * character count
87: * the string
88: *
89: * operatins := descriptor
90: * symbol
91: * character count
92: */
93:
94: /*
95: * handy macros for accessing parts of match blocks
96: */
97: #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
98: #define MNEXT(A) (A+2) /* character following a metasymbol block */
99:
100: #define OSYM(A) (*(A+1)) /* symbol in an operation block */
101: #define OCNT(A) (*(A+2)) /* character count */
102: #define ONEXT(A) (A+3) /* next character after the operation */
103: #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
104:
105: #define SCNT(A) (*(A+1)) /* byte count of a string */
106: #define SSTR(A) (A+2) /* address of the string */
107: #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
108:
109: /*
110: * bit flags in the descriptor
111: */
112: #define OPT 1
113: #define STR 2
114: #define META 4
115: #define ALT 8
116: #define OPER 16
117:
118: char *ure; /* pointer current position in unconverted exp */
119: char *ccre; /* pointer to current position in converted exp*/
120: char *malloc();
121:
122: char *
123: convexp(re)
124: char *re; /* unconverted irregular expression */
125: {
126: register char *cre; /* pointer to converted regular expression */
127:
128: /* allocate room for the converted expression */
129: if (re == NIL)
130: return (NIL);
131: if (*re == '\0')
132: return (NIL);
133: cre = malloc (4 * strlen(re) + 3);
134: ccre = cre;
135: ure = re;
136:
137: /* start the conversion with a \a */
138: *cre = META | OPT;
139: MSYM(cre) = 'a';
140: ccre = MNEXT(cre);
141:
142: /* start the conversion (its recursive) */
143: expconv ();
144: *ccre = 0;
145: return (cre);
146: }
147:
148: expconv()
149: {
150: register char *cs; /* pointer to current symbol in converted exp */
151: register char c; /* character being processed */
152: register char *acs; /* pinter to last alternate */
153: register int temp;
154:
155: /* let the conversion begin */
156: acs = NIL;
157: cs = NIL;
158: while (*ure != NIL) {
159: switch (c = *ure++) {
160:
161: case '\\':
162: switch (c = *ure++) {
163:
164: /* escaped characters are just characters */
165: default:
166: if (cs == NIL || (*cs & STR) == 0) {
167: cs = ccre;
168: *cs = STR;
169: SCNT(cs) = 1;
170: ccre += 2;
171: } else
172: SCNT(cs)++;
173: *ccre++ = c;
174: break;
175:
176: /* normal(?) metacharacters */
177: case 'a':
178: case 'd':
179: case 'e':
180: case 'p':
181: if (acs != NIL && acs != cs) {
182: do {
183: temp = OCNT(acs);
184: OCNT(acs) = ccre - acs;
185: acs -= temp;
186: } while (temp != 0);
187: acs = NIL;
188: }
189: cs = ccre;
190: *cs = META;
191: MSYM(cs) = c;
192: ccre = MNEXT(cs);
193: break;
194: }
195: break;
196:
197: /* just put the symbol in */
198: case '^':
199: case '$':
200: if (acs != NIL && acs != cs) {
201: do {
202: temp = OCNT(acs);
203: OCNT(acs) = ccre - acs;
204: acs -= temp;
205: } while (temp != 0);
206: acs = NIL;
207: }
208: cs = ccre;
209: *cs = META;
210: MSYM(cs) = c;
211: ccre = MNEXT(cs);
212: break;
213:
214: /* mark the last match sequence as optional */
215: case '?':
216: if (cs)
217: *cs = *cs | OPT;
218: break;
219:
220: /* recurse and define a subexpression */
221: case '(':
222: if (acs != NIL && acs != cs) {
223: do {
224: temp = OCNT(acs);
225: OCNT(acs) = ccre - acs;
226: acs -= temp;
227: } while (temp != 0);
228: acs = NIL;
229: }
230: cs = ccre;
231: *cs = OPER;
232: OSYM(cs) = '(';
233: ccre = ONEXT(cs);
234: expconv ();
235: OCNT(cs) = ccre - cs; /* offset to next symbol */
236: break;
237:
238: /* return from a recursion */
239: case ')':
240: if (acs != NIL) {
241: do {
242: temp = OCNT(acs);
243: OCNT(acs) = ccre - acs;
244: acs -= temp;
245: } while (temp != 0);
246: acs = NIL;
247: }
248: cs = ccre;
249: *cs = META;
250: MSYM(cs) = c;
251: ccre = MNEXT(cs);
252: return;
253:
254: /* mark the last match sequence as having an alternate */
255: /* the third byte will contain an offset to jump over the */
256: /* alternate match in case the first did not fail */
257: case '|':
258: if (acs != NIL && acs != cs)
259: OCNT(ccre) = ccre - acs; /* make a back pointer */
260: else
261: OCNT(ccre) = 0;
262: *cs |= ALT;
263: cs = ccre;
264: *cs = OPER;
265: OSYM(cs) = '|';
266: ccre = ONEXT(cs);
267: acs = cs; /* remember that the pointer is to be filles */
268: break;
269:
270: /* if its not a metasymbol just build a scharacter string */
271: default:
272: if (cs == NIL || (*cs & STR) == 0) {
273: cs = ccre;
274: *cs = STR;
275: SCNT(cs) = 1;
276: ccre = SSTR(cs);
277: } else
278: SCNT(cs)++;
279: *ccre++ = c;
280: break;
281: }
282: }
283: if (acs != NIL) {
284: do {
285: temp = OCNT(acs);
286: OCNT(acs) = ccre - acs;
287: acs -= temp;
288: } while (temp != 0);
289: acs = NIL;
290: }
291: return;
292: }
293: /* end of convertre */
294:
295:
296: /*
297: * The following routine recognises an irregular expresion
298: * with the following special characters:
299: *
300: * \? - means last match was optional
301: * \a - matches any number of characters
302: * \d - matches any number of spaces and tabs
303: * \p - matches any number of alphanumeric
304: * characters. The
305: * characters matched will be copied into
306: * the area pointed to by 'name'.
307: * \| - alternation
308: * \( \) - grouping used mostly for alternation and
309: * optionality
310: *
311: * The irregular expression must be translated to internal form
312: * prior to calling this routine
313: *
314: * The value returned is the pointer to the first non \a
315: * character matched.
316: */
317:
318: boolean _escaped; /* true if we are currently _escaped */
319: char *_start; /* start of string */
320:
321: char *
322: expmatch (s, re, mstring)
323: register char *s; /* string to check for a match in */
324: register char *re; /* a converted irregular expression */
325: register char *mstring; /* where to put whatever matches a \p */
326: {
327: register char *cs; /* the current symbol */
328: register char *ptr,*s1; /* temporary pointer */
329: boolean matched; /* a temporary boolean */
330:
331: /* initial conditions */
332: if (re == NIL)
333: return (NIL);
334: cs = re;
335: matched = FALSE;
336:
337: /* loop till expression string is exhausted (or at least pretty tired) */
338: while (*cs) {
339: switch (*cs & (OPER | STR | META)) {
340:
341: /* try to match a string */
342: case STR:
343: matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
344: if (matched) {
345:
346: /* hoorah it matches */
347: s += SCNT(cs);
348: cs = SNEXT(cs);
349: } else if (*cs & ALT) {
350:
351: /* alternation, skip to next expression */
352: cs = SNEXT(cs);
353: } else if (*cs & OPT) {
354:
355: /* the match is optional */
356: cs = SNEXT(cs);
357: matched = 1; /* indicate a successful match */
358: } else {
359:
360: /* no match, error return */
361: return (NIL);
362: }
363: break;
364:
365: /* an operator, do something fancy */
366: case OPER:
367: switch (OSYM(cs)) {
368:
369: /* this is an alternation */
370: case '|':
371: if (matched)
372:
373: /* last thing in the alternation was a match, skip ahead */
374: cs = OPTR(cs);
375: else
376:
377: /* no match, keep trying */
378: cs = ONEXT(cs);
379: break;
380:
381: /* this is a grouping, recurse */
382: case '(':
383: ptr = expmatch (s, ONEXT(cs), mstring);
384: if (ptr != NIL) {
385:
386: /* the subexpression matched */
387: matched = 1;
388: s = ptr;
389: } else if (*cs & ALT) {
390:
391: /* alternation, skip to next expression */
392: matched = 0;
393: } else if (*cs & OPT) {
394:
395: /* the match is optional */
396: matched = 1; /* indicate a successful match */
397: } else {
398:
399: /* no match, error return */
400: return (NIL);
401: }
402: cs = OPTR(cs);
403: break;
404: }
405: break;
406:
407: /* try to match a metasymbol */
408: case META:
409: switch (MSYM(cs)) {
410:
411: /* try to match anything and remember what was matched */
412: case 'p':
413: /*
414: * This is really the same as trying the match the
415: * remaining parts of the expression to any subset
416: * of the string.
417: */
418: s1 = s;
419: do {
420: ptr = expmatch (s1, MNEXT(cs), mstring);
421: if (ptr != NIL && s1 != s) {
422:
423: /* we have a match, remember the match */
424: strncpy (mstring, s, s1 - s);
425: mstring[s1 - s] = '\0';
426: return (ptr);
427: } else if (ptr != NIL && (*cs & OPT)) {
428:
429: /* it was aoptional so no match is ok */
430: return (ptr);
431: } else if (ptr != NIL) {
432:
433: /* not optional and we still matched */
434: return (NIL);
435: }
436: if (!isalnum(*s1) && *s1 != '_')
437: return (NIL);
438: if (*s1 == '\\')
439: _escaped = _escaped ? FALSE : TRUE;
440: else
441: _escaped = FALSE;
442: } while (*s1++);
443: return (NIL);
444:
445: /* try to match anything */
446: case 'a':
447: /*
448: * This is really the same as trying the match the
449: * remaining parts of the expression to any subset
450: * of the string.
451: */
452: s1 = s;
453: do {
454: ptr = expmatch (s1, MNEXT(cs), mstring);
455: if (ptr != NIL && s1 != s) {
456:
457: /* we have a match */
458: return (ptr);
459: } else if (ptr != NIL && (*cs & OPT)) {
460:
461: /* it was aoptional so no match is ok */
462: return (ptr);
463: } else if (ptr != NIL) {
464:
465: /* not optional and we still matched */
466: return (NIL);
467: }
468: if (*s1 == '\\')
469: _escaped = _escaped ? FALSE : TRUE;
470: else
471: _escaped = FALSE;
472: } while (*s1++);
473: return (NIL);
474:
475: /* fail if we are currently _escaped */
476: case 'e':
477: if (_escaped)
478: return(NIL);
479: cs = MNEXT(cs);
480: break;
481:
482: /* match any number of tabs and spaces */
483: case 'd':
484: ptr = s;
485: while (*s == ' ' || *s == '\t')
486: s++;
487: if (s != ptr || s == _start) {
488:
489: /* match, be happy */
490: matched = 1;
491: cs = MNEXT(cs);
492: } else if (*s == '\n' || *s == '\0') {
493:
494: /* match, be happy */
495: matched = 1;
496: cs = MNEXT(cs);
497: } else if (*cs & ALT) {
498:
499: /* try the next part */
500: matched = 0;
501: cs = MNEXT(cs);
502: } else if (*cs & OPT) {
503:
504: /* doesn't matter */
505: matched = 1;
506: cs = MNEXT(cs);
507: } else
508:
509: /* no match, error return */
510: return (NIL);
511: break;
512:
513: /* check for end of line */
514: case '$':
515: if (*s == '\0' || *s == '\n') {
516:
517: /* match, be happy */
518: s++;
519: matched = 1;
520: cs = MNEXT(cs);
521: } else if (*cs & ALT) {
522:
523: /* try the next part */
524: matched = 0;
525: cs = MNEXT(cs);
526: } else if (*cs & OPT) {
527:
528: /* doesn't matter */
529: matched = 1;
530: cs = MNEXT(cs);
531: } else
532:
533: /* no match, error return */
534: return (NIL);
535: break;
536:
537: /* check for start of line */
538: case '^':
539: if (s == _start) {
540:
541: /* match, be happy */
542: matched = 1;
543: cs = MNEXT(cs);
544: } else if (*cs & ALT) {
545:
546: /* try the next part */
547: matched = 0;
548: cs = MNEXT(cs);
549: } else if (*cs & OPT) {
550:
551: /* doesn't matter */
552: matched = 1;
553: cs = MNEXT(cs);
554: } else
555:
556: /* no match, error return */
557: return (NIL);
558: break;
559:
560: /* end of a subexpression, return success */
561: case ')':
562: return (s);
563: }
564: break;
565: }
566: }
567: return (s);
568: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.