|
|
1.1 root 1: /* $Header: search.c,v 4.3 85/05/01 11:50:16 lwall Exp $
2: *
3: * $Log: search.c,v $
4: * Revision 4.3 85/05/01 11:50:16 lwall
5: * Baseline for release with 4.3bsd.
6: *
7: */
8:
9: /* string search routines */
10:
11: /* Copyright (c) 1981,1980 James Gosling */
12:
13: /* Modified Aug. 12, 1981 by Tom London to include regular expressions
14: as in ed. RE stuff hacked over by jag to correct a few major problems,
15: mainly dealing with searching within the buffer rather than copying
16: each line to a separate array. Newlines can now appear in RE's */
17:
18: /* Ripped to shreds and glued back together to make a search package,
19: * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
20: * Changes include:
21: * Buffer, window, and mlisp stuff gone.
22: * Translation tables reduced to 1 table.
23: * Expression buffer is now dynamically allocated.
24: * Character classes now implemented with a bitmap.
25: */
26:
27: #include "EXTERN.h"
28: #include "common.h"
29: #include "util.h"
30: #include "INTERN.h"
31: #include "search.h"
32:
33: #ifndef BITSPERBYTE
34: #define BITSPERBYTE 8
35: #endif
36:
37: #define BMAPSIZ (127 / BITSPERBYTE + 1)
38:
39: /* meta characters in the "compiled" form of a regular expression */
40: #define CBRA 2 /* \( -- begin bracket */
41: #define CCHR 4 /* a vanilla character */
42: #define CDOT 6 /* . -- match anything except a newline */
43: #define CCL 8 /* [...] -- character class */
44: #define NCCL 10 /* [^...] -- negated character class */
45: #define CDOL 12 /* $ -- matches the end of a line */
46: #define CEND 14 /* The end of the pattern */
47: #define CKET 16 /* \) -- close bracket */
48: #define CBACK 18 /* \N -- backreference to the Nth bracketed
49: string */
50: #define CIRC 20 /* ^ matches the beginning of a line */
51:
52: #define WORD 32 /* matches word character \w */
53: #define NWORD 34 /* matches non-word characer \W */
54: #define WBOUND 36 /* matches word boundary \b */
55: #define NWBOUND 38 /* matches non-(word boundary) \B */
56:
57: #define STAR 01 /* * -- Kleene star, repeats the previous
58: REas many times as possible; the value
59: ORs with the other operator types */
60:
61: #define ASCSIZ 0200
62: typedef char TRANSTABLE[ASCSIZ];
63:
64: static TRANSTABLE trans = {
65: 0000,0001,0002,0003,0004,0005,0006,0007,
66: 0010,0011,0012,0013,0014,0015,0016,0017,
67: 0020,0021,0022,0023,0024,0025,0026,0027,
68: 0030,0031,0032,0033,0034,0035,0036,0037,
69: 0040,0041,0042,0043,0044,0045,0046,0047,
70: 0050,0051,0052,0053,0054,0055,0056,0057,
71: 0060,0061,0062,0063,0064,0065,0066,0067,
72: 0070,0071,0072,0073,0074,0075,0076,0077,
73: 0100,0101,0102,0103,0104,0105,0106,0107,
74: 0110,0111,0112,0113,0114,0115,0116,0117,
75: 0120,0121,0122,0123,0124,0125,0126,0127,
76: 0130,0131,0132,0133,0134,0135,0136,0137,
77: 0140,0141,0142,0143,0144,0145,0146,0147,
78: 0150,0151,0152,0153,0154,0155,0156,0157,
79: 0160,0161,0162,0163,0164,0165,0166,0167,
80: 0170,0171,0172,0173,0174,0175,0176,0177,
81: };
82: static bool folding = FALSE;
83:
84: static int err;
85: static char *FirstCharacter;
86:
87: void
88: search_init()
89: {
90: #ifdef UNDEF
91: register int i;
92:
93: for (i = 0; i < ASCSIZ; i++)
94: trans[i] = i;
95: #else
96: ;
97: #endif
98: }
99:
100: void
101: init_compex(compex)
102: register COMPEX *compex;
103: {
104: /* the following must start off zeroed */
105:
106: compex->eblen = 0;
107: compex->brastr = Nullch;
108: }
109:
110: void
111: free_compex(compex)
112: register COMPEX *compex;
113: {
114: if (compex->eblen) {
115: free(compex->expbuf);
116: compex->eblen = 0;
117: }
118: if (compex->brastr) {
119: free(compex->brastr);
120: compex->brastr = Nullch;
121: }
122: }
123:
124: static char *gbr_str = Nullch;
125: static int gbr_siz = 0;
126:
127: char *
128: getbracket(compex,n)
129: register COMPEX *compex;
130: int n;
131: {
132: int length = compex->braelist[n] - compex->braslist[n];
133:
134: if (!compex->nbra || n > compex->nbra || !compex->braelist[n] || length<0)
135: return nullstr;
136: growstr(&gbr_str, &gbr_siz, length+1);
137: safecpy(gbr_str, compex->braslist[n], length+1);
138: return gbr_str;
139: }
140:
141: void
142: case_fold(which)
143: int which;
144: {
145: register int i;
146:
147: if (which != folding) {
148: if (which) {
149: for (i = 'A'; i <= 'Z'; i++)
150: trans[i] = tolower(i);
151: }
152: else {
153: for (i = 'A'; i <= 'Z'; i++)
154: trans[i] = i;
155: }
156: folding = which;
157: }
158: }
159:
160: /* Compile the given regular expression into a [secret] internal format */
161:
162: char *
163: compile (compex, strp, RE, fold)
164: register COMPEX *compex;
165: register char *strp;
166: int RE;
167: int fold;
168: {
169: register int c;
170: register char *ep;
171: char *lastep;
172: char bracket[NBRA],
173: *bracketp;
174: char **alt = compex->alternatives;
175: char *retmes = "Badly formed search string";
176:
177: case_fold(compex->do_folding = fold);
178: if (!compex->eblen) {
179: compex->expbuf = safemalloc(84);
180: compex->eblen = 80;
181: }
182: ep = compex->expbuf; /* point at expression buffer */
183: *alt++ = ep; /* first alternative starts here */
184: bracketp = bracket; /* first bracket goes here */
185: if (*strp == 0) { /* nothing to compile? */
186: if (*ep == 0) /* nothing there yet? */
187: return "Null search string";
188: return Nullch; /* just keep old expression */
189: }
190: compex->nbra = 0; /* no brackets yet */
191: lastep = 0;
192: for (;;) {
193: if (ep - compex->expbuf >= compex->eblen)
194: grow_eb(compex);
195: c = *strp++; /* fetch next char of pattern */
196: if (c == 0) { /* end of pattern? */
197: if (bracketp != bracket) { /* balanced brackets? */
198: #ifdef VERBOSE
199: retmes = "Unbalanced parens";
200: #endif
201: goto cerror;
202: }
203: *ep++ = CEND; /* terminate expression */
204: *alt++ = 0; /* terminal alternative list */
205: /*
206: compex->eblen = ep - compex->expbuf + 1;
207: compex->expbuf = saferealloc(compex->expbuf,compex->eblen+4); */
208: return Nullch; /* return success */
209: }
210: if (c != '*')
211: lastep = ep;
212: if (!RE) { /* just a normal search string? */
213: *ep++ = CCHR; /* everything is a normal char */
214: *ep++ = c;
215: }
216: else /* it is a regular expression */
217: switch (c) {
218:
219: case '\\': /* meta something */
220: switch (c = *strp++) {
221: case '(':
222: if (compex->nbra >= NBRA) {
223: #ifdef VERBOSE
224: retmes = "Too many parens";
225: #endif
226: goto cerror;
227: }
228: *bracketp++ = ++compex->nbra;
229: *ep++ = CBRA;
230: *ep++ = compex->nbra;
231: break;
232: case '|':
233: if (bracketp>bracket) {
234: #ifdef VERBOSE
235: retmes = "No \\| in parens"; /* Alas! */
236: #endif
237: goto cerror;
238: }
239: *ep++ = CEND;
240: *alt++ = ep;
241: break;
242: case ')':
243: if (bracketp <= bracket) {
244: #ifdef VERBOSE
245: retmes = "Unmatched right paren";
246: #endif
247: goto cerror;
248: }
249: *ep++ = CKET;
250: *ep++ = *--bracketp;
251: break;
252: case 'w':
253: *ep++ = WORD;
254: break;
255: case 'W':
256: *ep++ = NWORD;
257: break;
258: case 'b':
259: *ep++ = WBOUND;
260: break;
261: case 'B':
262: *ep++ = NWBOUND;
263: break;
264: case '0': case '1': case '2': case '3': case '4':
265: case '5': case '6': case '7': case '8': case '9':
266: *ep++ = CBACK;
267: *ep++ = c - '0';
268: break;
269: default:
270: *ep++ = CCHR;
271: if (c == '\0')
272: goto cerror;
273: *ep++ = c;
274: break;
275: }
276: break;
277: case '.':
278: *ep++ = CDOT;
279: continue;
280:
281: case '*':
282: if (lastep == 0 || *lastep == CBRA || *lastep == CKET
283: || *lastep == CIRC
284: || (*lastep&STAR)|| *lastep>NWORD)
285: goto defchar;
286: *lastep |= STAR;
287: continue;
288:
289: case '^':
290: if (ep != compex->expbuf && ep[-1] != CEND)
291: goto defchar;
292: *ep++ = CIRC;
293: continue;
294:
295: case '$':
296: if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
297: goto defchar;
298: *ep++ = CDOL;
299: continue;
300:
301: case '[': { /* character class */
302: register int i;
303:
304: if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
305: grow_eb(compex); /* reserve bitmap */
306: for (i = BMAPSIZ; i; --i)
307: ep[i] = 0;
308:
309: if ((c = *strp++) == '^') {
310: c = *strp++;
311: *ep++ = NCCL; /* negated */
312: }
313: else
314: *ep++ = CCL; /* normal */
315:
316: i = 0; /* remember oldchar */
317: do {
318: if (c == '\0') {
319: #ifdef VERBOSE
320: retmes = "Missing ]";
321: #endif
322: goto cerror;
323: }
324: if (*strp == '-' && *(++strp))
325: i = *strp++;
326: else
327: i = c;
328: while (c <= i) {
329: ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
330: if (fold && isalpha(c))
331: ep[(c ^ 32) / BITSPERBYTE] |=
332: 1 << ((c ^ 32) % BITSPERBYTE);
333: /* set the other bit too */
334: c++;
335: }
336: } while ((c = *strp++) != ']');
337: ep += BMAPSIZ;
338: continue;
339: }
340:
341: defchar:
342: default:
343: *ep++ = CCHR;
344: *ep++ = c;
345: }
346: }
347: cerror:
348: compex->expbuf[0] = 0;
349: compex->nbra = 0;
350: return retmes;
351: }
352:
353: void
354: grow_eb(compex)
355: register COMPEX *compex;
356: {
357: compex->eblen += 80;
358: compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE)compex->eblen + 4);
359: }
360:
361: char *
362: execute (compex, addr)
363: register COMPEX *compex;
364: char *addr;
365: {
366: register char *p1 = addr;
367: register char *trt = trans;
368: register int c;
369:
370: if (addr == Nullch)
371: return Nullch;
372: if (compex->nbra) { /* any brackets? */
373: for (c = 0; c <= compex->nbra; c++)
374: compex->braslist[c] = compex->braelist[c] = Nullch;
375: if (compex->brastr)
376: free(compex->brastr);
377: compex->brastr = savestr(p1); /* in case p1 is not static */
378: p1 = compex->brastr; /* ! */
379: }
380: case_fold(compex->do_folding); /* make sure table is correct */
381: FirstCharacter = p1; /* for ^ tests */
382: if (compex->expbuf[0] == CCHR && !compex->alternatives[1]) {
383: c = trt[compex->expbuf[1]]; /* fast check for first character */
384: do {
385: if (trt[*p1] == c && advance (compex, p1, compex->expbuf))
386: return p1;
387: p1++;
388: } while (*p1 && !err);
389: return Nullch;
390: }
391: else { /* regular algorithm */
392: do {
393: register char **alt = compex->alternatives;
394: while (*alt) {
395: if (advance (compex, p1, *alt++))
396: return p1;
397: }
398: p1++;
399: } while (*p1 && !err);
400: return Nullch;
401: }
402: }
403:
404: /* advance the match of the regular expression starting at ep along the
405: string lp, simulates an NDFSA */
406: bool
407: advance (compex, lp, ep)
408: register COMPEX *compex;
409: register char *ep;
410: register char *lp;
411: {
412: register char *curlp;
413: register char *trt = trans;
414: register int i;
415:
416: while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET)
417: switch (*ep++) {
418:
419: case CCHR:
420: if (trt[*ep++] != trt[*lp]) return FALSE;
421: lp++;
422: continue;
423:
424: case CDOT:
425: if (*lp == '\n') return FALSE;
426: lp++;
427: continue;
428:
429: case CDOL:
430: if (!*lp || *lp == '\n')
431: continue;
432: return FALSE;
433:
434: case CIRC:
435: if (lp == FirstCharacter || lp[-1]=='\n')
436: continue;
437: return FALSE;
438:
439: case WORD:
440: if (isalnum(*lp)) {
441: lp++;
442: continue;
443: }
444: return FALSE;
445:
446: case NWORD:
447: if (!isalnum(*lp)) {
448: lp++;
449: continue;
450: }
451: return FALSE;
452:
453: case WBOUND:
454: if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
455: (!*lp || !isalnum(*lp)) )
456: continue;
457: return FALSE;
458:
459: case NWBOUND:
460: if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
461: (!*lp || !isalnum(*lp)))
462: continue;
463: return FALSE;
464:
465: case CEND:
466: return TRUE;
467:
468: case CCL:
469: if (cclass (ep, *lp, 1)) {
470: ep += BMAPSIZ;
471: lp++;
472: continue;
473: }
474: return FALSE;
475:
476: case NCCL:
477: if (cclass (ep, *lp, 0)) {
478: ep += BMAPSIZ;
479: lp++;
480: continue;
481: }
482: return FALSE;
483:
484: case CBRA:
485: compex->braslist[*ep++] = lp;
486: continue;
487:
488: case CKET:
489: i = *ep++;
490: compex->braelist[i] = lp;
491: compex->braelist[0] = lp;
492: compex->braslist[0] = compex->braslist[i];
493: continue;
494:
495: case CBACK:
496: if (compex->braelist[i = *ep++] == 0) {
497: fputs("bad braces\n",stdout) FLUSH;
498: err = TRUE;
499: return FALSE;
500: }
501: if (backref (compex, i, lp)) {
502: lp += compex->braelist[i] - compex->braslist[i];
503: continue;
504: }
505: return FALSE;
506:
507: case CBACK | STAR:
508: if (compex->braelist[i = *ep++] == 0) {
509: fputs("bad braces\n",stdout) FLUSH;
510: err = TRUE;
511: return FALSE;
512: }
513: curlp = lp;
514: while (backref (compex, i, lp)) {
515: lp += compex->braelist[i] - compex->braslist[i];
516: }
517: while (lp >= curlp) {
518: if (advance (compex, lp, ep))
519: return TRUE;
520: lp -= compex->braelist[i] - compex->braslist[i];
521: }
522: continue;
523:
524: case CDOT | STAR:
525: curlp = lp;
526: while (*lp++ && lp[-1] != '\n');
527: goto star;
528:
529: case WORD | STAR:
530: curlp = lp;
531: while (*lp++ && isalnum(lp[-1]));
532: goto star;
533:
534: case NWORD | STAR:
535: curlp = lp;
536: while (*lp++ && !isalnum(lp[-1]));
537: goto star;
538:
539: case CCHR | STAR:
540: curlp = lp;
541: while (*lp++ && trt[lp[-1]] == trt[*ep]);
542: ep++;
543: goto star;
544:
545: case CCL | STAR:
546: case NCCL | STAR:
547: curlp = lp;
548: while (*lp++ && cclass (ep, lp[-1], ep[-1] == (CCL | STAR)));
549: ep += BMAPSIZ;
550: goto star;
551:
552: star:
553: do {
554: lp--;
555: if (advance (compex, lp, ep))
556: return TRUE;
557: } while (lp > curlp);
558: return FALSE;
559:
560: default:
561: fputs("Badly compiled pattern\n",stdout) FLUSH;
562: err = TRUE;
563: return -1;
564: }
565: if (*ep == CEND || *ep == CDOL) {
566: return TRUE;
567: }
568: return FALSE;
569: }
570:
571: bool
572: backref (compex, i, lp)
573: register COMPEX *compex;
574: register int i;
575: register char *lp;
576: {
577: register char *bp;
578:
579: bp = compex->braslist[i];
580: while (*lp && *bp == *lp) {
581: bp++;
582: lp++;
583: if (bp >= compex->braelist[i])
584: return TRUE;
585: }
586: return FALSE;
587: }
588:
589: bool
590: cclass (set, c, af)
591: register char *set;
592: register int c;
593: {
594: c &= 0177;
595: #if BITSPERBYTE == 8
596: if (set[c >> 3] & 1 << (c & 7))
597: #else
598: if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
599: #endif
600: return af;
601: return !af;
602: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.