|
|
1.1 root 1: #include "re.h"
2: #include "lre.h"
3: #include "hdr.h"
4:
5: int re_debug = 0;
6:
7: #define RESET(r, ps) COPY(r, ps, begin)
8: #define SET(r, ps, n) {if(r->ps.base[n] == 0) r->ps.count++, r->ps.base[n] = r->ps.last, r->ps.last = n; }
9: #define GET(ps, n) for(n = ps.last; n > 0; n = ps.base[n])
10: #define COPY(r, to, from) memmove((char *)r->to.base, (char *)r->from.base, r->maxid*sizeof(int)), r->to.count = r->from.count, r->to.last = r->from.last
11:
12: static State *addstate(re_re *r, Positionset *ps);
13: static int first(re_re *, Expr *);
14: static int match(re_re *, Expr *, int);
15: static State *nextstate(re_re *, State *, int);
16: #ifdef DEBUG
17: static void ppr(Positionset *, char *);
18: #endif
19:
20: typedef enum { NORMAL, ACCEPT, ACCEPT_EOP, FAIL } Out;
21:
22: static void
23: eptr(register re_re *r, register Expr *e)
24: {
25: if((e->id < 0) || (e->id >= r->maxid)){
26: EPR "eptr abort: r=%ld[maxid=%d] e=%ld[id=%d]\n", r, r->maxid, e, e->id);
27: abort();
28: }
29: r->ptr[e->id] = e;
30: if((e->type != Charclass) && (e->type != Compcharclass)){
31: if(e->l)
32: eptr(r, e->l);
33: if(e->r)
34: eptr(r, e->r);
35: }
36: }
37:
38: re_re *
39: egprep(enum Parsetype parser, unsigned char *b, unsigned char *e, unsigned char *map, int dotstar)
40: {
41: register re_re *r;
42: Expr *ecomp;
43: int i;
44:
45: r = (re_re *)egmalloc(sizeof (re_re), "allocating re_re");
46: if(r == 0)
47: return 0;
48: memset((char *)r, 0, sizeof (re_re));
49: eg_lexinit((char *)b, (char *)e);
50: if(map)
51: memmove(r->mymap, (char *)map, 256);
52: else
53: for(i = 0; i < 256; i++)
54: r->mymap[i] = i;
55: eg_lex();
56: if(dotstar){
57: ecomp = eg_newexpr(Star, 0, eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0), (Expr *)0);
58: ecomp = eg_newexpr(Cat, 0, ecomp, eg_eall(parser, r->mymap));
59: } else
60: ecomp = eg_eall(parser, r->mymap);
61: r->root = eg_newexpr(EOP, '#', ecomp, (Expr *)0);
62: egpost(r);
63: r->carat = 0;
64: if(r->backref || r->parens)
65: egbr(r);
66: else
67: eginit(r, dotstar);
68: return(r);
69: }
70:
71: void
72: eginit(register re_re *r, int dotstar)
73: {
74: int n;
75:
76: #ifdef DEBUG
77: if(TRACE(6))
78: PR "eginit(r=%d, dotstar=%d)\n", r, dotstar);
79: #endif
80: #ifdef DEBUG
81: if(TRACE(10)){
82: char buf[EPRINTSIZE];
83: eg_epr(r->root, buf, 0);
84: PR "eginit: r=%d expr='%s'\n", r, buf);
85: }
86: #endif
87: r->ptr = (Expr **)egmalloc(r->maxid*sizeof(Expr *), "ptr");
88: if (!r->ptr)
89: return;
90: eptr(r, r->root);
91: r->firstpos.base = (int *)egmalloc(n = r->maxid*sizeof(int), "first base");
92: if (!r->firstpos.base){
93: free((char*)r->ptr);
94: return;
95: }
96: r->begin.base = (int *)egmalloc(n, "begin base");
97: if (!r->begin.base){
98: free((char*)r->firstpos.base);
99: free((char*)r->ptr);
100: return;
101: }
102: r->tmp.base = (int *)egmalloc(n, "tmp base");
103: if (!r->tmp.base){
104: free((char*)r->begin.base);
105: free((char*)r->firstpos.base);
106: free((char*)r->ptr);
107: return;
108: }
109: memset((char *)r->begin.base, 0, n);
110: r->begin.count = 0;
111: r->begin.last = -1;
112: r->carat = dotstar;
113: RESET(r, firstpos);
114: if(first(r, r->root->l) == 0){
115: /*
116: nullable, so include me
117: */
118: SET(r, firstpos, r->root->id)
119: }
120: if(dotstar)
121: COPY(r, begin, firstpos);
122: eg_stateinit(r);
123: eg_clrstates(r);
124: eg_posinit(r);
125: (void)addstate(r, &r->firstpos);
126: eg_savestate(r, dotstar? nextstate(r, r->states, RE_CARAT):r->states);
127: eg_posset(r);
128: }
129:
130: unsigned char *
131: eg_quickmatch(register re_re *r, register unsigned char *b, register unsigned char *e, int endpts)
132: {
133: register State *s, *t;
134:
135: #ifdef DEBUG
136: if(TRACE(10)){
137: char buf[EPRINTSIZE];
138: eg_epr(r->root, buf, 0);
139: PR "qm: r=%d expr='%s' endpts=%d\n", r, buf, endpts);
140: }
141: #endif
142: s = eg_startstate(r);
143: if(endpts&RE_BEG){
144: if(!r->carat){
145: if(t = s->tab[RE_CARAT])
146: s = t;
147: else
148: s = nextstate(r, s, RE_CARAT);
149: if(s->out == FAIL)
150: s = eg_startstate(r);
151: }
152: if(s->out){
153: #ifdef DEBUG
154: if(TRACE(6))
155: PR "match at ^: '%s' out=%d\n", b, s->out);
156: #endif
157: return((s->out == FAIL)? 0:b);
158: }
159: }
160: while(b < e){
161: #ifdef DEBUG
162: if(TRACE(4))
163: PR "state %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b);
164: #endif
165: if(t = s->tab[*(unsigned char *)b])
166: s = t;
167: else
168: s = nextstate(r, s, *(unsigned char *)b);
169: if(s->out){
170: #ifdef DEBUG
171: if(TRACE(6))
172: PR "match at input '%s' out=%d\n", b, s->out);
173: #endif
174: return((s->out == FAIL)? 0:b);
175: }
176: b++;
177: }
178: if(endpts&RE_END){
179: if(t = s->tab[RE_DOLLAR])
180: s = t;
181: else
182: s = nextstate(r, s, RE_DOLLAR);
183: }
184: if(s->out){
185: #ifdef DEBUG
186: if(TRACE(6))
187: PR "match at $ '%s' out=%d\n", b, s->out);
188: #endif
189: return((s->out == FAIL)? 0:b);
190: }
191: #ifdef DEBUG
192: if(TRACE(3)){
193: char buf[EPRINTSIZE];
194:
195: eg_epr(r->root, buf, 1);
196: PR "pat = %s\n", buf);
197: }
198: #endif
199: return(0);
200: }
201:
202: unsigned char *
203: eg_lquickmatch(register re_re *r, register unsigned char *b, register unsigned char *e, int endpts)
204: {
205: register State *s, *t;
206: int outedness = 0;
207:
208: #ifdef DEBUG
209: if(TRACE(10)){
210: char buf[EPRINTSIZE];
211: eg_epr(r->root, buf, 0);
212: PR "lqm: r=%d carat=%d expr='%s' endpts=%d\n", r, r->carat, buf, endpts);
213: }
214: #endif
215: s = eg_startstate(r);
216: if(endpts&RE_BEG){
217: if(!r->carat){
218: if(t = s->tab[RE_CARAT])
219: s = t;
220: else
221: s = nextstate(r, s, RE_CARAT);
222: if(s->out == FAIL)
223: s = eg_startstate(r);
224: }
225: if(s->out){
226: #ifdef DEBUG
227: if(TRACE(6))
228: PR "match at ^: '%s' out=%d\n", b, s->out);
229: #endif
230: if(s->out == ACCEPT_EOP)
231: return(b);
232: else if(s->out == FAIL)
233: return(0);
234: outedness = 1;
235: }
236: }
237: /*
238: look for first match
239: */
240: if(outedness == 0)
241: for(; b < e; b++){
242: #ifdef DEBUG
243: if(TRACE(4))
244: PR "state %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b);
245: #endif
246: if(t = s->tab[*(unsigned char *)b])
247: s = t;
248: else
249: s = nextstate(r, s, *(unsigned char *)b);
250: if(s->out){
251: #ifdef DEBUG
252: if(TRACE(4))
253: PR " out=%d, state %d@%d\n", s->out,
254: s-r->states, (int)s);
255: #endif
256: b++;
257: if((s->out == ACCEPT_EOP) || (s->out == FAIL))
258: goto finish;
259: outedness = 1;
260: break;
261: }
262: }
263: /*
264: in the following loop, we have seen a match already
265: because only way outedness is zero is if b>=e
266: */
267: for(; b < e; b++){
268: #ifdef DEBUG
269: if(TRACE(4))
270: PR "statex %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b);
271: #endif
272: if(t = s->tab[*(unsigned char *)b])
273: s = t;
274: else
275: s = nextstate(r, s, *(unsigned char *)b);
276: if(s->out == ACCEPT)
277: continue;
278: if((s->out == NORMAL) || (s->out == FAIL)){
279: #ifdef DEBUG
280: if(TRACE(6))
281: PR "unmatch at input '%s' out=%d\n", b, s->out);
282: #endif
283: return(b);
284: }
285: }
286: if(endpts&RE_END){
287: if(t = s->tab[RE_DOLLAR])
288: s = t;
289: else
290: s = nextstate(r, s, RE_DOLLAR);
291: }
292: finish:
293: if((s->out && (s->out != FAIL)) || outedness){
294: #ifdef DEBUG
295: if(TRACE(6))
296: PR "match at $ '%s' out=%d\n", b, s->out);
297: #endif
298: return(b);
299: }
300: #ifdef DEBUG
301: if(TRACE(3)){
302: char buf[EPRINTSIZE];
303:
304: eg_epr(r->root, buf, 1);
305: PR "lqm('%s') failed; out=%d, s->out=%d \n", buf, outedness, s->out);
306: }
307: #endif
308: return(0);
309: }
310:
311: static
312: match(register re_re *r, register Expr *e, int a)
313: {
314: switch(e->type)
315: {
316: case Dollar: return(a == RE_DOLLAR);
317: case Carat: return(a == RE_CARAT);
318: case Dot: return(a < 256);
319: case Literal: return(r->mymap[a] == r->mymap[e->lit]);
320: case Charclass: if(a >= 256) return(0);
321: return(memchr((char *)e->r, r->mymap[a], (int)e->l) != 0);
322: case Compcharclass:
323: if(a >= 256) return(0);
324: return(memchr((char *)e->r, r->mymap[a], (int)e->l) == 0);
325: }
326: return(0);
327: }
328:
329: /*
330: generates the followset for a node in firstpos
331: */
332:
333: static void
334: follow(register re_re *r, register Expr *e)
335: {
336: register Expr *p;
337:
338: if(e->type == EOP)
339: return;
340: else
341: p = e->parent;
342: switch(p->type)
343: {
344: case EOP:
345: SET(r, firstpos, p->id)
346: break;
347: case Plus:
348: case Star:
349: (void)first(r, e);
350: follow(r, p);
351: break;
352: case Quest:
353: case Alternate:
354: follow(r, p);
355: break;
356: case Cat:
357: if(e == p->l){
358: if(first(r, p->r) == 0){
359: follow(r, p);
360: break;
361: }
362: } else
363: follow(r, p);
364: break;
365: case Group:
366: follow(r, p);
367: break;
368: }
369: }
370:
371: /*
372: first returns 0 if e is nullable and in the process,
373: sets up firstpos.
374: */
375:
376: static
377: first(register re_re *r, register Expr *e)
378: {
379: int k;
380:
381: switch(e->type)
382: {
383: case Carat:
384: case Dollar:
385: case Literal:
386: case Dot:
387: case Charclass:
388: case Compcharclass:
389: SET(r, firstpos, e->id)
390: return(1);
391: case EOP:
392: case Star:
393: case Quest:
394: (void)first(r, e->l);
395: return(0);
396: case Group:
397: case Plus:
398: return(first(r, e->l));
399: case Cat:
400: return(first(r, e->l) || first(r, e->r));
401: case Alternate:
402: k = first(r, e->r);
403: return(first(r, e->l) && k);
404: default:
405: EPR "bad type %d\n", e->type);
406: abort();
407: return(0);
408: }
409: }
410:
411: static void
412: efollow(register re_re *r, register Expr *e)
413: {
414: register i, *p;
415:
416: RESET(r, firstpos);
417: follow(r, e);
418: e->flen = r->firstpos.count;
419: e->follow = (int *)malloc((unsigned)(e->flen*sizeof(int)));
420: if(e->follow == 0){
421: char buf[100];
422: SPR buf, "efollow malloc fail(%ld)\n", e->flen);
423: re_error(buf);
424: return;
425: }
426: p = e->follow;
427: GET(r->firstpos, i)
428: *p++ = i;
429: if(p != e->follow+e->flen){
430: char buf[100];
431: SPR buf, "efollow length error: %d %ld\n", p-e->follow, e->flen);
432: re_error(buf);
433: return;
434: }
435: }
436:
437: static State *
438: addstate(register re_re *r, register Positionset *ps)
439: {
440: register *p, j;
441: register State *s;
442:
443: s = r->states + eg_getstate(r);
444: memset((char *)s->tab, 0, sizeof(s->tab));
445: s->pos = eg_posalloc(r, (int)ps->count);
446: s->npos = ps->count;
447: p = r->posbase+s->pos;
448: GET((*ps), j)
449: *p++ = j;
450: if(s->out = (ps->base[r->root->id] != 0)){
451: if((ps->base[r->root->id] <= 0) && (ps->last == r->root->id))
452: s->out = ACCEPT_EOP; /* marker for only the EOP marker */
453: else
454: s->out = ACCEPT;
455: }
456: if(s->npos == 0)
457: s->out = FAIL;
458: #ifdef DEBUG
459: if(TRACE(3)){
460: char buf[2000];
461: eg_spr(s->npos, s->pos+r->posbase, buf);
462: PR "new state[%d]@%d %s,pos=%d out=%d\n", s-r->states, s, buf, s->pos, s->out);
463: }
464: #endif
465: return(s);
466: }
467:
468: static State *
469: nextstate(register re_re *r, register State *s, int a)
470: {
471: register int p, *q, *eq;
472: register long i;
473: State *news;
474: Expr *e;
475:
476: RESET(r, tmp);
477: #ifdef DEBUG
478: if(TRACE(5)){
479: char buf[2000];
480: ppr(&r->tmp, buf);
481: PR "nextstate: reset: %s\n", buf);
482: }
483: #endif
484: for(i = s->npos, p = s->pos; i > 0; i--){
485: e = r->ptr[r->posbase[p++]];
486: #ifdef DEBUG
487: if(TRACE(11)){
488: PR "i=%d e->type=%d\n", i, e->type);
489: }
490: #endif
491: if(e->type == EOP) continue;
492: if(match(r, e, a)){
493: #ifdef DEBUG
494: if(TRACE(11)){PR "matched %c\n", a);}
495: #endif
496: if(e->follow == 0)
497: efollow(r, e);
498: for(q = e->follow, eq = q+e->flen; q < eq; q++){
499: SET(r, tmp, *q)
500: }
501: }
502: }
503: #ifdef DEBUG
504: if(TRACE(5)){
505: char buf[2000];
506: ppr(&r->tmp, buf);
507: PR "nextstate(%d, '%c'): found %s\n", s-r->states, a, buf);
508: }
509: #endif
510: if(news = eg_stateof(r, &r->tmp)){
511: if(a <= RE_DOLLAR)
512: s->tab[a] = news;
513: } else
514: news = addstate(r, &r->tmp);
515: #ifdef DEBUG
516: if(TRACE(5)){
517: PR "nextstate(%d, '%c'): returning %ld %d out=%d\n", s-r->states, a,
518: (long)news, news-r->states, news->out);
519: }
520: #endif
521: return(news);
522: }
523:
524: void *
525: egmalloc(int n, char *s)
526: {
527: char *x;
528: char buf[256];
529:
530: if((x = malloc(n)) == 0){
531: SPR buf, "malloc fail(%d): %s", n, s);
532: re_error(buf);
533: return 0;
534: }
535: return((void *)x);
536: }
537:
538: void *
539: egrealloc(char *p, int n, char *s)
540: {
541: char *x;
542: char buf[256];
543:
544: if((x = realloc(p, n)) == 0){
545: SPR buf, "realloc fail(%d): %s", n, s);
546: re_error(buf);
547: return 0;
548: }
549: return((void *)x);
550: }
551:
552: #ifdef DEBUG
553: static void
554: ppr(register Positionset *ps, register char *p)
555: {
556: register n;
557:
558: if(ps->count < 1){
559: SPR p, "{}");
560: return;
561: }
562: *p++ = '{';
563: GET((*ps), n){
564: SPR p, "%d[=%d],", n, ps->base[n]);
565: p = strchr(p, 0);
566: }
567: p[-1] = '}';
568: }
569: #endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.