|
|
researchv10 Norman
#include "re.h"
#include "lre.h"
#include "hdr.h"
int re_debug = 0;
#define RESET(r, ps) COPY(r, ps, begin)
#define SET(r, ps, n) {if(r->ps.base[n] == 0) r->ps.count++, r->ps.base[n] = r->ps.last, r->ps.last = n; }
#define GET(ps, n) for(n = ps.last; n > 0; n = ps.base[n])
#define COPY(r, to, from) memmove((char *)r->to.base, (char *)r->from.base, r->maxid*sizeof(int)), r->to.count = r->from.count, r->to.last = r->from.last
static State *addstate(re_re *r, Positionset *ps);
static int first(re_re *, Expr *);
static int match(re_re *, Expr *, int);
static State *nextstate(re_re *, State *, int);
#ifdef DEBUG
static void ppr(Positionset *, char *);
#endif
typedef enum { NORMAL, ACCEPT, ACCEPT_EOP, FAIL } Out;
static void
eptr(register re_re *r, register Expr *e)
{
if((e->id < 0) || (e->id >= r->maxid)){
EPR "eptr abort: r=%ld[maxid=%d] e=%ld[id=%d]\n", r, r->maxid, e, e->id);
abort();
}
r->ptr[e->id] = e;
if((e->type != Charclass) && (e->type != Compcharclass)){
if(e->l)
eptr(r, e->l);
if(e->r)
eptr(r, e->r);
}
}
re_re *
egprep(enum Parsetype parser, unsigned char *b, unsigned char *e, unsigned char *map, int dotstar)
{
register re_re *r;
Expr *ecomp;
int i;
r = (re_re *)egmalloc(sizeof (re_re), "allocating re_re");
if(r == 0)
return 0;
memset((char *)r, 0, sizeof (re_re));
eg_lexinit((char *)b, (char *)e);
if(map)
memmove(r->mymap, (char *)map, 256);
else
for(i = 0; i < 256; i++)
r->mymap[i] = i;
eg_lex();
if(dotstar){
ecomp = eg_newexpr(Star, 0, eg_newexpr(Dot, '.', (Expr *)0, (Expr *)0), (Expr *)0);
ecomp = eg_newexpr(Cat, 0, ecomp, eg_eall(parser, r->mymap));
} else
ecomp = eg_eall(parser, r->mymap);
r->root = eg_newexpr(EOP, '#', ecomp, (Expr *)0);
egpost(r);
r->carat = 0;
if(r->backref || r->parens)
egbr(r);
else
eginit(r, dotstar);
return(r);
}
void
eginit(register re_re *r, int dotstar)
{
int n;
#ifdef DEBUG
if(TRACE(6))
PR "eginit(r=%d, dotstar=%d)\n", r, dotstar);
#endif
#ifdef DEBUG
if(TRACE(10)){
char buf[EPRINTSIZE];
eg_epr(r->root, buf, 0);
PR "eginit: r=%d expr='%s'\n", r, buf);
}
#endif
r->ptr = (Expr **)egmalloc(r->maxid*sizeof(Expr *), "ptr");
if (!r->ptr)
return;
eptr(r, r->root);
r->firstpos.base = (int *)egmalloc(n = r->maxid*sizeof(int), "first base");
if (!r->firstpos.base){
free((char*)r->ptr);
return;
}
r->begin.base = (int *)egmalloc(n, "begin base");
if (!r->begin.base){
free((char*)r->firstpos.base);
free((char*)r->ptr);
return;
}
r->tmp.base = (int *)egmalloc(n, "tmp base");
if (!r->tmp.base){
free((char*)r->begin.base);
free((char*)r->firstpos.base);
free((char*)r->ptr);
return;
}
memset((char *)r->begin.base, 0, n);
r->begin.count = 0;
r->begin.last = -1;
r->carat = dotstar;
RESET(r, firstpos);
if(first(r, r->root->l) == 0){
/*
nullable, so include me
*/
SET(r, firstpos, r->root->id)
}
if(dotstar)
COPY(r, begin, firstpos);
eg_stateinit(r);
eg_clrstates(r);
eg_posinit(r);
(void)addstate(r, &r->firstpos);
eg_savestate(r, dotstar? nextstate(r, r->states, RE_CARAT):r->states);
eg_posset(r);
}
unsigned char *
eg_quickmatch(register re_re *r, register unsigned char *b, register unsigned char *e, int endpts)
{
register State *s, *t;
#ifdef DEBUG
if(TRACE(10)){
char buf[EPRINTSIZE];
eg_epr(r->root, buf, 0);
PR "qm: r=%d expr='%s' endpts=%d\n", r, buf, endpts);
}
#endif
s = eg_startstate(r);
if(endpts&RE_BEG){
if(!r->carat){
if(t = s->tab[RE_CARAT])
s = t;
else
s = nextstate(r, s, RE_CARAT);
if(s->out == FAIL)
s = eg_startstate(r);
}
if(s->out){
#ifdef DEBUG
if(TRACE(6))
PR "match at ^: '%s' out=%d\n", b, s->out);
#endif
return((s->out == FAIL)? 0:b);
}
}
while(b < e){
#ifdef DEBUG
if(TRACE(4))
PR "state %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b);
#endif
if(t = s->tab[*(unsigned char *)b])
s = t;
else
s = nextstate(r, s, *(unsigned char *)b);
if(s->out){
#ifdef DEBUG
if(TRACE(6))
PR "match at input '%s' out=%d\n", b, s->out);
#endif
return((s->out == FAIL)? 0:b);
}
b++;
}
if(endpts&RE_END){
if(t = s->tab[RE_DOLLAR])
s = t;
else
s = nextstate(r, s, RE_DOLLAR);
}
if(s->out){
#ifdef DEBUG
if(TRACE(6))
PR "match at $ '%s' out=%d\n", b, s->out);
#endif
return((s->out == FAIL)? 0:b);
}
#ifdef DEBUG
if(TRACE(3)){
char buf[EPRINTSIZE];
eg_epr(r->root, buf, 1);
PR "pat = %s\n", buf);
}
#endif
return(0);
}
unsigned char *
eg_lquickmatch(register re_re *r, register unsigned char *b, register unsigned char *e, int endpts)
{
register State *s, *t;
int outedness = 0;
#ifdef DEBUG
if(TRACE(10)){
char buf[EPRINTSIZE];
eg_epr(r->root, buf, 0);
PR "lqm: r=%d carat=%d expr='%s' endpts=%d\n", r, r->carat, buf, endpts);
}
#endif
s = eg_startstate(r);
if(endpts&RE_BEG){
if(!r->carat){
if(t = s->tab[RE_CARAT])
s = t;
else
s = nextstate(r, s, RE_CARAT);
if(s->out == FAIL)
s = eg_startstate(r);
}
if(s->out){
#ifdef DEBUG
if(TRACE(6))
PR "match at ^: '%s' out=%d\n", b, s->out);
#endif
if(s->out == ACCEPT_EOP)
return(b);
else if(s->out == FAIL)
return(0);
outedness = 1;
}
}
/*
look for first match
*/
if(outedness == 0)
for(; b < e; b++){
#ifdef DEBUG
if(TRACE(4))
PR "state %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b);
#endif
if(t = s->tab[*(unsigned char *)b])
s = t;
else
s = nextstate(r, s, *(unsigned char *)b);
if(s->out){
#ifdef DEBUG
if(TRACE(4))
PR " out=%d, state %d@%d\n", s->out,
s-r->states, (int)s);
#endif
b++;
if((s->out == ACCEPT_EOP) || (s->out == FAIL))
goto finish;
outedness = 1;
break;
}
}
/*
in the following loop, we have seen a match already
because only way outedness is zero is if b>=e
*/
for(; b < e; b++){
#ifdef DEBUG
if(TRACE(4))
PR "statex %d@%d[%d pos]: char '%c'\n", s-r->states, (int)s, s->npos, *b);
#endif
if(t = s->tab[*(unsigned char *)b])
s = t;
else
s = nextstate(r, s, *(unsigned char *)b);
if(s->out == ACCEPT)
continue;
if((s->out == NORMAL) || (s->out == FAIL)){
#ifdef DEBUG
if(TRACE(6))
PR "unmatch at input '%s' out=%d\n", b, s->out);
#endif
return(b);
}
}
if(endpts&RE_END){
if(t = s->tab[RE_DOLLAR])
s = t;
else
s = nextstate(r, s, RE_DOLLAR);
}
finish:
if((s->out && (s->out != FAIL)) || outedness){
#ifdef DEBUG
if(TRACE(6))
PR "match at $ '%s' out=%d\n", b, s->out);
#endif
return(b);
}
#ifdef DEBUG
if(TRACE(3)){
char buf[EPRINTSIZE];
eg_epr(r->root, buf, 1);
PR "lqm('%s') failed; out=%d, s->out=%d \n", buf, outedness, s->out);
}
#endif
return(0);
}
static
match(register re_re *r, register Expr *e, int a)
{
switch(e->type)
{
case Dollar: return(a == RE_DOLLAR);
case Carat: return(a == RE_CARAT);
case Dot: return(a < 256);
case Literal: return(r->mymap[a] == r->mymap[e->lit]);
case Charclass: if(a >= 256) return(0);
return(memchr((char *)e->r, r->mymap[a], (int)e->l) != 0);
case Compcharclass:
if(a >= 256) return(0);
return(memchr((char *)e->r, r->mymap[a], (int)e->l) == 0);
}
return(0);
}
/*
generates the followset for a node in firstpos
*/
static void
follow(register re_re *r, register Expr *e)
{
register Expr *p;
if(e->type == EOP)
return;
else
p = e->parent;
switch(p->type)
{
case EOP:
SET(r, firstpos, p->id)
break;
case Plus:
case Star:
(void)first(r, e);
follow(r, p);
break;
case Quest:
case Alternate:
follow(r, p);
break;
case Cat:
if(e == p->l){
if(first(r, p->r) == 0){
follow(r, p);
break;
}
} else
follow(r, p);
break;
case Group:
follow(r, p);
break;
}
}
/*
first returns 0 if e is nullable and in the process,
sets up firstpos.
*/
static
first(register re_re *r, register Expr *e)
{
int k;
switch(e->type)
{
case Carat:
case Dollar:
case Literal:
case Dot:
case Charclass:
case Compcharclass:
SET(r, firstpos, e->id)
return(1);
case EOP:
case Star:
case Quest:
(void)first(r, e->l);
return(0);
case Group:
case Plus:
return(first(r, e->l));
case Cat:
return(first(r, e->l) || first(r, e->r));
case Alternate:
k = first(r, e->r);
return(first(r, e->l) && k);
default:
EPR "bad type %d\n", e->type);
abort();
return(0);
}
}
static void
efollow(register re_re *r, register Expr *e)
{
register i, *p;
RESET(r, firstpos);
follow(r, e);
e->flen = r->firstpos.count;
e->follow = (int *)malloc((unsigned)(e->flen*sizeof(int)));
if(e->follow == 0){
char buf[100];
SPR buf, "efollow malloc fail(%ld)\n", e->flen);
re_error(buf);
return;
}
p = e->follow;
GET(r->firstpos, i)
*p++ = i;
if(p != e->follow+e->flen){
char buf[100];
SPR buf, "efollow length error: %d %ld\n", p-e->follow, e->flen);
re_error(buf);
return;
}
}
static State *
addstate(register re_re *r, register Positionset *ps)
{
register *p, j;
register State *s;
s = r->states + eg_getstate(r);
memset((char *)s->tab, 0, sizeof(s->tab));
s->pos = eg_posalloc(r, (int)ps->count);
s->npos = ps->count;
p = r->posbase+s->pos;
GET((*ps), j)
*p++ = j;
if(s->out = (ps->base[r->root->id] != 0)){
if((ps->base[r->root->id] <= 0) && (ps->last == r->root->id))
s->out = ACCEPT_EOP; /* marker for only the EOP marker */
else
s->out = ACCEPT;
}
if(s->npos == 0)
s->out = FAIL;
#ifdef DEBUG
if(TRACE(3)){
char buf[2000];
eg_spr(s->npos, s->pos+r->posbase, buf);
PR "new state[%d]@%d %s,pos=%d out=%d\n", s-r->states, s, buf, s->pos, s->out);
}
#endif
return(s);
}
static State *
nextstate(register re_re *r, register State *s, int a)
{
register int p, *q, *eq;
register long i;
State *news;
Expr *e;
RESET(r, tmp);
#ifdef DEBUG
if(TRACE(5)){
char buf[2000];
ppr(&r->tmp, buf);
PR "nextstate: reset: %s\n", buf);
}
#endif
for(i = s->npos, p = s->pos; i > 0; i--){
e = r->ptr[r->posbase[p++]];
#ifdef DEBUG
if(TRACE(11)){
PR "i=%d e->type=%d\n", i, e->type);
}
#endif
if(e->type == EOP) continue;
if(match(r, e, a)){
#ifdef DEBUG
if(TRACE(11)){PR "matched %c\n", a);}
#endif
if(e->follow == 0)
efollow(r, e);
for(q = e->follow, eq = q+e->flen; q < eq; q++){
SET(r, tmp, *q)
}
}
}
#ifdef DEBUG
if(TRACE(5)){
char buf[2000];
ppr(&r->tmp, buf);
PR "nextstate(%d, '%c'): found %s\n", s-r->states, a, buf);
}
#endif
if(news = eg_stateof(r, &r->tmp)){
if(a <= RE_DOLLAR)
s->tab[a] = news;
} else
news = addstate(r, &r->tmp);
#ifdef DEBUG
if(TRACE(5)){
PR "nextstate(%d, '%c'): returning %ld %d out=%d\n", s-r->states, a,
(long)news, news-r->states, news->out);
}
#endif
return(news);
}
void *
egmalloc(int n, char *s)
{
char *x;
char buf[256];
if((x = malloc(n)) == 0){
SPR buf, "malloc fail(%d): %s", n, s);
re_error(buf);
return 0;
}
return((void *)x);
}
void *
egrealloc(char *p, int n, char *s)
{
char *x;
char buf[256];
if((x = realloc(p, n)) == 0){
SPR buf, "realloc fail(%d): %s", n, s);
re_error(buf);
return 0;
}
return((void *)x);
}
#ifdef DEBUG
static void
ppr(register Positionset *ps, register char *p)
{
register n;
if(ps->count < 1){
SPR p, "{}");
return;
}
*p++ = '{';
GET((*ps), n){
SPR p, "%d[=%d],", n, ps->base[n]);
p = strchr(p, 0);
}
p[-1] = '}';
}
#endif
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.