|
|
researchv10 Norman
/*
Copyright (c) 1989 AT&T
All Rights Reserved
THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T.
The copyright notice above does not evidence any
actual or intended publication of such source code.
*/
#define DEBUG
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "awk.h"
#include "y.tab.h"
#define HAT (NCHARS-1) /* matches ^ in regular expr */
/* NCHARS is 2**n */
#define MAXLIN 512
#define type(v) (v)->nobj
#define left(v) (v)->narg[0]
#define right(v) (v)->narg[1]
#define parent(v) (v)->nnext
#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
#define UNARY case STAR: case PLUS: case QUEST:
/* encoding in tree Nodes:
leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
left is index, right contains value or pointer to value
unary (STAR, PLUS, QUEST): left is child, right is null
binary (CAT, OR): left and right are children
parent contains pointer to parent
*/
uchar chars[MAXLIN];
int setvec[MAXLIN];
int tmpset[MAXLIN];
Node *point[MAXLIN];
int rtok; /* next token in current re */
int rlxval;
uchar *rlxstr;
uchar *prestr; /* current position in current re */
uchar *lastre; /* origin of last re */
static int setcnt;
static int poscnt;
uchar *patbeg;
int patlen;
#define NFA 20 /* cache this many dynamic fa's */
fa *fatab[NFA];
int nfatab = 0; /* entries in fatab */
fa *makedfa(uchar *s, int anchor) /* returns dfa for reg expr s */
{
int i, use, nuse;
fa *pfa;
if (compile_time) /* a constant for sure */
return mkdfa(s, anchor);
for (i = 0; i < nfatab; i++) /* is it there already? */
if (fatab[i]->anchor == anchor && strcmp(fatab[i]->restr,s) == 0) {
fatab[i]->use++;
return fatab[i];
}
pfa = mkdfa(s, anchor);
if (nfatab < NFA) { /* room for another */
fatab[nfatab] = pfa;
fatab[nfatab]->use = 1;
nfatab++;
return pfa;
}
use = fatab[0]->use; /* replace least-recently used */
nuse = 0;
for (i = 1; i < nfatab; i++)
if (fatab[i]->use < use) {
use = fatab[i]->use;
nuse = i;
}
freefa(fatab[nuse]);
fatab[nuse] = pfa;
pfa->use = 1;
return pfa;
}
fa *mkdfa(uchar *s, int anchor) /* does the real work of making a dfa */
/* anchor = 1 for anchored matches, else 0 */
{
Node *p, *p1;
fa *f;
p = reparse(s);
p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
/* put ALL STAR in front of reg. exp. */
p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
/* put FINAL after reg. exp. */
poscnt = 0;
penter(p1); /* enter parent pointers and leaf indices */
if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
overflo("out of space for fa");
f->accept = poscnt-1; /* penter has computed number of positions in re */
cfoll(f, p1); /* set up follow sets */
freetr(p1);
if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
overflo("out of space in makedfa");
if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
overflo("out of space in makedfa");
*f->posns[1] = 0;
f->initstat = makeinit(f, anchor);
f->anchor = anchor;
f->restr = tostring(s);
return f;
}
int makeinit(fa *f, int anchor)
{
register int i, k;
f->curstat = 2;
f->out[2] = 0;
f->reset = 0;
k = *(f->re[0].lfollow);
xfree(f->posns[2]);
if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
overflo("out of space in makeinit");
for (i=0; i<=k; i++) {
(f->posns[2])[i] = (f->re[0].lfollow)[i];
}
if ((f->posns[2])[1] == f->accept)
f->out[2] = 1;
for (i=0; i<NCHARS; i++)
f->gototab[2][i] = 0;
f->curstat = cgoto(f, 2, HAT);
if (anchor) {
*f->posns[2] = k-1; /* leave out position 0 */
for (i=0; i<k; i++) {
(f->posns[0])[i] = (f->posns[2])[i];
}
f->out[0] = f->out[2];
if (f->curstat != 2)
--(*f->posns[f->curstat]);
}
return f->curstat;
}
void penter(Node *p) /* set up parent pointers and leaf indices */
{
switch(type(p)) {
LEAF
left(p) = (Node *) poscnt;
if (poscnt >= MAXLIN)
overflo("leaf index overflow in penter");
point[poscnt++] = p;
break;
UNARY
penter(left(p));
parent(left(p)) = p;
break;
case CAT:
case OR:
penter(left(p));
penter(right(p));
parent(left(p)) = p;
parent(right(p)) = p;
break;
default: /* can't happen */
ERROR "unknown type %d in penter", type(p) FATAL;
break;
}
}
void freetr(Node *p) /* free parse tree */
{
switch (type(p)) {
LEAF
xfree(p);
break;
UNARY
freetr(left(p));
xfree(p);
break;
case CAT:
case OR:
freetr(left(p));
freetr(right(p));
xfree(p);
break;
default: /* can't happen */
ERROR "unknown type %d in freetr", type(p) FATAL;
break;
}
}
/* in the parsing of regular expressions, metacharacters like . have */
/* to be seen literally; \056 is not a metacharacter. */
hexstr(char **pp) /* find and eval hex string at pp, return new p */
{
char *p;
int n = 0;
for (p = *pp; isxdigit(*p); p++) {
if (isdigit(*p))
n = 16 * n + *p - '0';
else if (*p >= 'a' && *p <= 'f')
n = 16 * n + *p - 'a' + 10;
else if (*p >= 'A' && *p <= 'F')
n = 16 * n + *p - 'A' + 10;
}
*pp = p;
return n;
}
#define isoctdigit(c) ((c) >= '0' && (c) <= '8') /* multiple use of arg */
quoted(char **pp) /* pick up next thing after a \\ */
/* and increment *pp */
{
char *p = *pp;
int c;
if ((c = *p++) == 't')
c = '\t';
else if (c == 'n')
c = '\n';
else if (c == 'f')
c = '\f';
else if (c == 'r')
c = '\r';
else if (c == 'b')
c = '\b';
else if (c == '\\')
c = '\\';
else if (c == 'x') { /* hexadecimal goo follows */
c = hexstr(&p);
} else if (isoctdigit(c)) { /* \d \dd \ddd */
int n = c - '0';
if (isoctdigit(*p)) {
n = 8 * n + *p++ - '0';
if (isoctdigit(*p))
n = 8 * n + *p++ - '0';
}
c = n;
} /* else */
/* c = c; */
*pp = p;
return c;
}
uchar *cclenter(uchar *p)
{
register int i, c, c2;
uchar *op;
op = p;
i = 0;
while ((c = *p++) != 0) {
if (c == '\\') {
c = quoted(&p);
} else if (c == '-' && i > 0 && chars[i-1] != 0) {
if (*p != 0) {
c = chars[i-1];
c2 = *p++;
if (c2 == '\\')
c2 = quoted(&p);
while (c < c2) {
if (i >= MAXLIN-1)
overflo("character class too big");
chars[i++] = ++c;
}
continue;
}
}
if (i >= MAXLIN-1)
overflo("character class too big");
chars[i++] = c;
}
chars[i++] = '\0';
dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, chars) );
xfree(op);
return(tostring(chars));
}
void overflo(uchar *s)
{
ERROR "regular expression too big: %s", s FATAL;
}
void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
{
register int i;
register int *p;
switch(type(v)) {
LEAF
f->re[(int) left(v)].ltype = type(v);
f->re[(int) left(v)].lval = (long) right(v); /* assumes ptr & long fit */
for (i=0; i<=f->accept; i++)
setvec[i] = 0;
setcnt = 0;
follow(v); /* computes setvec and setcnt */
if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
overflo("out of space building follow set");
f->re[(int) left(v)].lfollow = p;
*p = setcnt;
for (i = f->accept; i >= 0; i--)
if (setvec[i] == 1) *++p = i;
break;
UNARY
cfoll(f,left(v));
break;
case CAT:
case OR:
cfoll(f,left(v));
cfoll(f,right(v));
break;
default: /* can't happen */
ERROR "unknown type %d in cfoll", type(v) FATAL;
}
}
first(Node *p) /* collects initially active leaves of p into setvec */
/* returns 0 or 1 depending on whether p matches empty string */
{
register int b;
switch(type(p)) {
LEAF
if (setvec[(int) left(p)] != 1) {
setvec[(int) left(p)] = 1;
setcnt++;
}
if (type(p) == CCL && (*(uchar *) right(p)) == '\0')
return(0); /* empty CCL */
else return(1);
case PLUS:
if (first(left(p)) == 0) return(0);
return(1);
case STAR:
case QUEST:
first(left(p));
return(0);
case CAT:
if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
return(1);
case OR:
b = first(right(p));
if (first(left(p)) == 0 || b == 0) return(0);
return(1);
}
ERROR "unknown type %d in first", type(p) FATAL; /* can't happen */
return(-1);
}
void follow(Node *v) /* collects leaves that can follow v into setvec */
{
Node *p;
if (type(v) == FINAL)
return;
p = parent(v);
switch (type(p)) {
case STAR:
case PLUS:
first(v);
follow(p);
return;
case OR:
case QUEST:
follow(p);
return;
case CAT:
if (v == left(p)) { /* v is left child of p */
if (first(right(p)) == 0) {
follow(p);
return;
}
}
else /* v is right child */
follow(p);
return;
}
}
member(int c, uchar *s) /* is c in s? */
{
while (*s)
if (c == *s++)
return(1);
return(0);
}
match(fa *f, uchar *p)
{
register int s, ns;
s = f->reset?makeinit(f,0):f->initstat;
if (f->out[s])
return(1);
do {
if (ns=f->gototab[s][*p])
s=ns;
else
s=cgoto(f,s,*p);
if (f->out[s])
return(1);
} while (*p++ != 0);
return(0);
}
pmatch(fa *f, uchar *p)
{
register int s, ns;
register uchar *q;
int i, k;
s = f->reset?makeinit(f,1):f->initstat;
patbeg = p;
patlen = -1;
do {
q = p;
do {
if (f->out[s]) /* final state */
patlen = q-p;
if (ns=f->gototab[s][*q])
s=ns;
else
s=cgoto(f,s,*q);
if (s==1) /* no transition */
if (patlen >= 0) {
patbeg = p;
return(1);
}
else
goto nextin; /* no match */
} while (*q++ != 0);
if (f->out[s])
patlen = q-p-1; /* don't count $ */
if (patlen >= 0) {
patbeg = p;
return(1);
}
nextin:
s = 2;
if (f->reset) {
for (i=2; i<=f->curstat; i++)
xfree(f->posns[i]);
k = *f->posns[0];
if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
overflo("out of space in pmatch");
for (i=0; i<=k; i++)
(f->posns[2])[i] = (f->posns[0])[i];
f->initstat = f->curstat = 2;
f->out[2] = f->out[0];
for (i=0; i<NCHARS; i++)
f->gototab[2][i] = 0;
}
} while (*p++ != 0);
return (0);
}
nematch(fa *f, uchar *p)
{
register int s, ns;
register uchar *q;
int i, k;
s = f->reset?makeinit(f,1):f->initstat;
patlen = -1;
while (*p) {
q = p;
do {
if (f->out[s]) /* final state */
patlen = q-p;
if (ns=f->gototab[s][*q])
s=ns;
else
s=cgoto(f,s,*q);
if (s==1) /* no transition */
if (patlen > 0) {
patbeg = p;
return(1);
}
else
goto nnextin; /* no nonempty match */
} while (*q++ != 0);
if (f->out[s])
patlen = q-p-1; /* don't count $ */
if (patlen > 0 ) {
patbeg = p;
return(1);
}
nnextin:
s = 2;
if (f->reset) {
for (i=2; i<=f->curstat; i++)
xfree(f->posns[i]);
k = *f->posns[0];
if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
overflo("out of state space");
for (i=0; i<=k; i++)
(f->posns[2])[i] = (f->posns[0])[i];
f->initstat = f->curstat = 2;
f->out[2] = f->out[0];
for (i=0; i<NCHARS; i++)
f->gototab[2][i] = 0;
}
p++;
}
return (0);
}
Node *reparse(uchar *p)
{
/* parses regular expression pointed to by p */
/* uses relex() to scan regular expression */
Node *np;
dprintf( ("reparse <%s>\n", p) );
lastre = prestr = p; /* prestr points to string to be parsed */
rtok = relex();
if (rtok == '\0')
ERROR "empty regular expression" FATAL;
np = regexp();
if (rtok != '\0')
ERROR "syntax error in regular expression %s at %s", lastre, prestr FATAL;
return(np);
}
Node *regexp(void)
{
return (alt(concat(primary())));
}
Node *primary(void)
{
Node *np;
switch (rtok) {
case CHAR:
np = op2(CHAR, NIL, (Node *) rlxval);
rtok = relex();
return (unary(np));
case ALL:
rtok = relex();
return (unary(op2(ALL, NIL, NIL)));
case DOT:
rtok = relex();
return (unary(op2(DOT, NIL, NIL)));
case CCL:
np = op2(CCL, NIL, (Node*) cclenter(rlxstr));
rtok = relex();
return (unary(np));
case NCCL:
np = op2(NCCL, NIL, (Node *) cclenter(rlxstr));
rtok = relex();
return (unary(np));
case '^':
rtok = relex();
return (unary(op2(CHAR, NIL, (Node *) HAT)));
case '$':
rtok = relex();
return (unary(op2(CHAR, NIL, NIL)));
case '(':
rtok = relex();
if (rtok == ')') { /* special pleading for () */
rtok = relex();
return unary(op2(CCL, NIL, (Node *) tostring("")));
}
np = regexp();
if (rtok == ')') {
rtok = relex();
return (unary(np));
}
else
ERROR "syntax error in regular expression %s at %s", lastre, prestr FATAL;
default:
ERROR "illegal primary in regular expression %s at %s", lastre, prestr FATAL;
}
return 0; /*NOTREACHED*/
}
Node *concat(Node *np)
{
switch (rtok) {
case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
return (concat(op2(CAT, np, primary())));
}
return (np);
}
Node *alt(Node *np)
{
if (rtok == OR) {
rtok = relex();
return (alt(op2(OR, np, concat(primary()))));
}
return (np);
}
Node *unary(Node *np)
{
switch (rtok) {
case STAR:
rtok = relex();
return (unary(op2(STAR, np, NIL)));
case PLUS:
rtok = relex();
return (unary(op2(PLUS, np, NIL)));
case QUEST:
rtok = relex();
return (unary(op2(QUEST, np, NIL)));
default:
return (np);
}
}
relex(void) /* lexical analyzer for reparse */
{
register int c;
uchar cbuf[MAXLIN];
int clen, cflag;
switch (c = *prestr++) {
case '|': return OR;
case '*': return STAR;
case '+': return PLUS;
case '?': return QUEST;
case '.': return DOT;
case '\0': prestr--; return '\0';
case '^':
case '$':
case '(':
case ')':
return c;
case '\\':
rlxval = quoted(&prestr);
return CHAR;
default:
rlxval = c;
return CHAR;
case '[':
clen = 0;
if (*prestr == '^') {
cflag = 1;
prestr++;
}
else
cflag = 0;
for ( ; clen < MAXLIN-1; ) {
if ((c = *prestr++) == '\\') {
cbuf[clen++] = '\\';
if ((c = *prestr++) == '\0')
ERROR "nonterminated character class %.20s...", lastre FATAL;
cbuf[clen++] = c;
} else if (c == ']') {
cbuf[clen] = 0;
rlxstr = tostring(cbuf);
if (cflag == 0)
return CCL;
else
return NCCL;
} else if (c == '\n') {
ERROR "newline in character class %.20s...", lastre FATAL;
} else if (c == '\0') {
ERROR "nonterminated character class %.20s", lastre FATAL;
} else
cbuf[clen++] = c;
}
if (clen >= MAXLIN-1)
ERROR "character class %.20s... too long", cbuf FATAL;
}
/* can't happen */
return 0;
}
int cgoto(fa *f, int s, int c)
{
register int i, j, k;
register int *p, *q;
for (i=0; i<=f->accept; i++)
setvec[i] = 0;
setcnt = 0;
/* compute positions of gototab[s,c] into setvec */
p = f->posns[s];
for (i=1; i<=*p; i++) {
if ((k = f->re[p[i]].ltype) != FINAL) {
if (k == CHAR && c == f->re[p[i]].lval
|| k == DOT && c != 0 && c != HAT
|| k == ALL && c != 0
|| k == CCL && member(c, (uchar *) f->re[p[i]].lval)
|| k == NCCL && !member(c, (uchar *) f->re[p[i]].lval) && c != 0 && c != HAT) {
q = f->re[p[i]].lfollow;
for (j=1; j<=*q; j++) {
if (setvec[q[j]] == 0) {
setcnt++;
setvec[q[j]] = 1;
}
}
}
}
}
/* determine if setvec is a previous state */
tmpset[0] = setcnt;
j = 1;
for (i = f->accept; i >= 0; i--)
if (setvec[i]) {
tmpset[j++] = i;
}
/* tmpset == previous state? */
for (i=1; i<= f->curstat; i++) {
p = f->posns[i];
if ((k = tmpset[0]) != p[0])
goto different;
for (j = 1; j <= k; j++)
if (tmpset[j] != p[j])
goto different;
/* setvec is state i */
f->gototab[s][c] = i;
return i;
different:;
}
/* add tmpset to current set of states */
if (f->curstat >= NSTATES-1) {
f->curstat = 2;
f->reset = 1;
for (i=2; i<NSTATES; i++)
xfree(f->posns[i]);
}
else
++(f->curstat);
for (i=0; i<NCHARS; i++)
f->gototab[f->curstat][i] = 0;
xfree(f->posns[f->curstat]);
if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
overflo("out of space in cgoto");
f->posns[f->curstat] = p;
f->gototab[s][c] = f->curstat;
for (i = 0; i <= setcnt; i++)
p[i] = tmpset[i];
if (setvec[f->accept])
f->out[f->curstat] = 1;
else
f->out[f->curstat] = 0;
return f->curstat;
}
void freefa(fa *f)
{
register int i;
if (f == NULL)
return;
for (i=0; i<=f->curstat; i++)
xfree(f->posns[i]);
for (i=0; i<=f->accept; i++) {
xfree(f->re[i].lfollow);
if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
xfree(f->re[i].lval);
}
xfree(f->restr);
xfree(f);
}
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.