|
|
1.1 root 1: #define _IN_SUF_TREE
2: #include "suftree.h"
3:
4: /*
5: ** Construct a suffix tree for a source string
6: ** and perform string matching of various input strings.
7: ** This is based on the suffix tree algorithm of E. McCreight.
8: ** I extended the algorithm to remove the restriction that the
9: ** last element of the string has to be different from the rest
10: ** of the string. Note also that since the alphabet is large (256),
11: ** instead of being stored in an array, the children of a node
12: ** are kept in a linked list which is managed by a move-to-front
13: ** heuristic.
14: **
15: ** For details, see the paper:
16: ** "A Space-Economical Suffix Tree Construction Algorithm"
17: ** E.M. McCreight, JACM, v.23, No.2, 1976, pp.262-272.
18: **
19: ** BldSuftree returns either NULL or the pointer to the root of the
20: ** tree. In the latter case, the tree can be destroyed by free()-ing
21: ** the root.
22: **
23: ** Written by Kiem-Phong Vo, 5/20/88.
24: */
25:
26: /* Delete a suffix tree */
27: void delsuftree(root)
28: Suftree *root;
29: {
30: if(!root)
31: return;
32: root -= 1;
33: while(root)
34: {
35: register Suftree *next;
36: next = NEXT(root);
37: free(root);
38: root = next;
39: }
40: }
41:
42: /* Find a child whose label string starts with a given character */
43: static Suftree *child_find(node,c)
44: Suftree *node; /* the node whose children will be searched */
45: Element c; /* the element being looked for */
46: {
47: register Suftree *np, *last;
48:
49: last = NIL(Suftree);
50: for(np = CHILD(node); np != NIL(Suftree); np = SIBLING(np))
51: if(LENGTH(np) > 0 && *LABEL(np) == c)
52: break;
53: else last = np;
54:
55: if(np && last)
56: {
57: /* move-to-front heuristic */
58: SIBLING(last) = SIBLING(np);
59: SIBLING(np) = CHILD(node);
60: CHILD(node) = np;
61: }
62: return np;
63: }
64:
65: /* Get space for tree nodes. */
66: static Suftree *getmem(root,n)
67: Suftree *root;
68: int n;
69: {
70: Suftree *list, *malloc(), **realloc();
71:
72: if((list = malloc((n+1)*sizeof(Suftree))) == NIL(Suftree))
73: {
74: if(root)
75: delsuftree(root);
76: return NIL(Suftree);
77: }
78:
79: /* store where this list is for later deletion */
80: NEXT(list) = NIL(Suftree);
81: if(root)
82: {
83: for(root -= 1; NEXT(root) != NIL(Suftree); root = NEXT(root))
84: ;
85: NEXT(root) = list;
86: }
87:
88: return list+1;
89: }
90:
91: /* Build the tree.
92: ** Following are the meaning of a few important variables:
93: ** clocus: contracted locus, this variable defines the
94: ** tree node that points to the longest prefix
95: ** that terminates at a node in the current tree.
96: ** locus: defines a tree node to be constructed that
97: ** points to the longest prefix that can be defined.
98: ** Unless both clocus and locus equal the root,
99: ** we maintain the invariance that clocus is the
100: ** parent of locus.
101: ** link: defines the sublink of the locus of the prefix
102: ** of the previously inserted suffix.
103: */
104: Suftree *bldsuftree(src,len)
105: Element *src; /* the string to build the tree from */
106: long int len; /* the length of the string */
107: {
108: register Element *sp, *mp, *rescan, *endmatch, *endsrc;
109: register Suftree *match, *clocus, *locus, *link;
110: register long int mtlen, relen;
111: register int n;
112: Suftree *root, *list, *endlist;
113:
114: if(len == 0)
115: return NIL(Suftree);
116:
117: /* get initial space for the tree nodes */
118: root = NIL(Suftree);
119:
120: /* 2*len+1 is the maximum number of nodes that can be created */
121: n = 2*len+1;
122: if(n > ALLOCSIZE)
123: n = ALLOCSIZE;
124: if(!(list = getmem(root,n)))
125: return NIL(Suftree);
126: endlist = list + n;
127:
128: /* make root node */
129: root = list++;
130: LABEL(root) = NIL(Element);
131: CHILD(root) = NIL(Suftree);
132: LINK(root) = root;
133:
134: /* locus and contracted locus of the empty substring */
135: locus = clocus = root;
136:
137: /* the current match length */
138: mtlen = 0;
139:
140: /* the end of the source string */
141: endsrc = src+len;
142:
143: /* now build the tree */
144: for(; src < endsrc; ++src)
145: {
146: /* prepare for scanning the current suffix */
147: if(locus != root)
148: {
149: /* define the string to be rescanned */
150: rescan = LABEL(locus);
151: relen = LENGTH(locus);
152:
153: /* minus the first character of the previous prefix */
154: if(clocus == root)
155: {
156: rescan++;
157: if(relen > 0)
158: --relen;
159: }
160: }
161: else mtlen = relen = 0;
162:
163: /* the length of the known-to-be-matched part */
164: if(mtlen > 0)
165: --mtlen;
166: /**/ ASSERT(relen <= mtlen)
167:
168: /* use sublink to rescan */
169: link = LINK(clocus);
170:
171: /* rescan */
172: while(relen > 0)
173: {
174: /* find a child of link that starts with the
175: first character of rescan. We then know that
176: rescan must match a prefix of that child.
177: */
178: match = child_find(link,*rescan);
179: /**/ ASSERT(match != NULL)
180:
181: /* clocus will be the parent of the new link */
182: clocus = link;
183:
184: /* rescan contains LABEL(match) */
185: if(relen >= LENGTH(match))
186: {
187: link = match;
188: relen -= LENGTH(match);
189: rescan += LENGTH(match);
190: }
191: /* rescan is a proper prefix of LABEL(match) */
192: else
193: {
194: if(list >= endlist)
195: {
196: if(!(list = getmem(root,ALLOCSIZE)))
197: return NIL(Suftree);
198: else endlist = list+ALLOCSIZE;
199: }
200:
201: /* make an internal node labeled with rescan */
202: LABEL(list) = rescan;
203: LENGTH(list) = relen;
204: CHILD(list) = match;
205: SIBLING(list) = SIBLING(match);
206: LINK(list) = root;
207:
208: /* adjust label and pointer of old node */
209: SIBLING(match) = NIL(Suftree);
210: LABEL(match) += relen;
211: LENGTH(match) -= relen;
212:
213: CHILD(link) = list;
214: link = list++;
215: break;
216: }
217: }
218:
219: /* define sublink for the prefix of the last suffix */
220: if(locus != root)
221: LINK(locus) = link;
222:
223: /* scan to match as much as possible */
224: locus = link;
225: sp = src + mtlen;
226: while(sp < endsrc)
227: {
228: /* see if it matches some child of clocus */
229: if(!(match = child_find(locus,*sp)))
230: break;
231:
232: /* clocus will be the parent of the new locus */
233: clocus = locus;
234:
235: /* find the extend of the match */
236: mp = LABEL(match);
237: endmatch = mp + LENGTH(match);
238: for(; sp < endsrc && mp < endmatch; ++sp, ++mp)
239: if(*sp != *mp)
240: break;
241:
242: /* the whole node is matched */
243: if(mp >= endmatch)
244: {
245: locus = match;
246: mtlen += LENGTH(match);
247: }
248: /* found the extended locus of this suffix */
249: else
250: {
251: if(list >= endlist)
252: {
253: if(!(list = getmem(root,ALLOCSIZE)))
254: return NIL(Suftree);
255: else endlist = list+ALLOCSIZE;
256: }
257:
258: /* make a new internal node */
259: LABEL(list) = LABEL(match);
260: LENGTH(list) = mp - LABEL(match);
261: CHILD(list) = match;
262: SIBLING(list) = SIBLING(match);
263: LINK(list) = root;
264:
265: SIBLING(match) = NIL(Suftree);
266: LABEL(match) += LENGTH(list);
267: LENGTH(match) -= LENGTH(list);
268: mtlen += LENGTH(list);
269:
270: /* the new node is the locus for this suffix */
271: CHILD(locus) = list;
272: locus = list++;
273: break;
274: }
275: }
276:
277: if(list >= endlist)
278: {
279: if(!(list = getmem(root,ALLOCSIZE)))
280: return NIL(Suftree);
281: else endlist = list+ALLOCSIZE;
282: }
283:
284: /* make a new external node for the suffix */
285: SUFFIX(list) = src;
286: LABEL(list) = sp;
287: LENGTH(list) = endsrc-sp;
288: CHILD(list) = NIL(Suftree);
289:
290: /* hook it in as the first child of locus */
291: SIBLING(list) = CHILD(locus);
292: CHILD(locus) = list++;
293: }
294:
295: return root;
296: }
297:
298: /* Given a raw string and a string represented in a suffix tree,
299: match the string against the tree to find a longest matching
300: prefix of the string.
301: Return the length of the match and where it occurs in the
302: string represented by the tree.
303: */
304: long mtchsuftree(tree,str,len,mtchp)
305: Suftree *tree; /* the root of the suffix tree */
306: Element *str; /* the string to be matched */
307: long len; /* the length of the string */
308: Element **mtchp; /* to return the address of the match */
309: {
310: register Suftree *match;
311: register Element *sp, *mp, *endmp, *endstr;
312: register long mlen;
313:
314: mlen = 0;
315: endstr = str + len;
316: while(1)
317: {
318: if(!(match = child_find(tree,*str)))
319: break;
320:
321: /* find the extent of the match */
322: mp = LABEL(match);
323: endmp = mp + LENGTH(match);
324: for(sp = str; sp < endstr && mp < endmp; ++sp, ++mp)
325: if(*sp != *mp)
326: break;
327:
328: /* update the length of the match */
329: mlen += sp-str;
330:
331: /* prepare for next iteration */
332: tree = match;
333: str = sp;
334:
335: /* see if we have to work any more */
336: if(mp < endmp || str >= endstr)
337: break;
338: }
339:
340: if(mlen == 0) /* no match */
341: *mtchp = NIL(Element);
342: else
343: {
344: /* find where the match starts */
345: while(CHILD(tree))
346: tree = CHILD(tree);
347: *mtchp = SUFFIX(tree);
348: }
349:
350: return mlen;
351: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.