|
|
1.1 ! root 1: #define _IN_SUF_TREE ! 2: #include "suftree.h" ! 3: ! 4: /* ! 5: ** Construct a suffix tree for a source string ! 6: ** and perform string matching of various input strings. ! 7: ** This is based on the suffix tree algorithm of E. McCreight. ! 8: ** I extended the algorithm to remove the restriction that the ! 9: ** last element of the string has to be different from the rest ! 10: ** of the string. Note also that since the alphabet is large (256), ! 11: ** instead of being stored in an array, the children of a node ! 12: ** are kept in a linked list which is managed by a move-to-front ! 13: ** heuristic. ! 14: ** ! 15: ** For details, see the paper: ! 16: ** "A Space-Economical Suffix Tree Construction Algorithm" ! 17: ** E.M. McCreight, JACM, v.23, No.2, 1976, pp.262-272. ! 18: ** ! 19: ** BldSuftree returns either NULL or the pointer to the root of the ! 20: ** tree. In the latter case, the tree can be destroyed by free()-ing ! 21: ** the root. ! 22: ** ! 23: ** Written by Kiem-Phong Vo, 5/20/88. ! 24: */ ! 25: ! 26: /* Delete a suffix tree */ ! 27: void delsuftree(root) ! 28: Suftree *root; ! 29: { ! 30: if(!root) ! 31: return; ! 32: root -= 1; ! 33: while(root) ! 34: { ! 35: register Suftree *next; ! 36: next = NEXT(root); ! 37: free(root); ! 38: root = next; ! 39: } ! 40: } ! 41: ! 42: /* Find a child whose label string starts with a given character */ ! 43: static Suftree *child_find(node,c) ! 44: Suftree *node; /* the node whose children will be searched */ ! 45: Element c; /* the element being looked for */ ! 46: { ! 47: register Suftree *np, *last; ! 48: ! 49: last = NIL(Suftree); ! 50: for(np = CHILD(node); np != NIL(Suftree); np = SIBLING(np)) ! 51: if(LENGTH(np) > 0 && *LABEL(np) == c) ! 52: break; ! 53: else last = np; ! 54: ! 55: if(np && last) ! 56: { ! 57: /* move-to-front heuristic */ ! 58: SIBLING(last) = SIBLING(np); ! 59: SIBLING(np) = CHILD(node); ! 60: CHILD(node) = np; ! 61: } ! 62: return np; ! 63: } ! 64: ! 65: /* Get space for tree nodes. */ ! 66: static Suftree *getmem(root,n) ! 67: Suftree *root; ! 68: int n; ! 69: { ! 70: Suftree *list, *malloc(), **realloc(); ! 71: ! 72: if((list = malloc((n+1)*sizeof(Suftree))) == NIL(Suftree)) ! 73: { ! 74: if(root) ! 75: delsuftree(root); ! 76: return NIL(Suftree); ! 77: } ! 78: ! 79: /* store where this list is for later deletion */ ! 80: NEXT(list) = NIL(Suftree); ! 81: if(root) ! 82: { ! 83: for(root -= 1; NEXT(root) != NIL(Suftree); root = NEXT(root)) ! 84: ; ! 85: NEXT(root) = list; ! 86: } ! 87: ! 88: return list+1; ! 89: } ! 90: ! 91: /* Build the tree. ! 92: ** Following are the meaning of a few important variables: ! 93: ** clocus: contracted locus, this variable defines the ! 94: ** tree node that points to the longest prefix ! 95: ** that terminates at a node in the current tree. ! 96: ** locus: defines a tree node to be constructed that ! 97: ** points to the longest prefix that can be defined. ! 98: ** Unless both clocus and locus equal the root, ! 99: ** we maintain the invariance that clocus is the ! 100: ** parent of locus. ! 101: ** link: defines the sublink of the locus of the prefix ! 102: ** of the previously inserted suffix. ! 103: */ ! 104: Suftree *bldsuftree(src,len) ! 105: Element *src; /* the string to build the tree from */ ! 106: long int len; /* the length of the string */ ! 107: { ! 108: register Element *sp, *mp, *rescan, *endmatch, *endsrc; ! 109: register Suftree *match, *clocus, *locus, *link; ! 110: register long int mtlen, relen; ! 111: register int n; ! 112: Suftree *root, *list, *endlist; ! 113: ! 114: if(len == 0) ! 115: return NIL(Suftree); ! 116: ! 117: /* get initial space for the tree nodes */ ! 118: root = NIL(Suftree); ! 119: ! 120: /* 2*len+1 is the maximum number of nodes that can be created */ ! 121: n = 2*len+1; ! 122: if(n > ALLOCSIZE) ! 123: n = ALLOCSIZE; ! 124: if(!(list = getmem(root,n))) ! 125: return NIL(Suftree); ! 126: endlist = list + n; ! 127: ! 128: /* make root node */ ! 129: root = list++; ! 130: LABEL(root) = NIL(Element); ! 131: CHILD(root) = NIL(Suftree); ! 132: LINK(root) = root; ! 133: ! 134: /* locus and contracted locus of the empty substring */ ! 135: locus = clocus = root; ! 136: ! 137: /* the current match length */ ! 138: mtlen = 0; ! 139: ! 140: /* the end of the source string */ ! 141: endsrc = src+len; ! 142: ! 143: /* now build the tree */ ! 144: for(; src < endsrc; ++src) ! 145: { ! 146: /* prepare for scanning the current suffix */ ! 147: if(locus != root) ! 148: { ! 149: /* define the string to be rescanned */ ! 150: rescan = LABEL(locus); ! 151: relen = LENGTH(locus); ! 152: ! 153: /* minus the first character of the previous prefix */ ! 154: if(clocus == root) ! 155: { ! 156: rescan++; ! 157: if(relen > 0) ! 158: --relen; ! 159: } ! 160: } ! 161: else mtlen = relen = 0; ! 162: ! 163: /* the length of the known-to-be-matched part */ ! 164: if(mtlen > 0) ! 165: --mtlen; ! 166: /**/ ASSERT(relen <= mtlen) ! 167: ! 168: /* use sublink to rescan */ ! 169: link = LINK(clocus); ! 170: ! 171: /* rescan */ ! 172: while(relen > 0) ! 173: { ! 174: /* find a child of link that starts with the ! 175: first character of rescan. We then know that ! 176: rescan must match a prefix of that child. ! 177: */ ! 178: match = child_find(link,*rescan); ! 179: /**/ ASSERT(match != NULL) ! 180: ! 181: /* clocus will be the parent of the new link */ ! 182: clocus = link; ! 183: ! 184: /* rescan contains LABEL(match) */ ! 185: if(relen >= LENGTH(match)) ! 186: { ! 187: link = match; ! 188: relen -= LENGTH(match); ! 189: rescan += LENGTH(match); ! 190: } ! 191: /* rescan is a proper prefix of LABEL(match) */ ! 192: else ! 193: { ! 194: if(list >= endlist) ! 195: { ! 196: if(!(list = getmem(root,ALLOCSIZE))) ! 197: return NIL(Suftree); ! 198: else endlist = list+ALLOCSIZE; ! 199: } ! 200: ! 201: /* make an internal node labeled with rescan */ ! 202: LABEL(list) = rescan; ! 203: LENGTH(list) = relen; ! 204: CHILD(list) = match; ! 205: SIBLING(list) = SIBLING(match); ! 206: LINK(list) = root; ! 207: ! 208: /* adjust label and pointer of old node */ ! 209: SIBLING(match) = NIL(Suftree); ! 210: LABEL(match) += relen; ! 211: LENGTH(match) -= relen; ! 212: ! 213: CHILD(link) = list; ! 214: link = list++; ! 215: break; ! 216: } ! 217: } ! 218: ! 219: /* define sublink for the prefix of the last suffix */ ! 220: if(locus != root) ! 221: LINK(locus) = link; ! 222: ! 223: /* scan to match as much as possible */ ! 224: locus = link; ! 225: sp = src + mtlen; ! 226: while(sp < endsrc) ! 227: { ! 228: /* see if it matches some child of clocus */ ! 229: if(!(match = child_find(locus,*sp))) ! 230: break; ! 231: ! 232: /* clocus will be the parent of the new locus */ ! 233: clocus = locus; ! 234: ! 235: /* find the extend of the match */ ! 236: mp = LABEL(match); ! 237: endmatch = mp + LENGTH(match); ! 238: for(; sp < endsrc && mp < endmatch; ++sp, ++mp) ! 239: if(*sp != *mp) ! 240: break; ! 241: ! 242: /* the whole node is matched */ ! 243: if(mp >= endmatch) ! 244: { ! 245: locus = match; ! 246: mtlen += LENGTH(match); ! 247: } ! 248: /* found the extended locus of this suffix */ ! 249: else ! 250: { ! 251: if(list >= endlist) ! 252: { ! 253: if(!(list = getmem(root,ALLOCSIZE))) ! 254: return NIL(Suftree); ! 255: else endlist = list+ALLOCSIZE; ! 256: } ! 257: ! 258: /* make a new internal node */ ! 259: LABEL(list) = LABEL(match); ! 260: LENGTH(list) = mp - LABEL(match); ! 261: CHILD(list) = match; ! 262: SIBLING(list) = SIBLING(match); ! 263: LINK(list) = root; ! 264: ! 265: SIBLING(match) = NIL(Suftree); ! 266: LABEL(match) += LENGTH(list); ! 267: LENGTH(match) -= LENGTH(list); ! 268: mtlen += LENGTH(list); ! 269: ! 270: /* the new node is the locus for this suffix */ ! 271: CHILD(locus) = list; ! 272: locus = list++; ! 273: break; ! 274: } ! 275: } ! 276: ! 277: if(list >= endlist) ! 278: { ! 279: if(!(list = getmem(root,ALLOCSIZE))) ! 280: return NIL(Suftree); ! 281: else endlist = list+ALLOCSIZE; ! 282: } ! 283: ! 284: /* make a new external node for the suffix */ ! 285: SUFFIX(list) = src; ! 286: LABEL(list) = sp; ! 287: LENGTH(list) = endsrc-sp; ! 288: CHILD(list) = NIL(Suftree); ! 289: ! 290: /* hook it in as the first child of locus */ ! 291: SIBLING(list) = CHILD(locus); ! 292: CHILD(locus) = list++; ! 293: } ! 294: ! 295: return root; ! 296: } ! 297: ! 298: /* Given a raw string and a string represented in a suffix tree, ! 299: match the string against the tree to find a longest matching ! 300: prefix of the string. ! 301: Return the length of the match and where it occurs in the ! 302: string represented by the tree. ! 303: */ ! 304: long mtchsuftree(tree,str,len,mtchp) ! 305: Suftree *tree; /* the root of the suffix tree */ ! 306: Element *str; /* the string to be matched */ ! 307: long len; /* the length of the string */ ! 308: Element **mtchp; /* to return the address of the match */ ! 309: { ! 310: register Suftree *match; ! 311: register Element *sp, *mp, *endmp, *endstr; ! 312: register long mlen; ! 313: ! 314: mlen = 0; ! 315: endstr = str + len; ! 316: while(1) ! 317: { ! 318: if(!(match = child_find(tree,*str))) ! 319: break; ! 320: ! 321: /* find the extent of the match */ ! 322: mp = LABEL(match); ! 323: endmp = mp + LENGTH(match); ! 324: for(sp = str; sp < endstr && mp < endmp; ++sp, ++mp) ! 325: if(*sp != *mp) ! 326: break; ! 327: ! 328: /* update the length of the match */ ! 329: mlen += sp-str; ! 330: ! 331: /* prepare for next iteration */ ! 332: tree = match; ! 333: str = sp; ! 334: ! 335: /* see if we have to work any more */ ! 336: if(mp < endmp || str >= endstr) ! 337: break; ! 338: } ! 339: ! 340: if(mlen == 0) /* no match */ ! 341: *mtchp = NIL(Element); ! 342: else ! 343: { ! 344: /* find where the match starts */ ! 345: while(CHILD(tree)) ! 346: tree = CHILD(tree); ! 347: *mtchp = SUFFIX(tree); ! 348: } ! 349: ! 350: return mlen; ! 351: }
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.