|
|
1.1 ! root 1: ! 2: (* April 1989, Jussi Rintanen, Helsinki University of Technology *) ! 3: ! 4: (* The tree pattern matching automata builder ! 5: ! 6: This is the final version of the machine builder of ML-Twig. ! 7: The machine builder takes as input a list of rules, constructs ! 8: a finite state automaton directly from the tree patterns. In the prototype ! 9: version, a string was constructed and returned, but the complexity of ! 10: constructing large strings from small constituent strings by catenating ! 11: them is not very good, so we have a separate function for printing ! 12: the automaton. ! 13: ! 14: *) ! 15: ! 16: signature AUTOMATA = ! 17: sig ! 18: exception AutomatonError of string ! 19: structure Parser : PARSER ! 20: val build_automaton: outstream * Parser.symbol list * Parser.rule list -> unit ! 21: end; ! 22: ! 23: (* This is the second version of ML-Twig Automata Builder. ! 24: We note, that the first version was purely functional (without side-effects) ! 25: and was based on a general structure constructing Aho-Corasick automata ! 26: for string matching. However, the first version was considered too complex ! 27: and inefficient, and we decided to rewrite it from scratch. ! 28: This revised version constructs a trie, which is represented by an array. ! 29: The trie is built directly from tree patterns, and explicit construction ! 30: of path strings is avoided. ! 31: For detailed description we refer to [Aho,Corasick] and [Hoffmann,O'Donnell]. ! 32: *) ! 33: ! 34: functor MAKEautomata (structure Parser : PARSER): AUTOMATA = ! 35: struct ! 36: exception AutomatonError of string ! 37: ! 38: fun fatal s = raise AutomatonError s ! 39: ! 40: (* This structure represents an abstract trie with extensions for ! 41: string pattern matching automaton construction. We have assumed, ! 42: that the implementation has side-effects, and for efficiency ! 43: an array is used. *) ! 44: ! 45: structure Implementation : ! 46: sig ! 47: structure Parser : PARSER ! 48: datatype alpha = Sym of Parser.symbol | Child of int ! 49: type automaton ! 50: val empty_automaton : unit -> automaton ! 51: val add_arc : automaton * int * alpha -> automaton * int ! 52: val add_finals : automaton * int * (int * int * Parser.symbol) list -> automaton ! 53: val set_failure : automaton * int * int -> automaton ! 54: val get_failure : automaton * int -> int ! 55: val get_finals : automaton * int -> (int * int * Parser.symbol) list ! 56: val get_transitions : automaton * int -> (alpha * int) list ! 57: val last_state : automaton -> int ! 58: end ! 59: = ! 60: struct ! 61: structure Parser : PARSER = Parser ! 62: open Parser ! 63: datatype alpha = Sym of symbol | Child of int ! 64: type state = ((int * int * symbol) list * (alpha * int) list * int) ! 65: type automaton = state array * int * int ! 66: ! 67: fun empty_automaton () = (array (400, ([],[],0)), 400, 1) ! 68: ! 69: fun add_arc (trie as (a,b,c),i,iota) = ! 70: let val (fs,ts,f) = a sub i ! 71: val rec go = (fn nil => ~1 | ((on,to) :: t) => ! 72: if iota = on then to else go t) ! 73: val destination = go ts ! 74: in ! 75: if destination <> ~1 ! 76: then (trie,destination) ! 77: else ! 78: if b = c ! 79: then ! 80: let val newsize = b*3 div 2 ! 81: val newa = array (newsize, ([],[],0)) ! 82: val rec copya = fn 0 => update(newa,0,a sub 0) ! 83: | n => (update(newa,n,a sub n); copya (n-1)) ! 84: in ! 85: copya (b-1); ((newa,newsize,c+1),c) ! 86: end ! 87: else (update(a,i,(fs,(iota,c)::ts,f));((a,b,c+1),c)) ! 88: end ! 89: ! 90: fun set_failure (trie as (a,b,c),i,f) = ! 91: let val (fs,ts,f') = a sub i ! 92: in ! 93: (update (a,i,(fs,ts,f)); ! 94: trie) ! 95: end ! 96: ! 97: fun add_finals (trie as (a,b,c),i,f) = ! 98: let val (fs,ts,s) = a sub i ! 99: in ! 100: (update (a,i,(f@fs,ts,s)); ! 101: trie) ! 102: end ! 103: ! 104: fun get_finals ((a,b,c),i) = let val (fs,ts,s) = a sub i in fs end ! 105: fun get_failure ((a,b,c),i) = let val (fs,ts,s) = a sub i in s end ! 106: fun get_transitions ((a,b,c),i) = let val (fs,ts,s) = a sub i in ts end ! 107: fun last_state (a,b,c) = c-1 ! 108: ! 109: end ! 110: ! 111: structure Parser = Parser ! 112: ! 113: open Implementation Parser ! 114: ! 115: val int2str : int -> string = makestring ! 116: ! 117: val accum = revfold ! 118: ! 119: (* This function traverses a tree pattern and concurrently adds arcs ! 120: to the trie representation of a tree pattern matching automaton. *) ! 121: ! 122: fun add_pattern (autom, rule1, nont, Leaf n, state, len) = ! 123: let val (autom', state') = add_arc (autom, state, Sym n) ! 124: in add_finals (autom', state', [(len,rule1,nont)]) end ! 125: | add_pattern (autom, rule1, nont, Tree (n, cs), state, len) = ! 126: let val (autom', state') = add_arc (autom, state, Sym n) ! 127: val (autom'''',_) = ! 128: accum ! 129: (fn (c,(autom'', cn)) => ! 130: let val (autom''', state'') = add_arc (autom'', state', Child cn) ! 131: in ! 132: (add_pattern (autom''', rule1, nont, c, state'', len+1), cn + 1) ! 133: end) ! 134: cs (autom', 1) ! 135: in ! 136: autom'''' ! 137: end ! 138: ! 139: fun go (au, s, i) = ! 140: let ! 141: val ts = get_transitions(au, s) ! 142: val rec g = fn nil => ~1 | ((p,q)::t) => if p=i then q else g t ! 143: in g ts ! 144: end ! 145: ! 146: fun oflevel1 au = let val ts = get_transitions(au, 0) ! 147: in map (fn (p,q) => q) ts ! 148: end ! 149: ! 150: fun iterate (au, nil, nil) = au ! 151: | iterate (au, nil, next) = iterate (au,next,nil) ! 152: | iterate (au, h::t, next) = ! 153: let val f = get_failure (au, h) ! 154: val ts = get_transitions (au, h) ! 155: val au' = accum (fn ((i,s),aut) => ! 156: let val rec fail = fn state => ! 157: if go (aut,state,i) <> ~1 ! 158: then go (aut,state,i) ! 159: else if state=0 ! 160: then 0 ! 161: else fail (get_failure (aut, state)) ! 162: in ! 163: add_finals(set_failure (aut, s, fail f), ! 164: s, ! 165: get_finals (aut, fail f)) ! 166: end ! 167: ) ts au ! 168: in ! 169: iterate (au',t,(map (fn (p,q) => q) ts) @ next) ! 170: end ! 171: ! 172: fun construct_failure au = iterate (au, oflevel1 au, []) ! 173: ! 174: fun construct_automaton rules = ! 175: let val t1 = (* Trie & final state values *) ! 176: accum ! 177: (fn (Rule(n,_,r,p,_,_),a) => add_pattern (a,n,r,p,0,1)) ! 178: rules ! 179: (empty_automaton ()) ! 180: in ! 181: construct_failure t1 (* Failure & final state values *) ! 182: end ! 183: ! 184: fun symbol2str (Label s) = "__"^s ! 185: | symbol2str (Node (s,_)) = s ! 186: ! 187: fun arc2str (Sym s) = symbol2str s ! 188: | arc2str (Child n) = "(ARC "^(int2str n)^")" ! 189: ! 190: fun output_symbols (out,symbols) = ! 191: (out "datatype symbols = ARC of int"; ! 192: map ! 193: (fn s => out (" | "^(symbol2str s))) ! 194: symbols; ! 195: out "\n") ! 196: ! 197: fun output_finals' (out,au,n) = ! 198: if n <= last_state au ! 199: then ! 200: let val finals = get_finals (au,n) ! 201: fun outfinal (i,j,s) = ! 202: (out "("; ! 203: out (int2str i); ! 204: out ","; ! 205: out (int2str j); ! 206: out ","; ! 207: out (symbol2str s); ! 208: out ")") ! 209: in ! 210: out (int2str n); ! 211: out " => ["; ! 212: case finals of ! 213: nil => () ! 214: | [h] => outfinal h ! 215: | (h::t) => (outfinal h;app (fn h => (out",";outfinal h)) t); ! 216: out "]\n | "; ! 217: output_finals' (out,au,n+1) ! 218: end ! 219: else () ! 220: ! 221: fun output_finals (out,au) = ! 222: (out "fun get_finals s =\n"; ! 223: out " case s of\n"; ! 224: output_finals' (out,au,0); ! 225: out "_ => nil\n\n") ! 226: ! 227: fun output_goto' (out,au,n) = ! 228: if n <= last_state au ! 229: then ! 230: let val transitions = get_transitions (au,n) ! 231: in ! 232: out (int2str n); ! 233: out " => (case a of "; ! 234: app ! 235: (fn (i,s) => ! 236: (out (arc2str i); ! 237: out " => "; ! 238: out (int2str s); ! 239: out " | ")) ! 240: transitions; ! 241: out " _ => "; ! 242: if n = 0 then (out "0") ! 243: else (out "go ("; out (int2str (get_failure (au,n))); out ",a)"); ! 244: out ")\n | "; ! 245: output_goto' (out,au,n+1) ! 246: end ! 247: else () ! 248: ! 249: fun output_goto (out,au) = ! 250: (out "fun go (s,a) =\n"; ! 251: out " case s of\n"; ! 252: output_goto' (out,au,0); ! 253: out "_ => 0\n\n") ! 254: ! 255: fun output_automaton (outstr,au,symbols) = ! 256: let val out = output outstr ! 257: in ! 258: output_finals (out,au); ! 259: output_goto (out,au); ! 260: out "val go_f = get_finals o go\n"; ! 261: out "fun childsymbol s = ARC s\n"; ! 262: out "val initialstate = 0\n"; ! 263: out "type state = int\n" ! 264: end ! 265: ! 266: fun build_automaton (outstr,symbols,rules) = ! 267: output_automaton(outstr,construct_automaton rules,symbols) ! 268: ! 269: end;
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.