|
|
1.1 root 1:
2: (* April 1989, Jussi Rintanen, Helsinki University of Technology *)
3:
4: (* The tree pattern matching automata builder
5:
6: This is the final version of the machine builder of ML-Twig.
7: The machine builder takes as input a list of rules, constructs
8: a finite state automaton directly from the tree patterns. In the prototype
9: version, a string was constructed and returned, but the complexity of
10: constructing large strings from small constituent strings by catenating
11: them is not very good, so we have a separate function for printing
12: the automaton.
13:
14: *)
15:
16: signature AUTOMATA =
17: sig
18: exception AutomatonError of string
19: structure Parser : PARSER
20: val build_automaton: outstream * Parser.symbol list * Parser.rule list -> unit
21: end;
22:
23: (* This is the second version of ML-Twig Automata Builder.
24: We note, that the first version was purely functional (without side-effects)
25: and was based on a general structure constructing Aho-Corasick automata
26: for string matching. However, the first version was considered too complex
27: and inefficient, and we decided to rewrite it from scratch.
28: This revised version constructs a trie, which is represented by an array.
29: The trie is built directly from tree patterns, and explicit construction
30: of path strings is avoided.
31: For detailed description we refer to [Aho,Corasick] and [Hoffmann,O'Donnell].
32: *)
33:
34: functor MAKEautomata (structure Parser : PARSER): AUTOMATA =
35: struct
36: exception AutomatonError of string
37:
38: fun fatal s = raise AutomatonError s
39:
40: (* This structure represents an abstract trie with extensions for
41: string pattern matching automaton construction. We have assumed,
42: that the implementation has side-effects, and for efficiency
43: an array is used. *)
44:
45: structure Implementation :
46: sig
47: structure Parser : PARSER
48: datatype alpha = Sym of Parser.symbol | Child of int
49: type automaton
50: val empty_automaton : unit -> automaton
51: val add_arc : automaton * int * alpha -> automaton * int
52: val add_finals : automaton * int * (int * int * Parser.symbol) list -> automaton
53: val set_failure : automaton * int * int -> automaton
54: val get_failure : automaton * int -> int
55: val get_finals : automaton * int -> (int * int * Parser.symbol) list
56: val get_transitions : automaton * int -> (alpha * int) list
57: val last_state : automaton -> int
58: end
59: =
60: struct
61: structure Parser : PARSER = Parser
62: open Parser
63: datatype alpha = Sym of symbol | Child of int
64: type state = ((int * int * symbol) list * (alpha * int) list * int)
65: type automaton = state array * int * int
66:
67: fun empty_automaton () = (array (400, ([],[],0)), 400, 1)
68:
69: fun add_arc (trie as (a,b,c),i,iota) =
70: let val (fs,ts,f) = a sub i
71: val rec go = (fn nil => ~1 | ((on,to) :: t) =>
72: if iota = on then to else go t)
73: val destination = go ts
74: in
75: if destination <> ~1
76: then (trie,destination)
77: else
78: if b = c
79: then
80: let val newsize = b*3 div 2
81: val newa = array (newsize, ([],[],0))
82: val rec copya = fn 0 => update(newa,0,a sub 0)
83: | n => (update(newa,n,a sub n); copya (n-1))
84: in
85: copya (b-1); ((newa,newsize,c+1),c)
86: end
87: else (update(a,i,(fs,(iota,c)::ts,f));((a,b,c+1),c))
88: end
89:
90: fun set_failure (trie as (a,b,c),i,f) =
91: let val (fs,ts,f') = a sub i
92: in
93: (update (a,i,(fs,ts,f));
94: trie)
95: end
96:
97: fun add_finals (trie as (a,b,c),i,f) =
98: let val (fs,ts,s) = a sub i
99: in
100: (update (a,i,(f@fs,ts,s));
101: trie)
102: end
103:
104: fun get_finals ((a,b,c),i) = let val (fs,ts,s) = a sub i in fs end
105: fun get_failure ((a,b,c),i) = let val (fs,ts,s) = a sub i in s end
106: fun get_transitions ((a,b,c),i) = let val (fs,ts,s) = a sub i in ts end
107: fun last_state (a,b,c) = c-1
108:
109: end
110:
111: structure Parser = Parser
112:
113: open Implementation Parser
114:
115: val int2str : int -> string = makestring
116:
117: val accum = revfold
118:
119: (* This function traverses a tree pattern and concurrently adds arcs
120: to the trie representation of a tree pattern matching automaton. *)
121:
122: fun add_pattern (autom, rule1, nont, Leaf n, state, len) =
123: let val (autom', state') = add_arc (autom, state, Sym n)
124: in add_finals (autom', state', [(len,rule1,nont)]) end
125: | add_pattern (autom, rule1, nont, Tree (n, cs), state, len) =
126: let val (autom', state') = add_arc (autom, state, Sym n)
127: val (autom'''',_) =
128: accum
129: (fn (c,(autom'', cn)) =>
130: let val (autom''', state'') = add_arc (autom'', state', Child cn)
131: in
132: (add_pattern (autom''', rule1, nont, c, state'', len+1), cn + 1)
133: end)
134: cs (autom', 1)
135: in
136: autom''''
137: end
138:
139: fun go (au, s, i) =
140: let
141: val ts = get_transitions(au, s)
142: val rec g = fn nil => ~1 | ((p,q)::t) => if p=i then q else g t
143: in g ts
144: end
145:
146: fun oflevel1 au = let val ts = get_transitions(au, 0)
147: in map (fn (p,q) => q) ts
148: end
149:
150: fun iterate (au, nil, nil) = au
151: | iterate (au, nil, next) = iterate (au,next,nil)
152: | iterate (au, h::t, next) =
153: let val f = get_failure (au, h)
154: val ts = get_transitions (au, h)
155: val au' = accum (fn ((i,s),aut) =>
156: let val rec fail = fn state =>
157: if go (aut,state,i) <> ~1
158: then go (aut,state,i)
159: else if state=0
160: then 0
161: else fail (get_failure (aut, state))
162: in
163: add_finals(set_failure (aut, s, fail f),
164: s,
165: get_finals (aut, fail f))
166: end
167: ) ts au
168: in
169: iterate (au',t,(map (fn (p,q) => q) ts) @ next)
170: end
171:
172: fun construct_failure au = iterate (au, oflevel1 au, [])
173:
174: fun construct_automaton rules =
175: let val t1 = (* Trie & final state values *)
176: accum
177: (fn (Rule(n,_,r,p,_,_),a) => add_pattern (a,n,r,p,0,1))
178: rules
179: (empty_automaton ())
180: in
181: construct_failure t1 (* Failure & final state values *)
182: end
183:
184: fun symbol2str (Label s) = "__"^s
185: | symbol2str (Node (s,_)) = s
186:
187: fun arc2str (Sym s) = symbol2str s
188: | arc2str (Child n) = "(ARC "^(int2str n)^")"
189:
190: fun output_symbols (out,symbols) =
191: (out "datatype symbols = ARC of int";
192: map
193: (fn s => out (" | "^(symbol2str s)))
194: symbols;
195: out "\n")
196:
197: fun output_finals' (out,au,n) =
198: if n <= last_state au
199: then
200: let val finals = get_finals (au,n)
201: fun outfinal (i,j,s) =
202: (out "(";
203: out (int2str i);
204: out ",";
205: out (int2str j);
206: out ",";
207: out (symbol2str s);
208: out ")")
209: in
210: out (int2str n);
211: out " => [";
212: case finals of
213: nil => ()
214: | [h] => outfinal h
215: | (h::t) => (outfinal h;app (fn h => (out",";outfinal h)) t);
216: out "]\n | ";
217: output_finals' (out,au,n+1)
218: end
219: else ()
220:
221: fun output_finals (out,au) =
222: (out "fun get_finals s =\n";
223: out " case s of\n";
224: output_finals' (out,au,0);
225: out "_ => nil\n\n")
226:
227: fun output_goto' (out,au,n) =
228: if n <= last_state au
229: then
230: let val transitions = get_transitions (au,n)
231: in
232: out (int2str n);
233: out " => (case a of ";
234: app
235: (fn (i,s) =>
236: (out (arc2str i);
237: out " => ";
238: out (int2str s);
239: out " | "))
240: transitions;
241: out " _ => ";
242: if n = 0 then (out "0")
243: else (out "go ("; out (int2str (get_failure (au,n))); out ",a)");
244: out ")\n | ";
245: output_goto' (out,au,n+1)
246: end
247: else ()
248:
249: fun output_goto (out,au) =
250: (out "fun go (s,a) =\n";
251: out " case s of\n";
252: output_goto' (out,au,0);
253: out "_ => 0\n\n")
254:
255: fun output_automaton (outstr,au,symbols) =
256: let val out = output outstr
257: in
258: output_finals (out,au);
259: output_goto (out,au);
260: out "val go_f = get_finals o go\n";
261: out "fun childsymbol s = ARC s\n";
262: out "val initialstate = 0\n";
263: out "type state = int\n"
264: end
265:
266: fun build_automaton (outstr,symbols,rules) =
267: output_automaton(outstr,construct_automaton rules,symbols)
268:
269: end;
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.