Annotation of researchv10no/cmd/sml/lib/twig/automata.sml, revision 1.1

1.1     ! root        1: 
        !             2: (* April 1989, Jussi Rintanen, Helsinki University of Technology *)
        !             3: 
        !             4: (* The tree pattern matching automata builder
        !             5: 
        !             6:   This is the final version of the machine builder of ML-Twig.
        !             7:   The machine builder takes as input a list of rules, constructs
        !             8:   a finite state automaton directly from the tree patterns. In the prototype
        !             9:   version, a string was constructed and returned, but the complexity of
        !            10:   constructing large strings from small constituent strings by catenating
        !            11:   them is not very good, so we have a separate function for printing
        !            12:   the automaton.
        !            13: 
        !            14: *)
        !            15: 
        !            16: signature AUTOMATA =
        !            17:   sig
        !            18:     exception AutomatonError of string
        !            19:     structure Parser : PARSER
        !            20:     val build_automaton: outstream * Parser.symbol list * Parser.rule list -> unit
        !            21: end;
        !            22: 
        !            23: (* This is the second version of ML-Twig Automata Builder.
        !            24: We note, that the first version was purely functional (without side-effects)
        !            25: and was based on a general structure constructing Aho-Corasick automata
        !            26: for string matching. However, the first version was considered too complex
        !            27: and inefficient, and we decided to rewrite it from scratch.
        !            28:   This revised version constructs a trie, which is represented by an array.
        !            29: The trie is built directly from tree patterns, and explicit construction
        !            30: of path strings is avoided.
        !            31: For detailed description we refer to [Aho,Corasick] and [Hoffmann,O'Donnell].
        !            32: *)
        !            33: 
        !            34: functor MAKEautomata (structure Parser : PARSER): AUTOMATA =
        !            35:   struct
        !            36:     exception AutomatonError of string
        !            37: 
        !            38:     fun fatal s = raise AutomatonError s
        !            39:       
        !            40: (* This structure represents an abstract trie with extensions for
        !            41:    string pattern matching automaton construction. We have assumed,
        !            42:    that the implementation has side-effects, and for efficiency
        !            43:    an array is used. *)
        !            44:       
        !            45:     structure Implementation :
        !            46:       sig
        !            47:        structure Parser : PARSER
        !            48:        datatype alpha = Sym of Parser.symbol | Child of int
        !            49:        type automaton
        !            50:        val empty_automaton : unit -> automaton
        !            51:        val add_arc : automaton * int * alpha -> automaton * int
        !            52:        val add_finals : automaton * int * (int * int * Parser.symbol) list -> automaton
        !            53:        val set_failure : automaton * int * int -> automaton
        !            54:        val get_failure : automaton * int -> int
        !            55:        val get_finals : automaton * int -> (int * int * Parser.symbol) list
        !            56:        val get_transitions : automaton * int -> (alpha * int) list
        !            57:        val last_state : automaton -> int
        !            58:       end
        !            59:     =
        !            60:     struct
        !            61:       structure Parser : PARSER = Parser
        !            62:       open Parser
        !            63:       datatype alpha = Sym of symbol | Child of int
        !            64:       type state = ((int * int * symbol) list * (alpha * int) list * int)
        !            65:       type automaton = state array * int * int
        !            66: 
        !            67:       fun empty_automaton () = (array (400, ([],[],0)), 400, 1)
        !            68: 
        !            69:       fun add_arc (trie as (a,b,c),i,iota) =
        !            70:        let val (fs,ts,f) = a sub i
        !            71:          val rec go = (fn nil => ~1 | ((on,to) :: t) =>
        !            72:                        if iota = on then to else go t)
        !            73:          val destination = go ts
        !            74:        in
        !            75:          if destination <> ~1
        !            76:            then (trie,destination)
        !            77:          else
        !            78:            if b = c
        !            79:              then
        !            80:                let val newsize = b*3 div 2
        !            81:                  val newa = array (newsize, ([],[],0))
        !            82:                  val rec copya = fn 0 => update(newa,0,a sub 0)
        !            83:                                   | n => (update(newa,n,a sub n); copya (n-1))
        !            84:                in
        !            85:                  copya (b-1); ((newa,newsize,c+1),c)
        !            86:                end
        !            87:            else (update(a,i,(fs,(iota,c)::ts,f));((a,b,c+1),c))
        !            88:        end
        !            89:          
        !            90:       fun set_failure (trie as (a,b,c),i,f) =
        !            91:        let val (fs,ts,f') = a sub i
        !            92:        in
        !            93:          (update (a,i,(fs,ts,f));
        !            94:           trie)
        !            95:        end
        !            96:       
        !            97:       fun add_finals (trie as (a,b,c),i,f) =
        !            98:        let val (fs,ts,s) = a sub i
        !            99:        in
        !           100:          (update (a,i,(f@fs,ts,s));
        !           101:           trie)
        !           102:        end
        !           103:       
        !           104:       fun get_finals ((a,b,c),i) = let val (fs,ts,s) = a sub i in fs end
        !           105:       fun get_failure ((a,b,c),i) = let val (fs,ts,s) = a sub i in s end
        !           106:       fun get_transitions ((a,b,c),i) = let val (fs,ts,s) = a sub i in ts end
        !           107:       fun last_state (a,b,c) = c-1
        !           108: 
        !           109:     end
        !           110: 
        !           111:     structure Parser = Parser
        !           112: 
        !           113:     open Implementation Parser
        !           114: 
        !           115:     val int2str : int -> string = makestring
        !           116: 
        !           117:     val accum = revfold
        !           118: 
        !           119: (* This function traverses a tree pattern and concurrently adds arcs
        !           120:    to the trie representation of a tree pattern matching automaton. *)
        !           121: 
        !           122:     fun add_pattern (autom, rule1, nont, Leaf n, state, len) =
        !           123:       let val (autom', state') = add_arc (autom, state, Sym n)
        !           124:       in add_finals (autom', state', [(len,rule1,nont)]) end
        !           125:       | add_pattern (autom, rule1, nont, Tree (n, cs), state, len) =
        !           126:        let val (autom', state') = add_arc (autom, state, Sym n)
        !           127:          val (autom'''',_) =
        !           128:            accum
        !           129:            (fn (c,(autom'', cn)) =>
        !           130:               let val (autom''', state'') = add_arc (autom'', state', Child cn)
        !           131:               in
        !           132:                 (add_pattern (autom''', rule1, nont, c, state'', len+1), cn + 1)
        !           133:               end)
        !           134:               cs (autom', 1)
        !           135:        in
        !           136:          autom''''
        !           137:        end
        !           138: 
        !           139:     fun go (au, s, i) =
        !           140:       let
        !           141:        val ts = get_transitions(au, s)
        !           142:        val rec g = fn nil => ~1 | ((p,q)::t) => if p=i then q else g t
        !           143:       in g ts
        !           144:       end
        !           145: 
        !           146:     fun oflevel1 au = let val ts = get_transitions(au, 0)
        !           147:                      in map (fn (p,q) => q) ts
        !           148:                      end
        !           149:                        
        !           150:     fun iterate (au, nil, nil) = au
        !           151:       | iterate (au, nil, next) = iterate (au,next,nil)
        !           152:       | iterate (au, h::t, next) =
        !           153:        let val f = get_failure (au, h)
        !           154:          val ts = get_transitions (au, h)
        !           155:          val au' = accum (fn ((i,s),aut) =>
        !           156:                           let val rec fail = fn state =>
        !           157:                             if go (aut,state,i) <> ~1
        !           158:                               then go (aut,state,i)
        !           159:                             else if state=0
        !           160:                                    then 0
        !           161:                                  else fail (get_failure (aut, state))
        !           162:                           in
        !           163:                             add_finals(set_failure (aut, s, fail f),
        !           164:                                        s,
        !           165:                                        get_finals (aut, fail f))
        !           166:                           end
        !           167:                         ) ts au
        !           168:        in
        !           169:          iterate (au',t,(map (fn (p,q) => q) ts) @ next)
        !           170:        end
        !           171: 
        !           172:     fun construct_failure au = iterate (au, oflevel1 au, [])
        !           173:       
        !           174:     fun construct_automaton rules =
        !           175:       let val t1 = (* Trie & final state values *)
        !           176:        accum
        !           177:        (fn (Rule(n,_,r,p,_,_),a) => add_pattern (a,n,r,p,0,1))
        !           178:           rules
        !           179:           (empty_automaton ())
        !           180:       in
        !           181:        construct_failure t1 (* Failure & final state values *)
        !           182:       end
        !           183: 
        !           184:     fun symbol2str (Label s) = "__"^s
        !           185:       | symbol2str (Node (s,_)) = s
        !           186: 
        !           187:     fun arc2str (Sym s) = symbol2str s
        !           188:       | arc2str (Child n) = "(ARC "^(int2str n)^")"
        !           189: 
        !           190:     fun output_symbols (out,symbols) =
        !           191:       (out "datatype symbols = ARC of int";
        !           192:        map
        !           193:        (fn s => out (" | "^(symbol2str s)))
        !           194:          symbols;
        !           195:          out "\n")
        !           196: 
        !           197:     fun output_finals' (out,au,n) =
        !           198:       if n <= last_state au
        !           199:        then
        !           200:          let val finals = get_finals (au,n)
        !           201:            fun outfinal (i,j,s) =
        !           202:              (out "(";
        !           203:               out (int2str i);
        !           204:               out ",";
        !           205:               out (int2str j);
        !           206:               out ",";
        !           207:               out (symbol2str s);
        !           208:               out ")")
        !           209:          in
        !           210:            out (int2str n);
        !           211:            out " => [";
        !           212:              case finals of
        !           213:                nil => ()
        !           214:              | [h] => outfinal h
        !           215:              | (h::t) => (outfinal h;app (fn h => (out",";outfinal h)) t);
        !           216:            out "]\n  | ";
        !           217:            output_finals' (out,au,n+1)
        !           218:          end
        !           219:       else ()
        !           220: 
        !           221:     fun output_finals (out,au) =
        !           222:       (out "fun get_finals s =\n";
        !           223:        out "  case s of\n";
        !           224:        output_finals' (out,au,0);
        !           225:        out "_ => nil\n\n")
        !           226: 
        !           227:     fun output_goto' (out,au,n) =
        !           228:       if n <= last_state au
        !           229:        then
        !           230:          let val transitions = get_transitions (au,n)
        !           231:          in
        !           232:            out (int2str n);
        !           233:            out " => (case a of ";
        !           234:            app
        !           235:            (fn (i,s) =>
        !           236:               (out (arc2str i);
        !           237:               out " => ";
        !           238:               out (int2str s);
        !           239:               out " | "))
        !           240:               transitions;
        !           241:             out " _ => ";
        !           242:             if n = 0 then (out "0")
        !           243:             else (out "go ("; out (int2str (get_failure (au,n))); out ",a)");
        !           244:            out ")\n  | ";
        !           245:            output_goto' (out,au,n+1)
        !           246:          end
        !           247:       else ()
        !           248: 
        !           249:     fun output_goto (out,au)  =
        !           250:       (out "fun go (s,a) =\n";
        !           251:        out "  case s of\n";
        !           252:        output_goto' (out,au,0);
        !           253:        out "_ => 0\n\n")
        !           254: 
        !           255:     fun output_automaton (outstr,au,symbols) =
        !           256:        let val out = output outstr
        !           257:        in
        !           258:         output_finals (out,au);
        !           259:         output_goto (out,au);
        !           260:         out "val go_f = get_finals o go\n";
        !           261:         out "fun childsymbol s = ARC s\n";
        !           262:         out "val initialstate = 0\n";
        !           263:         out "type state = int\n"
        !           264:        end
        !           265:        
        !           266:      fun build_automaton (outstr,symbols,rules) =
        !           267:        output_automaton(outstr,construct_automaton rules,symbols)
        !           268: 
        !           269:   end;

unix.superglobalmegacorp.com

This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.