1-- ----------------------------------------------------------------------------- 2-- 3-- DFA.hs, part of Alex 4-- 5-- (c) Chris Dornan 1995-2000, Simon Marlow 2003 6-- 7-- This module generates a DFA from a scanner by first converting it 8-- to an NFA and then converting the NFA with the subset construction. 9-- 10-- See the chapter on `Finite Automata and Lexical Analysis' in the 11-- dragon book for an excellent overview of the algorithms in this 12-- module. 13-- 14-- ----------------------------------------------------------------------------} 15 16module DFA(scanner2dfa) where 17 18import AbsSyn 19import qualified Map 20import qualified Data.IntMap as IntMap 21import NFA 22import Sort ( msort, nub' ) 23import CharSet 24 25import Data.Array ( (!) ) 26import Data.Maybe ( fromJust ) 27 28{- Defined in the Scan Module 29 30-- (This section should logically belong to the DFA module but it has been 31-- placed here to make this module self-contained.) 32-- 33-- `DFA' provides an alternative to `Scanner' (described in the RExp module); 34-- it can be used directly to scan text efficiently. Additionally it has an 35-- extra place holder for holding action functions for generating 36-- application-specific tokens. When this place holder is not being used, the 37-- unit type will be used. 38-- 39-- Each state in the automaton consist of a list of `Accept' values, descending 40-- in priority, and an array mapping characters to new states. As the array 41-- may only cover a sub-range of the characters, a default state number is 42-- given in the third field. By convention, all transitions to the -1 state 43-- represent invalid transitions. 44-- 45-- A list of accept states is provided for as the original specification may 46-- have been ambiguous, in which case the highest priority token should be 47-- taken (the one appearing earliest in the specification); this can not be 48-- calculated when the DFA is generated in all cases as some of the tokens may 49-- be associated with leading or trailing context or start codes. 50-- 51-- `scan_token' (see above) can deal with unconditional accept states more 52-- efficiently than those associated with context; to save it testing each time 53-- whether the list of accept states contains an unconditional state, the flag 54-- in the first field of `St' is set to true whenever the list contains an 55-- unconditional state. 56-- 57-- The `Accept' structure contains the priority of the token being accepted 58-- (lower numbers => higher priorities), the name of the token, a place holder 59-- that can be used for storing the `action' function for constructing the 60-- token from the input text and thge scanner's state, a list of start codes 61-- (listing the start codes that the scanner must be in for the token to be 62-- accepted; empty => no restriction), the leading and trailing context (both 63-- `Nothing' if there is none). 64-- 65-- The leading context consists simply of a character predicate that will 66-- return true if the last character read is acceptable. The trailing context 67-- consists of an alternative starting state within the DFA; if this `sub-dfa' 68-- turns up any accepting state when applied to the residual input then the 69-- trailing context is acceptable (see `scan_token' above). 70 71type DFA a = Array SNum (State a) 72 73type SNum = Int 74 75data State a = St Bool [Accept a] SNum (Array Char SNum) 76 77data Accept a = Acc Int String a [StartCode] (MB(Char->Bool)) (MB SNum) 78 79type StartCode = Int 80-} 81 82 83-- Scanners are converted to DFAs by converting them to NFAs first. Converting 84-- an NFA to a DFA works by identifying the states of the DFA with subsets of 85-- the NFA. The PartDFA is used to construct the DFA; it is essentially a DFA 86-- in which the states are represented directly by state sets of the NFA. 87-- `nfa2pdfa' constructs the partial DFA from the NFA by searching for all the 88-- transitions from a given list of state sets, initially containing the start 89-- state of the partial DFA, until all possible state sets have been considered 90-- The final DFA is then constructed with a `mk_dfa'. 91 92scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code 93scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs) 94 95nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code 96nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa)) 97 where 98 pdfa = new_pdfa n_starts nfa 99 n_starts = length scs -- number of start states 100 101-- `nfa2pdfa' works by taking the next outstanding state set to be considered 102-- and and ignoring it if the state is already in the partial DFA, otherwise 103-- generating all possible transitions from it, adding the new state to the 104-- partial DFA and continuing the closure with the extra states. Note the way 105-- it incorporates the trailing context references into the search (by 106-- including `rctx_ss' in the search). 107 108nfa2pdfa:: NFA -> DFA StateSet Code -> [StateSet] -> DFA StateSet Code 109nfa2pdfa _ pdfa [] = pdfa 110nfa2pdfa nfa pdfa (ss:umkd) 111 | ss `in_pdfa` pdfa = nfa2pdfa nfa pdfa umkd 112 | otherwise = nfa2pdfa nfa pdfa' umkd' 113 where 114 pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa 115 116 umkd' = rctx_sss ++ map snd ss_outs ++ umkd 117 118 -- for each character, the set of states that character would take 119 -- us to from the current set of states in the NFA. 120 ss_outs :: [(Int, StateSet)] 121 ss_outs = [ (fromIntegral ch, mk_ss nfa ss') 122 | ch <- byteSetElems $ setUnions [p | (p,_) <- outs], 123 let ss' = [ s' | (p,s') <- outs, byteSetElem p ch ], 124 not (null ss') 125 ] 126 127 rctx_sss = [ mk_ss nfa [s] 128 | Acc _ _ _ (RightContextRExp s) <- accs ] 129 130 outs :: [(ByteSet,SNum)] 131 outs = [ out | s <- ss, out <- nst_outs (nfa!s) ] 132 133 accs = sort_accs [acc| s<-ss, acc<-nst_accs (nfa!s)] 134 135-- `sort_accs' sorts a list of accept values into decending order of priority, 136-- eliminating any elements that follow an unconditional accept value. 137 138sort_accs:: [Accept a] -> [Accept a] 139sort_accs accs = foldr chk [] (msort le accs) 140 where 141 chk acc@(Acc _ _ Nothing NoRightContext) _ = [acc] 142 chk acc rst = acc:rst 143 144 le (Acc{accPrio = n}) (Acc{accPrio=n'}) = n<=n' 145 146 147 148{------------------------------------------------------------------------------ 149 State Sets and Partial DFAs 150------------------------------------------------------------------------------} 151 152 153 154-- A `PartDFA' is a partially constructed DFA in which the states are 155-- represented by sets of states of the original NFA. It is represented by a 156-- triple consisting of the start state of the partial DFA, the NFA from which 157-- it is derived and a map from state sets to states of the partial DFA. The 158-- state set for a given list of NFA states is calculated by taking the epsilon 159-- closure of all the states, sorting the result with duplicates eliminated. 160 161type StateSet = [SNum] 162 163new_pdfa:: Int -> NFA -> DFA StateSet a 164new_pdfa starts nfa 165 = DFA { dfa_start_states = start_ss, 166 dfa_states = Map.empty 167 } 168 where 169 start_ss = [ msort (<=) (nst_cl(nfa!n)) | n <- [0..(starts-1)]] 170 171 -- starts is the number of start states 172 173-- constructs the epsilon-closure of a set of NFA states 174mk_ss:: NFA -> [SNum] -> StateSet 175mk_ss nfa l = nub' (<=) [s'| s<-l, s'<-nst_cl(nfa!s)] 176 177add_pdfa:: StateSet -> State StateSet a -> DFA StateSet a -> DFA StateSet a 178add_pdfa ss pst (DFA st mp) = DFA st (Map.insert ss pst mp) 179 180in_pdfa:: StateSet -> DFA StateSet a -> Bool 181in_pdfa ss (DFA _ mp) = ss `Map.member` mp 182 183-- Construct a DFA with numbered states, from a DFA whose states are 184-- sets of states from the original NFA. 185 186mk_int_dfa:: NFA -> DFA StateSet a -> DFA SNum a 187mk_int_dfa nfa (DFA start_states mp) 188 = DFA [0 .. length start_states-1] 189 (Map.fromList [ (lookup' st, cnv pds) | (st, pds) <- Map.toAscList mp ]) 190 where 191 mp' = Map.fromList (zip (start_states ++ 192 (map fst . Map.toAscList) (foldr Map.delete mp start_states)) [0..]) 193 194 lookup' = fromJust . flip Map.lookup mp' 195 196 cnv :: State StateSet a -> State SNum a 197 cnv (State accs as) = State accs' as' 198 where 199 as' = IntMap.mapWithKey (\_ch s -> lookup' s) as 200 201 accs' = map cnv_acc accs 202 cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx' 203 where rctx' = 204 case rctx of 205 RightContextRExp s -> 206 RightContextRExp (lookup' (mk_ss nfa [s])) 207 other -> other 208 209{- 210 211-- `mk_st' constructs a state node from the list of accept values and a list of 212-- transitions. The transitions list all the valid transitions out of the 213-- node; all invalid transitions should be represented in the array by state 214-- -1. `mk_st' has to work out whether the accept states contain an 215-- unconditional entry, in which case the first field of `St' should be true, 216-- and which default state to use in constructing the array (the array may span 217-- a sub-range of the character set, the state number given the third argument 218-- of `St' being taken as the default if an input character lies outside the 219-- range). The default values is chosen to minimise the bounds of the array 220-- and so there are two candidates: the value that 0 maps to (in which case 221-- some initial segment of the array may be omitted) or the value that 255 maps 222-- to (in which case a final segment of the array may be omitted), hence the 223-- calculation of `(df,bds)'. 224-- 225-- Note that empty arrays are avoided as they can cause severe problems for 226-- some popular Haskell compilers. 227 228mk_st:: [Accept Code] -> [(Char,Int)] -> State Code 229mk_st accs as = 230 if null as 231 then St accs (-1) (listArray ('0','0') [-1]) 232 else St accs df (listArray bds [arr!c| c<-range bds]) 233 where 234 bds = if sz==0 then ('0','0') else bds0 235 236 (sz,df,bds0) | sz1 < sz2 = (sz1,df1,bds1) 237 | otherwise = (sz2,df2,bds2) 238 239 (sz1,df1,bds1) = mk_bds(arr!chr 0) 240 (sz2,df2,bds2) = mk_bds(arr!chr 255) 241 242 mk_bds df = (t-b, df, (chr b, chr (255-t))) 243 where 244 b = length (takeWhile id [arr!c==df| c<-['\0'..'\xff']]) 245 t = length (takeWhile id [arr!c==df| c<-['\xff','\xfe'..'\0']]) 246 247 arr = listArray ('\0','\xff') (take 256 (repeat (-1))) // as 248-} 249