1-- -----------------------------------------------------------------------------
2--
3-- DFA.hs, part of Alex
4--
5-- (c) Chris Dornan 1995-2000, Simon Marlow 2003
6--
7-- This module generates a DFA from a scanner by first converting it
8-- to an NFA and then converting the NFA with the subset construction.
9--
10-- See the chapter on `Finite Automata and Lexical Analysis' in the
11-- dragon book for an excellent overview of the algorithms in this
12-- module.
13--
14-- ----------------------------------------------------------------------------}
15
16module DFA(scanner2dfa) where
17
18import AbsSyn
19import qualified Map
20import qualified Data.IntMap as IntMap
21import NFA
22import Sort ( msort, nub' )
23import CharSet
24
25import Data.Array ( (!) )
26import Data.Maybe ( fromJust )
27
28{-                        Defined in the Scan Module
29
30-- (This section should logically belong to the DFA module but it has been
31-- placed here to make this module self-contained.)
32--
33-- `DFA' provides an alternative to `Scanner' (described in the RExp module);
34-- it can be used directly to scan text efficiently.  Additionally it has an
35-- extra place holder for holding action functions for generating
36-- application-specific tokens.  When this place holder is not being used, the
37-- unit type will be used.
38--
39-- Each state in the automaton consist of a list of `Accept' values, descending
40-- in priority, and an array mapping characters to new states.  As the array
41-- may only cover a sub-range of the characters, a default state number is
42-- given in the third field.  By convention, all transitions to the -1 state
43-- represent invalid transitions.
44--
45-- A list of accept states is provided for as the original specification may
46-- have been ambiguous, in which case the highest priority token should be
47-- taken (the one appearing earliest in the specification); this can not be
48-- calculated when the DFA is generated in all cases as some of the tokens may
49-- be associated with leading or trailing context or start codes.
50--
51-- `scan_token' (see above) can deal with unconditional accept states more
52-- efficiently than those associated with context; to save it testing each time
53-- whether the list of accept states contains an unconditional state, the flag
54-- in the first field of `St' is set to true whenever the list contains an
55-- unconditional state.
56--
57-- The `Accept' structure contains the priority of the token being accepted
58-- (lower numbers => higher priorities), the name of the token, a place holder
59-- that can be used for storing the `action' function for constructing the
60-- token from the input text and thge scanner's state, a list of start codes
61-- (listing the start codes that the scanner must be in for the token to be
62-- accepted; empty => no restriction), the leading and trailing context (both
63-- `Nothing' if there is none).
64--
65-- The leading context consists simply of a character predicate that will
66-- return true if the last character read is acceptable.  The trailing context
67-- consists of an alternative starting state within the DFA; if this `sub-dfa'
68-- turns up any accepting state when applied to the residual input then the
69-- trailing context is acceptable (see `scan_token' above).
70
71type DFA a = Array SNum (State a)
72
73type SNum = Int
74
75data State a = St Bool [Accept a] SNum (Array Char SNum)
76
77data Accept a = Acc Int String a [StartCode] (MB(Char->Bool)) (MB SNum)
78
79type StartCode = Int
80-}
81
82
83-- Scanners are converted to DFAs by converting them to NFAs first.  Converting
84-- an NFA to a DFA works by identifying the states of the DFA with subsets of
85-- the NFA.  The PartDFA is used to construct the DFA; it is essentially a DFA
86-- in which the states are represented directly by state sets of the NFA.
87-- `nfa2pdfa' constructs the partial DFA from the NFA by searching for all the
88-- transitions from a given list of state sets, initially containing the start
89-- state of the partial DFA, until all possible state sets have been considered
90-- The final DFA is then constructed with a `mk_dfa'.
91
92scanner2dfa:: Encoding -> Scanner -> [StartCode] -> DFA SNum Code
93scanner2dfa enc scanner scs = nfa2dfa scs (scanner2nfa enc scanner scs)
94
95nfa2dfa:: [StartCode] -> NFA -> DFA SNum Code
96nfa2dfa scs nfa = mk_int_dfa nfa (nfa2pdfa nfa pdfa (dfa_start_states pdfa))
97        where
98        pdfa = new_pdfa n_starts nfa
99        n_starts = length scs  -- number of start states
100
101-- `nfa2pdfa' works by taking the next outstanding state set to be considered
102-- and and ignoring it if the state is already in the partial DFA, otherwise
103-- generating all possible transitions from it, adding the new state to the
104-- partial DFA and continuing the closure with the extra states.  Note the way
105-- it incorporates the trailing context references into the search (by
106-- including `rctx_ss' in the search).
107
108nfa2pdfa:: NFA -> DFA StateSet Code -> [StateSet] -> DFA StateSet Code
109nfa2pdfa _   pdfa [] = pdfa
110nfa2pdfa nfa pdfa (ss:umkd)
111  |  ss `in_pdfa` pdfa =  nfa2pdfa nfa pdfa  umkd
112  |  otherwise         =  nfa2pdfa nfa pdfa' umkd'
113  where
114        pdfa' = add_pdfa ss (State accs (IntMap.fromList ss_outs)) pdfa
115
116        umkd' = rctx_sss ++ map snd ss_outs ++ umkd
117
118        -- for each character, the set of states that character would take
119        -- us to from the current set of states in the NFA.
120        ss_outs :: [(Int, StateSet)]
121        ss_outs = [ (fromIntegral ch, mk_ss nfa ss')
122                  | ch  <- byteSetElems $ setUnions [p | (p,_) <- outs],
123                    let ss'  = [ s' | (p,s') <- outs, byteSetElem p ch ],
124                    not (null ss')
125                  ]
126
127        rctx_sss = [ mk_ss nfa [s]
128                   | Acc _ _ _ (RightContextRExp s) <- accs ]
129
130        outs :: [(ByteSet,SNum)]
131        outs =  [ out | s <- ss, out <- nst_outs (nfa!s) ]
132
133        accs = sort_accs [acc| s<-ss, acc<-nst_accs (nfa!s)]
134
135-- `sort_accs' sorts a list of accept values into decending order of priority,
136-- eliminating any elements that follow an unconditional accept value.
137
138sort_accs:: [Accept a] -> [Accept a]
139sort_accs accs = foldr chk [] (msort le accs)
140        where
141        chk acc@(Acc _ _ Nothing NoRightContext) _   = [acc]
142        chk acc                                  rst = acc:rst
143
144        le (Acc{accPrio = n}) (Acc{accPrio=n'}) = n<=n'
145
146
147
148{------------------------------------------------------------------------------
149                          State Sets and Partial DFAs
150------------------------------------------------------------------------------}
151
152
153
154-- A `PartDFA' is a partially constructed DFA in which the states are
155-- represented by sets of states of the original NFA.  It is represented by a
156-- triple consisting of the start state of the partial DFA, the NFA from which
157-- it is derived and a map from state sets to states of the partial DFA.  The
158-- state set for a given list of NFA states is calculated by taking the epsilon
159-- closure of all the states, sorting the result with duplicates eliminated.
160
161type StateSet = [SNum]
162
163new_pdfa:: Int -> NFA -> DFA StateSet a
164new_pdfa starts nfa
165 = DFA { dfa_start_states = start_ss,
166         dfa_states = Map.empty
167       }
168 where
169        start_ss = [ msort (<=) (nst_cl(nfa!n)) | n <- [0..(starts-1)]]
170
171 -- starts is the number of start states
172
173-- constructs the epsilon-closure of a set of NFA states
174mk_ss:: NFA -> [SNum] -> StateSet
175mk_ss nfa l = nub' (<=) [s'| s<-l, s'<-nst_cl(nfa!s)]
176
177add_pdfa:: StateSet -> State StateSet a -> DFA StateSet a -> DFA StateSet a
178add_pdfa ss pst (DFA st mp) = DFA st (Map.insert ss pst mp)
179
180in_pdfa:: StateSet -> DFA StateSet a -> Bool
181in_pdfa ss (DFA _ mp) = ss `Map.member` mp
182
183-- Construct a DFA with numbered states, from a DFA whose states are
184-- sets of states from the original NFA.
185
186mk_int_dfa:: NFA -> DFA StateSet a -> DFA SNum a
187mk_int_dfa nfa (DFA start_states mp)
188  = DFA [0 .. length start_states-1]
189        (Map.fromList [ (lookup' st, cnv pds) | (st, pds) <- Map.toAscList mp ])
190  where
191        mp' = Map.fromList (zip (start_states ++
192                                 (map fst . Map.toAscList) (foldr Map.delete mp start_states)) [0..])
193
194        lookup' = fromJust . flip Map.lookup mp'
195
196        cnv :: State StateSet a -> State SNum a
197        cnv (State accs as) = State accs' as'
198                where
199                as'   = IntMap.mapWithKey (\_ch s -> lookup' s) as
200
201                accs' = map cnv_acc accs
202                cnv_acc (Acc p a lctx rctx) = Acc p a lctx rctx'
203                  where rctx' =
204                          case rctx of
205                                RightContextRExp s ->
206                                  RightContextRExp (lookup' (mk_ss nfa [s]))
207                                other -> other
208
209{-
210
211-- `mk_st' constructs a state node from the list of accept values and a list of
212-- transitions.  The transitions list all the valid transitions out of the
213-- node; all invalid transitions should be represented in the array by state
214-- -1.  `mk_st' has to work out whether the accept states contain an
215-- unconditional entry, in which case the first field of `St' should be true,
216-- and which default state to use in constructing the array (the array may span
217-- a sub-range of the character set, the state number given the third argument
218-- of `St' being taken as the default if an input character lies outside the
219-- range).  The default values is chosen to minimise the bounds of the array
220-- and so there are two candidates: the value that 0 maps to (in which case
221-- some initial segment of the array may be omitted) or the value that 255 maps
222-- to (in which case a final segment of the array may be omitted), hence the
223-- calculation of `(df,bds)'.
224--
225-- Note that empty arrays are avoided as they can cause severe problems for
226-- some popular Haskell compilers.
227
228mk_st:: [Accept Code] -> [(Char,Int)] -> State Code
229mk_st accs as =
230        if null as
231           then St accs (-1) (listArray ('0','0') [-1])
232           else St accs df (listArray bds [arr!c| c<-range bds])
233        where
234        bds = if sz==0 then ('0','0') else bds0
235
236        (sz,df,bds0) | sz1 < sz2 = (sz1,df1,bds1)
237                     | otherwise = (sz2,df2,bds2)
238
239        (sz1,df1,bds1) = mk_bds(arr!chr 0)
240        (sz2,df2,bds2) = mk_bds(arr!chr 255)
241
242        mk_bds df = (t-b, df, (chr b, chr (255-t)))
243                where
244                b = length (takeWhile id [arr!c==df| c<-['\0'..'\xff']])
245                t = length (takeWhile id [arr!c==df| c<-['\xff','\xfe'..'\0']])
246
247        arr = listArray ('\0','\xff') (take 256 (repeat (-1))) // as
248-}
249