1-- -----------------------------------------------------------------------------
2--
3-- NFA.hs, part of Alex
4--
5-- (c) Chris Dornan 1995-2000, Simon Marlow 2003
6--
7-- The `scanner2nfa' takes a `Scanner' (see the `RExp' module) and
8-- generates its equivelent nondeterministic finite automaton.  NFAs
9-- are turned into DFAs in the DFA module.
10--
11-- See the chapter on `Finite Automata and Lexical Analysis' in the
12-- dragon book for an excellent overview of the algorithms in this
13-- module.
14--
15-- ----------------------------------------------------------------------------}
16
17module NFA where
18
19import AbsSyn
20import CharSet
21import DFS ( t_close, out )
22import Map ( Map )
23import qualified Map hiding ( Map )
24import Util ( str, space )
25
26#if __GLASGOW_HASKELL__ < 710
27import Control.Applicative ( Applicative(..) )
28#endif
29import Control.Monad ( forM_, zipWithM, zipWithM_, when, liftM, ap )
30import Data.Array ( Array, (!), array, listArray, assocs, bounds )
31
32-- Each state of a nondeterministic automaton contains a list of `Accept'
33-- values, a list of epsilon transitions (an epsilon transition represents a
34-- transition to another state that can be made without reading a character)
35-- and a list of transitions qualified with a character predicate (the
36-- transition can only be made to the given state on input of a character
37-- permitted by the predicate).  Although a list of `Accept' values is provided
38-- for, in actual fact each state will have zero or one of them (the `Maybe'
39-- type is not used because the flexibility offered by the list representation
40-- is useful).
41
42type NFA = Array SNum NState
43
44data NState = NSt {
45 nst_accs :: [Accept Code],
46 nst_cl   :: [SNum],
47 nst_outs :: [(ByteSet,SNum)]
48 }
49
50-- Debug stuff
51instance Show NState where
52  showsPrec _ (NSt accs cl outs) =
53    str "NSt " . shows accs . space . shows cl . space .
54        shows [ (c, s) | (c,s) <- outs ]
55
56{-                           From the Scan Module
57
58-- The `Accept' structure contains the priority of the token being accepted
59-- (lower numbers => higher priorities), the name of the token, a place holder
60-- that can be used for storing the `action' function, a list of start codes
61-- (listing the start codes that the scanner must be in for the token to be
62-- accepted; empty => no restriction), the leading and trailing context (both
63-- `Nothing' if there is none).
64--
65-- The leading context consists simply of a character predicate that will
66-- return true if the last character read is acceptable.  The trailing context
67-- consists of an alternative starting state within the DFA; if this `sub-dfa'
68-- turns up any accepting state when applied to the residual input then the
69-- trailing context is acceptable.
70-}
71
72
73-- `scanner2nfa' takes a scanner (see the AbsSyn module) and converts it to an
74-- NFA, using the NFA creation monad (see below).
75--
76-- We generate a start state for each startcode, with the same number
77-- as that startcode, and epsilon transitions from this state to each
78-- of the sub-NFAs for each of the tokens acceptable in that startcode.
79
80scanner2nfa:: Encoding -> Scanner -> [StartCode] -> NFA
81scanner2nfa enc Scanner{scannerTokens = toks} startcodes
82   = runNFA enc $
83        do
84          -- make a start state for each start code (these will be
85          -- numbered from zero).
86          start_states <- sequence (replicate (length startcodes) newState)
87
88          -- construct the NFA for each token
89          tok_states <- zipWithM do_token toks [0..]
90
91          -- make an epsilon edge from each state state to each
92          -- token that is acceptable in that state
93          zipWithM_ (tok_transitions (zip toks tok_states))
94                startcodes start_states
95
96        where
97          do_token (RECtx _scs lctx re rctx code) prio = do
98                b <- newState
99                e <- newState
100                rexp2nfa b e re
101
102                rctx_e <- case rctx of
103                                  NoRightContext ->
104                                        return NoRightContext
105                                  RightContextCode code' ->
106                                        return (RightContextCode code')
107                                  RightContextRExp re' -> do
108                                        r_b <- newState
109                                        r_e <- newState
110                                        rexp2nfa r_b r_e re'
111                                        accept r_e rctxt_accept
112                                        return (RightContextRExp r_b)
113
114                let lctx' = case lctx of
115                                  Nothing -> Nothing
116                                  Just st -> Just st
117
118                accept e (Acc prio code lctx' rctx_e)
119                return b
120
121          tok_transitions toks_with_states start_code start_state = do
122                let states = [ s | (RECtx scs _ _ _ _, s) <- toks_with_states,
123                                   null scs || start_code `elem` map snd scs ]
124                mapM_ (epsilonEdge start_state) states
125
126-- -----------------------------------------------------------------------------
127-- NFA creation from a regular expression
128
129-- rexp2nfa B E R generates an NFA that begins in state B, recognises
130-- R, and ends in state E only if R has been recognised.
131
132rexp2nfa :: SNum -> SNum -> RExp -> NFAM ()
133rexp2nfa b e Eps    = epsilonEdge b e
134rexp2nfa b e (Ch p) = charEdge b p e
135rexp2nfa b e (re1 :%% re2) = do
136  s <- newState
137  rexp2nfa b s re1
138  rexp2nfa s e re2
139rexp2nfa b e (re1 :| re2) = do
140  rexp2nfa b e re1
141  rexp2nfa b e re2
142rexp2nfa b e (Star re) = do
143  s <- newState
144  epsilonEdge b s
145  rexp2nfa s s re
146  epsilonEdge s e
147rexp2nfa b e (Plus re) = do
148  s1 <- newState
149  s2 <- newState
150  rexp2nfa s1 s2 re
151  epsilonEdge b s1
152  epsilonEdge s2 s1
153  epsilonEdge s2 e
154rexp2nfa b e (Ques re) = do
155  rexp2nfa b e re
156  epsilonEdge b e
157
158-- -----------------------------------------------------------------------------
159-- NFA creation monad.
160
161-- Partial credit to Thomas Hallgren for this code, as I adapted it from
162-- his "Lexing Haskell in Haskell" lexer generator.
163
164type MapNFA = Map SNum NState
165
166newtype NFAM a = N {unN :: SNum -> MapNFA -> Encoding -> (SNum, MapNFA, a)}
167
168instance Functor NFAM where
169  fmap = liftM
170
171instance Applicative NFAM where
172  pure a = N $ \s n _ -> (s,n,a)
173  (<*>) = ap
174
175instance Monad NFAM where
176  return = pure
177
178  m >>= k  = N $ \s n e -> case unN m s n e of
179                                 (s', n', a) -> unN (k a) s' n' e
180
181runNFA :: Encoding -> NFAM () -> NFA
182runNFA e m = case unN m 0 Map.empty e of
183                (s, nfa_map, ()) -> -- trace ("runNfa.." ++ show (Map.toAscList nfa_map)) $
184                                    e_close (array (0,s-1) (Map.toAscList nfa_map))
185
186e_close:: Array Int NState -> NFA
187e_close ar = listArray bds
188                [NSt accs (out gr v) outs|(v,NSt accs _ outs)<-assocs ar]
189        where
190        gr = t_close (hi+1,\v->nst_cl (ar!v))
191        bds@(_,hi) = bounds ar
192
193newState :: NFAM SNum
194newState = N $ \s n _ -> (s+1,n,s)
195
196getEncoding :: NFAM Encoding
197getEncoding = N $ \s n e -> (s,n,e)
198
199anyBytes :: SNum -> Int -> SNum -> NFAM ()
200anyBytes from 0 to = epsilonEdge from to
201anyBytes from n to = do
202        s <- newState
203        byteEdge from (byteSetRange 0 0xff) s
204        anyBytes s (n-1) to
205
206bytesEdge :: SNum -> [Byte] -> [Byte] -> SNum -> NFAM ()
207bytesEdge from [] [] to = epsilonEdge from to
208bytesEdge from [x] [y] to = byteEdge from (byteSetRange x y) to -- (OPTIMISATION)
209bytesEdge from (x:xs) (y:ys) to
210    | x == y = do
211        s <- newState
212        byteEdge from (byteSetSingleton x) s
213        bytesEdge s xs ys to
214    | x < y = do
215        do s <- newState
216           byteEdge from (byteSetSingleton x) s
217           bytesEdge s xs (fmap (const 0xff) ys) to
218
219        do t <- newState
220           byteEdge from (byteSetSingleton y) t
221           bytesEdge t (fmap (const 0x00) xs) ys to
222
223        when ((x+1) <= (y-1)) $ do
224           u <- newState
225           byteEdge from (byteSetRange (x+1) (y-1)) u
226           anyBytes u (length xs) to
227bytesEdge _ _ _ _ = undefined -- hide compiler warning
228
229charEdge :: SNum -> CharSet -> SNum -> NFAM ()
230charEdge from charset to = do
231  -- trace ("charEdge: " ++ (show $ charset) ++ " => " ++ show (byteRanges charset)) $
232  e <- getEncoding
233  forM_ (byteRanges e charset) $ \(xs,ys) -> do
234    bytesEdge from xs ys to
235
236
237
238byteEdge :: SNum -> ByteSet -> SNum -> NFAM ()
239byteEdge from charset to = N $ \s n _ -> (s, addEdge n, ())
240 where
241   addEdge n =
242     case Map.lookup from n of
243       Nothing ->
244           Map.insert from (NSt [] [] [(charset,to)]) n
245       Just (NSt acc eps trans) ->
246           Map.insert from (NSt acc eps ((charset,to):trans)) n
247
248epsilonEdge :: SNum -> SNum -> NFAM ()
249epsilonEdge from to
250 | from == to = return ()
251 | otherwise  = N $ \s n _ -> let n' = addEdge n in n' `seq` (s, n', ())
252 where
253   addEdge n =
254     case Map.lookup from n of
255       Nothing                  -> Map.insert from (NSt [] [to] []) n
256       Just (NSt acc eps trans) -> Map.insert from (NSt acc (to:eps) trans) n
257
258accept :: SNum -> Accept Code -> NFAM ()
259accept state new_acc = N $ \s n _ -> (s, addAccept n, ())
260 where
261   addAccept n =
262     case Map.lookup state n of
263       Nothing ->
264           Map.insert state (NSt [new_acc] [] []) n
265       Just (NSt acc eps trans) ->
266           Map.insert state (NSt (new_acc:acc) eps trans) n
267
268
269rctxt_accept :: Accept Code
270rctxt_accept = Acc 0 Nothing Nothing NoRightContext
271