1-- ----------------------------------------------------------------------------- 2-- 3-- NFA.hs, part of Alex 4-- 5-- (c) Chris Dornan 1995-2000, Simon Marlow 2003 6-- 7-- The `scanner2nfa' takes a `Scanner' (see the `RExp' module) and 8-- generates its equivelent nondeterministic finite automaton. NFAs 9-- are turned into DFAs in the DFA module. 10-- 11-- See the chapter on `Finite Automata and Lexical Analysis' in the 12-- dragon book for an excellent overview of the algorithms in this 13-- module. 14-- 15-- ----------------------------------------------------------------------------} 16 17module NFA where 18 19import AbsSyn 20import CharSet 21import DFS ( t_close, out ) 22import Map ( Map ) 23import qualified Map hiding ( Map ) 24import Util ( str, space ) 25 26#if __GLASGOW_HASKELL__ < 710 27import Control.Applicative ( Applicative(..) ) 28#endif 29import Control.Monad ( forM_, zipWithM, zipWithM_, when, liftM, ap ) 30import Data.Array ( Array, (!), array, listArray, assocs, bounds ) 31 32-- Each state of a nondeterministic automaton contains a list of `Accept' 33-- values, a list of epsilon transitions (an epsilon transition represents a 34-- transition to another state that can be made without reading a character) 35-- and a list of transitions qualified with a character predicate (the 36-- transition can only be made to the given state on input of a character 37-- permitted by the predicate). Although a list of `Accept' values is provided 38-- for, in actual fact each state will have zero or one of them (the `Maybe' 39-- type is not used because the flexibility offered by the list representation 40-- is useful). 41 42type NFA = Array SNum NState 43 44data NState = NSt { 45 nst_accs :: [Accept Code], 46 nst_cl :: [SNum], 47 nst_outs :: [(ByteSet,SNum)] 48 } 49 50-- Debug stuff 51instance Show NState where 52 showsPrec _ (NSt accs cl outs) = 53 str "NSt " . shows accs . space . shows cl . space . 54 shows [ (c, s) | (c,s) <- outs ] 55 56{- From the Scan Module 57 58-- The `Accept' structure contains the priority of the token being accepted 59-- (lower numbers => higher priorities), the name of the token, a place holder 60-- that can be used for storing the `action' function, a list of start codes 61-- (listing the start codes that the scanner must be in for the token to be 62-- accepted; empty => no restriction), the leading and trailing context (both 63-- `Nothing' if there is none). 64-- 65-- The leading context consists simply of a character predicate that will 66-- return true if the last character read is acceptable. The trailing context 67-- consists of an alternative starting state within the DFA; if this `sub-dfa' 68-- turns up any accepting state when applied to the residual input then the 69-- trailing context is acceptable. 70-} 71 72 73-- `scanner2nfa' takes a scanner (see the AbsSyn module) and converts it to an 74-- NFA, using the NFA creation monad (see below). 75-- 76-- We generate a start state for each startcode, with the same number 77-- as that startcode, and epsilon transitions from this state to each 78-- of the sub-NFAs for each of the tokens acceptable in that startcode. 79 80scanner2nfa:: Encoding -> Scanner -> [StartCode] -> NFA 81scanner2nfa enc Scanner{scannerTokens = toks} startcodes 82 = runNFA enc $ 83 do 84 -- make a start state for each start code (these will be 85 -- numbered from zero). 86 start_states <- sequence (replicate (length startcodes) newState) 87 88 -- construct the NFA for each token 89 tok_states <- zipWithM do_token toks [0..] 90 91 -- make an epsilon edge from each state state to each 92 -- token that is acceptable in that state 93 zipWithM_ (tok_transitions (zip toks tok_states)) 94 startcodes start_states 95 96 where 97 do_token (RECtx _scs lctx re rctx code) prio = do 98 b <- newState 99 e <- newState 100 rexp2nfa b e re 101 102 rctx_e <- case rctx of 103 NoRightContext -> 104 return NoRightContext 105 RightContextCode code' -> 106 return (RightContextCode code') 107 RightContextRExp re' -> do 108 r_b <- newState 109 r_e <- newState 110 rexp2nfa r_b r_e re' 111 accept r_e rctxt_accept 112 return (RightContextRExp r_b) 113 114 let lctx' = case lctx of 115 Nothing -> Nothing 116 Just st -> Just st 117 118 accept e (Acc prio code lctx' rctx_e) 119 return b 120 121 tok_transitions toks_with_states start_code start_state = do 122 let states = [ s | (RECtx scs _ _ _ _, s) <- toks_with_states, 123 null scs || start_code `elem` map snd scs ] 124 mapM_ (epsilonEdge start_state) states 125 126-- ----------------------------------------------------------------------------- 127-- NFA creation from a regular expression 128 129-- rexp2nfa B E R generates an NFA that begins in state B, recognises 130-- R, and ends in state E only if R has been recognised. 131 132rexp2nfa :: SNum -> SNum -> RExp -> NFAM () 133rexp2nfa b e Eps = epsilonEdge b e 134rexp2nfa b e (Ch p) = charEdge b p e 135rexp2nfa b e (re1 :%% re2) = do 136 s <- newState 137 rexp2nfa b s re1 138 rexp2nfa s e re2 139rexp2nfa b e (re1 :| re2) = do 140 rexp2nfa b e re1 141 rexp2nfa b e re2 142rexp2nfa b e (Star re) = do 143 s <- newState 144 epsilonEdge b s 145 rexp2nfa s s re 146 epsilonEdge s e 147rexp2nfa b e (Plus re) = do 148 s1 <- newState 149 s2 <- newState 150 rexp2nfa s1 s2 re 151 epsilonEdge b s1 152 epsilonEdge s2 s1 153 epsilonEdge s2 e 154rexp2nfa b e (Ques re) = do 155 rexp2nfa b e re 156 epsilonEdge b e 157 158-- ----------------------------------------------------------------------------- 159-- NFA creation monad. 160 161-- Partial credit to Thomas Hallgren for this code, as I adapted it from 162-- his "Lexing Haskell in Haskell" lexer generator. 163 164type MapNFA = Map SNum NState 165 166newtype NFAM a = N {unN :: SNum -> MapNFA -> Encoding -> (SNum, MapNFA, a)} 167 168instance Functor NFAM where 169 fmap = liftM 170 171instance Applicative NFAM where 172 pure a = N $ \s n _ -> (s,n,a) 173 (<*>) = ap 174 175instance Monad NFAM where 176 return = pure 177 178 m >>= k = N $ \s n e -> case unN m s n e of 179 (s', n', a) -> unN (k a) s' n' e 180 181runNFA :: Encoding -> NFAM () -> NFA 182runNFA e m = case unN m 0 Map.empty e of 183 (s, nfa_map, ()) -> -- trace ("runNfa.." ++ show (Map.toAscList nfa_map)) $ 184 e_close (array (0,s-1) (Map.toAscList nfa_map)) 185 186e_close:: Array Int NState -> NFA 187e_close ar = listArray bds 188 [NSt accs (out gr v) outs|(v,NSt accs _ outs)<-assocs ar] 189 where 190 gr = t_close (hi+1,\v->nst_cl (ar!v)) 191 bds@(_,hi) = bounds ar 192 193newState :: NFAM SNum 194newState = N $ \s n _ -> (s+1,n,s) 195 196getEncoding :: NFAM Encoding 197getEncoding = N $ \s n e -> (s,n,e) 198 199anyBytes :: SNum -> Int -> SNum -> NFAM () 200anyBytes from 0 to = epsilonEdge from to 201anyBytes from n to = do 202 s <- newState 203 byteEdge from (byteSetRange 0 0xff) s 204 anyBytes s (n-1) to 205 206bytesEdge :: SNum -> [Byte] -> [Byte] -> SNum -> NFAM () 207bytesEdge from [] [] to = epsilonEdge from to 208bytesEdge from [x] [y] to = byteEdge from (byteSetRange x y) to -- (OPTIMISATION) 209bytesEdge from (x:xs) (y:ys) to 210 | x == y = do 211 s <- newState 212 byteEdge from (byteSetSingleton x) s 213 bytesEdge s xs ys to 214 | x < y = do 215 do s <- newState 216 byteEdge from (byteSetSingleton x) s 217 bytesEdge s xs (fmap (const 0xff) ys) to 218 219 do t <- newState 220 byteEdge from (byteSetSingleton y) t 221 bytesEdge t (fmap (const 0x00) xs) ys to 222 223 when ((x+1) <= (y-1)) $ do 224 u <- newState 225 byteEdge from (byteSetRange (x+1) (y-1)) u 226 anyBytes u (length xs) to 227bytesEdge _ _ _ _ = undefined -- hide compiler warning 228 229charEdge :: SNum -> CharSet -> SNum -> NFAM () 230charEdge from charset to = do 231 -- trace ("charEdge: " ++ (show $ charset) ++ " => " ++ show (byteRanges charset)) $ 232 e <- getEncoding 233 forM_ (byteRanges e charset) $ \(xs,ys) -> do 234 bytesEdge from xs ys to 235 236 237 238byteEdge :: SNum -> ByteSet -> SNum -> NFAM () 239byteEdge from charset to = N $ \s n _ -> (s, addEdge n, ()) 240 where 241 addEdge n = 242 case Map.lookup from n of 243 Nothing -> 244 Map.insert from (NSt [] [] [(charset,to)]) n 245 Just (NSt acc eps trans) -> 246 Map.insert from (NSt acc eps ((charset,to):trans)) n 247 248epsilonEdge :: SNum -> SNum -> NFAM () 249epsilonEdge from to 250 | from == to = return () 251 | otherwise = N $ \s n _ -> let n' = addEdge n in n' `seq` (s, n', ()) 252 where 253 addEdge n = 254 case Map.lookup from n of 255 Nothing -> Map.insert from (NSt [] [to] []) n 256 Just (NSt acc eps trans) -> Map.insert from (NSt acc (to:eps) trans) n 257 258accept :: SNum -> Accept Code -> NFAM () 259accept state new_acc = N $ \s n _ -> (s, addAccept n, ()) 260 where 261 addAccept n = 262 case Map.lookup state n of 263 Nothing -> 264 Map.insert state (NSt [new_acc] [] []) n 265 Just (NSt acc eps trans) -> 266 Map.insert state (NSt (new_acc:acc) eps trans) n 267 268 269rctxt_accept :: Accept Code 270rctxt_accept = Acc 0 Nothing Nothing NoRightContext 271