1-- ----------------------------------------------------------------------------- 2-- 3-- AbsSyn.hs, part of Alex 4-- 5-- (c) Chris Dornan 1995-2000, Simon Marlow 2003 6-- 7-- This module provides a concrete representation for regular expressions and 8-- scanners. Scanners are used for tokenising files in preparation for parsing. 9-- 10-- ----------------------------------------------------------------------------} 11 12module AbsSyn ( 13 Code, Directive(..), Scheme(..), 14 wrapperName, 15 Scanner(..), 16 RECtx(..), 17 RExp(..), 18 DFA(..), State(..), SNum, StartCode, Accept(..), 19 RightContext(..), showRCtx, strtype, 20 encodeStartCodes, extractActions, 21 Target(..), 22 UsesPreds(..), usesPreds, 23 StrType(..) 24 ) where 25 26import CharSet ( CharSet, Encoding ) 27import Map ( Map ) 28import qualified Map hiding ( Map ) 29import Data.IntMap (IntMap) 30import Sort ( nub' ) 31import Util ( str, nl ) 32 33import Data.Maybe ( fromJust ) 34 35infixl 4 :| 36infixl 5 :%% 37 38-- ----------------------------------------------------------------------------- 39-- Abstract Syntax for Alex scripts 40 41type Code = String 42 43data Directive 44 = WrapperDirective String -- use this wrapper 45 | EncodingDirective Encoding -- use this encoding 46 | ActionType String -- Type signature of actions, 47 -- with optional typeclasses 48 | TypeClass String 49 | TokenType String 50 deriving Show 51 52data StrType = Str | Lazy | Strict 53 54instance Show StrType where 55 show Str = "String" 56 show Lazy = "ByteString.ByteString" 57 show Strict = "ByteString.ByteString" 58 59data Scheme 60 = Default { defaultTypeInfo :: Maybe (Maybe String, String) } 61 | GScan { gscanTypeInfo :: Maybe (Maybe String, String) } 62 | Basic { basicStrType :: StrType, 63 basicTypeInfo :: Maybe (Maybe String, String) } 64 | Posn { posnByteString :: Bool, 65 posnTypeInfo :: Maybe (Maybe String, String) } 66 | Monad { monadByteString :: Bool, monadUserState :: Bool, 67 monadTypeInfo :: Maybe (Maybe String, String) } 68 69strtype :: Bool -> String 70strtype True = "ByteString.ByteString" 71strtype False = "String" 72 73wrapperName :: Scheme -> Maybe String 74wrapperName Default {} = Nothing 75wrapperName GScan {} = Just "gscan" 76wrapperName Basic { basicStrType = Str } = Just "basic" 77wrapperName Basic { basicStrType = Lazy } = Just "basic-bytestring" 78wrapperName Basic { basicStrType = Strict } = Just "strict-bytestring" 79wrapperName Posn { posnByteString = False } = Just "posn" 80wrapperName Posn { posnByteString = True } = Just "posn-bytestring" 81wrapperName Monad { monadByteString = False, 82 monadUserState = False } = Just "monad" 83wrapperName Monad { monadByteString = True, 84 monadUserState = False } = Just "monad-bytestring" 85wrapperName Monad { monadByteString = False, 86 monadUserState = True } = Just "monadUserState" 87wrapperName Monad { monadByteString = True, 88 monadUserState = True } = Just "monadUserState-bytestring" 89 90-- TODO: update this comment 91-- 92-- A `Scanner' consists of an association list associating token names with 93-- regular expressions with context. The context may include a list of start 94-- codes, some leading context to test the character immediately preceding the 95-- token and trailing context to test the residual input after the token. 96-- 97-- The start codes consist of the names and numbers of the start codes; 98-- initially the names only will be generated by the parser, the numbers being 99-- allocated at a later stage. Start codes become meaningful when scanners are 100-- converted to DFAs; see the DFA section of the Scan module for details. 101 102data Scanner = Scanner { scannerName :: String, 103 scannerTokens :: [RECtx] } 104 deriving Show 105 106data RECtx = RECtx { reCtxStartCodes :: [(String,StartCode)], 107 reCtxPreCtx :: Maybe CharSet, 108 reCtxRE :: RExp, 109 reCtxPostCtx :: RightContext RExp, 110 reCtxCode :: Maybe Code 111 } 112 113data RightContext r 114 = NoRightContext 115 | RightContextRExp r 116 | RightContextCode Code 117 deriving (Eq,Ord) 118 119instance Show RECtx where 120 showsPrec _ (RECtx scs _ r rctx code) = 121 showStarts scs . shows r . showRCtx rctx . showMaybeCode code 122 123showMaybeCode :: Maybe String -> String -> String 124showMaybeCode Nothing = id 125showMaybeCode (Just code) = showCode code 126 127showCode :: String -> String -> String 128showCode code = showString " { " . showString code . showString " }" 129 130showStarts :: [(String, StartCode)] -> String -> String 131showStarts [] = id 132showStarts scs = shows scs 133 134showRCtx :: Show r => RightContext r -> String -> String 135showRCtx NoRightContext = id 136showRCtx (RightContextRExp r) = ('\\':) . shows r 137showRCtx (RightContextCode code) = showString "\\ " . showCode code 138 139-- ----------------------------------------------------------------------------- 140-- DFAs 141 142data DFA s a = DFA 143 { dfa_start_states :: [s], 144 dfa_states :: Map s (State s a) 145 } 146 147data State s a = State { state_acc :: [Accept a], 148 state_out :: IntMap s -- 0..255 only 149 } 150 151type SNum = Int 152 153data Accept a 154 = Acc { accPrio :: Int, 155 accAction :: Maybe a, 156 accLeftCtx :: Maybe CharSet, -- cannot be converted to byteset at this point. 157 accRightCtx :: RightContext SNum 158 } 159 deriving (Eq,Ord) 160 161-- debug stuff 162instance Show (Accept a) where 163 showsPrec _ (Acc p _act _lctx _rctx) = shows p --TODO 164 165type StartCode = Int 166 167-- ----------------------------------------------------------------------------- 168-- Predicates / contexts 169 170-- we can generate somewhat faster code in the case that 171-- the lexer doesn't use predicates 172data UsesPreds = UsesPreds | DoesntUsePreds 173 174usesPreds :: DFA s a -> UsesPreds 175usesPreds dfa 176 | any acceptHasCtx [ acc | st <- Map.elems (dfa_states dfa) 177 , acc <- state_acc st ] 178 = UsesPreds 179 | otherwise 180 = DoesntUsePreds 181 where 182 acceptHasCtx Acc { accLeftCtx = Nothing 183 , accRightCtx = NoRightContext } = False 184 acceptHasCtx _ = True 185 186-- ----------------------------------------------------------------------------- 187-- Regular expressions 188 189-- `RExp' provides an abstract syntax for regular expressions. `Eps' will 190-- match empty strings; `Ch p' matches strings containinng a single character 191-- `c' if `p c' is true; `re1 :%% re2' matches a string if `re1' matches one of 192-- its prefixes and `re2' matches the rest; `re1 :| re2' matches a string if 193-- `re1' or `re2' matches it; `Star re', `Plus re' and `Ques re' can be 194-- expressed in terms of the other operators. See the definitions of `ARexp' 195-- for a formal definition of the semantics of these operators. 196 197data RExp 198 = Eps 199 | Ch CharSet 200 | RExp :%% RExp 201 | RExp :| RExp 202 | Star RExp 203 | Plus RExp 204 | Ques RExp 205 206instance Show RExp where 207 showsPrec _ Eps = showString "()" 208 showsPrec _ (Ch _) = showString "[..]" 209 showsPrec _ (l :%% r) = shows l . shows r 210 showsPrec _ (l :| r) = shows l . ('|':) . shows r 211 showsPrec _ (Star r) = shows r . ('*':) 212 showsPrec _ (Plus r) = shows r . ('+':) 213 showsPrec _ (Ques r) = shows r . ('?':) 214 215{------------------------------------------------------------------------------ 216 Abstract Regular Expression 217------------------------------------------------------------------------------} 218 219 220-- This section contains demonstrations; it is not part of Alex. 221 222{- 223-- This function illustrates `ARexp'. It returns true if the string in its 224-- argument is matched by the regular expression. 225 226recognise:: RExp -> String -> Bool 227recognise re inp = any (==len) (ap_ar (arexp re) inp) 228 where 229 len = length inp 230 231 232-- `ARexp' provides an regular expressions in abstract format. Here regular 233-- expressions are represented by a function that takes the string to be 234-- matched and returns the sizes of all the prefixes matched by the regular 235-- expression (the list may contain duplicates). Each of the `RExp' operators 236-- are represented by similarly named functions over ARexp. The `ap' function 237-- takes an `ARExp', a string and returns the sizes of all the prefixes 238-- matching that regular expression. `arexp' converts an `RExp' to an `ARexp'. 239 240 241arexp:: RExp -> ARexp 242arexp Eps = eps_ar 243arexp (Ch p) = ch_ar p 244arexp (re :%% re') = arexp re `seq_ar` arexp re' 245arexp (re :| re') = arexp re `bar_ar` arexp re' 246arexp (Star re) = star_ar (arexp re) 247arexp (Plus re) = plus_ar (arexp re) 248arexp (Ques re) = ques_ar (arexp re) 249 250 251star_ar:: ARexp -> ARexp 252star_ar sc = eps_ar `bar_ar` plus_ar sc 253 254plus_ar:: ARexp -> ARexp 255plus_ar sc = sc `seq_ar` star_ar sc 256 257ques_ar:: ARexp -> ARexp 258ques_ar sc = eps_ar `bar_ar` sc 259 260 261-- Hugs abstract type definition -- not for GHC. 262 263type ARexp = String -> [Int] 264-- in ap_ar, eps_ar, ch_ar, seq_ar, bar_ar 265 266ap_ar:: ARexp -> String -> [Int] 267ap_ar sc = sc 268 269eps_ar:: ARexp 270eps_ar inp = [0] 271 272ch_ar:: (Char->Bool) -> ARexp 273ch_ar p "" = [] 274ch_ar p (c:rst) = if p c then [1] else [] 275 276seq_ar:: ARexp -> ARexp -> ARexp 277seq_ar sc sc' inp = [n+m| n<-sc inp, m<-sc' (drop n inp)] 278 279bar_ar:: ARexp -> ARexp -> ARexp 280bar_ar sc sc' inp = sc inp ++ sc' inp 281-} 282 283-- ----------------------------------------------------------------------------- 284-- Utils 285 286-- Map the available start codes onto [1..] 287 288encodeStartCodes:: Scanner -> (Scanner,[StartCode],ShowS) 289encodeStartCodes scan = (scan', 0 : map snd name_code_pairs, sc_hdr) 290 where 291 scan' = scan{ scannerTokens = map mk_re_ctx (scannerTokens scan) } 292 293 mk_re_ctx (RECtx scs lc re rc code) 294 = RECtx (map mk_sc scs) lc re rc code 295 296 mk_sc (nm,_) = (nm, if nm=="0" then 0 297 else fromJust (Map.lookup nm code_map)) 298 299 sc_hdr tl = 300 case name_code_pairs of 301 [] -> tl 302 (nm,_):rst -> "\n" ++ nm ++ foldr f t rst 303 where 304 f (nm', _) t' = "," ++ nm' ++ t' 305 t = " :: Int\n" ++ foldr fmt_sc tl name_code_pairs 306 where 307 fmt_sc (nm,sc) t = nm ++ " = " ++ show sc ++ "\n" ++ t 308 309 code_map = Map.fromList name_code_pairs 310 311 name_code_pairs = zip (nub' (<=) nms) [1..] 312 313 nms = [nm | RECtx{reCtxStartCodes = scs} <- scannerTokens scan, 314 (nm,_) <- scs, nm /= "0"] 315 316 317-- Grab the code fragments for the token actions, and replace them 318-- with function names of the form alex_action_$n$. We do this 319-- because the actual action fragments might be duplicated in the 320-- generated file. 321 322extractActions :: Scheme -> Scanner -> (Scanner,ShowS) 323extractActions scheme scanner = (scanner{scannerTokens = new_tokens}, decl_str) 324 where 325 (new_tokens, decls) = unzip (zipWith f (scannerTokens scanner) act_names) 326 327 f r@(RECtx{ reCtxCode = Just code }) name 328 = (r{reCtxCode = Just name}, Just (mkDecl name code)) 329 f r@(RECtx{ reCtxCode = Nothing }) _ 330 = (r{reCtxCode = Nothing}, Nothing) 331 332 gscanActionType res = 333 str "AlexPosn -> Char -> String -> Int -> ((Int, state) -> " 334 . str res . str ") -> (Int, state) -> " . str res 335 336 mkDecl fun code = case scheme of 337 Default { defaultTypeInfo = Just (Nothing, actionty) } -> 338 str fun . str " :: " . str actionty . str "\n" 339 . str fun . str " = " . str code . nl 340 Default { defaultTypeInfo = Just (Just tyclasses, actionty) } -> 341 str fun . str " :: (" . str tyclasses . str ") => " . 342 str actionty . str "\n" . 343 str fun . str " = " . str code . nl 344 GScan { gscanTypeInfo = Just (Nothing, tokenty) } -> 345 str fun . str " :: " . gscanActionType tokenty . str "\n" 346 . str fun . str " = " . str code . nl 347 GScan { gscanTypeInfo = Just (Just tyclasses, tokenty) } -> 348 str fun . str " :: (" . str tyclasses . str ") => " . 349 gscanActionType tokenty . str "\n" . 350 str fun . str " = " . str code . nl 351 Basic { basicStrType = strty, basicTypeInfo = Just (Nothing, tokenty) } -> 352 str fun . str " :: " . str (show strty) . str " -> " 353 . str tokenty . str "\n" 354 . str fun . str " = " . str code . nl 355 Basic { basicStrType = strty, 356 basicTypeInfo = Just (Just tyclasses, tokenty) } -> 357 str fun . str " :: (" . str tyclasses . str ") => " . 358 str (show strty) . str " -> " . str tokenty . str "\n" . 359 str fun . str " = " . str code . nl 360 Posn { posnByteString = isByteString, 361 posnTypeInfo = Just (Nothing, tokenty) } -> 362 str fun . str " :: AlexPosn -> " . str (strtype isByteString) . str " -> " 363 . str tokenty . str "\n" 364 . str fun . str " = " . str code . nl 365 Posn { posnByteString = isByteString, 366 posnTypeInfo = Just (Just tyclasses, tokenty) } -> 367 str fun . str " :: (" . str tyclasses . str ") => AlexPosn -> " . 368 str (strtype isByteString) . str " -> " . str tokenty . str "\n" . 369 str fun . str " = " . str code . nl 370 Monad { monadByteString = isByteString, 371 monadTypeInfo = Just (Nothing, tokenty) } -> 372 let 373 actintty = if isByteString then "Int64" else "Int" 374 in 375 str fun . str " :: AlexInput -> " . str actintty . str " -> Alex (" 376 . str tokenty . str ")\n" 377 . str fun . str " = " . str code . nl 378 Monad { monadByteString = isByteString, 379 monadTypeInfo = Just (Just tyclasses, tokenty) } -> 380 let 381 actintty = if isByteString then "Int64" else "Int" 382 in 383 str fun . str " :: (" . str tyclasses . str ") => " 384 . str " AlexInput -> " . str actintty 385 . str " -> Alex (" . str tokenty . str ")\n" 386 . str fun . str " = " . str code . nl 387 _ -> str fun . str " = " . str code . nl 388 389 act_names = map (\n -> "alex_action_" ++ show (n::Int)) [0..] 390 391 decl_str = foldr (.) id [ decl | Just decl <- decls ] 392 393-- ----------------------------------------------------------------------------- 394-- Code generation targets 395 396data Target = GhcTarget | HaskellTarget 397