1------------------------------------------------------------------------------- 2-- ALEX SCANNER AND LITERATE PREPROCESSOR 3-- 4-- This Script defines the grammar used to generate the Alex scanner and a 5-- preprocessing scanner for dealing with literate scripts. The actions for 6-- the Alex scanner are given separately in the Alex module. 7-- 8-- See the Alex manual for a discussion of the scanners defined here. 9-- 10-- Chris Dornan, Aug-95, 4-Jun-96, 10-Jul-96, 29-Sep-97 11------------------------------------------------------------------------------- 12 13{ 14module Scan (lexer, AlexPosn(..), Token(..), Tkn(..), tokPosn) where 15 16import Data.Char 17import ParseMonad 18--import Debug.Trace 19} 20 21$digit = 0-9 22$hexdig = [0-9 A-F a-f] 23$octal = 0-7 24$lower = a-z 25$upper = A-Z 26$alpha = [$upper $lower] 27$alphanum = [$alpha $digit] 28$idchar = [$alphanum \_ \'] 29 30$special = [\.\;\,\$\|\*\+\?\#\~\-\{\}\(\)\[\]\^\/] 31$graphic = $printable # $white 32$nonspecial = $graphic # [$special \%] 33 34@id = $alpha $idchar* 35@smac = \$ @id | \$ \{ @id \} 36@rmac = \@ @id | \@ \{ @id \} 37 38@comment = "--".* 39@ws = $white+ | @comment 40 41alex :- 42 43@ws { skip } -- white space; ignore 44 45<0> \" [^\"]* \" { string } 46<0> (@id @ws?)? \:\- { bind } 47<0> \{ / (\n | [^$digit]) { code } 48<0> $special { special } -- note: matches { 49<0> \% "wrapper" { wrapper } 50<0> \% "encoding" { encoding } 51<0> \% "action" { actionty } 52<0> \% "token" { tokenty } 53<0> \% "typeclass" { typeclass } 54 55<0> \\ $digit+ { decch } 56<0> \\ x $hexdig+ { hexch } 57<0> \\ o $octal+ { octch } 58<0> \\ $printable { escape } 59<0> $nonspecial # [\<] { char } 60<0> @smac { smac } 61<0> @rmac { rmac } 62 63<0> @smac @ws? \= { smacdef } 64<0> @rmac @ws? \= { rmacdef } 65 66-- identifiers are allowed to be unquoted in startcode lists 67<0> \< { special `andBegin` startcodes } 68<startcodes> 0 { zero } 69<startcodes> @id { startcode } 70<startcodes> \, { special } 71<startcodes> \> { special `andBegin` afterstartcodes } 72 73-- After a <..> startcode sequence, we can have a {...} grouping of rules, 74-- so don't try to interpret the opening { as a code block. 75<afterstartcodes> \{ (\n | [^$digit ]) { special `andBegin` 0 } 76<afterstartcodes> () { skip `andBegin` 0 } -- note: empty pattern 77{ 78 79-- ----------------------------------------------------------------------------- 80-- Token type 81 82data Token = T AlexPosn Tkn 83 deriving Show 84 85tokPosn (T p _) = p 86 87data Tkn 88 = SpecialT Char 89 | CodeT String 90 | ZeroT 91 | IdT String 92 | StringT String 93 | BindT String 94 | CharT Char 95 | SMacT String 96 | RMacT String 97 | SMacDefT String 98 | RMacDefT String 99 | NumT Int 100 | WrapperT 101 | EncodingT 102 | ActionTypeT 103 | TokenTypeT 104 | TypeClassT 105 | EOFT 106 deriving Show 107 108-- ----------------------------------------------------------------------------- 109-- Token functions 110 111special, zero, string, bind, escape, decch, hexch, octch, char :: Action 112smac, rmac, smacdef, rmacdef, startcode, wrapper, encoding :: Action 113actionty, tokenty, typeclass :: Action 114special (p,_,str) _ = return $ T p (SpecialT (head str)) 115zero (p,_,_) _ = return $ T p ZeroT 116string (p,_,str) ln = return $ T p (StringT (extract ln str)) 117bind (p,_,str) _ = return $ T p (BindT (takeWhile isIdChar str)) 118escape (p,_,str) _ = return $ T p (CharT (esc str)) 119decch (p,_,str) ln = return $ T p (CharT (do_ech 10 ln (take (ln-1) (tail str)))) 120hexch (p,_,str) ln = return $ T p (CharT (do_ech 16 ln (take (ln-2) (drop 2 str)))) 121octch (p,_,str) ln = return $ T p (CharT (do_ech 8 ln (take (ln-2) (drop 2 str)))) 122char (p,_,str) _ = return $ T p (CharT (head str)) 123smac (p,_,str) ln = return $ T p (SMacT (mac ln str)) 124rmac (p,_,str) ln = return $ T p (RMacT (mac ln str)) 125smacdef (p,_,str) ln = return $ T p (SMacDefT (macdef ln str)) 126rmacdef (p,_,str) ln = return $ T p (RMacDefT (macdef ln str)) 127startcode (p,_,str) ln = return $ T p (IdT (take ln str)) 128wrapper (p,_,_) _ = return $ T p WrapperT 129encoding (p,_,_) _ = return $ T p EncodingT 130actionty (p,_,_) _ = return $ T p ActionTypeT 131tokenty (p,_,_) _ = return $ T p TokenTypeT 132typeclass (p,_,_) _ = return $ T p TypeClassT 133 134isIdChar :: Char -> Bool 135isIdChar c = isAlphaNum c || c `elem` "_'" 136 137extract :: Int -> String -> String 138extract ln str = take (ln-2) (tail str) 139 140do_ech :: Int -> Int -> String -> Char 141do_ech radix _ln str = chr (parseInt radix str) 142 143mac :: Int -> String -> String 144mac ln str = take (ln-1) $ tail str 145 146-- TODO : replace not . isSpace with (\c -> not (isSpace c) && c /= '=') 147macdef :: Int -> String -> String 148macdef _ln str = takeWhile (\c -> not (isSpace c) && c /= '=') $ tail str 149 150esc :: String -> Char 151esc str = 152 case head $ tail str of 153 'a' -> '\a' 154 'b' -> '\b' 155 'f' -> '\f' 156 'n' -> '\n' 157 'r' -> '\r' 158 't' -> '\t' 159 'v' -> '\v' 160 c -> c 161 162parseInt :: Int -> String -> Int 163parseInt radix ds = foldl1 (\n d -> n * radix + d) (map digitToInt ds) 164 165-- In brace-delimited code, we have to be careful to match braces 166-- within the code, but ignore braces inside strings and character 167-- literals. We do an approximate job (doing it properly requires 168-- implementing a large chunk of the Haskell lexical syntax). 169 170code :: Action 171code (p,_,_inp) _ = do 172 currentInput <- getInput 173 go currentInput 1 "" 174 where 175 go :: AlexInput -> Int -> String -> P Token 176 go inp 0 cs = do 177 setInput inp 178 return (T p (CodeT (reverse (tail cs)))) 179 go inp n cs = do 180 case alexGetChar inp of 181 Nothing -> err inp 182 Just (c,inp2) -> 183 case c of 184 '{' -> go inp2 (n+1) (c:cs) 185 '}' -> go inp2 (n-1) (c:cs) 186 '\'' -> go_char inp2 n (c:cs) 187 '\"' -> go_str inp2 n (c:cs) '\"' 188 c2 -> go inp2 n (c2:cs) 189 190 go_char :: AlexInput -> Int -> String -> P Token 191 -- try to catch multiple occurrences of ' at identifier end 192 go_char inp n cs@('\'':'\'':_) = go inp n cs 193 -- try to catch occurrences of ' within an identifier 194 go_char inp n cs@('\'':c2:_) 195 | isAlphaNum c2 = go inp n cs 196 go_char inp n cs = go_str inp n cs '\'' 197 198 go_str :: AlexInput -> Int -> String -> Char -> P Token 199 go_str inp n cs end = do 200 case alexGetChar inp of 201 Nothing -> err inp 202 Just (c,inp2) 203 | c == end -> go inp2 n (c:cs) 204 | otherwise -> 205 case c of 206 '\\' -> case alexGetChar inp2 of 207 Nothing -> err inp2 208 Just (d,inp3) -> go_str inp3 n (d:c:cs) end 209 c2 -> go_str inp2 n (c2:cs) end 210 211 err inp = do setInput inp; lexError "lexical error in code fragment" 212 213lexError :: String -> P a 214lexError s = do 215 (_,_,_,input) <- getInput 216 failP (s ++ (if (not (null input)) 217 then " at " ++ show (head input) 218 else " at end of file")) 219 220lexer :: (Token -> P a) -> P a 221lexer cont = lexToken >>= cont 222 223lexToken :: P Token 224lexToken = do 225 inp@(p,c,_,s) <- getInput 226 sc <- getStartCode 227 case alexScan inp sc of 228 AlexEOF -> return (T p EOFT) 229 AlexError _ -> lexError "lexical error" 230 AlexSkip inp1 _ -> do 231 setInput inp1 232 lexToken 233 AlexToken inp1 len t -> do 234 setInput inp1 235 t (p,c,s) len 236 237type Action = (AlexPosn,Char,String) -> Int -> P Token 238 239skip :: Action 240skip _ _ = lexToken 241 242andBegin :: Action -> StartCode -> Action 243andBegin act sc inp len = setStartCode sc >> act inp len 244} 245