1----------------------------------------------------------------------------- 2The lexer. 3 4(c) 1993-2001 Andy Gill, Simon Marlow 5----------------------------------------------------------------------------- 6 7> module Lexer ( 8> Token(..), 9> TokenId(..), 10> lexer ) where 11 12> import ParseMonad 13 14> import Data.Char ( isSpace, isAlphaNum, isDigit, digitToInt ) 15 16> data Token 17> = TokenInfo String TokenId 18> | TokenNum Int TokenId 19> | TokenKW TokenId 20> | TokenEOF 21 22> tokenToId :: Token -> TokenId 23> tokenToId (TokenInfo _ i) = i 24> tokenToId (TokenNum _ i) = i 25> tokenToId (TokenKW i) = i 26> tokenToId TokenEOF = error "tokenToId TokenEOF" 27 28> instance Eq Token where 29> i == i' = tokenToId i == tokenToId i' 30 31> instance Ord Token where 32> i <= i' = tokenToId i <= tokenToId i' 33 34> data TokenId 35> = TokId -- words and symbols 36> | TokSpecId_TokenType -- %tokentype 37> | TokSpecId_Token -- %token 38> | TokSpecId_Name -- %name 39> | TokSpecId_Partial -- %partial 40> | TokSpecId_ErrorHandlerType -- %errorhandlertype 41> | TokSpecId_Lexer -- %lexer 42> | TokSpecId_ImportedIdentity -- %importedidentity 43> | TokSpecId_Monad -- %monad 44> | TokSpecId_Nonassoc -- %nonassoc 45> | TokSpecId_Left -- %left 46> | TokSpecId_Right -- %right 47> | TokSpecId_Prec -- %prec 48> | TokSpecId_Shift -- %shift 49> | TokSpecId_Expect -- %expect 50> | TokSpecId_Error -- %error 51> | TokSpecId_Attributetype -- %attributetype 52> | TokSpecId_Attribute -- %attribute 53> | TokCodeQuote -- stuff inside { .. } 54> | TokColon -- : 55> | TokSemiColon -- ; 56> | TokDoubleColon -- :: 57> | TokDoublePercent -- %% 58> | TokBar -- | 59> | TokNum -- Integer 60> | TokParenL -- ( 61> | TokParenR -- ) 62> | TokComma -- , 63> deriving (Eq,Ord 64 65#ifdef DEBUG 66 67> ,Show 68 69#endif 70 71> ) 72 73ToDo: proper text instance here, for use in parser error messages. 74 75> lexer :: (Token -> P a) -> P a 76> lexer cont = mkP lexer' 77> where lexer' "" = returnToken cont TokenEOF "" 78> lexer' ('-':'-':r) = lexer' (dropWhile (/= '\n') r) 79> lexer' ('{':'-':r) = \line -> lexNestedComment line lexer' r line 80> lexer' (c:rest) = nextLex cont c rest 81 82> returnToken :: (t -> P a) -> t -> String -> Int -> ParseResult a 83> returnToken cont tok = runP (cont tok) 84 85> nextLex :: (Token -> P a) -> Char -> String -> Int -> ParseResult a 86> nextLex cont c = case c of 87> '\n' -> \rest line -> returnToken lexer cont rest (line+1) 88> '%' -> lexPercent cont 89> ':' -> lexColon cont 90> ';' -> returnToken cont (TokenKW TokSemiColon) 91 92> '|' -> returnToken cont (TokenKW TokBar) 93> '\'' -> lexChar cont 94> '"'{-"-}-> lexString cont 95> '{' -> lexCode cont 96 97> '(' -> returnToken cont (TokenKW TokParenL) 98> ')' -> returnToken cont (TokenKW TokParenR) 99> ',' -> returnToken cont (TokenKW TokComma) 100 101> _ 102> | isSpace c -> runP (lexer cont) 103> | c >= 'a' && c <= 'z' 104> || c >= 'A' && c <= 'Z' -> lexId cont c 105> | isDigit c -> lexNum cont c 106> _ -> lexError ("lexical error before `" ++ c : "'") 107 108Percents come in two forms, in pairs, or 109followed by a special identifier. 110 111> lexPercent :: (Token -> P a) -> [Char] -> Int -> ParseResult a 112> lexPercent cont s = case s of 113> '%':rest -> returnToken cont (TokenKW TokDoublePercent) rest 114> 't':'o':'k':'e':'n':'t':'y':'p':'e':rest -> 115> returnToken cont (TokenKW TokSpecId_TokenType) rest 116> 't':'o':'k':'e':'n':rest -> 117> returnToken cont (TokenKW TokSpecId_Token) rest 118> 'n':'a':'m':'e':rest -> 119> returnToken cont (TokenKW TokSpecId_Name) rest 120> 'p':'a':'r':'t':'i':'a':'l':rest -> 121> returnToken cont (TokenKW TokSpecId_Partial) rest 122> 'i':'m':'p':'o':'r':'t':'e':'d':'i':'d':'e':'n':'t':'i':'t':'y':rest -> 123> returnToken cont (TokenKW TokSpecId_ImportedIdentity) rest 124> 'm':'o':'n':'a':'d':rest -> 125> returnToken cont (TokenKW TokSpecId_Monad) rest 126> 'l':'e':'x':'e':'r':rest -> 127> returnToken cont (TokenKW TokSpecId_Lexer) rest 128> 'n':'o':'n':'a':'s':'s':'o':'c':rest -> 129> returnToken cont (TokenKW TokSpecId_Nonassoc) rest 130> 'l':'e':'f':'t':rest -> 131> returnToken cont (TokenKW TokSpecId_Left) rest 132> 'r':'i':'g':'h':'t':rest -> 133> returnToken cont (TokenKW TokSpecId_Right) rest 134> 'p':'r':'e':'c':rest -> 135> returnToken cont (TokenKW TokSpecId_Prec) rest 136> 's':'h':'i':'f':'t':rest -> 137> returnToken cont (TokenKW TokSpecId_Shift) rest 138> 'e':'x':'p':'e':'c':'t':rest -> 139> returnToken cont (TokenKW TokSpecId_Expect) rest 140> 'e':'r':'r':'o':'r':'h':'a':'n':'d':'l':'e':'r':'t':'y':'p':'e':rest -> 141> returnToken cont (TokenKW TokSpecId_ErrorHandlerType) rest 142> 'e':'r':'r':'o':'r':rest -> 143> returnToken cont (TokenKW TokSpecId_Error) rest 144> 'a':'t':'t':'r':'i':'b':'u':'t':'e':'t':'y':'p':'e':rest -> 145> returnToken cont (TokenKW TokSpecId_Attributetype) rest 146> 'a':'t':'t':'r':'i':'b':'u':'t':'e':rest -> 147> returnToken cont (TokenKW TokSpecId_Attribute) rest 148> _ -> lexError ("unrecognised directive: %" ++ 149> takeWhile (not.isSpace) s) s 150 151> lexColon :: (Token -> P a) -> [Char] -> Int -> ParseResult a 152> lexColon cont (':':rest) = returnToken cont (TokenKW TokDoubleColon) rest 153> lexColon cont rest = returnToken cont (TokenKW TokColon) rest 154 155> lexId :: (Token -> P a) -> Char -> String -> Int -> ParseResult a 156> lexId cont c rest = 157> readId rest (\ ident rest' -> returnToken cont (TokenInfo (c:ident) TokId) rest') 158 159> lexChar :: (Token -> P a) -> String -> Int -> ParseResult a 160> lexChar cont rest = lexReadChar rest 161> (\ ident -> returnToken cont (TokenInfo ("'" ++ ident ++ "'") TokId)) 162 163> lexString :: (Token -> P a) -> String -> Int -> ParseResult a 164> lexString cont rest = lexReadString rest 165> (\ ident -> returnToken cont (TokenInfo ("\"" ++ ident ++ "\"") TokId)) 166 167> lexCode :: (Token -> P a) -> String -> Int -> ParseResult a 168> lexCode cont rest = lexReadCode rest (0 :: Integer) "" cont 169 170> lexNum :: (Token -> P a) -> Char -> String -> Int -> ParseResult a 171> lexNum cont c rest = 172> readNum rest (\ num rest' -> 173> returnToken cont (TokenNum (stringToInt (c:num)) TokNum) rest') 174> where stringToInt = foldl (\n c' -> digitToInt c' + 10*n) 0 175 176> cleanupCode :: String -> String 177> cleanupCode s = 178> dropWhile isSpace (reverse (dropWhile isSpace (reverse s))) 179 180This has to match for @}@ that are {\em not} in strings. The code 181here is a bit tricky, but should work in most cases. 182 183> lexReadCode :: (Eq a, Num a) 184> => String -> a -> String -> (Token -> P b) -> Int 185> -> ParseResult b 186> lexReadCode s n c = case s of 187> '\n':r -> \cont l -> lexReadCode r n ('\n':c) cont (l+1) 188> 189> '{' :r -> lexReadCode r (n+1) ('{':c) 190> 191> '}' :r 192> | n == 0 -> \cont -> returnToken cont (TokenInfo ( 193> cleanupCode (reverse c)) TokCodeQuote) r 194> | otherwise -> lexReadCode r (n-1) ('}':c) 195> 196> '"'{-"-}:r -> lexReadString r (\ str r' -> 197> lexReadCode r' n ('"' : (reverse str) ++ '"' : c)) 198> 199> a: '\'':r | isAlphaNum a -> lexReadCode r n ('\'':a:c) 200> 201> '\'' :r -> lexReadSingleChar r (\ str r' -> 202> lexReadCode r' n ((reverse str) ++ '\'' : c)) 203> 204> ch:r -> lexReadCode r n (ch:c) 205> 206> [] -> \_cont -> lexError "No closing '}' in code segment" [] 207 208---------------------------------------------------------------------------- 209Utilities that read the rest of a token. 210 211> readId :: String -> (String -> String -> a) -> a 212> readId (c:r) fn | isIdPart c = readId r (fn . (:) c) 213> readId r fn = fn [] r 214 215> readNum :: String -> (String -> String -> a) -> a 216> readNum (c:r) fn | isDigit c = readNum r (fn . (:) c) 217> readNum r fn = fn [] r 218 219> isIdPart :: Char -> Bool 220> isIdPart c = 221> c >= 'a' && c <= 'z' 222> || c >= 'A' && c <= 'Z' 223> || c >= '0' && c <= '9' 224> || c == '_' 225 226> lexReadSingleChar :: String -> (String -> String -> a) -> a 227> lexReadSingleChar ('\\':c:'\'':r) fn = fn ('\\':c:"'") r 228> lexReadSingleChar (c:'\'':r) fn = fn (c:"'") r 229> lexReadSingleChar r fn = fn "" r 230 231> lexReadChar :: String -> (String -> String -> a) -> a 232> lexReadChar ('\'':r) fn = fn "" r 233> lexReadChar ('\\':'\'':r) fn = lexReadChar r (fn . (:) '\\' . (:) '\'') 234> lexReadChar ('\\':c:r) fn = lexReadChar r (fn . (:) '\\' . (:) c) 235> lexReadChar (c:r) fn = lexReadChar r (fn . (:) c) 236> lexReadChar [] fn = fn "" [] 237 238> lexReadString :: String -> (String -> String -> a) -> a 239> lexReadString ('"'{-"-}:r) fn = fn "" r 240> lexReadString ('\\':'"':r) fn = lexReadString r (fn . (:) '\\' . (:) '"') 241> lexReadString ('\\':c:r) fn = lexReadString r (fn . (:) '\\' . (:) c) 242> lexReadString (c:r) fn = lexReadString r (fn . (:) c) 243> lexReadString [] fn = fn "" [] 244 245> lexError :: String -> String -> Int -> ParseResult a 246> lexError err = runP (lineP >>= \l -> failP (show l ++ ": " ++ err ++ "\n")) 247 248> lexNestedComment :: Int -> ([Char] -> Int -> ParseResult a) -> [Char] -> Int 249> -> ParseResult a 250> lexNestedComment l cont r = 251> case r of 252> '-':'}':r' -> cont r' 253> '{':'-':r' -> \line -> lexNestedComment line 254> (\r'' -> lexNestedComment l cont r'') r' line 255> '\n':r' -> \line -> lexNestedComment l cont r' (line+1) 256> _:r' -> lexNestedComment l cont r' 257> "" -> \_ -> lexError "unterminated comment" r l 258