1----------------------------------------------------------------------------- 2The lexer. 3 4(c) 1993-2001 Andy Gill, Simon Marlow 5----------------------------------------------------------------------------- 6 7> module Lexer ( 8> Token(..), 9> TokenId(..), 10> lexer ) where 11 12> import ParseMonad 13 14> import Data.Char ( isSpace, isAlphaNum, isDigit, digitToInt ) 15 16> data Token 17> = TokenInfo String TokenId 18> | TokenNum Int TokenId 19> | TokenKW TokenId 20> | TokenEOF 21 22> tokenToId :: Token -> TokenId 23> tokenToId (TokenInfo _ i) = i 24> tokenToId (TokenNum _ i) = i 25> tokenToId (TokenKW i) = i 26> tokenToId TokenEOF = error "tokenToId TokenEOF" 27 28> instance Eq Token where 29> i == i' = tokenToId i == tokenToId i' 30 31> instance Ord Token where 32> i <= i' = tokenToId i <= tokenToId i' 33 34> data TokenId 35> = TokId -- words and symbols 36> | TokSpecId_TokenType -- %tokentype 37> | TokSpecId_Token -- %token 38> | TokSpecId_Name -- %name 39> | TokSpecId_Partial -- %partial 40> | TokSpecId_ErrorHandlerType -- %errorhandlertype 41> | TokSpecId_Lexer -- %lexer 42> | TokSpecId_ImportedIdentity -- %importedidentity 43> | TokSpecId_Monad -- %monad 44> | TokSpecId_Nonassoc -- %nonassoc 45> | TokSpecId_Left -- %left 46> | TokSpecId_Right -- %right 47> | TokSpecId_Prec -- %prec 48> | TokSpecId_Expect -- %expect 49> | TokSpecId_Error -- %error 50> | TokSpecId_Attributetype -- %attributetype 51> | TokSpecId_Attribute -- %attribute 52> | TokCodeQuote -- stuff inside { .. } 53> | TokColon -- : 54> | TokSemiColon -- ; 55> | TokDoubleColon -- :: 56> | TokDoublePercent -- %% 57> | TokBar -- | 58> | TokNum -- Integer 59> | TokParenL -- ( 60> | TokParenR -- ) 61> | TokComma -- , 62> deriving (Eq,Ord 63 64#ifdef DEBUG 65 66> ,Show 67 68#endif 69 70> ) 71 72ToDo: proper text instance here, for use in parser error messages. 73 74> lexer :: (Token -> P a) -> P a 75> lexer cont = mkP lexer' 76> where lexer' "" = returnToken cont TokenEOF "" 77> lexer' ('-':'-':r) = lexer' (dropWhile (/= '\n') r) 78> lexer' ('{':'-':r) = \line -> lexNestedComment line lexer' r line 79> lexer' (c:rest) = nextLex cont c rest 80 81> returnToken :: (t -> P a) -> t -> String -> Int -> ParseResult a 82> returnToken cont tok = runP (cont tok) 83 84> nextLex :: (Token -> P a) -> Char -> String -> Int -> ParseResult a 85> nextLex cont c = case c of 86> '\n' -> \rest line -> returnToken lexer cont rest (line+1) 87> '%' -> lexPercent cont 88> ':' -> lexColon cont 89> ';' -> returnToken cont (TokenKW TokSemiColon) 90 91> '|' -> returnToken cont (TokenKW TokBar) 92> '\'' -> lexChar cont 93> '"'{-"-}-> lexString cont 94> '{' -> lexCode cont 95 96> '(' -> returnToken cont (TokenKW TokParenL) 97> ')' -> returnToken cont (TokenKW TokParenR) 98> ',' -> returnToken cont (TokenKW TokComma) 99 100> _ 101> | isSpace c -> runP (lexer cont) 102> | c >= 'a' && c <= 'z' 103> || c >= 'A' && c <= 'Z' -> lexId cont c 104> | isDigit c -> lexNum cont c 105> _ -> lexError ("lexical error before `" ++ c : "'") 106 107Percents come in two forms, in pairs, or 108followed by a special identifier. 109 110> lexPercent :: (Token -> P a) -> [Char] -> Int -> ParseResult a 111> lexPercent cont s = case s of 112> '%':rest -> returnToken cont (TokenKW TokDoublePercent) rest 113> 't':'o':'k':'e':'n':'t':'y':'p':'e':rest -> 114> returnToken cont (TokenKW TokSpecId_TokenType) rest 115> 't':'o':'k':'e':'n':rest -> 116> returnToken cont (TokenKW TokSpecId_Token) rest 117> 'n':'a':'m':'e':rest -> 118> returnToken cont (TokenKW TokSpecId_Name) rest 119> 'p':'a':'r':'t':'i':'a':'l':rest -> 120> returnToken cont (TokenKW TokSpecId_Partial) rest 121> 'i':'m':'p':'o':'r':'t':'e':'d':'i':'d':'e':'n':'t':'i':'t':'y':rest -> 122> returnToken cont (TokenKW TokSpecId_ImportedIdentity) rest 123> 'm':'o':'n':'a':'d':rest -> 124> returnToken cont (TokenKW TokSpecId_Monad) rest 125> 'l':'e':'x':'e':'r':rest -> 126> returnToken cont (TokenKW TokSpecId_Lexer) rest 127> 'n':'o':'n':'a':'s':'s':'o':'c':rest -> 128> returnToken cont (TokenKW TokSpecId_Nonassoc) rest 129> 'l':'e':'f':'t':rest -> 130> returnToken cont (TokenKW TokSpecId_Left) rest 131> 'r':'i':'g':'h':'t':rest -> 132> returnToken cont (TokenKW TokSpecId_Right) rest 133> 'p':'r':'e':'c':rest -> 134> returnToken cont (TokenKW TokSpecId_Prec) rest 135> 'e':'x':'p':'e':'c':'t':rest -> 136> returnToken cont (TokenKW TokSpecId_Expect) rest 137> 'e':'r':'r':'o':'r':'h':'a':'n':'d':'l':'e':'r':'t':'y':'p':'e':rest -> 138> returnToken cont (TokenKW TokSpecId_ErrorHandlerType) rest 139> 'e':'r':'r':'o':'r':rest -> 140> returnToken cont (TokenKW TokSpecId_Error) rest 141> 'a':'t':'t':'r':'i':'b':'u':'t':'e':'t':'y':'p':'e':rest -> 142> returnToken cont (TokenKW TokSpecId_Attributetype) rest 143> 'a':'t':'t':'r':'i':'b':'u':'t':'e':rest -> 144> returnToken cont (TokenKW TokSpecId_Attribute) rest 145> _ -> lexError ("unrecognised directive: %" ++ 146> takeWhile (not.isSpace) s) s 147 148> lexColon :: (Token -> P a) -> [Char] -> Int -> ParseResult a 149> lexColon cont (':':rest) = returnToken cont (TokenKW TokDoubleColon) rest 150> lexColon cont rest = returnToken cont (TokenKW TokColon) rest 151 152> lexId :: (Token -> P a) -> Char -> String -> Int -> ParseResult a 153> lexId cont c rest = 154> readId rest (\ ident rest' -> returnToken cont (TokenInfo (c:ident) TokId) rest') 155 156> lexChar :: (Token -> P a) -> String -> Int -> ParseResult a 157> lexChar cont rest = lexReadChar rest 158> (\ ident -> returnToken cont (TokenInfo ("'" ++ ident ++ "'") TokId)) 159 160> lexString :: (Token -> P a) -> String -> Int -> ParseResult a 161> lexString cont rest = lexReadString rest 162> (\ ident -> returnToken cont (TokenInfo ("\"" ++ ident ++ "\"") TokId)) 163 164> lexCode :: (Token -> P a) -> String -> Int -> ParseResult a 165> lexCode cont rest = lexReadCode rest (0 :: Integer) "" cont 166 167> lexNum :: (Token -> P a) -> Char -> String -> Int -> ParseResult a 168> lexNum cont c rest = 169> readNum rest (\ num rest' -> 170> returnToken cont (TokenNum (stringToInt (c:num)) TokNum) rest') 171> where stringToInt = foldl (\n c' -> digitToInt c' + 10*n) 0 172 173> cleanupCode :: String -> String 174> cleanupCode s = 175> dropWhile isSpace (reverse (dropWhile isSpace (reverse s))) 176 177This has to match for @}@ that are {\em not} in strings. The code 178here is a bit tricky, but should work in most cases. 179 180> lexReadCode :: (Eq a, Num a) 181> => String -> a -> String -> (Token -> P b) -> Int 182> -> ParseResult b 183> lexReadCode s n c = case s of 184> '\n':r -> \cont l -> lexReadCode r n ('\n':c) cont (l+1) 185> 186> '{' :r -> lexReadCode r (n+1) ('{':c) 187> 188> '}' :r 189> | n == 0 -> \cont -> returnToken cont (TokenInfo ( 190> cleanupCode (reverse c)) TokCodeQuote) r 191> | otherwise -> lexReadCode r (n-1) ('}':c) 192> 193> '"'{-"-}:r -> lexReadString r (\ str r' -> 194> lexReadCode r' n ('"' : (reverse str) ++ '"' : c)) 195> 196> a: '\'':r | isAlphaNum a -> lexReadCode r n ('\'':a:c) 197> 198> '\'' :r -> lexReadSingleChar r (\ str r' -> 199> lexReadCode r' n ((reverse str) ++ '\'' : c)) 200> 201> ch:r -> lexReadCode r n (ch:c) 202> 203> [] -> \_cont -> lexError "No closing '}' in code segment" [] 204 205---------------------------------------------------------------------------- 206Utilities that read the rest of a token. 207 208> readId :: String -> (String -> String -> a) -> a 209> readId (c:r) fn | isIdPart c = readId r (fn . (:) c) 210> readId r fn = fn [] r 211 212> readNum :: String -> (String -> String -> a) -> a 213> readNum (c:r) fn | isDigit c = readNum r (fn . (:) c) 214> readNum r fn = fn [] r 215 216> isIdPart :: Char -> Bool 217> isIdPart c = 218> c >= 'a' && c <= 'z' 219> || c >= 'A' && c <= 'Z' 220> || c >= '0' && c <= '9' 221> || c == '_' 222 223> lexReadSingleChar :: String -> (String -> String -> a) -> a 224> lexReadSingleChar ('\\':c:'\'':r) fn = fn ('\\':c:"'") r 225> lexReadSingleChar (c:'\'':r) fn = fn (c:"'") r 226> lexReadSingleChar r fn = fn "" r 227 228> lexReadChar :: String -> (String -> String -> a) -> a 229> lexReadChar ('\'':r) fn = fn "" r 230> lexReadChar ('\\':'\'':r) fn = lexReadChar r (fn . (:) '\\' . (:) '\'') 231> lexReadChar ('\\':c:r) fn = lexReadChar r (fn . (:) '\\' . (:) c) 232> lexReadChar (c:r) fn = lexReadChar r (fn . (:) c) 233> lexReadChar [] fn = fn "" [] 234 235> lexReadString :: String -> (String -> String -> a) -> a 236> lexReadString ('"'{-"-}:r) fn = fn "" r 237> lexReadString ('\\':'"':r) fn = lexReadString r (fn . (:) '\\' . (:) '"') 238> lexReadString ('\\':c:r) fn = lexReadString r (fn . (:) '\\' . (:) c) 239> lexReadString (c:r) fn = lexReadString r (fn . (:) c) 240> lexReadString [] fn = fn "" [] 241 242> lexError :: String -> String -> Int -> ParseResult a 243> lexError err = runP (lineP >>= \l -> failP (show l ++ ": " ++ err ++ "\n")) 244 245> lexNestedComment :: Int -> ([Char] -> Int -> ParseResult a) -> [Char] -> Int 246> -> ParseResult a 247> lexNestedComment l cont r = 248> case r of 249> '-':'}':r' -> cont r' 250> '{':'-':r' -> \line -> lexNestedComment line 251> (\r'' -> lexNestedComment l cont r'') r' line 252> '\n':r' -> \line -> lexNestedComment l cont r' (line+1) 253> _:r' -> lexNestedComment l cont r' 254> "" -> \_ -> lexError "unterminated comment" r l 255