1-----------------------------------------------------------------------------
2The lexer.
3
4(c) 1993-2001 Andy Gill, Simon Marlow
5-----------------------------------------------------------------------------
6
7> module Lexer (
8>       Token(..),
9>       TokenId(..),
10>       lexer ) where
11
12> import ParseMonad
13
14> import Data.Char ( isSpace, isAlphaNum, isDigit, digitToInt )
15
16> data Token
17>       = TokenInfo String TokenId
18>       | TokenNum  Int    TokenId
19>       | TokenKW          TokenId
20>       | TokenEOF
21
22> tokenToId :: Token -> TokenId
23> tokenToId (TokenInfo _ i) = i
24> tokenToId (TokenNum _ i) = i
25> tokenToId (TokenKW i) = i
26> tokenToId TokenEOF = error "tokenToId TokenEOF"
27
28> instance Eq Token where
29>       i == i' = tokenToId i == tokenToId i'
30
31> instance Ord Token where
32>       i <= i' = tokenToId i <= tokenToId i'
33
34> data TokenId
35>       = TokId                 -- words and symbols
36>       | TokSpecId_TokenType   -- %tokentype
37>       | TokSpecId_Token       -- %token
38>       | TokSpecId_Name        -- %name
39>       | TokSpecId_Partial     -- %partial
40>       | TokSpecId_ErrorHandlerType    -- %errorhandlertype
41>       | TokSpecId_Lexer       -- %lexer
42>       | TokSpecId_ImportedIdentity -- %importedidentity
43>       | TokSpecId_Monad       -- %monad
44>       | TokSpecId_Nonassoc    -- %nonassoc
45>       | TokSpecId_Left        -- %left
46>       | TokSpecId_Right       -- %right
47>       | TokSpecId_Prec        -- %prec
48>       | TokSpecId_Shift       -- %shift
49>       | TokSpecId_Expect      -- %expect
50>       | TokSpecId_Error       -- %error
51>       | TokSpecId_Attributetype -- %attributetype
52>       | TokSpecId_Attribute   -- %attribute
53>       | TokCodeQuote          -- stuff inside { .. }
54>       | TokColon              -- :
55>       | TokSemiColon          -- ;
56>       | TokDoubleColon        -- ::
57>       | TokDoublePercent      -- %%
58>       | TokBar                -- |
59>       | TokNum                -- Integer
60>       | TokParenL             -- (
61>       | TokParenR             -- )
62>       | TokComma              -- ,
63>       deriving (Eq,Ord
64
65#ifdef DEBUG
66
67>               ,Show
68
69#endif
70
71>               )
72
73ToDo: proper text instance here, for use in parser error messages.
74
75> lexer :: (Token -> P a) -> P a
76> lexer cont = mkP lexer'
77>   where lexer' "" = returnToken cont TokenEOF ""
78>         lexer' ('-':'-':r) = lexer' (dropWhile (/= '\n') r)
79>         lexer' ('{':'-':r) = \line -> lexNestedComment line lexer' r line
80>         lexer' (c:rest) = nextLex cont c rest
81
82> returnToken :: (t -> P a) -> t -> String -> Int -> ParseResult a
83> returnToken cont tok = runP (cont tok)
84
85> nextLex :: (Token -> P a) -> Char -> String -> Int -> ParseResult a
86> nextLex cont c = case c of
87>       '\n'    -> \rest line -> returnToken lexer cont rest (line+1)
88>       '%'     -> lexPercent cont
89>       ':'     -> lexColon cont
90>       ';'     -> returnToken cont (TokenKW TokSemiColon)
91
92>       '|'     -> returnToken cont (TokenKW TokBar)
93>       '\''    -> lexChar cont
94>       '"'{-"-}-> lexString cont
95>       '{'     -> lexCode cont
96
97>       '('     -> returnToken cont (TokenKW TokParenL)
98>       ')'     -> returnToken cont (TokenKW TokParenR)
99>       ','     -> returnToken cont (TokenKW TokComma)
100
101>       _
102>         | isSpace c -> runP (lexer cont)
103>         |  c >= 'a' && c <= 'z'
104>            || c >= 'A' && c <= 'Z' -> lexId cont c
105>         | isDigit c -> lexNum cont c
106>       _       -> lexError ("lexical error before `" ++ c : "'")
107
108Percents come in two forms, in pairs, or
109followed by a special identifier.
110
111> lexPercent :: (Token -> P a) -> [Char] -> Int -> ParseResult a
112> lexPercent cont s = case s of
113>       '%':rest -> returnToken cont (TokenKW TokDoublePercent) rest
114>       't':'o':'k':'e':'n':'t':'y':'p':'e':rest ->
115>               returnToken cont (TokenKW TokSpecId_TokenType) rest
116>       't':'o':'k':'e':'n':rest ->
117>               returnToken cont (TokenKW TokSpecId_Token) rest
118>       'n':'a':'m':'e':rest ->
119>               returnToken cont (TokenKW TokSpecId_Name) rest
120>       'p':'a':'r':'t':'i':'a':'l':rest ->
121>               returnToken cont (TokenKW TokSpecId_Partial) rest
122>       'i':'m':'p':'o':'r':'t':'e':'d':'i':'d':'e':'n':'t':'i':'t':'y':rest ->
123>               returnToken cont (TokenKW TokSpecId_ImportedIdentity) rest
124>       'm':'o':'n':'a':'d':rest ->
125>               returnToken cont (TokenKW TokSpecId_Monad) rest
126>       'l':'e':'x':'e':'r':rest ->
127>               returnToken cont (TokenKW TokSpecId_Lexer) rest
128>       'n':'o':'n':'a':'s':'s':'o':'c':rest ->
129>               returnToken cont (TokenKW TokSpecId_Nonassoc) rest
130>       'l':'e':'f':'t':rest ->
131>               returnToken cont (TokenKW TokSpecId_Left) rest
132>       'r':'i':'g':'h':'t':rest ->
133>               returnToken cont (TokenKW TokSpecId_Right) rest
134>       'p':'r':'e':'c':rest ->
135>               returnToken cont (TokenKW TokSpecId_Prec) rest
136>       's':'h':'i':'f':'t':rest ->
137>               returnToken cont (TokenKW TokSpecId_Shift) rest
138>       'e':'x':'p':'e':'c':'t':rest ->
139>               returnToken cont (TokenKW TokSpecId_Expect) rest
140>       'e':'r':'r':'o':'r':'h':'a':'n':'d':'l':'e':'r':'t':'y':'p':'e':rest ->
141>               returnToken cont (TokenKW TokSpecId_ErrorHandlerType) rest
142>       'e':'r':'r':'o':'r':rest ->
143>               returnToken cont (TokenKW TokSpecId_Error) rest
144>       'a':'t':'t':'r':'i':'b':'u':'t':'e':'t':'y':'p':'e':rest ->
145>               returnToken cont (TokenKW TokSpecId_Attributetype) rest
146>       'a':'t':'t':'r':'i':'b':'u':'t':'e':rest ->
147>               returnToken cont (TokenKW TokSpecId_Attribute) rest
148>       _ -> lexError ("unrecognised directive: %" ++
149>                               takeWhile (not.isSpace) s) s
150
151> lexColon :: (Token -> P a) -> [Char] -> Int -> ParseResult a
152> lexColon cont (':':rest) = returnToken cont (TokenKW TokDoubleColon) rest
153> lexColon cont rest       = returnToken cont (TokenKW TokColon) rest
154
155> lexId :: (Token -> P a) -> Char -> String -> Int -> ParseResult a
156> lexId cont c rest =
157>       readId rest (\ ident rest' -> returnToken cont (TokenInfo (c:ident) TokId) rest')
158
159> lexChar :: (Token -> P a) -> String -> Int -> ParseResult a
160> lexChar cont rest = lexReadChar rest
161>       (\ ident -> returnToken cont (TokenInfo ("'" ++ ident ++ "'") TokId))
162
163> lexString :: (Token -> P a) -> String -> Int -> ParseResult a
164> lexString cont rest = lexReadString rest
165>       (\ ident -> returnToken cont (TokenInfo ("\"" ++ ident ++ "\"") TokId))
166
167> lexCode :: (Token -> P a) -> String -> Int -> ParseResult a
168> lexCode cont rest = lexReadCode rest (0 :: Integer) "" cont
169
170> lexNum :: (Token -> P a) -> Char -> String -> Int -> ParseResult a
171> lexNum cont c rest =
172>        readNum rest (\ num rest' ->
173>                         returnToken cont (TokenNum (stringToInt (c:num)) TokNum) rest')
174>  where stringToInt = foldl (\n c' -> digitToInt c' + 10*n) 0
175
176> cleanupCode :: String -> String
177> cleanupCode s =
178>    dropWhile isSpace (reverse (dropWhile isSpace (reverse s)))
179
180This has to match for @}@ that are {\em not} in strings.  The code
181here is a bit tricky, but should work in most cases.
182
183> lexReadCode :: (Eq a, Num a)
184>             => String -> a -> String -> (Token -> P b) -> Int
185>             -> ParseResult b
186> lexReadCode s n c = case s of
187>       '\n':r -> \cont l ->  lexReadCode r n ('\n':c) cont (l+1)
188>
189>       '{' :r -> lexReadCode r (n+1) ('{':c)
190>
191>       '}' :r
192>               | n == 0    -> \cont -> returnToken cont (TokenInfo (
193>                               cleanupCode (reverse c)) TokCodeQuote) r
194>               | otherwise -> lexReadCode r (n-1) ('}':c)
195>
196>       '"'{-"-}:r -> lexReadString r (\ str r' ->
197>                     lexReadCode r' n ('"' : (reverse str) ++ '"' : c))
198>
199>       a: '\'':r | isAlphaNum a -> lexReadCode r n ('\'':a:c)
200>
201>       '\'' :r -> lexReadSingleChar r (\ str r' ->
202>                  lexReadCode r' n ((reverse str) ++ '\'' : c))
203>
204>       ch:r -> lexReadCode r n (ch:c)
205>
206>       [] -> \_cont -> lexError "No closing '}' in code segment" []
207
208----------------------------------------------------------------------------
209Utilities that read the rest of a token.
210
211> readId :: String -> (String -> String -> a) -> a
212> readId (c:r) fn | isIdPart c = readId r (fn . (:) c)
213> readId r     fn = fn [] r
214
215> readNum :: String -> (String -> String -> a) -> a
216> readNum (c:r) fn | isDigit c = readNum r (fn . (:) c)
217> readNum r     fn = fn [] r
218
219> isIdPart :: Char -> Bool
220> isIdPart c =
221>          c >= 'a' && c <= 'z'
222>       || c >= 'A' && c <= 'Z'
223>       || c >= '0' && c <= '9'
224>       || c == '_'
225
226> lexReadSingleChar :: String -> (String -> String -> a) -> a
227> lexReadSingleChar ('\\':c:'\'':r) fn = fn ('\\':c:"'") r
228> lexReadSingleChar (c:'\'':r)      fn = fn (c:"'") r
229> lexReadSingleChar r               fn = fn "" r
230
231> lexReadChar :: String -> (String -> String -> a) -> a
232> lexReadChar ('\'':r)      fn = fn "" r
233> lexReadChar ('\\':'\'':r) fn = lexReadChar r (fn . (:) '\\' . (:) '\'')
234> lexReadChar ('\\':c:r)    fn = lexReadChar r (fn . (:) '\\' . (:) c)
235> lexReadChar (c:r)         fn = lexReadChar r (fn . (:) c)
236> lexReadChar []            fn = fn "" []
237
238> lexReadString :: String -> (String -> String -> a) -> a
239> lexReadString ('"'{-"-}:r) fn = fn "" r
240> lexReadString ('\\':'"':r) fn = lexReadString r (fn . (:) '\\' . (:) '"')
241> lexReadString ('\\':c:r)   fn = lexReadString r (fn . (:) '\\' . (:) c)
242> lexReadString (c:r)        fn = lexReadString r (fn . (:) c)
243> lexReadString []           fn = fn "" []
244
245> lexError :: String -> String -> Int -> ParseResult a
246> lexError err = runP (lineP >>= \l -> failP (show l ++ ": " ++ err ++ "\n"))
247
248> lexNestedComment :: Int -> ([Char] -> Int -> ParseResult a) -> [Char] -> Int
249>                  -> ParseResult a
250> lexNestedComment l cont r =
251>   case r of
252>       '-':'}':r' -> cont r'
253>       '{':'-':r' -> \line -> lexNestedComment line
254>                       (\r'' -> lexNestedComment l cont r'') r' line
255>       '\n':r'    -> \line -> lexNestedComment l cont r' (line+1)
256>       _:r'       -> lexNestedComment l cont r'
257>       ""         -> \_ -> lexError "unterminated comment" r l
258