1-- C->Haskell Compiler: Lexer for CHS Files 2-- 3-- Author : Manuel M T Chakravarty 4-- Created: 13 August 99 5-- 6-- Version $Revision: 1.1 $ from $Date: 2004/11/21 21:05:35 $ 7-- 8-- Copyright (c) [1999..2004] Manuel M T Chakravarty 9-- 10-- This file is free software; you can redistribute it and/or modify 11-- it under the terms of the GNU General Public License as published by 12-- the Free Software Foundation; either version 2 of the License, or 13-- (at your option) any later version. 14-- 15-- This file is distributed in the hope that it will be useful, 16-- but WITHOUT ANY WARRANTY; without even the implied warranty of 17-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18-- GNU General Public License for more details. 19-- 20--- DESCRIPTION --------------------------------------------------------------- 21-- 22-- Lexer for CHS files; the tokens are only partially recognised. 23-- 24--- DOCU ---------------------------------------------------------------------- 25-- 26-- language: Haskell 98 27-- 28-- * CHS files are assumed to be Haskell 98 files that include C2HS binding 29-- hooks. 30-- 31-- * Haskell code is not tokenised, but binding hooks (delimited by `{#'and 32-- `#}') are analysed. Therefore the lexer operates in two states 33-- (realised as two lexer coupled by meta actions) depending on whether 34-- Haskell code or a binding hook is currently read. The lexer reading 35-- Haskell code is called `base lexer'; the other one, `binding-hook 36-- lexer'. In addition, there is a inline-c lexer, which, as the 37-- binding-hook lexer, can be triggered from the base lexer. 38-- 39-- * Base lexer: 40-- 41-- haskell -> (inline \\ special)* 42-- | special \\ `"' 43-- | comment 44-- | nested 45-- | hstring 46-- | '{#' 47-- | cpp 48-- special -> `(' | `{' | `-' | `"' 49-- ctrl -> `\n' | `\f' | `\r' | `\t' | `\v' 50-- 51-- inline -> any \\ ctrl 52-- any -> '\0'..'\255' 53-- 54-- Within the base lexer control codes appear as separate tokens in the 55-- token list. 56-- 57-- NOTE: It is important that `{' is an extra lexeme and not added as an 58-- optional component at the end of the first alternative for 59-- `haskell'. Otherwise, the principle of the longest match will 60-- divide `foo {#' into the tokens `foo {' and `#' instead of `foo ' 61-- and `{#'. 62-- 63-- One line comments are handled by 64-- 65-- comment -> `--' (any \\ `\n')* `\n' 66-- 67-- and nested comments by 68-- 69-- nested -> `{-' any* `-}' 70-- 71-- where `any*' may contain _balanced_ occurrences of `{-' and `-}'. 72-- 73-- hstring -> `"' inhstr* `"' 74-- inhstr -> ` '..`\127' \\ `"' 75-- | `\"' 76-- 77-- Pre-precessor directives as well as the switch to inline-C code are 78-- formed as follows: 79-- 80-- cpp -> `\n#' (inline | `\t')* `\n' 81-- | `\n#c' (' ' | '\t')* `\n' 82-- 83-- We allow whitespace between the `#' and the actual directive, but in `#c' 84-- and `#endc' the directive must immediately follow the `#'. This might 85-- be regarded as a not entirely orthogonal design, but simplifies matters 86-- especially for `#endc'. 87-- 88-- * On encountering the lexeme `{#', a meta action in the base lexer 89-- transfers control to the following binding-hook lexer: 90-- 91-- ident -> letter (letter | digit | `\'')* 92-- | `\'' letter (letter | digit)* `\'' 93-- reservedid -> `as' | `call' | `class' | `context' | `deriving' 94-- | `enum' | `foreign' | `fun' | `get' | `lib' 95-- | `newtype' | `pointer' | `prefix' | `pure' | `set' 96-- | `sizeof' | `stable' | `type' | `underscoreToCase' 97-- | `unsafe' | `with' | 'lock' | 'unlock' 98-- reservedsym -> `{#' | `#}' | `{' | `}' | `,' | `.' | `->' | `=' 99-- | `=>' | '-' | `*' | `&' | `^' 100-- string -> `"' instr* `"' 101-- verbhs -> `\`' instr* `\'' 102-- instr -> ` '..`\127' \\ `"' 103-- comment -> `--' (any \\ `\n')* `\n' 104-- 105-- Control characters, white space, and comments are discarded in the 106-- binding-hook lexer. Nested comments are not allowed in a binding hook. 107-- Identifiers can be enclosed in single quotes to avoid collision with 108-- C->Haskell keywords. 109-- 110-- * In the binding-hook lexer, the lexeme `#}' transfers control back to the 111-- base lexer. An occurence of the lexeme `{#' inside the binding-hook 112-- lexer triggers an error. The symbol `{#' is not explcitly represented 113-- in the resulting token stream. However, the occurrence of a token 114-- representing one of the reserved identifiers `call', `context', `enum', 115-- and `field' marks the start of a binding hook. Strictly speaking, `#}' 116-- need also not occur in the token stream, as the next `haskell' token 117-- marks a hook's end. It is, however, useful for producing accurate error 118-- messages (in case an hook is closed to early) to have a token 119-- representing `#}'. 120-- 121-- * The rule `ident' describes Haskell identifiers, but without 122-- distinguishing between variable and constructor identifers (ie, those 123-- starting with a lowercase and those starting with an uppercase letter). 124-- However, we use it also to scan C identifiers; although, strictly 125-- speaking, it is too general for them. In the case of C identifiers, 126-- this should not have any impact on the range of descriptions accepted by 127-- the tool, as illegal identifier will never occur in a C header file that 128-- is accepted by the C lexer. In the case of Haskell identifiers, a 129-- confusion between variable and constructor identifiers will be noted by 130-- the Haskell compiler translating the code generated by c2hs. Moreover, 131-- identifiers can be enclosed in single quotes to avoid collision with 132-- C->Haskell keywords, but those may not contain apostrophes. 133-- 134-- * Any line starting with the character `#' is regarded to be a C 135-- preprocessor directive. With the exception of `#c' and `#endc', which 136-- delimit a set of lines containing inline C code. Hence, in the base 137-- lexer, the lexeme `#c' triggers a meta action transferring control to the 138-- following inline-C lexer: 139-- 140-- c -> inline* \\ `\n#endc' 141-- 142-- We do neither treat C strings nor C comments specially. Hence, if the 143-- string "\n#endc" occurs in a comment, we will mistakenly regard it as 144-- the end of the inline C code. Note that the problem cannot happen with 145-- strings, as C does not permit strings that extend over multiple lines. 146-- At the moment, it just seems not to be worth the effort required to 147-- treat this situation more accurately. 148-- 149-- The inline-C lexer also doesn't handle pre-processor directives 150-- specially. Hence, structural pre-processor directives (namely, 151-- conditionals) may occur within inline-C code only properly nested. 152-- 153-- Shortcomings 154-- ~~~~~~~~~~~~ 155-- Some lexemes that include single and double quote characters are not lexed 156-- correctly. See the implementation comment at `haskell' for details. 157-- 158-- 159--- TODO ---------------------------------------------------------------------- 160-- 161-- * In `haskell', the case of a single `"' (without a matching second one) 162-- is caught by an eplicit error raising rule. This shouldn't be 163-- necessary, but for some strange reason, the lexer otherwise hangs when a 164-- single `"' appears in the input. 165-- 166-- * Comments in the "gap" of a string are not yet supported. 167-- 168 169module CHSLexer (CHSToken(..), lexCHS) 170where 171 172import Data.List ((\\)) 173import Data.Char (isDigit) 174import Control.Monad (liftM) 175import Numeric (readDec, readOct, readHex) 176 177import Position (Position(..), Pos(posOf), incPos, retPos, tabPos) 178import Errors (ErrorLvl(..), Error, makeError) 179import UNames (NameSupply, Name, names) 180import Idents (Ident, lexemeToIdent, identToLexeme) 181import Lexers (Regexp, Lexer, Action, epsilon, char, (+>), lexaction, 182 lexactionErr, lexmeta, (>|<), (>||<), ctrlLexer, star, plus, 183 quest, alt, string, LexerState, execLexer) 184 185import C2HSState (CST, raise, raiseError, nop, getNameSupply) 186 187 188-- token definition 189-- ---------------- 190 191-- possible tokens (EXPORTED) 192-- 193data CHSToken = CHSTokArrow Position -- `->' 194 | CHSTokDArrow Position -- `=>' 195 | CHSTokDot Position -- `.' 196 | CHSTokComma Position -- `,' 197 | CHSTokEqual Position -- `=' 198 | CHSTokMinus Position -- `-' 199 | CHSTokStar Position -- `*' 200 | CHSTokAmp Position -- `&' 201 | CHSTokHat Position -- `^' 202 | CHSTokLBrace Position -- `{' 203 | CHSTokRBrace Position -- `}' 204 | CHSTokLParen Position -- `(' 205 | CHSTokRParen Position -- `)' 206 | CHSTokEndHook Position -- `#}' 207 | CHSTokAs Position -- `as' 208 | CHSTokCall Position -- `call' 209 | CHSTokClass Position -- `class' 210 | CHSTokContext Position -- `context' 211 | CHSTokDerive Position -- `deriving' 212 | CHSTokEnum Position -- `enum' 213 | CHSTokForeign Position -- `foreign' 214 | CHSTokFun Position -- `fun' 215 | CHSTokGet Position -- `get' 216 | CHSTokImport Position -- `import' 217 | CHSTokLib Position -- `lib' 218 | CHSTokNewtype Position -- `newtype' 219 | CHSTokPointer Position -- `pointer' 220 | CHSTokPrefix Position -- `prefix' 221 | CHSTokPure Position -- `pure' 222 | CHSTokQualif Position -- `qualified' 223 | CHSTokSet Position -- `set' 224 | CHSTokSizeof Position -- `sizeof' 225 | CHSTokStable Position -- `stable' 226 | CHSTokType Position -- `type' 227 | CHSTok_2Case Position -- `underscoreToCase' 228 | CHSTokUnsafe Position -- `unsafe' 229 | CHSTokWith Position -- `with' 230 | CHSTokLock Position -- `lock' 231 | CHSTokNolock Position -- `nolock' 232 | CHSTokString Position String -- string 233 | CHSTokHSVerb Position String -- verbatim Haskell (`...') 234 | CHSTokIdent Position Ident -- identifier 235 | CHSTokHaskell Position String -- verbatim Haskell code 236 | CHSTokCPP Position String -- pre-processor directive 237 | CHSTokLine Position -- line pragma 238 | CHSTokC Position String -- verbatim C code 239 | CHSTokCtrl Position Char -- control code 240 | CHSTokPragma Position -- '{-# LANGUAGE' language pragma begin 241 | CHSTokPragEnd Position -- '#-}' language pragma end 242 243instance Pos CHSToken where 244 posOf (CHSTokArrow pos ) = pos 245 posOf (CHSTokDArrow pos ) = pos 246 posOf (CHSTokDot pos ) = pos 247 posOf (CHSTokComma pos ) = pos 248 posOf (CHSTokEqual pos ) = pos 249 posOf (CHSTokMinus pos ) = pos 250 posOf (CHSTokStar pos ) = pos 251 posOf (CHSTokAmp pos ) = pos 252 posOf (CHSTokHat pos ) = pos 253 posOf (CHSTokLBrace pos ) = pos 254 posOf (CHSTokRBrace pos ) = pos 255 posOf (CHSTokLParen pos ) = pos 256 posOf (CHSTokRParen pos ) = pos 257 posOf (CHSTokEndHook pos ) = pos 258 posOf (CHSTokAs pos ) = pos 259 posOf (CHSTokCall pos ) = pos 260 posOf (CHSTokClass pos ) = pos 261 posOf (CHSTokContext pos ) = pos 262 posOf (CHSTokDerive pos ) = pos 263 posOf (CHSTokEnum pos ) = pos 264 posOf (CHSTokForeign pos ) = pos 265 posOf (CHSTokFun pos ) = pos 266 posOf (CHSTokGet pos ) = pos 267 posOf (CHSTokImport pos ) = pos 268 posOf (CHSTokLib pos ) = pos 269 posOf (CHSTokNewtype pos ) = pos 270 posOf (CHSTokPointer pos ) = pos 271 posOf (CHSTokPrefix pos ) = pos 272 posOf (CHSTokPure pos ) = pos 273 posOf (CHSTokQualif pos ) = pos 274 posOf (CHSTokSet pos ) = pos 275 posOf (CHSTokSizeof pos ) = pos 276 posOf (CHSTokStable pos ) = pos 277 posOf (CHSTokType pos ) = pos 278 posOf (CHSTok_2Case pos ) = pos 279 posOf (CHSTokUnsafe pos ) = pos 280 posOf (CHSTokWith pos ) = pos 281 posOf (CHSTokLock pos ) = pos 282 posOf (CHSTokNolock pos ) = pos 283 posOf (CHSTokString pos _) = pos 284 posOf (CHSTokHSVerb pos _) = pos 285 posOf (CHSTokIdent pos _) = pos 286 posOf (CHSTokHaskell pos _) = pos 287 posOf (CHSTokCPP pos _) = pos 288 posOf (CHSTokC pos _) = pos 289 posOf (CHSTokCtrl pos _) = pos 290 posOf (CHSTokPragma pos ) = pos 291 posOf (CHSTokPragEnd pos ) = pos 292 293instance Eq CHSToken where 294 (CHSTokArrow _ ) == (CHSTokArrow _ ) = True 295 (CHSTokDArrow _ ) == (CHSTokDArrow _ ) = True 296 (CHSTokDot _ ) == (CHSTokDot _ ) = True 297 (CHSTokComma _ ) == (CHSTokComma _ ) = True 298 (CHSTokEqual _ ) == (CHSTokEqual _ ) = True 299 (CHSTokMinus _ ) == (CHSTokMinus _ ) = True 300 (CHSTokStar _ ) == (CHSTokStar _ ) = True 301 (CHSTokAmp _ ) == (CHSTokAmp _ ) = True 302 (CHSTokHat _ ) == (CHSTokHat _ ) = True 303 (CHSTokLBrace _ ) == (CHSTokLBrace _ ) = True 304 (CHSTokRBrace _ ) == (CHSTokRBrace _ ) = True 305 (CHSTokLParen _ ) == (CHSTokLParen _ ) = True 306 (CHSTokRParen _ ) == (CHSTokRParen _ ) = True 307 (CHSTokEndHook _ ) == (CHSTokEndHook _ ) = True 308 (CHSTokAs _ ) == (CHSTokAs _ ) = True 309 (CHSTokCall _ ) == (CHSTokCall _ ) = True 310 (CHSTokClass _ ) == (CHSTokClass _ ) = True 311 (CHSTokContext _ ) == (CHSTokContext _ ) = True 312 (CHSTokDerive _ ) == (CHSTokDerive _ ) = True 313 (CHSTokEnum _ ) == (CHSTokEnum _ ) = True 314 (CHSTokForeign _ ) == (CHSTokForeign _ ) = True 315 (CHSTokFun _ ) == (CHSTokFun _ ) = True 316 (CHSTokGet _ ) == (CHSTokGet _ ) = True 317 (CHSTokImport _ ) == (CHSTokImport _ ) = True 318 (CHSTokLib _ ) == (CHSTokLib _ ) = True 319 (CHSTokNewtype _ ) == (CHSTokNewtype _ ) = True 320 (CHSTokPointer _ ) == (CHSTokPointer _ ) = True 321 (CHSTokPrefix _ ) == (CHSTokPrefix _ ) = True 322 (CHSTokPure _ ) == (CHSTokPure _ ) = True 323 (CHSTokQualif _ ) == (CHSTokQualif _ ) = True 324 (CHSTokSet _ ) == (CHSTokSet _ ) = True 325 (CHSTokSizeof _ ) == (CHSTokSizeof _ ) = True 326 (CHSTokStable _ ) == (CHSTokStable _ ) = True 327 (CHSTokType _ ) == (CHSTokType _ ) = True 328 (CHSTok_2Case _ ) == (CHSTok_2Case _ ) = True 329 (CHSTokUnsafe _ ) == (CHSTokUnsafe _ ) = True 330 (CHSTokWith _ ) == (CHSTokWith _ ) = True 331 (CHSTokLock _ ) == (CHSTokLock _ ) = True 332 (CHSTokNolock _ ) == (CHSTokNolock _ ) = True 333 (CHSTokString _ _) == (CHSTokString _ _) = True 334 (CHSTokHSVerb _ _) == (CHSTokHSVerb _ _) = True 335 (CHSTokIdent _ _) == (CHSTokIdent _ _) = True 336 (CHSTokHaskell _ _) == (CHSTokHaskell _ _) = True 337 (CHSTokCPP _ _) == (CHSTokCPP _ _) = True 338 (CHSTokC _ _) == (CHSTokC _ _) = True 339 (CHSTokCtrl _ _) == (CHSTokCtrl _ _) = True 340 (CHSTokPragma _ ) == (CHSTokPragma _ ) = True 341 (CHSTokPragEnd _ ) == (CHSTokPragEnd _ ) = True 342 _ == _ = False 343 344instance Show CHSToken where 345 showsPrec _ (CHSTokArrow _ ) = showString "->" 346 showsPrec _ (CHSTokDArrow _ ) = showString "=>" 347 showsPrec _ (CHSTokDot _ ) = showString "." 348 showsPrec _ (CHSTokComma _ ) = showString "," 349 showsPrec _ (CHSTokEqual _ ) = showString "=" 350 showsPrec _ (CHSTokMinus _ ) = showString "-" 351 showsPrec _ (CHSTokStar _ ) = showString "*" 352 showsPrec _ (CHSTokAmp _ ) = showString "&" 353 showsPrec _ (CHSTokHat _ ) = showString "^" 354 showsPrec _ (CHSTokLBrace _ ) = showString "{" 355 showsPrec _ (CHSTokRBrace _ ) = showString "}" 356 showsPrec _ (CHSTokLParen _ ) = showString "(" 357 showsPrec _ (CHSTokRParen _ ) = showString ")" 358 showsPrec _ (CHSTokEndHook _ ) = showString "#}" 359 showsPrec _ (CHSTokAs _ ) = showString "as" 360 showsPrec _ (CHSTokCall _ ) = showString "call" 361 showsPrec _ (CHSTokClass _ ) = showString "class" 362 showsPrec _ (CHSTokContext _ ) = showString "context" 363 showsPrec _ (CHSTokDerive _ ) = showString "deriving" 364 showsPrec _ (CHSTokEnum _ ) = showString "enum" 365 showsPrec _ (CHSTokForeign _ ) = showString "foreign" 366 showsPrec _ (CHSTokFun _ ) = showString "fun" 367 showsPrec _ (CHSTokGet _ ) = showString "get" 368 showsPrec _ (CHSTokImport _ ) = showString "import" 369 showsPrec _ (CHSTokLib _ ) = showString "lib" 370 showsPrec _ (CHSTokNewtype _ ) = showString "newtype" 371 showsPrec _ (CHSTokPointer _ ) = showString "pointer" 372 showsPrec _ (CHSTokPrefix _ ) = showString "prefix" 373 showsPrec _ (CHSTokPure _ ) = showString "pure" 374 showsPrec _ (CHSTokQualif _ ) = showString "qualified" 375 showsPrec _ (CHSTokSet _ ) = showString "set" 376 showsPrec _ (CHSTokSizeof _ ) = showString "sizeof" 377 showsPrec _ (CHSTokStable _ ) = showString "stable" 378 showsPrec _ (CHSTokType _ ) = showString "type" 379 showsPrec _ (CHSTok_2Case _ ) = showString "underscoreToCase" 380 showsPrec _ (CHSTokUnsafe _ ) = showString "unsafe" 381 showsPrec _ (CHSTokWith _ ) = showString "with" 382 showsPrec _ (CHSTokLock _ ) = showString "lock" 383 showsPrec _ (CHSTokNolock _ ) = showString "nolock" 384 showsPrec _ (CHSTokString _ s) = showString ("\"" ++ s ++ "\"") 385 showsPrec _ (CHSTokHSVerb _ s) = showString ("`" ++ s ++ "'") 386 showsPrec _ (CHSTokIdent _ i) = (showString . identToLexeme) i 387 showsPrec _ (CHSTokHaskell _ s) = showString s 388 showsPrec _ (CHSTokCPP _ s) = showString s 389 showsPrec _ (CHSTokC _ s) = showString s 390 showsPrec _ (CHSTokCtrl _ c) = showChar c 391 showsPrec _ (CHSTokPragma _ ) = showString "{-# LANGUAGE" 392 showsPrec _ (CHSTokPragEnd _ ) = showString "#-}" 393 394 395-- lexer state 396-- ----------- 397 398-- state threaded through the lexer 399-- 400data CHSLexerState = CHSLS { 401 nestLvl :: Int, -- nesting depth of nested comments 402 inHook :: Bool, -- within a binding hook? 403 namesup :: [Name] -- supply of unique names 404 } 405 406-- initial state 407-- 408initialState :: CST s CHSLexerState 409initialState = do 410 namesup <- liftM names getNameSupply 411 return $ CHSLS { 412 nestLvl = 0, 413 inHook = False, 414 namesup = namesup 415 } 416 417-- raise an error if the given state is not a final state 418-- 419assertFinalState :: Position -> CHSLexerState -> CST s () 420assertFinalState pos CHSLS {nestLvl = nestLvl, inHook = inHook} 421 | nestLvl > 0 = raiseError pos ["Unexpected end of file!", 422 "Unclosed nested comment."] 423 | inHook = raiseError pos ["Unexpected end of file!", 424 "Unclosed binding hook."] 425 | otherwise = nop 426 427-- lexer and action type used throughout this specification 428-- 429type CHSLexer = Lexer CHSLexerState CHSToken 430type CHSAction = Action CHSToken 431type CHSRegexp = Regexp CHSLexerState CHSToken 432 433-- for actions that need a new unique name 434-- 435infixl 3 `lexactionName` 436lexactionName :: CHSRegexp 437 -> (String -> Position -> Name -> CHSToken) 438 -> CHSLexer 439re `lexactionName` action = re `lexmeta` action' 440 where 441 action' str pos state = let name:ns = namesup state 442 in 443 (Just $ Right (action str pos name), 444 incPos pos (length str), 445 state {namesup = ns}, 446 Nothing) 447 448 449-- lexical specification 450-- --------------------- 451 452-- the lexical definition of the tokens (the base lexer) 453-- 454-- 455chslexer :: CHSLexer 456chslexer = pragma -- LANGUAGE pragma 457 >||< haskell -- Haskell code 458 >||< nested -- nested comments 459 >||< ctrl -- control code (that has to be preserved) 460 >||< hook -- start of a binding hook 461 >||< cpp -- a pre-processor directive (or `#c') 462 463-- the LANGUAGE pragma 464pragma :: CHSLexer 465pragma = string "{-# LANGUAGE" `lexmeta` \_ pos s -> 466 (Just $ Right (CHSTokPragma pos), incPos pos 12, s, Just langLexer) 467 468langLexer :: CHSLexer 469langLexer = whitespace >||< identOrKW >||< symbol >||< 470 (string "#-}" `lexmeta` \_ pos s -> 471 (Just $ Right (CHSTokPragEnd pos), incPos pos 3, s, Just chslexer)) 472 473-- stream of Haskell code (terminated by a control character or binding hook) 474-- 475haskell :: CHSLexer 476-- 477-- NB: We need to make sure that '"' is not regarded as the beginning of a 478-- string; however, we cannot really lex character literals properly 479-- without lexing identifiers (as the latter may containing single quotes 480-- as part of their lexeme). Thus, we special case '"'. This is still a 481-- kludge, as a program fragment, such as 482-- 483-- foo'"'strange string" 484-- 485-- will not be handled correctly. 486-- 487haskell = ( anyButSpecial`star` epsilon 488 >|< specialButQuotes 489 >|< char '"' +> inhstr`star` char '"' 490 >|< string "'\"'" -- special case of " 491 >|< string "--" +> anyButNL`star` epsilon -- comment 492 ) 493 `lexaction` copyVerbatim 494 >||< char '"' -- this is a bad kludge 495 `lexactionErr` 496 \_ pos -> (Left $ makeError ErrorErr pos 497 ["Lexical error!", 498 "Unclosed string."]) 499 where 500 anyButSpecial = alt (inlineSet \\ specialSet) 501 specialButQuotes = alt (specialSet \\ ['"']) 502 anyButNL = alt (anySet \\ ['\n']) 503 inhstr = instr >|< char '\\' >|< string "\\\"" >|< gap 504 gap = char '\\' +> alt (' ':ctrlSet)`plus` char '\\' 505 506-- action copying the input verbatim to `CHSTokHaskell' tokens 507-- 508copyVerbatim :: CHSAction 509copyVerbatim cs pos = Just $ CHSTokHaskell pos cs 510 511-- nested comments 512-- 513nested :: CHSLexer 514nested = 515 string "{-" {- for Haskell emacs mode :-( -} 516 `lexmeta` enterComment 517 >||< 518 string "-}" 519 `lexmeta` leaveComment 520 where 521 enterComment cs pos s = 522 (copyVerbatim' cs pos, -- collect the lexeme 523 incPos pos 2, -- advance current position 524 s {nestLvl = nestLvl s + 1}, -- increase nesting level 525 Just $ inNestedComment) -- continue in comment lexer 526 -- 527 leaveComment cs pos s = 528 case nestLvl s of 529 0 -> (commentCloseErr pos, -- 0: -} outside comment => err 530 incPos pos 2, -- advance current position 531 s, 532 Nothing) 533 1 -> (copyVerbatim' cs pos, -- collect the lexeme 534 incPos pos 2, -- advance current position 535 s {nestLvl = nestLvl s - 1}, -- decrease nesting level 536 Just chslexer) -- 1: continue with root lexer 537 _ -> (copyVerbatim' cs pos, -- collect the lexeme 538 incPos pos 2, -- advance current position 539 s {nestLvl = nestLvl s - 1}, -- decrease nesting level 540 Nothing) -- _: cont with comment lexer 541 -- 542 copyVerbatim' cs pos = Just $ Right (CHSTokHaskell pos cs) 543 -- 544 commentCloseErr pos = 545 Just $ Left (makeError ErrorErr pos 546 ["Lexical error!", 547 "`-}' not preceded by a matching `{-'."]) 548 {- for Haskell emacs mode :-( -} 549 550 551-- lexer processing the inner of a comment 552-- 553inNestedComment :: CHSLexer 554inNestedComment = commentInterior -- inside a comment 555 >||< nested -- nested comments 556 >||< ctrl -- control code (preserved) 557 558-- standard characters in a nested comment 559-- 560commentInterior :: CHSLexer 561commentInterior = ( anyButSpecial`star` epsilon 562 >|< special 563 ) 564 `lexaction` copyVerbatim 565 where 566 anyButSpecial = alt (inlineSet \\ commentSpecialSet) 567 special = alt commentSpecialSet 568 569-- control code in the base lexer (is turned into a token) 570-- 571-- * this covers exactly the same set of characters as contained in `ctrlSet' 572-- and `Lexers.ctrlLexer' and advances positions also like the `ctrlLexer' 573-- 574ctrl :: CHSLexer 575ctrl = 576 char '\n' `lexmeta` newline 577 >||< char '\r' `lexmeta` newline 578 >||< char '\v' `lexmeta` newline 579 >||< char '\f' `lexmeta` formfeed 580 >||< char '\t' `lexmeta` tab 581 where 582 newline [c] pos = ctrlResult pos c (retPos pos) 583 formfeed [c] pos = ctrlResult pos c (incPos pos 1) 584 tab [c] pos = ctrlResult pos c (tabPos pos) 585 586 ctrlResult pos c pos' s = 587 (Just $ Right (CHSTokCtrl pos c), pos', s, Nothing) 588 589-- start of a binding hook (ie, enter the binding hook lexer) 590-- 591hook :: CHSLexer 592hook = string "{#" 593 `lexmeta` \_ pos s -> (Nothing, incPos pos 2, s, Just bhLexer) 594 595-- pre-processor directives and `#c' 596-- 597-- * we lex `#c' as a directive and special case it in the action 598-- 599-- * we lex C line number pragmas and special case it in the action 600-- 601cpp :: CHSLexer 602cpp = directive 603 where 604 directive = 605 string "\n#" +> alt ('\t':inlineSet)`star` epsilon 606 `lexmeta` 607 \(_:_:dir) pos s -> -- strip off the "\n#" 608 case dir of 609 ['c'] -> -- #c 610 (Nothing, retPos pos, s, Just cLexer) 611 -- a #c may be followed by whitespace 612 'c':sp:_ | sp `elem` " \t" -> -- #c 613 (Nothing, retPos pos, s, Just cLexer) 614 ' ':line@(n:_) | isDigit n -> -- C line pragma 615 let pos' = adjustPosByCLinePragma line pos 616 in (Just $ Right (CHSTokLine pos'), pos', s, Nothing) 617 _ -> -- CPP directive 618 (Just $ Right (CHSTokCPP pos dir), 619 retPos pos, s, Nothing) 620 621adjustPosByCLinePragma :: String -> Position -> Position 622adjustPosByCLinePragma str (Position fname _ _) = 623 (Position fname' row' 0) 624 where 625 str' = dropWhite str 626 (rowStr, str'') = span isDigit str' 627 row' = read rowStr 628 str''' = dropWhite str'' 629 fnameStr = takeWhile (/= '"') . drop 1 $ str''' 630 fname' | null str''' || head str''' /= '"' = fname 631 -- try and get more sharing of file name strings 632 | fnameStr == fname = fname 633 | otherwise = fnameStr 634 -- 635 dropWhite = dropWhile (\c -> c == ' ' || c == '\t') 636 637-- the binding hook lexer 638-- 639bhLexer :: CHSLexer 640bhLexer = identOrKW 641 >||< symbol 642 >||< strlit 643 >||< hsverb 644 >||< whitespace 645 >||< endOfHook 646 >||< string "--" +> anyButNL`star` char '\n' -- comment 647 `lexmeta` \_ pos s -> (Nothing, retPos pos, s, Nothing) 648 where 649 anyButNL = alt (anySet \\ ['\n']) 650 endOfHook = string "#}" 651 `lexmeta` 652 \_ pos s -> (Just $ Right (CHSTokEndHook pos), 653 incPos pos 2, s, Just chslexer) 654 655-- the inline-C lexer 656-- 657cLexer :: CHSLexer 658cLexer = inlineC -- inline C code 659 >||< ctrl -- control code (preserved) 660 >||< string "\n#endc" -- end of inline C code... 661 `lexmeta` -- ...preserve '\n' as control token 662 \_ pos s -> (Just $ Right (CHSTokCtrl pos '\n'), retPos pos, s, 663 Just chslexer) 664 where 665 inlineC = alt inlineSet `lexaction` copyVerbatimC 666 -- 667 copyVerbatimC :: CHSAction 668 copyVerbatimC cs pos = Just $ CHSTokC pos cs 669 670-- whitespace 671-- 672-- * horizontal and vertical tabs, newlines, and form feeds are filter out by 673-- `Lexers.ctrlLexer' 674-- 675whitespace :: CHSLexer 676whitespace = (char ' ' `lexaction` \_ _ -> Nothing) 677 >||< ctrlLexer 678 679-- identifiers and keywords 680-- 681identOrKW :: CHSLexer 682-- 683-- the strictness annotations seem to help a bit 684-- 685identOrKW = 686 -- identifier or keyword 687 (letter +> (letter >|< digit >|< char '\'')`star` epsilon 688 `lexactionName` \cs pos name -> (idkwtok $!pos) cs name) 689 >||< -- identifier in single quotes 690 (char '\'' +> letter +> (letter >|< digit)`star` char '\'' 691 `lexactionName` \cs pos name -> (mkid $!pos) cs name) 692 -- NB: quotes are removed by lexemeToIdent 693 where 694 idkwtok pos "as" _ = CHSTokAs pos 695 idkwtok pos "call" _ = CHSTokCall pos 696 idkwtok pos "class" _ = CHSTokClass pos 697 idkwtok pos "context" _ = CHSTokContext pos 698 idkwtok pos "deriving" _ = CHSTokDerive pos 699 idkwtok pos "enum" _ = CHSTokEnum pos 700 idkwtok pos "foreign" _ = CHSTokForeign pos 701 idkwtok pos "fun" _ = CHSTokFun pos 702 idkwtok pos "get" _ = CHSTokGet pos 703 idkwtok pos "import" _ = CHSTokImport pos 704 idkwtok pos "lib" _ = CHSTokLib pos 705 idkwtok pos "newtype" _ = CHSTokNewtype pos 706 idkwtok pos "pointer" _ = CHSTokPointer pos 707 idkwtok pos "prefix" _ = CHSTokPrefix pos 708 idkwtok pos "pure" _ = CHSTokPure pos 709 idkwtok pos "qualified" _ = CHSTokQualif pos 710 idkwtok pos "set" _ = CHSTokSet pos 711 idkwtok pos "sizeof" _ = CHSTokSizeof pos 712 idkwtok pos "stable" _ = CHSTokStable pos 713 idkwtok pos "type" _ = CHSTokType pos 714 idkwtok pos "underscoreToCase" _ = CHSTok_2Case pos 715 idkwtok pos "unsafe" _ = CHSTokUnsafe pos 716 idkwtok pos "with" _ = CHSTokWith pos 717 idkwtok pos "lock" _ = CHSTokLock pos 718 idkwtok pos "nolock" _ = CHSTokNolock pos 719 idkwtok pos cs name = mkid pos cs name 720 -- 721 mkid pos cs name = CHSTokIdent pos (lexemeToIdent pos cs name) 722 723-- reserved symbols 724-- 725symbol :: CHSLexer 726symbol = sym "->" CHSTokArrow 727 >||< sym "=>" CHSTokDArrow 728 >||< sym "." CHSTokDot 729 >||< sym "," CHSTokComma 730 >||< sym "=" CHSTokEqual 731 >||< sym "-" CHSTokMinus 732 >||< sym "*" CHSTokStar 733 >||< sym "&" CHSTokAmp 734 >||< sym "^" CHSTokHat 735 >||< sym "{" CHSTokLBrace 736 >||< sym "}" CHSTokRBrace 737 >||< sym "(" CHSTokLParen 738 >||< sym ")" CHSTokRParen 739 where 740 sym cs con = string cs `lexaction` \_ pos -> Just (con pos) 741 742-- string 743-- 744strlit :: CHSLexer 745strlit = char '"' +> (instr >|< char '\\')`star` char '"' 746 `lexaction` \cs pos -> Just (CHSTokString pos (init . tail $ cs)) 747 748-- verbatim code 749-- 750hsverb :: CHSLexer 751hsverb = char '`' +> inhsverb`star` char '\'' 752 `lexaction` \cs pos -> Just (CHSTokHSVerb pos (init . tail $ cs)) 753 754 755-- regular expressions 756-- 757letter, digit, instr, inchar, inhsverb :: Regexp s t 758letter = alt ['a'..'z'] >|< alt ['A'..'Z'] >|< char '_' 759digit = alt ['0'..'9'] 760instr = alt ([' '..'\127'] \\ "\"\\") 761inchar = alt ([' '..'\127'] \\ "\'") 762inhsverb = alt ([' '..'\127'] \\ "\'") 763 764-- character sets 765-- 766anySet, inlineSet, specialSet, commentSpecialSet, ctrlSet :: [Char] 767anySet = ['\0'..'\255'] 768inlineSet = anySet \\ ctrlSet 769specialSet = ['{', '-', '"', '\''] 770commentSpecialSet = ['{', '-'] 771ctrlSet = ['\n', '\f', '\r', '\t', '\v'] 772 773 774-- main lexing routine 775-- ------------------- 776 777-- generate a token sequence out of a string denoting a CHS file 778-- (EXPORTED) 779-- 780-- * the given position is attributed to the first character in the string 781-- 782-- * errors are entered into the compiler state 783-- 784lexCHS :: String -> Position -> CST s [CHSToken] 785lexCHS cs pos = 786 do 787 state <- initialState 788 let (ts, lstate, errs) = execLexer chslexer (cs, pos, state) 789 (_, pos', state') = lstate 790 mapM raise errs 791 assertFinalState pos' state' 792 return ts 793