1-- C->Haskell Compiler: Lexer for CHS Files 2-- 3-- Author : Manuel M T Chakravarty 4-- Created: 13 August 99 5-- 6-- Copyright (c) [1999..2005] Manuel M T Chakravarty 7-- 8-- This file is free software; you can redistribute it and/or modify 9-- it under the terms of the GNU General Public License as published by 10-- the Free Software Foundation; either version 2 of the License, or 11-- (at your option) any later version. 12-- 13-- This file is distributed in the hope that it will be useful, 14-- but WITHOUT ANY WARRANTY; without even the implied warranty of 15-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16-- GNU General Public License for more details. 17-- 18--- DESCRIPTION --------------------------------------------------------------- 19-- 20-- Lexer for CHS files; the tokens are only partially recognised. 21-- 22--- DOCU ---------------------------------------------------------------------- 23-- 24-- language: Haskell 98 25-- 26-- * CHS files are assumed to be Haskell 98 files that include C2HS binding 27-- hooks. 28-- 29-- * Haskell code is not tokenised, but binding hooks (delimited by `{#'and 30-- `#}') are analysed. Therefore the lexer operates in two states 31-- (realised as two lexer coupled by meta actions) depending on whether 32-- Haskell code or a binding hook is currently read. The lexer reading 33-- Haskell code is called `base lexer'; the other one, `binding-hook 34-- lexer'. In addition, there is a inline-c lexer, which, as the 35-- binding-hook lexer, can be triggered from the base lexer. 36-- 37-- * Base lexer: 38-- 39-- haskell -> (inline \\ special)* 40-- | special \\ `"' 41-- | comment 42-- | nested 43-- | hstring 44-- | '{#' 45-- | cpp 46-- special -> `(' | `{' | `-' | `"' 47-- ctrl -> `\n' | `\f' | `\r' | `\t' | `\v' 48-- 49-- inline -> any \\ ctrl 50-- any -> '\0'..'\255' 51-- 52-- Within the base lexer control codes appear as separate tokens in the 53-- token list. 54-- 55-- NOTE: It is important that `{' is an extra lexeme and not added as an 56-- optional component at the end of the first alternative for 57-- `haskell'. Otherwise, the principle of the longest match will 58-- divide `foo {#' into the tokens `foo {' and `#' instead of `foo ' 59-- and `{#'. 60-- 61-- One line comments are handled by 62-- 63-- comment -> `--' (any \\ `\n')* `\n' 64-- 65-- and nested comments by 66-- 67-- nested -> `{-' any* `-}' 68-- 69-- where `any*' may contain _balanced_ occurrences of `{-' and `-}'. 70-- 71-- hstring -> `"' inhstr* `"' 72-- inhstr -> ` '..`\127' \\ `"' 73-- | `\"' 74-- 75-- Pre-precessor directives as well as the switch to inline-C code are 76-- formed as follows: 77-- 78-- cpp -> `\n#' (inline | `\t')* `\n' 79-- | `\n#c' (' ' | '\t')* `\n' 80-- 81-- We allow whitespace between the `#' and the actual directive, but in `#c' 82-- and `#endc' the directive must immediately follow the `#'. This might 83-- be regarded as a not entirely orthogonal design, but simplifies matters 84-- especially for `#endc'. 85-- 86-- * On encountering the lexeme `{#', a meta action in the base lexer 87-- transfers control to the following binding-hook lexer: 88-- 89-- ident -> letter (letter | digit | `\'')* 90-- cidenttail -> digit (letter | digit)* 91-- reservedid -> `add' | `as' | `call' | `class' | `context' | `deriving' 92-- | `enum' | `foreign' | `fun' | `get' | `lib' 93-- | `downcaseFirstLetter' | `finalizer' | `interruptible' 94-- | `newtype' | `nocode' | `pointer' | `prefix' | `pure' 95-- | `set' | `sizeof' | `stable' | `struct' | `type' 96-- | `underscoreToCase' | `upcaseFirstLetter' | `unsafe' | 97-- | `with' | `const' | `omit' 98-- reservedsym -> `{#' | `#}' | `{' | `}' | `,' | `.' | `->' | `=' 99-- | `=>' | '-' | `*' | `&' | `^' | `+' | `%' 100-- string -> `"' instr* `"' 101-- verbhs -> `\`' inhsverb* `\'' 102-- quoths -> `\'' inhsverb* `\'' 103-- instr -> ` '..`\127' \\ `"' 104-- inhsverb -> ` '..`\127' \\ `\'' 105-- comment -> `--' (any \\ `\n')* `\n' 106-- 107-- Control characters, white space, and comments are discarded in the 108-- binding-hook lexer. Nested comments are not allowed in a binding hook. 109-- Identifiers can be enclosed in single quotes to avoid collision with 110-- C->Haskell keywords, and in this case quoted part could also be followed by 111-- what is assumed to be a valid Haskell code, which would be transferred in the 112-- output file verbatim. 113-- 114-- * In the binding-hook lexer, the lexeme `#}' transfers control back to the 115-- base lexer. An occurence of the lexeme `{#' inside the binding-hook 116-- lexer triggers an error. The symbol `{#' is not explcitly represented 117-- in the resulting token stream. However, the occurrence of a token 118-- representing one of the reserved identifiers `call', `context', `enum', 119-- and `field' marks the start of a binding hook. Strictly speaking, `#}' 120-- need also not occur in the token stream, as the next `haskell' token 121-- marks a hook's end. It is, however, useful for producing accurate error 122-- messages (in case an hook is closed to early) to have a token 123-- representing `#}'. 124-- 125-- * The rule `ident' describes Haskell identifiers, but without 126-- distinguishing between variable and constructor identifers (ie, those 127-- starting with a lowercase and those starting with an uppercase letter). 128-- However, we use it also to scan C identifiers; although, strictly 129-- speaking, it is too general for them. In the case of C identifiers, 130-- this should not have any impact on the range of descriptions accepted by 131-- the tool, as illegal identifier will never occur in a C header file that 132-- is accepted by the C lexer. In the case of Haskell identifiers, a 133-- confusion between variable and constructor identifiers will be noted by 134-- the Haskell compiler translating the code generated by c2hs. Moreover, 135-- identifiers that are inside quoted parts (see above) may not contain apostrophes. 136-- 137-- * Any line starting with the character `#' is regarded to be a C 138-- preprocessor directive. With the exception of `#c' and `#endc', which 139-- delimit a set of lines containing inline C code. Hence, in the base 140-- lexer, the lexeme `#c' triggers a meta action transferring control to the 141-- following inline-C lexer: 142-- 143-- c -> inline* \\ `\n#endc' 144-- 145-- We do neither treat C strings nor C comments specially. Hence, if the 146-- string "\n#endc" occurs in a comment, we will mistakenly regard it as 147-- the end of the inline C code. Note that the problem cannot happen with 148-- strings, as C does not permit strings that extend over multiple lines. 149-- At the moment, it just seems not to be worth the effort required to 150-- treat this situation more accurately. 151-- 152-- The inline-C lexer also doesn't handle pre-processor directives 153-- specially. Hence, structural pre-processor directives (namely, 154-- conditionals) may occur within inline-C code only properly nested. 155-- 156-- Shortcomings 157-- ~~~~~~~~~~~~ 158-- Some lexemes that include single and double quote characters are not lexed 159-- correctly. See the implementation comment at `haskell' for details. 160-- 161-- 162--- TODO ---------------------------------------------------------------------- 163-- 164-- * In `haskell', the case of a single `"' (without a matching second one) 165-- is caught by an eplicit error raising rule. This shouldn't be 166-- necessary, but for some strange reason, the lexer otherwise hangs when a 167-- single `"' appears in the input. 168-- 169-- * Comments in the "gap" of a string are not yet supported. 170-- 171 172module C2HS.CHS.Lexer (CHSToken(..), lexCHS, keywordToIdent) 173where 174 175import Data.List ((\\)) 176import Data.Char (isDigit, isSpace) 177 178import Language.C.Data.Ident 179import Language.C.Data.Name 180import Language.C.Data.Position 181 182import Data.Errors (ErrorLevel(..), makeError) 183import Text.Lexers (Regexp, Lexer, Action, epsilon, char, (+>), lexaction, 184 lexactionErr, lexmeta, (>|<), (>||<), ctrlLexer, star, plus, 185 alt, string, execLexer) 186import Control.State (getNameSupply, setNameSupply) 187import C2HS.State (CST, raise, raiseError) 188 189 190-- token definition 191-- ---------------- 192 193-- | possible tokens 194-- 195data CHSToken = CHSTokArrow Position -- `->' 196 | CHSTokDArrow Position -- `=>' 197 | CHSTokDot Position -- `.' 198 | CHSTokComma Position -- `,' 199 | CHSTokEqual Position -- `=' 200 | CHSTokMinus Position -- `-' 201 | CHSTokStar Position -- `*' 202 | CHSTokAmp Position -- `&' 203 | CHSTokHat Position -- `^' 204 | CHSTokPercent Position -- `%' 205 | CHSTokPlus Position -- `+' 206 | CHSTokPlusS Position -- `+S' 207 | CHSTokPlusNum Position Int -- `+<num>' 208 | CHSTokLBrace Position -- `{' 209 | CHSTokRBrace Position -- `}' 210 | CHSTokLParen Position -- `(' 211 | CHSTokRParen Position -- `)' 212 | CHSTokLBrack Position -- `[' 213 | CHSTokRBrack Position -- `]' 214 | CHSTokHook Position -- `{#' 215 | CHSTokEndHook Position -- `#}' 216 | CHSTokAdd Position -- `add' 217 | CHSTokAs Position -- `as' 218 | CHSTokCall Position -- `call' 219 | CHSTokClass Position -- `class' 220 | CHSTokConst Position -- `const' 221 | CHSTokContext Position -- `context' 222 | CHSTokNonGNU Position -- `nonGNU' 223 | CHSTokTypedef Position -- `typedef' 224 | CHSTokDefault Position -- `default' 225 | CHSTokDerive Position -- `deriving' 226 | CHSTokDown Position -- `downcaseFirstLetter' 227 | CHSTokEnum Position -- `enum' 228 | CHSTokFinal Position -- `finalizer' 229 | CHSTokForeign Position -- `foreign' 230 | CHSTokFun Position -- `fun' 231 | CHSTokGet Position -- `get' 232 | CHSTokImport Position -- `import' 233 | CHSTokIntr Position -- `interruptible' 234 | CHSTokLib Position -- `lib' 235 | CHSTokNewtype Position -- `newtype' 236 | CHSTokNocode Position -- `nocode' 237 | CHSTokOffsetof Position -- `offsetof' 238 | CHSTokOmit Position -- `omit' 239 | CHSTokPointer Position -- `pointer' 240 | CHSTokPrefix Position -- `prefix' 241 | CHSTokPure Position -- `pure' 242 | CHSTokQualif Position -- `qualified' 243 | CHSTokSet Position -- `set' 244 | CHSTokSizeof Position -- `sizeof' 245 | CHSTokAlignof Position -- `alignof' 246 | CHSTokStable Position -- `stable' 247 | CHSTokStruct Position -- `struct' 248 | CHSTokType Position -- `type' 249 | CHSTok_2Case Position -- `underscoreToCase' 250 | CHSTokUnsafe Position -- `unsafe' 251 | CHSTokUpper Position -- `upcaseFirstLetter' 252 | CHSTokVariadic Position -- `variadic' 253 | CHSTokWith Position Ident -- `with' 254 | CHSTokIn Position -- `in' 255 | CHSTokOut Position -- `out' 256 | CHSTokString Position String -- string 257 | CHSTokHSVerb Position String -- verbatim Haskell (`...') 258 | CHSTokHSQuot Position String -- quoted Haskell ('...') 259 | CHSTokIdent Position Ident -- identifier 260 | CHSTokHaskell Position String -- verbatim Haskell code 261 | CHSTokCPP Position String Bool -- pre-processor directive 262 | CHSTokLine Position -- line pragma 263 | CHSTokC Position String -- verbatim C code 264 | CHSTokCtrl Position Char -- control code 265 | CHSTokComment Position String -- comment 266 | CHSTokCIdentTail Position Ident -- C identifier without prefix 267 | CHSTokCArg Position String -- C type argument 268 269instance Pos CHSToken where 270 posOf (CHSTokArrow pos ) = pos 271 posOf (CHSTokDArrow pos ) = pos 272 posOf (CHSTokDot pos ) = pos 273 posOf (CHSTokComma pos ) = pos 274 posOf (CHSTokEqual pos ) = pos 275 posOf (CHSTokMinus pos ) = pos 276 posOf (CHSTokStar pos ) = pos 277 posOf (CHSTokAmp pos ) = pos 278 posOf (CHSTokHat pos ) = pos 279 posOf (CHSTokLBrace pos ) = pos 280 posOf (CHSTokRBrace pos ) = pos 281 posOf (CHSTokLParen pos ) = pos 282 posOf (CHSTokRParen pos ) = pos 283 posOf (CHSTokLBrack pos ) = pos 284 posOf (CHSTokRBrack pos ) = pos 285 posOf (CHSTokHook pos ) = pos 286 posOf (CHSTokEndHook pos ) = pos 287 posOf (CHSTokAdd pos ) = pos 288 posOf (CHSTokAs pos ) = pos 289 posOf (CHSTokCall pos ) = pos 290 posOf (CHSTokClass pos ) = pos 291 posOf (CHSTokConst pos ) = pos 292 posOf (CHSTokContext pos ) = pos 293 posOf (CHSTokNonGNU pos ) = pos 294 posOf (CHSTokDerive pos ) = pos 295 posOf (CHSTokTypedef pos ) = pos 296 posOf (CHSTokDefault pos ) = pos 297 posOf (CHSTokDown pos ) = pos 298 posOf (CHSTokEnum pos ) = pos 299 posOf (CHSTokFinal pos ) = pos 300 posOf (CHSTokForeign pos ) = pos 301 posOf (CHSTokFun pos ) = pos 302 posOf (CHSTokGet pos ) = pos 303 posOf (CHSTokImport pos ) = pos 304 posOf (CHSTokLib pos ) = pos 305 posOf (CHSTokNewtype pos ) = pos 306 posOf (CHSTokNocode pos ) = pos 307 posOf (CHSTokOffsetof pos ) = pos 308 posOf (CHSTokOmit pos ) = pos 309 posOf (CHSTokPointer pos ) = pos 310 posOf (CHSTokPrefix pos ) = pos 311 posOf (CHSTokPure pos ) = pos 312 posOf (CHSTokQualif pos ) = pos 313 posOf (CHSTokSet pos ) = pos 314 posOf (CHSTokSizeof pos ) = pos 315 posOf (CHSTokAlignof pos ) = pos 316 posOf (CHSTokStable pos ) = pos 317 posOf (CHSTokStruct pos ) = pos 318 posOf (CHSTokType pos ) = pos 319 posOf (CHSTok_2Case pos ) = pos 320 posOf (CHSTokUnsafe pos ) = pos 321 posOf (CHSTokUpper pos ) = pos 322 posOf (CHSTokVariadic pos ) = pos 323 posOf (CHSTokWith pos _) = pos 324 posOf (CHSTokIn pos ) = pos 325 posOf (CHSTokOut pos ) = pos 326 posOf (CHSTokString pos _) = pos 327 posOf (CHSTokHSVerb pos _) = pos 328 posOf (CHSTokHSQuot pos _) = pos 329 posOf (CHSTokIdent pos _) = pos 330 posOf (CHSTokHaskell pos _) = pos 331 posOf (CHSTokCPP pos _ _) = pos 332 posOf (CHSTokLine pos ) = pos 333 posOf (CHSTokC pos _) = pos 334 posOf (CHSTokCtrl pos _) = pos 335 posOf (CHSTokComment pos _) = pos 336 posOf (CHSTokCIdentTail pos _) = pos 337 posOf (CHSTokCArg pos _) = pos 338 posOf (CHSTokPercent pos) = pos 339 340instance Eq CHSToken where 341 (CHSTokArrow _ ) == (CHSTokArrow _ ) = True 342 (CHSTokDArrow _ ) == (CHSTokDArrow _ ) = True 343 (CHSTokDot _ ) == (CHSTokDot _ ) = True 344 (CHSTokComma _ ) == (CHSTokComma _ ) = True 345 (CHSTokEqual _ ) == (CHSTokEqual _ ) = True 346 (CHSTokMinus _ ) == (CHSTokMinus _ ) = True 347 (CHSTokStar _ ) == (CHSTokStar _ ) = True 348 (CHSTokAmp _ ) == (CHSTokAmp _ ) = True 349 (CHSTokHat _ ) == (CHSTokHat _ ) = True 350 (CHSTokLBrace _ ) == (CHSTokLBrace _ ) = True 351 (CHSTokRBrace _ ) == (CHSTokRBrace _ ) = True 352 (CHSTokLParen _ ) == (CHSTokLParen _ ) = True 353 (CHSTokRParen _ ) == (CHSTokRParen _ ) = True 354 (CHSTokLBrack _ ) == (CHSTokLBrack _ ) = True 355 (CHSTokRBrack _ ) == (CHSTokRBrack _ ) = True 356 (CHSTokHook _ ) == (CHSTokHook _ ) = True 357 (CHSTokEndHook _ ) == (CHSTokEndHook _ ) = True 358 (CHSTokAdd _ ) == (CHSTokAdd _ ) = True 359 (CHSTokAs _ ) == (CHSTokAs _ ) = True 360 (CHSTokCall _ ) == (CHSTokCall _ ) = True 361 (CHSTokClass _ ) == (CHSTokClass _ ) = True 362 (CHSTokConst _ ) == (CHSTokConst _ ) = True 363 (CHSTokContext _ ) == (CHSTokContext _ ) = True 364 (CHSTokNonGNU _ ) == (CHSTokNonGNU _ ) = True 365 (CHSTokTypedef _ ) == (CHSTokTypedef _ ) = True 366 (CHSTokDefault _ ) == (CHSTokDefault _ ) = True 367 (CHSTokDerive _ ) == (CHSTokDerive _ ) = True 368 (CHSTokDown _ ) == (CHSTokDown _ ) = True 369 (CHSTokEnum _ ) == (CHSTokEnum _ ) = True 370 (CHSTokFinal _ ) == (CHSTokFinal _ ) = True 371 (CHSTokForeign _ ) == (CHSTokForeign _ ) = True 372 (CHSTokFun _ ) == (CHSTokFun _ ) = True 373 (CHSTokGet _ ) == (CHSTokGet _ ) = True 374 (CHSTokImport _ ) == (CHSTokImport _ ) = True 375 (CHSTokLib _ ) == (CHSTokLib _ ) = True 376 (CHSTokNewtype _ ) == (CHSTokNewtype _ ) = True 377 (CHSTokNocode _ ) == (CHSTokNocode _ ) = True 378 (CHSTokOffsetof _ ) == (CHSTokOffsetof _ ) = True 379 (CHSTokOmit _ ) == (CHSTokOmit _ ) = True 380 (CHSTokPointer _ ) == (CHSTokPointer _ ) = True 381 (CHSTokPrefix _ ) == (CHSTokPrefix _ ) = True 382 (CHSTokPure _ ) == (CHSTokPure _ ) = True 383 (CHSTokQualif _ ) == (CHSTokQualif _ ) = True 384 (CHSTokSet _ ) == (CHSTokSet _ ) = True 385 (CHSTokSizeof _ ) == (CHSTokSizeof _ ) = True 386 (CHSTokAlignof _ ) == (CHSTokAlignof _ ) = True 387 (CHSTokStable _ ) == (CHSTokStable _ ) = True 388 (CHSTokStruct _ ) == (CHSTokStruct _ ) = True 389 (CHSTokType _ ) == (CHSTokType _ ) = True 390 (CHSTok_2Case _ ) == (CHSTok_2Case _ ) = True 391 (CHSTokUnsafe _ ) == (CHSTokUnsafe _ ) = True 392 (CHSTokUpper _ ) == (CHSTokUpper _ ) = True 393 (CHSTokVariadic _ ) == (CHSTokVariadic _ ) = True 394 (CHSTokWith _ _) == (CHSTokWith _ _) = True 395 (CHSTokIn _ ) == (CHSTokIn _ ) = True 396 (CHSTokOut _ ) == (CHSTokOut _ ) = True 397 (CHSTokString _ _) == (CHSTokString _ _) = True 398 (CHSTokHSVerb _ _) == (CHSTokHSVerb _ _) = True 399 (CHSTokHSQuot _ _) == (CHSTokHSQuot _ _) = True 400 (CHSTokIdent _ _) == (CHSTokIdent _ _) = True 401 (CHSTokHaskell _ _) == (CHSTokHaskell _ _) = True 402 (CHSTokCPP _ _ _) == (CHSTokCPP _ _ _) = True 403 (CHSTokLine _ ) == (CHSTokLine _ ) = True 404 (CHSTokC _ _) == (CHSTokC _ _) = True 405 (CHSTokCtrl _ _) == (CHSTokCtrl _ _) = True 406 (CHSTokComment _ _) == (CHSTokComment _ _) = True 407 (CHSTokCIdentTail _ _) == (CHSTokCIdentTail _ _) = True 408 (CHSTokCArg _ _) == (CHSTokCArg _ _) = True 409 _ == _ = False 410 411instance Show CHSToken where 412 showsPrec _ (CHSTokArrow _ ) = showString "->" 413 showsPrec _ (CHSTokDArrow _ ) = showString "=>" 414 showsPrec _ (CHSTokDot _ ) = showString "." 415 showsPrec _ (CHSTokComma _ ) = showString "," 416 showsPrec _ (CHSTokEqual _ ) = showString "=" 417 showsPrec _ (CHSTokMinus _ ) = showString "-" 418 showsPrec _ (CHSTokStar _ ) = showString "*" 419 showsPrec _ (CHSTokAmp _ ) = showString "&" 420 showsPrec _ (CHSTokHat _ ) = showString "^" 421 showsPrec _ (CHSTokPercent _ ) = showString "%" 422 showsPrec _ (CHSTokPlus _ ) = showString "+" 423 showsPrec _ (CHSTokPlusS _ ) = showString "+S" 424 showsPrec _ (CHSTokPlusNum _ sz) = showString ("+" ++ show sz) 425 showsPrec _ (CHSTokLBrace _ ) = showString "{" 426 showsPrec _ (CHSTokRBrace _ ) = showString "}" 427 showsPrec _ (CHSTokLParen _ ) = showString "(" 428 showsPrec _ (CHSTokRParen _ ) = showString ")" 429 showsPrec _ (CHSTokLBrack _ ) = showString "[" 430 showsPrec _ (CHSTokRBrack _ ) = showString "]" 431 showsPrec _ (CHSTokHook _ ) = showString "{#" 432 showsPrec _ (CHSTokEndHook _ ) = showString "#}" 433 showsPrec _ (CHSTokAdd _ ) = showString "add" 434 showsPrec _ (CHSTokAs _ ) = showString "as" 435 showsPrec _ (CHSTokCall _ ) = showString "call" 436 showsPrec _ (CHSTokClass _ ) = showString "class" 437 showsPrec _ (CHSTokConst _ ) = showString "const" 438 showsPrec _ (CHSTokContext _ ) = showString "context" 439 showsPrec _ (CHSTokNonGNU _ ) = showString "nonGNU" 440 showsPrec _ (CHSTokTypedef _ ) = showString "typedef" 441 showsPrec _ (CHSTokDefault _ ) = showString "default" 442 showsPrec _ (CHSTokDerive _ ) = showString "deriving" 443 showsPrec _ (CHSTokDown _ ) = showString "downcaseFirstLetter" 444 showsPrec _ (CHSTokEnum _ ) = showString "enum" 445 showsPrec _ (CHSTokFinal _ ) = showString "finalizer" 446 showsPrec _ (CHSTokForeign _ ) = showString "foreign" 447 showsPrec _ (CHSTokFun _ ) = showString "fun" 448 showsPrec _ (CHSTokGet _ ) = showString "get" 449 showsPrec _ (CHSTokImport _ ) = showString "import" 450 showsPrec _ (CHSTokIntr _ ) = showString "interruptible" 451 showsPrec _ (CHSTokLib _ ) = showString "lib" 452 showsPrec _ (CHSTokNewtype _ ) = showString "newtype" 453 showsPrec _ (CHSTokNocode _ ) = showString "nocode" 454 showsPrec _ (CHSTokOffsetof _ ) = showString "offsetof" 455 showsPrec _ (CHSTokOmit _ ) = showString "omit" 456 showsPrec _ (CHSTokPointer _ ) = showString "pointer" 457 showsPrec _ (CHSTokPrefix _ ) = showString "prefix" 458 showsPrec _ (CHSTokPure _ ) = showString "pure" 459 showsPrec _ (CHSTokQualif _ ) = showString "qualified" 460 showsPrec _ (CHSTokSet _ ) = showString "set" 461 showsPrec _ (CHSTokSizeof _ ) = showString "sizeof" 462 showsPrec _ (CHSTokAlignof _ ) = showString "alignof" 463 showsPrec _ (CHSTokStable _ ) = showString "stable" 464 showsPrec _ (CHSTokStruct _ ) = showString "struct" 465 showsPrec _ (CHSTokType _ ) = showString "type" 466 showsPrec _ (CHSTok_2Case _ ) = showString "underscoreToCase" 467 showsPrec _ (CHSTokUnsafe _ ) = showString "unsafe" 468 showsPrec _ (CHSTokUpper _ ) = showString "upcaseFirstLetter" 469 showsPrec _ (CHSTokVariadic _ ) = showString "variadic" 470 showsPrec _ (CHSTokWith _ _) = showString "with" 471 showsPrec _ (CHSTokIn _ ) = showString "in" 472 showsPrec _ (CHSTokOut _ ) = showString "out" 473 showsPrec _ (CHSTokString _ s) = showString ("\"" ++ s ++ "\"") 474 showsPrec _ (CHSTokHSVerb _ s) = showString ("`" ++ s ++ "'") 475 showsPrec _ (CHSTokHSQuot _ s) = showString ("'" ++ s ++ "'") 476 showsPrec _ (CHSTokIdent _ i) = (showString . identToString) i 477 showsPrec _ (CHSTokHaskell _ s) = showString s 478 showsPrec _ (CHSTokCPP _ s nl) = showString (if nl then "\n" else "" ++ s) 479 showsPrec _ (CHSTokLine _ ) = id --TODO show line num? 480 showsPrec _ (CHSTokC _ s) = showString s 481 showsPrec _ (CHSTokCtrl _ c) = showChar c 482 showsPrec _ (CHSTokComment _ s) = showString (if null s 483 then "" 484 else " --" ++ s ++ "\n") 485 showsPrec _ (CHSTokCIdentTail _ i) = (showString . identToString) i 486 showsPrec _ (CHSTokCArg _ s) = showString s 487 488-- lexer state 489-- ----------- 490 491-- | state threaded through the lexer 492-- 493data CHSLexerState = CHSLS { 494 nestLvl :: Int, -- nesting depth of nested comments 495 inHook :: Bool, -- within a binding hook? 496 namesup :: [Name] -- supply of unique names 497 } 498 499-- | initial state 500-- 501initialState :: [Name] -> CST s CHSLexerState 502initialState nameSupply = 503 do 504 return CHSLS { 505 nestLvl = 0, 506 inHook = False, 507 namesup = nameSupply 508 } 509 510-- | raise an error if the given state is not a final state 511-- 512assertFinalState :: Position -> CHSLexerState -> CST s () 513assertFinalState pos CHSLS {nestLvl = nestLvl', inHook = inHook'} 514 | nestLvl' > 0 = raiseError pos ["Unexpected end of file!", 515 "Unclosed nested comment."] 516 | inHook' = raiseError pos ["Unexpected end of file!", 517 "Unclosed binding hook."] 518 | otherwise = return () 519 520-- | lexer and action type used throughout this specification 521-- 522type CHSLexer = Lexer CHSLexerState CHSToken 523type CHSAction = Action CHSToken 524type CHSRegexp = Regexp CHSLexerState CHSToken 525 526-- | for actions that need a new unique name 527-- 528infixl 3 `lexactionName` 529lexactionName :: CHSRegexp 530 -> (String -> Position -> Name -> CHSToken) 531 -> CHSLexer 532re `lexactionName` action = re `lexmeta` action' 533 where 534 action' str pos state = let name:ns = namesup state 535 in 536 (Just $ Right (action str pos name), 537 incPos pos (length str), 538 state {namesup = ns}, 539 Nothing) 540 541 542-- lexical specification 543-- --------------------- 544 545-- | the lexical definition of the tokens (the base lexer) 546-- 547chslexer :: CHSLexer 548chslexer = haskell -- Haskell code 549 >||< nested -- nested comments 550 >||< ctrl -- control code (that has to be preserved) 551 >||< hook -- start of a binding hook 552 >||< cpp -- a pre-processor directive (or `#c') 553 >||< startmarker -- marks beginning of input 554 555-- | stream of Haskell code (terminated by a control character or binding hook) 556-- 557haskell :: CHSLexer 558-- 559-- NB: We need to make sure that '"' is not regarded as the beginning of a 560-- string; however, we cannot really lex character literals properly 561-- without lexing identifiers (as the latter may containing single quotes 562-- as part of their lexeme). Thus, we special case '"'. This is still a 563-- kludge, as a program fragment, such as 564-- 565-- foo'"'strange string" 566-- 567-- will not be handled correctly. 568-- 569haskell = ( anyButSpecial`star` epsilon 570 >|< specialButQuotes 571 >|< char '"' +> inhstr`star` char '"' 572 >|< string "'\"'" -- special case of " 573 >|< string "--" +> anyButNL`star` epsilon -- comment 574 ) 575 `lexaction` copyVerbatim 576 >||< char '"' -- this is a bad kludge 577 `lexactionErr` 578 \_ pos -> (Left $ makeError LevelError pos 579 ["Lexical error!", 580 "Unclosed string."]) 581 where 582 anyButSpecial = alt (inlineSet \\ specialSet) 583 specialButQuotes = alt (specialSet \\ ['"']) 584 anyButNL = alt (anySet \\ ['\n']) 585 inhstr = instr >|< char '\\' >|< string "\\\"" >|< gap 586 gap = char '\\' +> alt (' ':ctrlSet)`plus` char '\\' 587 588-- | action copying the input verbatim to `CHSTokHaskell' tokens 589-- 590copyVerbatim :: CHSAction 591copyVerbatim cs pos = Just $ CHSTokHaskell pos cs 592 593-- | nested comments 594-- 595nested :: CHSLexer 596nested = 597 string "{-" {- for Haskell emacs mode :-( -} 598 `lexmeta` enterComment 599 >||< 600 string "-}" 601 `lexmeta` leaveComment 602 where 603 enterComment cs pos s = 604 (copyVerbatim' cs pos, -- collect the lexeme 605 incPos pos 2, -- advance current position 606 s {nestLvl = nestLvl s + 1}, -- increase nesting level 607 Just $ inNestedComment) -- continue in comment lexer 608 -- 609 leaveComment cs pos s = 610 case nestLvl s of 611 0 -> (commentCloseErr pos, -- 0: -} outside comment => err 612 incPos pos 2, -- advance current position 613 s, 614 Nothing) 615 1 -> (copyVerbatim' cs pos, -- collect the lexeme 616 incPos pos 2, -- advance current position 617 s {nestLvl = nestLvl s - 1}, -- decrease nesting level 618 Just chslexer) -- 1: continue with root lexer 619 _ -> (copyVerbatim' cs pos, -- collect the lexeme 620 incPos pos 2, -- advance current position 621 s {nestLvl = nestLvl s - 1}, -- decrease nesting level 622 Nothing) -- _: cont with comment lexer 623 -- 624 copyVerbatim' cs pos = Just $ Right (CHSTokHaskell pos cs) 625 -- 626 commentCloseErr pos = 627 Just $ Left (makeError LevelError pos 628 ["Lexical error!", 629 "`-}' not preceded by a matching `{-'."]) 630 {- for Haskell emacs mode :-( -} 631 632 633-- | lexer processing the inner of a comment 634-- 635inNestedComment :: CHSLexer 636inNestedComment = commentInterior -- inside a comment 637 >||< nested -- nested comments 638 >||< ctrl -- control code (preserved) 639 640-- | standard characters in a nested comment 641-- 642commentInterior :: CHSLexer 643commentInterior = ( anyButSpecial`star` epsilon 644 >|< special 645 ) 646 `lexaction` copyVerbatim 647 where 648 anyButSpecial = alt (inlineSet \\ commentSpecialSet) 649 special = alt commentSpecialSet 650 651-- | control code in the base lexer (is turned into a token) 652-- 653-- * this covers exactly the same set of characters as contained in `ctrlSet' 654-- and `Lexers.ctrlLexer' and advances positions also like the `ctrlLexer' 655-- 656ctrl :: CHSLexer 657ctrl = 658 char '\n' `lexmeta` newline 659 >||< char '\r' `lexmeta` newline 660 >||< char '\v' `lexmeta` newline 661 >||< char '\f' `lexmeta` formfeed 662 >||< char '\t' `lexmeta` tab 663 where 664 newline [c] pos = ctrlResult pos c (retPos pos) 665 formfeed [c] pos = ctrlResult pos c (incPos pos 1) 666 tab [c] pos = ctrlResult pos c (incPos pos 8) 667 668 ctrlResult pos c pos' s = 669 (Just $ Right (CHSTokCtrl pos c), pos', s, Nothing) 670 671-- | start of a binding hook (ie, enter the binding hook lexer) 672-- 673hook :: CHSLexer 674hook = string "{#" 675 `lexmeta` \_ pos s -> (Just $ Right (CHSTokHook pos), 676 incPos pos 2, s, Just bhLexer) 677 678-- | start marker: used to identify pre-processor directive at 679-- beginning of input -- this lexer just drops the start marker if it 680-- hasn't been used to handle a pre-processor directive 681-- 682startmarker :: CHSLexer 683startmarker = char '\000' `lexmeta` 684 \_ pos s -> (Nothing, incPos pos 1, s, Just chslexer) 685 686-- | pre-processor directives and `#c' 687-- 688-- * we lex `#c' as a directive and special case it in the action 689-- 690-- * we lex C line number pragmas and special case it in the action 691-- 692cpp :: CHSLexer 693cpp = directive 694 where 695 directive = 696 --(string "\n#" >|< string "\0#") +> 697 alt "\n\0" +> alt " \t" `star` string "#" +> 698 alt ('\t':inlineSet)`star` epsilon 699 `lexmeta` 700 \t@(ld:spdir) pos s -> -- strip off the "\n" or "\0" 701 let dir = drop 1 $ dropWhile (`elem` " \t") spdir 702 in case dir of 703 ['c'] -> -- #c 704 (Nothing, incPos pos (length t), s, Just cLexer) 705 -- a #c may be followed by whitespace 706 'c':sp:_ | sp `elem` " \t" -> -- #c 707 (Nothing, incPos pos (length t), s, Just cLexer) 708 ' ':line@(n:_) | isDigit n -> -- C line pragma 709 let pos' = adjustPosByCLinePragma line pos 710 in (Just $ Right (CHSTokLine pos'), pos', s, Nothing) 711 _ -> -- CPP directive 712 (Just $ Right (CHSTokCPP pos dir (ld == '\n')), 713 if ld == '\n' then retPos pos else incPos pos (length t), 714 s, Nothing) 715 716adjustPosByCLinePragma :: String -> Position -> Position 717adjustPosByCLinePragma str pos = 718 if isSourcePos pos 719 then position (posOffset pos) fname' row' 1 (posParent pos) 720 else pos 721 where 722 fname = posFile pos 723 str' = dropWhite str 724 (rowStr, str'') = span isDigit str' 725 row' = read rowStr 726 str''' = dropWhite str'' 727 fnameStr = takeWhile (/= '"') . drop 1 $ str''' 728 fname' | null str''' || head str''' /= '"' = fname 729 -- try and get more sharing of file name strings 730 | fnameStr == fname = fname 731 | otherwise = fnameStr 732 -- 733 dropWhite = dropWhile (\c -> c == ' ' || c == '\t') 734 735-- | the binding hook lexer 736-- 737bhLexer :: CHSLexer 738bhLexer = identOrKW 739 >||< symbol 740 >||< strlit 741 >||< hsverb 742 >||< hsquot 743 >||< whitespace 744 >||< arglist 745 >||< endOfHook 746 >||< string "--" +> anyButNL`star` epsilon -- comment 747 `lexaction` \cs pos -> Just (CHSTokComment pos (drop 2 cs)) 748 where 749 anyButNL = alt (anySet \\ ['\n']) 750 endOfHook = string "#}" 751 `lexmeta` 752 \_ pos s -> (Just $ Right (CHSTokEndHook pos), 753 incPos pos 2, s, Just chslexer) 754 arglist = string "[" 755 `lexmeta` \_ pos s -> (Just $ Right (CHSTokLBrack pos), 756 incPos pos 1, s, Just alLexer) 757 758-- | lexer for C function types for variadic functions 759-- 760alLexer :: CHSLexer 761alLexer = sym "," CHSTokComma 762 >||< endOfArgList 763 >||< cArg 764 where sym cs con = string cs `lexaction` \_ pos -> Just (con pos) 765 endOfArgList = string "]" 766 `lexmeta` 767 \_ pos s -> (Just $ Right (CHSTokRBrack pos), 768 incPos pos 1, s, Just bhLexer) 769 cArg = ((alt (anySet \\ ",]")) `star` epsilon) `lexaction` 770 \cs pos -> Just (CHSTokCArg pos $ trim cs) 771 trim = reverse . dropWhile isSpace . reverse . dropWhile isSpace 772 773-- | the inline-C lexer 774-- 775cLexer :: CHSLexer 776cLexer = inlineC -- inline C code 777 >||< ctrl -- control code (preserved) 778 >||< string "\n#endc" -- end of inline C code... 779 `lexmeta` -- ...preserve '\n' as control token 780 \_ pos s -> (Just $ Right (CHSTokCtrl pos '\n'), retPos pos, s, 781 Just chslexer) 782 where 783 inlineC = alt inlineSet `lexaction` copyVerbatimC 784 -- 785 copyVerbatimC :: CHSAction 786 copyVerbatimC cs pos = Just $ CHSTokC pos cs 787 788-- | whitespace 789-- 790-- * horizontal and vertical tabs, newlines, and form feeds are filter out by 791-- `Lexers.ctrlLexer' 792-- 793whitespace :: CHSLexer 794whitespace = (char ' ' `lexaction` \_ _ -> Nothing) 795 >||< ctrlLexer 796 797-- | identifiers and keywords 798-- 799identOrKW :: CHSLexer 800-- 801-- the strictness annotations seem to help a bit 802-- 803identOrKW = 804 -- identifier or keyword 805 (letter +> (letter >|< digit >|< char '\'')`star` epsilon 806 `lexactionName` \cs pos name -> (idkwtok $!pos) cs name) 807 >||< 808 (digit +> (letter >|< digit)`star` epsilon 809 `lexactionName` \cs pos name -> 810 CHSTokCIdentTail pos (mkIdent pos cs name)) 811 where 812 idkwtok pos "add" _ = CHSTokAdd pos 813 idkwtok pos "as" _ = CHSTokAs pos 814 idkwtok pos "call" _ = CHSTokCall pos 815 idkwtok pos "class" _ = CHSTokClass pos 816 idkwtok pos "const" _ = CHSTokConst pos 817 idkwtok pos "context" _ = CHSTokContext pos 818 idkwtok pos "nonGNU" _ = CHSTokNonGNU pos 819 idkwtok pos "typedef" _ = CHSTokTypedef pos 820 idkwtok pos "default" _ = CHSTokDefault pos 821 idkwtok pos "deriving" _ = CHSTokDerive pos 822 idkwtok pos "downcaseFirstLetter" _ = CHSTokDown pos 823 idkwtok pos "enum" _ = CHSTokEnum pos 824 idkwtok pos "finalizer" _ = CHSTokFinal pos 825 idkwtok pos "foreign" _ = CHSTokForeign pos 826 idkwtok pos "fun" _ = CHSTokFun pos 827 idkwtok pos "get" _ = CHSTokGet pos 828 idkwtok pos "import" _ = CHSTokImport pos 829 idkwtok pos "interruptible" _ = CHSTokIntr pos 830 idkwtok pos "lib" _ = CHSTokLib pos 831 idkwtok pos "newtype" _ = CHSTokNewtype pos 832 idkwtok pos "nocode" _ = CHSTokNocode pos 833 idkwtok pos "offsetof" _ = CHSTokOffsetof pos 834 idkwtok pos "omit" _ = CHSTokOmit pos 835 idkwtok pos "pointer" _ = CHSTokPointer pos 836 idkwtok pos "prefix" _ = CHSTokPrefix pos 837 idkwtok pos "pure" _ = CHSTokPure pos 838 idkwtok pos "qualified" _ = CHSTokQualif pos 839 idkwtok pos "set" _ = CHSTokSet pos 840 idkwtok pos "sizeof" _ = CHSTokSizeof pos 841 idkwtok pos "alignof" _ = CHSTokAlignof pos 842 idkwtok pos "stable" _ = CHSTokStable pos 843 idkwtok pos "struct" _ = CHSTokStruct pos 844 idkwtok pos "type" _ = CHSTokType pos 845 idkwtok pos "underscoreToCase" _ = CHSTok_2Case pos 846 idkwtok pos "unsafe" _ = CHSTokUnsafe pos 847 idkwtok pos "upcaseFirstLetter"_ = CHSTokUpper pos 848 idkwtok pos "variadic" _ = CHSTokVariadic pos 849 idkwtok pos "with" name = mkwith pos name 850 idkwtok pos "in" _ = CHSTokIn pos 851 idkwtok pos "out" _ = CHSTokOut pos 852 idkwtok pos cs name = mkid pos cs name 853 -- 854 mkid pos cs name = CHSTokIdent pos (mkIdent pos cs name) 855 mkwith pos name = CHSTokWith pos (mkIdent pos "with" name) 856 857keywordToIdent :: CHSToken -> CHSToken 858keywordToIdent tok = 859 case tok of 860 CHSTokAdd pos -> mkid pos "add" 861 CHSTokAs pos -> mkid pos "as" 862 CHSTokCall pos -> mkid pos "call" 863 CHSTokClass pos -> mkid pos "class" 864 CHSTokConst pos -> mkid pos "const" 865 CHSTokContext pos -> mkid pos "context" 866 CHSTokNonGNU pos -> mkid pos "nonGNU" 867 CHSTokTypedef pos -> mkid pos "typedef" 868 CHSTokDefault pos -> mkid pos "default" 869 CHSTokDerive pos -> mkid pos "deriving" 870 CHSTokDown pos -> mkid pos "downcaseFirstLetter" 871 CHSTokEnum pos -> mkid pos "enum" 872 CHSTokFinal pos -> mkid pos "finalizer" 873 CHSTokForeign pos -> mkid pos "foreign" 874 CHSTokFun pos -> mkid pos "fun" 875 CHSTokGet pos -> mkid pos "get" 876 CHSTokImport pos -> mkid pos "import" 877 CHSTokIntr pos -> mkid pos "interruptible" 878 CHSTokLib pos -> mkid pos "lib" 879 CHSTokNewtype pos -> mkid pos "newtype" 880 CHSTokNocode pos -> mkid pos "nocode" 881 CHSTokOffsetof pos -> mkid pos "offsetof" 882 CHSTokOmit pos -> mkid pos "omit" 883 CHSTokPointer pos -> mkid pos "pointer" 884 CHSTokPrefix pos -> mkid pos "prefix" 885 CHSTokPure pos -> mkid pos "pure" 886 CHSTokQualif pos -> mkid pos "qualified" 887 CHSTokSet pos -> mkid pos "set" 888 CHSTokSizeof pos -> mkid pos "sizeof" 889 CHSTokAlignof pos -> mkid pos "alignof" 890 CHSTokStable pos -> mkid pos "stable" 891 CHSTokStruct pos -> mkid pos "struct" 892 CHSTokType pos -> mkid pos "type" 893 CHSTok_2Case pos -> mkid pos "underscoreToCase" 894 CHSTokUnsafe pos -> mkid pos "unsafe" 895 CHSTokUpper pos -> mkid pos "upcaseFirstLetter" 896 CHSTokVariadic pos -> mkid pos "variadic" 897 CHSTokWith pos ide -> CHSTokIdent pos ide 898 CHSTokIn pos -> mkid pos "in" 899 CHSTokOut pos -> mkid pos "out" 900 _ -> tok 901 where mkid pos str = CHSTokIdent pos (internalIdent str) 902 903-- | reserved symbols 904-- 905symbol :: CHSLexer 906symbol = sym "->" CHSTokArrow 907 >||< sym "=>" CHSTokDArrow 908 >||< sym "." CHSTokDot 909 >||< sym "," CHSTokComma 910 >||< sym "=" CHSTokEqual 911 >||< sym "-" CHSTokMinus 912 >||< sym "*" CHSTokStar 913 >||< sym "&" CHSTokAmp 914 >||< sym "^" CHSTokHat 915 >||< sym "%" CHSTokPercent 916 >||< sym "+" CHSTokPlus 917 >||< sym "+S" CHSTokPlusS 918 >||< sym_with_num "+" CHSTokPlusNum 919 >||< sym "{" CHSTokLBrace 920 >||< sym "}" CHSTokRBrace 921 >||< sym "(" CHSTokLParen 922 >||< sym ")" CHSTokRParen 923 where 924 sym cs con = string cs `lexaction` \_ pos -> Just (con pos) 925 sym_with_num cs con = 926 string cs +> digit +> digit`star` epsilon 927 `lexaction` \(_:ds) pos -> Just (con pos (read ds)) 928 929-- | string 930-- 931strlit :: CHSLexer 932strlit = char '"' +> (instr >|< char '\\')`star` char '"' 933 `lexaction` \cs pos -> Just (CHSTokString pos (init . tail $ cs)) 934 935-- | verbatim code 936-- 937hsverb :: CHSLexer 938hsverb = char '`' +> inhsverb`star` char '\'' 939 `lexaction` \cs pos -> Just (CHSTokHSVerb pos (init . tail $ cs)) 940 941-- | quoted code 942-- 943hsquot :: CHSLexer 944hsquot = char '\'' +> inhsverb`star` char '\'' 945 `lexaction` \cs pos -> Just (CHSTokHSQuot pos (init . tail $ cs)) 946 947 948-- | regular expressions 949-- 950letter, digit, instr, inhsverb :: Regexp s t 951letter = alt ['a'..'z'] >|< alt ['A'..'Z'] >|< char '_' 952digit = alt ['0'..'9'] 953instr = alt ([' '..'\255'] \\ "\"\\") 954inhsverb = alt ([' '..'\127'] \\ "'") 955 956-- | character sets 957-- 958anySet, inlineSet, specialSet, commentSpecialSet, ctrlSet :: [Char] 959anySet = ['\1'..'\255'] 960inlineSet = anySet \\ ctrlSet 961specialSet = ['{', '-', '"', '\''] 962commentSpecialSet = ['{', '-'] 963ctrlSet = ['\n', '\f', '\r', '\t', '\v'] 964 965 966-- main lexing routine 967-- ------------------- 968 969-- | generate a token sequence out of a string denoting a CHS file 970-- 971-- * the given position is attributed to the first character in the string 972-- 973-- * errors are entered into the compiler state 974-- 975-- * on a successfull parse, the name supply is updated 976lexCHS :: String -> Position -> CST s [CHSToken] 977lexCHS cs pos = 978 do 979 nameSupply <- getNameSupply 980 state <- initialState nameSupply 981 let (ts, lstate, errs) = execLexer chslexer ('\0':cs, pos, state) 982 (_, pos', state') = lstate 983 mapM_ raise errs 984 assertFinalState pos' state' 985 setNameSupply $ namesup state' 986 return ts 987