1 -- C -> Haskell Compiler: Lexer for C Header Files 2 -- 3 -- Author : Manuel M T Chakravarty, Duncan Coutts 4 -- Created: 24 May 2005 5 -- 6 -- Version $Revision: 1.1.2.1 $ from $Date: 2005/06/14 00:16:14 $ 7 -- 8 -- Copyright (c) [1999..2004] Manuel M T Chakravarty 9 -- Copyright (c) 2005 Duncan Coutts 10 -- 11 -- This file is free software; you can redistribute it and/or modify 12 -- it under the terms of the GNU General Public License as published by 13 -- the Free Software Foundation; either version 2 of the License, or 14 -- (at your option) any later version. 15 -- 16 -- This file is distributed in the hope that it will be useful, 17 -- but WITHOUT ANY WARRANTY; without even the implied warranty of 18 -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 -- GNU General Public License for more details. 20 -- 21 --- DESCRIPTION --------------------------------------------------------------- 22 -- 23 -- Lexer for C header files after being processed by the C preprocessor 24 -- 25 --- DOCU ---------------------------------------------------------------------- 26 -- 27 -- language: Haskell 98 28 -- 29 -- We assume that the input already went through cpp. Thus, we do not handle 30 -- comments and preprocessor directives here. The lexer recognizes all tokens 31 -- of ANCI C except those occurring only in function bodies. It supports the 32 -- C99 `restrict' extension: <http://www.lysator.liu.se/c/restrict.html> as 33 -- well as inline functions. 34 -- 35 -- Comments: 36 -- 37 -- * There is no support for the optional feature of extended characters (see 38 -- K&R A2.5.2) or the corresponding strings (A2.6). 39 -- 40 -- * We add `typedef-name' (K&R 8.9) as a token, as proposed in K&R A13. 41 -- However, as these tokens cannot be recognized lexically, but require a 42 -- context analysis, they are never produced by the lexer, but instead have 43 -- to be introduced in a later phase (by converting the corresponding 44 -- identifiers). 45 -- 46 -- * We also recognize GNU C `__attribute__', `__extension__', `__const', 47 -- `__const__', `__inline', `__inline__', `__restrict', and `__restrict__'. 48 -- 49 -- * Any line starting with `#pragma' is ignored. 50 -- 51 -- With K&R we refer to ``The C Programming Language'', second edition, Brain 52 -- W. Kernighan and Dennis M. Ritchie, Prentice Hall, 1988. 53 -- 54 --- TODO ---------------------------------------------------------------------- 55 -- 56 -- * `showsPrec' of `CTokCLit' should produce K&R-conforming escapes; 57 -- same for `CTokSLit' 58 -- 59 -- * There are more GNU C specific keywords. Add them and change `CParser' 60 -- correspondingly (in particular, most tokens within __attribute ((...)) 61 -- expressions are actually keywords, but we handle them as identifiers at 62 -- the moment). 63 -- 64 65 { 66 67 module CLexer (lexC, parseError) where 68 69 import Data.Char (isDigit, ord) 70 import Data.Word (Word8) 71 import Numeric (readDec, readOct, readHex) 72 73 import Position (Position(..), Pos(posOf)) 74 import Errors (interr) 75 import UNames (Name) 76 import Idents (Ident, lexemeToIdent, identToLexeme) 77 78 import CTokens 79 import CParserMonad 80 81 } 82 83 $space = [ \ \t ] -- horizontal white space 84 $eol = \n 85 86 $letter = [a-zA-Z_] 87 $octdigit = 0-7 88 $digit = 0-9 89 $digitNZ = 1-9 90 $hexdigit = [0-9a-fA-F] 91 92 $inchar = \0-\255 # [ \\ \' \n \r ] 93 $instr = \0-\255 # [ \\ \" \n \r ] 94 $anyButNL = \0-\255 # \n 95 $infname = \ -\255 # [ \\ \" ] 96 $visible = \ -\127 97 98 @int = $digitNZ$digit* 99 @sp = $space* 100 101 -- character escape sequence (follows K&R A2.5.2) 102 -- 103 -- * also used for strings 104 -- 105 @charesc = \\([ntvbrfae\\\?\'\"]|$octdigit{1,3}|x$hexdigit+) 106 107 -- components of float constants (follows K&R A2.5.3) 108 -- 109 @digits = $digit+ 110 @intpart = @digits 111 @fractpart = @digits 112 @mantpart = @intpart?\.@fractpart|@intpart\. 113 @exppart = [eE][\+\-]?@digits 114 @suffix = [fFlLqQwW] 115 116 117 tokens :- 118 119 -- whitespace (follows K&R A2.1) 120 -- 121 -- * horizontal and vertical tabs, newlines, and form feeds are filter out by 122 -- `Lexers.ctrlLexer' 123 -- 124 -- * comments are not handled, as we assume the input already went through cpp 125 -- 126 $white+ ; 127 128 -- #line directive (K&R A12.6) 129 -- 130 -- * allows further ints after the file name a la GCC; as the GCC CPP docu 131 -- doesn't say how many ints there can be, we allow an unbound number 132 -- 133 \#$space*@int$space*(\"($infname|@charesc)*\"$space*)?(@int$space*)*$eol 134 { \pos len str -> setPos (adjustPos (take len str) pos) >> lexToken } 135 136 -- #pragma directive (K&R A12.8) 137 -- 138 -- * we simply ignore any #pragma (but take care to update the position 139 -- information) 140 -- 141 \#$space*pragma$anyButNL*$eol ; 142 143 -- #itent directive, eg used by rcs/cvs 144 -- 145 -- * we simply ignore any #itent (but take care to update the position 146 -- information) 147 -- 148 \#$space*ident$anyButNL*$eol ; 149 150 -- identifiers and keywords (follows K&R A2.3 and A2.4) 151 -- 152 $letter($letter|$digit)* { \pos len str -> idkwtok (take len str) pos } 153 154 -- constants (follows K&R A2.5) 155 -- 156 -- * K&R explicit mentions `enumeration-constants'; however, as they are 157 -- lexically identifiers, we do not have an extra case for them 158 -- 159 160 -- integer constants (follows K&R A2.5.1) 161 -- 162 0$octdigit*[uUlL]{0,3} { token CTokILit (fst . head . readOct) } 163 $digitNZ$digit*[uUlL]{0,3} { token CTokILit (fst . head . readDec) } 164 0[xX]$hexdigit*[uUlL]{0,3} { token CTokILit (fst . head . readHex . drop 2) } 165 166 -- character constants (follows K&R A2.5.2) 167 -- 168 \'($inchar|@charesc)\' { token CTokCLit (fst . oneChar . tail) } 169 L\'($inchar|@charesc)\' { token CTokCLit (fst . oneChar . tail . tail) } 170 171 -- float constants (follows K&R A2.5.3) 172 -- 173 (@mantpart@exppart?|@intpart@exppart)@suffix? { token CTokFLit id } 174 175 -- string literal (follows K&R A2.6) 176 -- 177 \"($instr|@charesc)*\" { token CTokSLit normalizeEscapes } 178 L\"($instr|@charesc)*\" { token CTokSLit (normalizeEscapes . tail) } 179 180 181 -- operators and separators 182 -- 183 "(" { token_ CTokLParen } 184 ")" { token_ CTokRParen } 185 "[" { token_ CTokLBracket } 186 "]" { token_ CTokRBracket } 187 "->" { token_ CTokArrow } 188 "." { token_ CTokDot } 189 "!" { token_ CTokExclam } 190 "~" { token_ CTokTilde } 191 "++" { token_ CTokInc } 192 "--" { token_ CTokDec } 193 "+" { token_ CTokPlus } 194 "-" { token_ CTokMinus } 195 "*" { token_ CTokStar } 196 "/" { token_ CTokSlash } 197 "%" { token_ CTokPercent } 198 "&" { token_ CTokAmper } 199 "<<" { token_ CTokShiftL } 200 ">>" { token_ CTokShiftR } 201 "<" { token_ CTokLess } 202 "<=" { token_ CTokLessEq } 203 ">" { token_ CTokHigh } 204 ">=" { token_ CTokHighEq } 205 "==" { token_ CTokEqual } 206 "!=" { token_ CTokUnequal } 207 "^" { token_ CTokHat } 208 "|" { token_ CTokBar } 209 "&&" { token_ CTokAnd } 210 "||" { token_ CTokOr } 211 "?" { token_ CTokQuest } 212 ":" { token_ CTokColon } 213 "=" { token_ CTokAssign } 214 "+=" { token_ CTokPlusAss } 215 "-=" { token_ CTokMinusAss } 216 "*=" { token_ CTokStarAss } 217 "/=" { token_ CTokSlashAss } 218 "%=" { token_ CTokPercAss } 219 "&=" { token_ CTokAmpAss } 220 "^=" { token_ CTokHatAss } 221 "|=" { token_ CTokBarAss } 222 "<<=" { token_ CTokSLAss } 223 ">>=" { token_ CTokSRAss } 224 "," { token_ CTokComma } 225 \; { token_ CTokSemic } 226 "{" { token_ CTokLBrace } 227 "}" { token_ CTokRBrace } 228 "..." { token_ CTokEllipsis } 229 230 231 { 232 233 -- We use the odd looking list of string patterns here rather than normal 234 -- string literals since GHC converts the latter into a sequence of string 235 -- comparisons (ie a linear search) but it translates the former using its 236 -- efficient pattern matching which gives us the expected radix-style search. 237 -- This gives change makes a significant performance difference. 238 -- 239 idkwtok :: String -> Position -> P CToken 240 idkwtok ('a':'l':'i':'g':'n':'o':'f':[]) = tok CTokAlignof 241 idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':[]) = tok CTokAlignof 242 idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':'_':'_':[]) = tok CTokAlignof 243 idkwtok ('a':'s':'m':[]) = tok CTokAsm 244 idkwtok ('_':'_':'a':'s':'m':[]) = tok CTokAsm 245 idkwtok ('_':'_':'a':'s':'m':'_':'_':[]) = tok CTokAsm 246 idkwtok ('a':'u':'t':'o':[]) = tok CTokAuto 247 idkwtok ('b':'r':'e':'a':'k':[]) = tok CTokBreak 248 idkwtok ('_':'B':'o':'o':'l':[]) = tok CTokBool 249 idkwtok ('c':'a':'s':'e':[]) = tok CTokCase 250 idkwtok ('c':'h':'a':'r':[]) = tok CTokChar 251 idkwtok ('c':'o':'n':'s':'t':[]) = tok CTokConst 252 idkwtok ('_':'_':'c':'o':'n':'s':'t':[]) = tok CTokConst 253 idkwtok ('_':'_':'c':'o':'n':'s':'t':'_':'_':[]) = tok CTokConst 254 idkwtok ('c':'o':'n':'t':'i':'n':'u':'e':[]) = tok CTokContinue 255 idkwtok ('_':'C':'o':'m':'p':'l':'e':'x':[]) = tok CTokComplex 256 idkwtok ('d':'e':'f':'a':'u':'l':'t':[]) = tok CTokDefault 257 idkwtok ('d':'o':[]) = tok CTokDo 258 idkwtok ('d':'o':'u':'b':'l':'e':[]) = tok CTokDouble 259 idkwtok ('e':'l':'s':'e':[]) = tok CTokElse 260 idkwtok ('e':'n':'u':'m':[]) = tok CTokEnum 261 idkwtok ('e':'x':'t':'e':'r':'n':[]) = tok CTokExtern 262 idkwtok ('_':'_':'f':'l':'o':'a':'t':'1':'2':'8':[]) = tok CTokFloat128 263 idkwtok ('f':'l':'o':'a':'t':[]) = tok CTokFloat 264 idkwtok ('f':'o':'r':[]) = tok CTokFor 265 idkwtok ('g':'o':'t':'o':[]) = tok CTokGoto 266 idkwtok ('i':'f':[]) = tok CTokIf 267 idkwtok ('i':'n':'l':'i':'n':'e':[]) = tok CTokInline 268 idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':[]) = tok CTokInline 269 idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':'_':'_':[]) = tok CTokInline 270 idkwtok ('i':'n':'t':[]) = tok CTokInt 271 idkwtok ('_':'_':'u':'i':'n':'t':'1':'2':'8':'_':'t':[]) = tok CTokInt 272 idkwtok ('_':'_':'i':'n':'t':'1':'2':'8':'_':'t':[]) = tok CTokInt 273 idkwtok ('_':'_':'u':'i':'n':'t':'1':'2':'8':[]) = tok CTokInt 274 idkwtok ('_':'_':'i':'n':'t':'1':'2':'8':[]) = tok CTokInt 275 idkwtok ('l':'o':'n':'g':[]) = tok CTokLong 276 idkwtok ('r':'e':'g':'i':'s':'t':'e':'r':[]) = tok CTokRegister 277 idkwtok ('r':'e':'s':'t':'r':'i':'c':'t':[]) = tok CTokRestrict 278 idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':[]) = tok CTokRestrict 279 idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':'_':'_':[]) = tok CTokRestrict 280 idkwtok ('r':'e':'t':'u':'r':'n':[]) = tok CTokReturn 281 idkwtok ('s':'h':'o':'r':'t':[]) = tok CTokShort 282 idkwtok ('s':'i':'g':'n':'e':'d':[]) = tok CTokSigned 283 idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':[]) = tok CTokSigned 284 idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':'_':'_':[]) = tok CTokSigned 285 idkwtok ('s':'i':'z':'e':'o':'f':[]) = tok CTokSizeof 286 idkwtok ('s':'t':'a':'t':'i':'c':[]) = tok CTokStatic 287 idkwtok ('s':'t':'r':'u':'c':'t':[]) = tok CTokStruct 288 idkwtok ('s':'w':'i':'t':'c':'h':[]) = tok CTokSwitch 289 idkwtok ('t':'y':'p':'e':'d':'e':'f':[]) = tok CTokTypedef 290 idkwtok ('t':'y':'p':'e':'o':'f':[]) = tok CTokTypeof 291 idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':[]) = tok CTokTypeof 292 idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':'_':'_':[]) = tok CTokTypeof 293 idkwtok ('_':'_':'t':'h':'r':'e':'a':'d':[]) = tok CTokThread 294 idkwtok ('u':'n':'i':'o':'n':[]) = tok CTokUnion 295 idkwtok ('u':'n':'s':'i':'g':'n':'e':'d':[]) = tok CTokUnsigned 296 idkwtok ('v':'o':'i':'d':[]) = tok CTokVoid 297 idkwtok ('v':'o':'l':'a':'t':'i':'l':'e':[]) = tok CTokVolatile 298 idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':[]) = tok CTokVolatile 299 idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':'_':'_':[]) = tok CTokVolatile 300 idkwtok ('w':'h':'i':'l':'e':[]) = tok CTokWhile 301 idkwtok ('_':'_':'l':'a':'b':'e':'l':'_':'_':[]) = tok CTokLabel 302 idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':[]) = tok (CTokGnuC GnuCAttrTok) 303 -- ignoreAttribute >> lexToken 304 idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':'_':'_':[]) = tok (CTokGnuC GnuCAttrTok) 305 -- ignoreAttribute >> lexToken 306 idkwtok ('_':'_':'e':'x':'t':'e':'n':'s':'i':'o':'n':'_':'_':[]) = 307 tok (CTokGnuC GnuCExtTok) 308 idkwtok ('_':'_':'b':'u':'i':'l':'t':'i':'n':'_':rest) 309 | rest == "va_arg" = tok (CTokGnuC GnuCVaArg) 310 | rest == "offsetof" = tok (CTokGnuC GnuCOffsetof) 311 | rest == "types_compatible_p" = tok (CTokGnuC GnuCTyCompat) 312 313 idkwtok cs = \pos -> do 314 name <- getNewName 315 let ident = lexemeToIdent pos cs name 316 tyident <- isTypeIdent ident 317 if tyident 318 then return (CTokTyIdent pos ident) 319 else return (CTokIdent pos ident) 320 321 ignoreAttribute :: P () 322 ignoreAttribute = skipTokens 0 323 where skipTokens n = do 324 tok <- lexToken 325 case tok of 326 CTokRParen _ | n == 1 -> return () 327 | otherwise -> skipTokens (n-1) 328 CTokLParen _ -> skipTokens (n+1) 329 _ -> skipTokens n 330 331 tok :: (Position -> CToken) -> Position -> P CToken 332 tok tc pos = return (tc pos) 333 334 -- converts the first character denotation of a C-style string to a character 335 -- and the remaining string 336 -- 337 oneChar :: String -> (Char, String) 338 oneChar ('\\':c:cs) = case c of 339 'n' -> ('\n', cs) 340 't' -> ('\t', cs) 341 'v' -> ('\v', cs) 342 'b' -> ('\b', cs) 343 'r' -> ('\r', cs) 344 'f' -> ('\f', cs) 345 'a' -> ('\a', cs) 346 'e' -> ('\ESC', cs) --GNU C extension 347 '\\' -> ('\\', cs) 348 '?' -> ('?', cs) 349 '\'' -> ('\'', cs) 350 '"' -> ('"', cs) 351 'x' -> case head (readHex cs) of 352 (i, cs') -> (toEnum i, cs') 353 _ -> case head (readOct (c:cs)) of 354 (i, cs') -> (toEnum i, cs') 355 oneChar (c :cs) = (c, cs) 356 357 normalizeEscapes [] = [] 358 normalizeEscapes cs = case oneChar cs of 359 (c, cs') -> c : normalizeEscapes cs' 360 361 adjustPos :: String -> Position -> Position 362 adjustPos str (Position fname row _) = Position fname' row' 0 363 where 364 str' = dropWhite . drop 1 $ str 365 (rowStr, str'') = span isDigit str' 366 row' = read rowStr 367 str''' = dropWhite str'' 368 fnameStr = takeWhile (/= '"') . drop 1 $ str''' 369 fname' | null str''' || head str''' /= '"' = fname 370 -- try and get more sharing of file name strings 371 | fnameStr == fname = fname 372 | otherwise = fnameStr 373 -- 374 dropWhite = dropWhile (\c -> c == ' ' || c == '\t') 375 376 {-# INLINE token_ #-} 377 -- token that ignores the string 378 token_ :: (Position -> CToken) -> Position -> Int -> String -> P CToken 379 token_ tok pos _ _ = return (tok pos) 380 381 {-# INLINE token #-} 382 -- token that uses the string 383 token :: (Position -> a -> CToken) -> (String -> a) 384 -> Position -> Int -> String -> P CToken 385 token tok read pos len str = return (tok pos (read $ take len str)) 386 387 388 -- ----------------------------------------------------------------------------- 389 -- The input type 390 391 type AlexInput = (Position, -- current position, 392 String) -- current input string 393 394 alexInputPrevChar :: AlexInput -> Char 395 alexInputPrevChar _ = error "alexInputPrevChar not used" 396 397 -- For alex >= 3.0 398 alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) 399 alexGetByte (p,[]) = Nothing 400 alexGetByte (p,(c:s)) = let p' = alexMove p c in p' `seq` 401 Just (fromIntegral $ ord c, (p', s)) 402 403 -- For alex < 3.0 404 alexGetChar :: AlexInput -> Maybe (Char,AlexInput) 405 alexGetChar (p,[]) = Nothing 406 alexGetChar (p,(c:s)) = let p' = alexMove p c in p' `seq` 407 Just (c, (p', s)) 408 409 alexMove :: Position -> Char -> Position 410 alexMove (Position f l c) '\t' = Position f l (((c+7) `div` 8)*8+1) 411 alexMove (Position f l c) '\n' = Position f (l+1) 1 412 alexMove (Position f l c) _ = Position f l (c+1) 413 414 lexicalError :: P a 415 lexicalError = do 416 pos <- getPos 417 (c:cs) <- getInput 418 failP pos 419 ["Lexical error!", 420 "The character " ++ show c ++ " does not fit here."] 421 422 parseError :: P a 423 parseError = do 424 tok <- getLastToken 425 failP (posOf tok) 426 ["Syntax error!", 427 "The symbol `" ++ show tok ++ "' does not fit here."] 428 429 lexToken :: P CToken 430 lexToken = do 431 pos <- getPos 432 inp <- getInput 433 case alexScan (pos, inp) 0 of 434 AlexEOF -> return CTokEof 435 AlexError inp' -> lexicalError 436 AlexSkip (pos', inp') len -> do 437 setPos pos' 438 setInput inp' 439 lexToken 440 AlexToken (pos', inp') len action -> do 441 setPos pos' 442 setInput inp' 443 tok <- action pos len inp 444 setLastToken tok 445 return tok 446 447 lexC :: (CToken -> P a) -> P a 448 lexC cont = do 449 tok <- lexToken 450 cont tok 451 } 452