c2hs/c/CLexer.x

--  C -> Haskell Compiler: Lexer for C Header Files
--
--  Author : Manuel M T Chakravarty, Duncan Coutts
--  Created: 24 May 2005
--
--  Version $Revision: 1.1.2.1 $ from $Date: 2005/06/14 00:16:14 $
--
--  Copyright (c) [1999..2004] Manuel M T Chakravarty
--  Copyright (c) 2005 Duncan Coutts
--
--  This file is free software; you can redistribute it and/or modify
--  it under the terms of the GNU General Public License as published by
--  the Free Software Foundation; either version 2 of the License, or
--  (at your option) any later version.
--
--  This file is distributed in the hope that it will be useful,
--  but WITHOUT ANY WARRANTY; without even the implied warranty of
--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
--  GNU General Public License for more details.
--
--- DESCRIPTION ---------------------------------------------------------------
--
--  Lexer for C header files after being processed by the C preprocessor
--
--- DOCU ----------------------------------------------------------------------
--
--  language: Haskell 98
--
--  We assume that the input already went through cpp.  Thus, we do not handle
--  comments and preprocessor directives here.  The lexer recognizes all tokens
--  of ANCI C except those occurring only in function bodies.  It supports the
--  C99 `restrict' extension: <http://www.lysator.liu.se/c/restrict.html> as
--  well as inline functions.
--
--  Comments:
--
--  * There is no support for the optional feature of extended characters (see
--    K&R A2.5.2) or the corresponding strings (A2.6).
--
--  * We add `typedef-name' (K&R 8.9) as a token, as proposed in K&R A13.
--    However, as these tokens cannot be recognized lexically, but require a
--    context analysis, they are never produced by the lexer, but instead have
--    to be introduced in a later phase (by converting the corresponding
--    identifiers).
--
--  * We also recognize GNU C `__attribute__', `__extension__', `__const',
--    `__const__', `__inline', `__inline__', `__restrict', and `__restrict__'.
--
--  * Any line starting with `#pragma' is ignored.
--
--  With K&R we refer to ``The C Programming Language'', second edition, Brain
--  W. Kernighan and Dennis M. Ritchie, Prentice Hall, 1988.
--
--- TODO ----------------------------------------------------------------------
--
--  * `showsPrec' of `CTokCLit' should produce K&R-conforming escapes;
--    same for `CTokSLit'
--
--  * There are more GNU C specific keywords.  Add them and change `CParser'
--    correspondingly (in particular, most tokens within __attribute ((...))
--    expressions are actually keywords, but we handle them as identifiers at
--    the moment).
--

{

module CLexer (lexC, parseError) where

import Data.Char      (isDigit, ord)
import Data.Word      (Word8)
import Numeric   (readDec, readOct, readHex)

import Position  (Position(..), Pos(posOf))
import Errors    (interr)
import UNames	 (Name)
import Idents    (Ident, lexemeToIdent, identToLexeme)

import CTokens
import CParserMonad

}

$space = [ \ \t ] -- horizontal white space
$eol   = \n

$letter   = [a-zA-Z_]
$octdigit = 0-7
$digit    = 0-9
$digitNZ  = 1-9
$hexdigit = [0-9a-fA-F]

$inchar   = \0-\255 # [ \\ \' \n \r ]
$instr    = \0-\255 # [ \\ \" \n \r ]
$anyButNL = \0-\255 # \n
$infname  = \ -\255 # [ \\ \" ]
$visible  = \ -\127

@int = $digitNZ$digit*
@sp  = $space*

-- character escape sequence (follows K&R A2.5.2)
--
--  * also used for strings
--
@charesc  = \\([ntvbrfae\\\?\'\"]|$octdigit{1,3}|x$hexdigit+)

-- components of float constants (follows K&R A2.5.3)
--
@digits    = $digit+
@intpart   = @digits
@fractpart = @digits
@mantpart  = @intpart?\.@fractpart|@intpart\.
@exppart   = [eE][\+\-]?@digits
@suffix    = [fFlLqQwW]


tokens :-

-- whitespace (follows K&R A2.1)
--
--  * horizontal and vertical tabs, newlines, and form feeds are filter out by
--   `Lexers.ctrlLexer'
--
--  * comments are not handled, as we assume the input already went through cpp
--
$white+					;

-- #line directive (K&R A12.6)
--
--  * allows further ints after the file name a la GCC; as the GCC CPP docu
--   doesn't say how many ints there can be, we allow an unbound number
--
\#$space*@int$space*(\"($infname|@charesc)*\"$space*)?(@int$space*)*$eol
  { \pos len str -> setPos (adjustPos (take len str) pos) >> lexToken }

-- #pragma directive (K&R A12.8)
--
--  * we simply ignore any #pragma (but take care to update the position
--   information)
--
\#$space*pragma$anyButNL*$eol		;

-- #itent directive, eg used by rcs/cvs
--
--  * we simply ignore any #itent (but take care to update the position
--   information)
--
\#$space*ident$anyButNL*$eol		;

-- identifiers and keywords (follows K&R A2.3 and A2.4)
--
$letter($letter|$digit)*	{ \pos len str -> idkwtok (take len str) pos }

-- constants (follows K&R A2.5)
--
--  * K&R explicit mentions `enumeration-constants'; however, as they are
--   lexically identifiers, we do not have an extra case for them
--

-- integer constants (follows K&R A2.5.1)
--
0$octdigit*[uUlL]{0,3}		{ token CTokILit (fst . head . readOct) }
$digitNZ$digit*[uUlL]{0,3}	{ token CTokILit (fst . head . readDec) }
0[xX]$hexdigit*[uUlL]{0,3}	{ token CTokILit (fst . head . readHex . drop 2) }

-- character constants (follows K&R A2.5.2)
--
\'($inchar|@charesc)\'	{ token CTokCLit (fst . oneChar . tail) }
L\'($inchar|@charesc)\'	{ token CTokCLit (fst . oneChar . tail . tail) }

-- float constants (follows K&R A2.5.3)
--
(@mantpart@exppart?|@intpart@exppart)@suffix?	{ token CTokFLit id }

-- string literal (follows K&R A2.6)
--
\"($instr|@charesc)*\"			{ token CTokSLit normalizeEscapes }
L\"($instr|@charesc)*\"			{ token CTokSLit (normalizeEscapes . tail) }


-- operators and separators
--
"("	{ token_ CTokLParen }
")"	{ token_ CTokRParen  }
"["	{ token_ CTokLBracket }
"]"	{ token_ CTokRBracket }
"->"	{ token_ CTokArrow }
"."	{ token_ CTokDot }
"!"	{ token_ CTokExclam }
"~"	{ token_ CTokTilde }
"++"	{ token_ CTokInc }
"--"	{ token_ CTokDec }
"+"	{ token_ CTokPlus }
"-"	{ token_ CTokMinus }
"*"	{ token_ CTokStar }
"/"	{ token_ CTokSlash }
"%"	{ token_ CTokPercent }
"&"	{ token_ CTokAmper }
"<<"	{ token_ CTokShiftL }
">>"	{ token_ CTokShiftR }
"<"	{ token_ CTokLess }
"<="	{ token_ CTokLessEq }
">"	{ token_ CTokHigh }
">="	{ token_ CTokHighEq }
"=="	{ token_ CTokEqual }
"!="	{ token_ CTokUnequal }
"^"	{ token_ CTokHat }
"|"	{ token_ CTokBar }
"&&"	{ token_ CTokAnd }
"||"	{ token_ CTokOr }
"?"	{ token_ CTokQuest }
":"	{ token_ CTokColon }
"="	{ token_ CTokAssign }
"+="	{ token_ CTokPlusAss }
"-="	{ token_ CTokMinusAss }
"*="	{ token_ CTokStarAss }
"/="	{ token_ CTokSlashAss }
"%="	{ token_ CTokPercAss }
"&="	{ token_ CTokAmpAss }
"^="	{ token_ CTokHatAss }
"|="	{ token_ CTokBarAss }
"<<="	{ token_ CTokSLAss }
">>="	{ token_ CTokSRAss }
","	{ token_ CTokComma }
\;	{ token_ CTokSemic }
"{"	{ token_ CTokLBrace }
"}"	{ token_ CTokRBrace }
"..."	{ token_ CTokEllipsis }


{

-- We use the odd looking list of string patterns here rather than normal
-- string literals since GHC converts the latter into a sequence of string
-- comparisons (ie a linear search) but it translates the former using its
-- efficient pattern matching which gives us the expected radix-style search.
-- This gives change makes a significant performance difference.
--
idkwtok :: String -> Position -> P CToken
idkwtok ('a':'l':'i':'g':'n':'o':'f':[])		     = tok CTokAlignof
idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':[])	     = tok CTokAlignof
idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':'_':'_':[])     = tok CTokAlignof
idkwtok ('a':'s':'m':[])				     = tok CTokAsm
idkwtok ('_':'_':'a':'s':'m':[])			     = tok CTokAsm
idkwtok ('_':'_':'a':'s':'m':'_':'_':[])		     = tok CTokAsm
idkwtok ('a':'u':'t':'o':[])				     = tok CTokAuto
idkwtok ('b':'r':'e':'a':'k':[])			     = tok CTokBreak
idkwtok ('_':'B':'o':'o':'l':[])			     = tok CTokBool
idkwtok ('c':'a':'s':'e':[])				     = tok CTokCase
idkwtok ('c':'h':'a':'r':[])				     = tok CTokChar
idkwtok ('c':'o':'n':'s':'t':[])			     = tok CTokConst
idkwtok ('_':'_':'c':'o':'n':'s':'t':[])		     = tok CTokConst
idkwtok ('_':'_':'c':'o':'n':'s':'t':'_':'_':[])	     = tok CTokConst
idkwtok ('c':'o':'n':'t':'i':'n':'u':'e':[])		     = tok CTokContinue
idkwtok ('_':'C':'o':'m':'p':'l':'e':'x':[])		     = tok CTokComplex
idkwtok ('d':'e':'f':'a':'u':'l':'t':[])		     = tok CTokDefault
idkwtok ('d':'o':[])					     = tok CTokDo
idkwtok ('d':'o':'u':'b':'l':'e':[])			     = tok CTokDouble
idkwtok ('e':'l':'s':'e':[])				     = tok CTokElse
idkwtok ('e':'n':'u':'m':[])				     = tok CTokEnum
idkwtok ('e':'x':'t':'e':'r':'n':[])			     = tok CTokExtern
idkwtok ('_':'_':'f':'l':'o':'a':'t':'1':'2':'8':[])           = tok CTokFloat128
idkwtok ('f':'l':'o':'a':'t':[])			     = tok CTokFloat
idkwtok ('f':'o':'r':[])				     = tok CTokFor
idkwtok ('g':'o':'t':'o':[])				     = tok CTokGoto
idkwtok ('i':'f':[])					     = tok CTokIf
idkwtok ('i':'n':'l':'i':'n':'e':[])			     = tok CTokInline
idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':[])		     = tok CTokInline
idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':'_':'_':[])	     = tok CTokInline
idkwtok ('i':'n':'t':[])				     = tok CTokInt
idkwtok ('_':'_':'u':'i':'n':'t':'1':'2':'8':'_':'t':[])     = tok CTokInt
idkwtok ('_':'_':'i':'n':'t':'1':'2':'8':'_':'t':[])	     = tok CTokInt
idkwtok ('_':'_':'u':'i':'n':'t':'1':'2':'8':[])	     = tok CTokInt
idkwtok ('_':'_':'i':'n':'t':'1':'2':'8':[])		     = tok CTokInt
idkwtok ('l':'o':'n':'g':[])				     = tok CTokLong
idkwtok ('r':'e':'g':'i':'s':'t':'e':'r':[])		     = tok CTokRegister
idkwtok ('r':'e':'s':'t':'r':'i':'c':'t':[])		     = tok CTokRestrict
idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':[])	     = tok CTokRestrict
idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':'_':'_':[]) = tok CTokRestrict
idkwtok ('r':'e':'t':'u':'r':'n':[])			     = tok CTokReturn
idkwtok ('s':'h':'o':'r':'t':[])			     = tok CTokShort
idkwtok ('s':'i':'g':'n':'e':'d':[])			     = tok CTokSigned
idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':[])		     = tok CTokSigned
idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':'_':'_':[])	     = tok CTokSigned
idkwtok ('s':'i':'z':'e':'o':'f':[])			     = tok CTokSizeof
idkwtok ('s':'t':'a':'t':'i':'c':[])			     = tok CTokStatic
idkwtok ('s':'t':'r':'u':'c':'t':[])			     = tok CTokStruct
idkwtok ('s':'w':'i':'t':'c':'h':[])			     = tok CTokSwitch
idkwtok ('t':'y':'p':'e':'d':'e':'f':[])		     = tok CTokTypedef
idkwtok ('t':'y':'p':'e':'o':'f':[])			     = tok CTokTypeof
idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':[])		     = tok CTokTypeof
idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':'_':'_':[])	     = tok CTokTypeof
idkwtok ('_':'_':'t':'h':'r':'e':'a':'d':[])		     = tok CTokThread
idkwtok ('u':'n':'i':'o':'n':[])			     = tok CTokUnion
idkwtok ('u':'n':'s':'i':'g':'n':'e':'d':[])		     = tok CTokUnsigned
idkwtok ('v':'o':'i':'d':[])				     = tok CTokVoid
idkwtok ('v':'o':'l':'a':'t':'i':'l':'e':[])		     = tok CTokVolatile
idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':[])	     = tok CTokVolatile
idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':'_':'_':[]) = tok CTokVolatile
idkwtok ('w':'h':'i':'l':'e':[])			     = tok CTokWhile
idkwtok ('_':'_':'l':'a':'b':'e':'l':'_':'_':[])             = tok CTokLabel
idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':[]) = tok (CTokGnuC GnuCAttrTok)
--						ignoreAttribute >> lexToken
idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':'_':'_':[]) = tok (CTokGnuC GnuCAttrTok)
--						ignoreAttribute >> lexToken
idkwtok ('_':'_':'e':'x':'t':'e':'n':'s':'i':'o':'n':'_':'_':[]) =
						tok (CTokGnuC GnuCExtTok)
idkwtok ('_':'_':'b':'u':'i':'l':'t':'i':'n':'_':rest)
        | rest == "va_arg"             = tok (CTokGnuC GnuCVaArg)
        | rest == "offsetof"           = tok (CTokGnuC GnuCOffsetof)
        | rest == "types_compatible_p" = tok (CTokGnuC GnuCTyCompat)

idkwtok cs = \pos -> do
  name <- getNewName
  let ident = lexemeToIdent pos cs name
  tyident <- isTypeIdent ident
  if tyident
    then return (CTokTyIdent pos ident)
    else return (CTokIdent   pos ident)

ignoreAttribute :: P ()
ignoreAttribute = skipTokens 0
  where skipTokens n = do
          tok <- lexToken
          case tok of
            CTokRParen _ | n == 1    -> return ()
                         | otherwise -> skipTokens (n-1)
            CTokLParen _             -> skipTokens (n+1)
            _                        -> skipTokens n

tok :: (Position -> CToken) -> Position -> P CToken
tok tc pos = return (tc pos)

-- converts the first character denotation of a C-style string to a character
-- and the remaining string
--
oneChar             :: String -> (Char, String)
oneChar ('\\':c:cs)  = case c of
			 'n'  -> ('\n', cs)
			 't'  -> ('\t', cs)
			 'v'  -> ('\v', cs)
			 'b'  -> ('\b', cs)
			 'r'  -> ('\r', cs)
			 'f'  -> ('\f', cs)
			 'a'  -> ('\a', cs)
			 'e'  -> ('\ESC', cs)  --GNU C extension
			 '\\' -> ('\\', cs)
			 '?'  -> ('?', cs)
			 '\'' -> ('\'', cs)
			 '"'  -> ('"', cs)
			 'x'  -> case head (readHex cs) of
			           (i, cs') -> (toEnum i, cs')
			 _    -> case head (readOct (c:cs)) of
			           (i, cs') -> (toEnum i, cs')
oneChar (c   :cs)    = (c, cs)

normalizeEscapes [] = []
normalizeEscapes cs = case oneChar cs of
                        (c, cs') -> c : normalizeEscapes cs'

adjustPos :: String -> Position -> Position
adjustPos str (Position fname row _) = Position fname' row' 0
  where
    str'            = dropWhite . drop 1 $ str
    (rowStr, str'') = span isDigit str'
    row'	    = read rowStr
    str'''	    = dropWhite str''
    fnameStr	    = takeWhile (/= '"') . drop 1 $ str'''
    fname'	    | null str''' || head str''' /= '"'	= fname
		    -- try and get more sharing of file name strings
		    | fnameStr == fname			= fname
		    | otherwise				= fnameStr
    --
    dropWhite = dropWhile (\c -> c == ' ' || c == '\t')

{-# INLINE token_ #-}
-- token that ignores the string
token_ :: (Position -> CToken) -> Position -> Int -> String -> P CToken
token_ tok pos _ _ = return (tok pos)

{-# INLINE token #-}
-- token that uses the string
token :: (Position -> a -> CToken) -> (String -> a)
      -> Position -> Int -> String -> P CToken
token tok read pos len str = return (tok pos (read $ take len str))


-- -----------------------------------------------------------------------------
-- The input type

type AlexInput = (Position, 	-- current position,
		  String)	-- current input string

alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar _ = error "alexInputPrevChar not used"

-- For alex >= 3.0
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte (p,[]) = Nothing
alexGetByte (p,(c:s))  = let p' = alexMove p c in p' `seq`
                           Just (fromIntegral $ ord c, (p', s))

-- For alex < 3.0
alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
alexGetChar (p,[]) = Nothing
alexGetChar (p,(c:s))  = let p' = alexMove p c in p' `seq`
                           Just (c, (p', s))

alexMove :: Position -> Char -> Position
alexMove (Position f l c) '\t' = Position f l     (((c+7) `div` 8)*8+1)
alexMove (Position f l c) '\n' = Position f (l+1) 1
alexMove (Position f l c) _    = Position f l     (c+1)

lexicalError :: P a
lexicalError = do
  pos <- getPos
  (c:cs) <- getInput
  failP pos
        ["Lexical error!",
         "The character " ++ show c ++ " does not fit here."]

parseError :: P a
parseError = do
  tok <- getLastToken
  failP (posOf tok)
        ["Syntax error!",
         "The symbol `" ++ show tok ++ "' does not fit here."]

lexToken :: P CToken
lexToken = do
  pos <- getPos
  inp <- getInput
  case alexScan (pos, inp) 0 of
    AlexEOF -> return CTokEof
    AlexError inp' -> lexicalError
    AlexSkip  (pos', inp') len -> do
        setPos pos'
        setInput inp'
	lexToken
    AlexToken (pos', inp') len action -> do
        setPos pos'
        setInput inp'
        tok <- action pos len inp
        setLastToken tok
        return tok

lexC :: (CToken -> P a) -> P a
lexC cont = do
  tok <- lexToken
  cont tok
}