1 --  C -> Haskell Compiler: Lexer for C Header Files
2 --
3 --  Author : Manuel M T Chakravarty, Duncan Coutts
4 --  Created: 24 May 2005
5 --
6 --  Version $Revision: 1.1.2.1 $ from $Date: 2005/06/14 00:16:14 $
7 --
8 --  Copyright (c) [1999..2004] Manuel M T Chakravarty
9 --  Copyright (c) 2005 Duncan Coutts
10 --
11 --  This file is free software; you can redistribute it and/or modify
12 --  it under the terms of the GNU General Public License as published by
13 --  the Free Software Foundation; either version 2 of the License, or
14 --  (at your option) any later version.
15 --
16 --  This file is distributed in the hope that it will be useful,
17 --  but WITHOUT ANY WARRANTY; without even the implied warranty of
18 --  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 --  GNU General Public License for more details.
20 --
21 --- DESCRIPTION ---------------------------------------------------------------
22 --
23 --  Lexer for C header files after being processed by the C preprocessor
24 --
25 --- DOCU ----------------------------------------------------------------------
26 --
27 --  language: Haskell 98
28 --
29 --  We assume that the input already went through cpp.  Thus, we do not handle
30 --  comments and preprocessor directives here.  The lexer recognizes all tokens
31 --  of ANCI C except those occurring only in function bodies.  It supports the
32 --  C99 `restrict' extension: <http://www.lysator.liu.se/c/restrict.html> as
33 --  well as inline functions.
34 --
35 --  Comments:
36 --
37 --  * There is no support for the optional feature of extended characters (see
38 --    K&R A2.5.2) or the corresponding strings (A2.6).
39 --
40 --  * We add `typedef-name' (K&R 8.9) as a token, as proposed in K&R A13.
41 --    However, as these tokens cannot be recognized lexically, but require a
42 --    context analysis, they are never produced by the lexer, but instead have
43 --    to be introduced in a later phase (by converting the corresponding
44 --    identifiers).
45 --
46 --  * We also recognize GNU C `__attribute__', `__extension__', `__const',
47 --    `__const__', `__inline', `__inline__', `__restrict', and `__restrict__'.
48 --
49 --  * Any line starting with `#pragma' is ignored.
50 --
51 --  With K&R we refer to ``The C Programming Language'', second edition, Brain
52 --  W. Kernighan and Dennis M. Ritchie, Prentice Hall, 1988.
53 --
54 --- TODO ----------------------------------------------------------------------
55 --
56 --  * `showsPrec' of `CTokCLit' should produce K&R-conforming escapes;
57 --    same for `CTokSLit'
58 --
59 --  * There are more GNU C specific keywords.  Add them and change `CParser'
60 --    correspondingly (in particular, most tokens within __attribute ((...))
61 --    expressions are actually keywords, but we handle them as identifiers at
62 --    the moment).
63 --
64 
65 {
66 
67 module CLexer (lexC, parseError) where
68 
69 import Data.Char      (isDigit, ord)
70 import Data.Word      (Word8)
71 import Numeric   (readDec, readOct, readHex)
72 
73 import Position  (Position(..), Pos(posOf))
74 import Errors    (interr)
75 import UNames	 (Name)
76 import Idents    (Ident, lexemeToIdent, identToLexeme)
77 
78 import CTokens
79 import CParserMonad
80 
81 }
82 
83 $space = [ \ \t ] -- horizontal white space
84 $eol   = \n
85 
86 $letter   = [a-zA-Z_]
87 $octdigit = 0-7
88 $digit    = 0-9
89 $digitNZ  = 1-9
90 $hexdigit = [0-9a-fA-F]
91 
92 $inchar   = \0-\255 # [ \\ \' \n \r ]
93 $instr    = \0-\255 # [ \\ \" \n \r ]
94 $anyButNL = \0-\255 # \n
95 $infname  = \ -\255 # [ \\ \" ]
96 $visible  = \ -\127
97 
98 @int = $digitNZ$digit*
99 @sp  = $space*
100 
101 -- character escape sequence (follows K&R A2.5.2)
102 --
103 --  * also used for strings
104 --
105 @charesc  = \\([ntvbrfae\\\?\'\"]|$octdigit{1,3}|x$hexdigit+)
106 
107 -- components of float constants (follows K&R A2.5.3)
108 --
109 @digits    = $digit+
110 @intpart   = @digits
111 @fractpart = @digits
112 @mantpart  = @intpart?\.@fractpart|@intpart\.
113 @exppart   = [eE][\+\-]?@digits
114 @suffix    = [fFlLqQwW]
115 
116 
117 tokens :-
118 
119 -- whitespace (follows K&R A2.1)
120 --
121 --  * horizontal and vertical tabs, newlines, and form feeds are filter out by
122 --   `Lexers.ctrlLexer'
123 --
124 --  * comments are not handled, as we assume the input already went through cpp
125 --
126 $white+					;
127 
128 -- #line directive (K&R A12.6)
129 --
130 --  * allows further ints after the file name a la GCC; as the GCC CPP docu
131 --   doesn't say how many ints there can be, we allow an unbound number
132 --
133 \#$space*@int$space*(\"($infname|@charesc)*\"$space*)?(@int$space*)*$eol
134   { \pos len str -> setPos (adjustPos (take len str) pos) >> lexToken }
135 
136 -- #pragma directive (K&R A12.8)
137 --
138 --  * we simply ignore any #pragma (but take care to update the position
139 --   information)
140 --
141 \#$space*pragma$anyButNL*$eol		;
142 
143 -- #itent directive, eg used by rcs/cvs
144 --
145 --  * we simply ignore any #itent (but take care to update the position
146 --   information)
147 --
148 \#$space*ident$anyButNL*$eol		;
149 
150 -- identifiers and keywords (follows K&R A2.3 and A2.4)
151 --
152 $letter($letter|$digit)*	{ \pos len str -> idkwtok (take len str) pos }
153 
154 -- constants (follows K&R A2.5)
155 --
156 --  * K&R explicit mentions `enumeration-constants'; however, as they are
157 --   lexically identifiers, we do not have an extra case for them
158 --
159 
160 -- integer constants (follows K&R A2.5.1)
161 --
162 0$octdigit*[uUlL]{0,3}		{ token CTokILit (fst . head . readOct) }
163 $digitNZ$digit*[uUlL]{0,3}	{ token CTokILit (fst . head . readDec) }
164 0[xX]$hexdigit*[uUlL]{0,3}	{ token CTokILit (fst . head . readHex . drop 2) }
165 
166 -- character constants (follows K&R A2.5.2)
167 --
168 \'($inchar|@charesc)\'	{ token CTokCLit (fst . oneChar . tail) }
169 L\'($inchar|@charesc)\'	{ token CTokCLit (fst . oneChar . tail . tail) }
170 
171 -- float constants (follows K&R A2.5.3)
172 --
173 (@mantpart@exppart?|@intpart@exppart)@suffix?	{ token CTokFLit id }
174 
175 -- string literal (follows K&R A2.6)
176 --
177 \"($instr|@charesc)*\"			{ token CTokSLit normalizeEscapes }
178 L\"($instr|@charesc)*\"			{ token CTokSLit (normalizeEscapes . tail) }
179 
180 
181 -- operators and separators
182 --
183 "("	{ token_ CTokLParen }
184 ")"	{ token_ CTokRParen  }
185 "["	{ token_ CTokLBracket }
186 "]"	{ token_ CTokRBracket }
187 "->"	{ token_ CTokArrow }
188 "."	{ token_ CTokDot }
189 "!"	{ token_ CTokExclam }
190 "~"	{ token_ CTokTilde }
191 "++"	{ token_ CTokInc }
192 "--"	{ token_ CTokDec }
193 "+"	{ token_ CTokPlus }
194 "-"	{ token_ CTokMinus }
195 "*"	{ token_ CTokStar }
196 "/"	{ token_ CTokSlash }
197 "%"	{ token_ CTokPercent }
198 "&"	{ token_ CTokAmper }
199 "<<"	{ token_ CTokShiftL }
200 ">>"	{ token_ CTokShiftR }
201 "<"	{ token_ CTokLess }
202 "<="	{ token_ CTokLessEq }
203 ">"	{ token_ CTokHigh }
204 ">="	{ token_ CTokHighEq }
205 "=="	{ token_ CTokEqual }
206 "!="	{ token_ CTokUnequal }
207 "^"	{ token_ CTokHat }
208 "|"	{ token_ CTokBar }
209 "&&"	{ token_ CTokAnd }
210 "||"	{ token_ CTokOr }
211 "?"	{ token_ CTokQuest }
212 ":"	{ token_ CTokColon }
213 "="	{ token_ CTokAssign }
214 "+="	{ token_ CTokPlusAss }
215 "-="	{ token_ CTokMinusAss }
216 "*="	{ token_ CTokStarAss }
217 "/="	{ token_ CTokSlashAss }
218 "%="	{ token_ CTokPercAss }
219 "&="	{ token_ CTokAmpAss }
220 "^="	{ token_ CTokHatAss }
221 "|="	{ token_ CTokBarAss }
222 "<<="	{ token_ CTokSLAss }
223 ">>="	{ token_ CTokSRAss }
224 ","	{ token_ CTokComma }
225 \;	{ token_ CTokSemic }
226 "{"	{ token_ CTokLBrace }
227 "}"	{ token_ CTokRBrace }
228 "..."	{ token_ CTokEllipsis }
229 
230 
231 {
232 
233 -- We use the odd looking list of string patterns here rather than normal
234 -- string literals since GHC converts the latter into a sequence of string
235 -- comparisons (ie a linear search) but it translates the former using its
236 -- efficient pattern matching which gives us the expected radix-style search.
237 -- This gives change makes a significant performance difference.
238 --
239 idkwtok :: String -> Position -> P CToken
240 idkwtok ('a':'l':'i':'g':'n':'o':'f':[])		     = tok CTokAlignof
241 idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':[])	     = tok CTokAlignof
242 idkwtok ('_':'_':'a':'l':'i':'g':'n':'o':'f':'_':'_':[])     = tok CTokAlignof
243 idkwtok ('a':'s':'m':[])				     = tok CTokAsm
244 idkwtok ('_':'_':'a':'s':'m':[])			     = tok CTokAsm
245 idkwtok ('_':'_':'a':'s':'m':'_':'_':[])		     = tok CTokAsm
246 idkwtok ('a':'u':'t':'o':[])				     = tok CTokAuto
247 idkwtok ('b':'r':'e':'a':'k':[])			     = tok CTokBreak
248 idkwtok ('_':'B':'o':'o':'l':[])			     = tok CTokBool
249 idkwtok ('c':'a':'s':'e':[])				     = tok CTokCase
250 idkwtok ('c':'h':'a':'r':[])				     = tok CTokChar
251 idkwtok ('c':'o':'n':'s':'t':[])			     = tok CTokConst
252 idkwtok ('_':'_':'c':'o':'n':'s':'t':[])		     = tok CTokConst
253 idkwtok ('_':'_':'c':'o':'n':'s':'t':'_':'_':[])	     = tok CTokConst
254 idkwtok ('c':'o':'n':'t':'i':'n':'u':'e':[])		     = tok CTokContinue
255 idkwtok ('_':'C':'o':'m':'p':'l':'e':'x':[])		     = tok CTokComplex
256 idkwtok ('d':'e':'f':'a':'u':'l':'t':[])		     = tok CTokDefault
257 idkwtok ('d':'o':[])					     = tok CTokDo
258 idkwtok ('d':'o':'u':'b':'l':'e':[])			     = tok CTokDouble
259 idkwtok ('e':'l':'s':'e':[])				     = tok CTokElse
260 idkwtok ('e':'n':'u':'m':[])				     = tok CTokEnum
261 idkwtok ('e':'x':'t':'e':'r':'n':[])			     = tok CTokExtern
262 idkwtok ('_':'_':'f':'l':'o':'a':'t':'1':'2':'8':[])           = tok CTokFloat128
263 idkwtok ('f':'l':'o':'a':'t':[])			     = tok CTokFloat
264 idkwtok ('f':'o':'r':[])				     = tok CTokFor
265 idkwtok ('g':'o':'t':'o':[])				     = tok CTokGoto
266 idkwtok ('i':'f':[])					     = tok CTokIf
267 idkwtok ('i':'n':'l':'i':'n':'e':[])			     = tok CTokInline
268 idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':[])		     = tok CTokInline
269 idkwtok ('_':'_':'i':'n':'l':'i':'n':'e':'_':'_':[])	     = tok CTokInline
270 idkwtok ('i':'n':'t':[])				     = tok CTokInt
271 idkwtok ('_':'_':'u':'i':'n':'t':'1':'2':'8':'_':'t':[])     = tok CTokInt
272 idkwtok ('_':'_':'i':'n':'t':'1':'2':'8':'_':'t':[])	     = tok CTokInt
273 idkwtok ('_':'_':'u':'i':'n':'t':'1':'2':'8':[])	     = tok CTokInt
274 idkwtok ('_':'_':'i':'n':'t':'1':'2':'8':[])		     = tok CTokInt
275 idkwtok ('l':'o':'n':'g':[])				     = tok CTokLong
276 idkwtok ('r':'e':'g':'i':'s':'t':'e':'r':[])		     = tok CTokRegister
277 idkwtok ('r':'e':'s':'t':'r':'i':'c':'t':[])		     = tok CTokRestrict
278 idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':[])	     = tok CTokRestrict
279 idkwtok ('_':'_':'r':'e':'s':'t':'r':'i':'c':'t':'_':'_':[]) = tok CTokRestrict
280 idkwtok ('r':'e':'t':'u':'r':'n':[])			     = tok CTokReturn
281 idkwtok ('s':'h':'o':'r':'t':[])			     = tok CTokShort
282 idkwtok ('s':'i':'g':'n':'e':'d':[])			     = tok CTokSigned
283 idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':[])		     = tok CTokSigned
284 idkwtok ('_':'_':'s':'i':'g':'n':'e':'d':'_':'_':[])	     = tok CTokSigned
285 idkwtok ('s':'i':'z':'e':'o':'f':[])			     = tok CTokSizeof
286 idkwtok ('s':'t':'a':'t':'i':'c':[])			     = tok CTokStatic
287 idkwtok ('s':'t':'r':'u':'c':'t':[])			     = tok CTokStruct
288 idkwtok ('s':'w':'i':'t':'c':'h':[])			     = tok CTokSwitch
289 idkwtok ('t':'y':'p':'e':'d':'e':'f':[])		     = tok CTokTypedef
290 idkwtok ('t':'y':'p':'e':'o':'f':[])			     = tok CTokTypeof
291 idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':[])		     = tok CTokTypeof
292 idkwtok ('_':'_':'t':'y':'p':'e':'o':'f':'_':'_':[])	     = tok CTokTypeof
293 idkwtok ('_':'_':'t':'h':'r':'e':'a':'d':[])		     = tok CTokThread
294 idkwtok ('u':'n':'i':'o':'n':[])			     = tok CTokUnion
295 idkwtok ('u':'n':'s':'i':'g':'n':'e':'d':[])		     = tok CTokUnsigned
296 idkwtok ('v':'o':'i':'d':[])				     = tok CTokVoid
297 idkwtok ('v':'o':'l':'a':'t':'i':'l':'e':[])		     = tok CTokVolatile
298 idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':[])	     = tok CTokVolatile
299 idkwtok ('_':'_':'v':'o':'l':'a':'t':'i':'l':'e':'_':'_':[]) = tok CTokVolatile
300 idkwtok ('w':'h':'i':'l':'e':[])			     = tok CTokWhile
301 idkwtok ('_':'_':'l':'a':'b':'e':'l':'_':'_':[])             = tok CTokLabel
302 idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':[]) = tok (CTokGnuC GnuCAttrTok)
303 --						ignoreAttribute >> lexToken
304 idkwtok ('_':'_':'a':'t':'t':'r':'i':'b':'u':'t':'e':'_':'_':[]) = tok (CTokGnuC GnuCAttrTok)
305 --						ignoreAttribute >> lexToken
306 idkwtok ('_':'_':'e':'x':'t':'e':'n':'s':'i':'o':'n':'_':'_':[]) =
307 						tok (CTokGnuC GnuCExtTok)
308 idkwtok ('_':'_':'b':'u':'i':'l':'t':'i':'n':'_':rest)
309         | rest == "va_arg"             = tok (CTokGnuC GnuCVaArg)
310         | rest == "offsetof"           = tok (CTokGnuC GnuCOffsetof)
311         | rest == "types_compatible_p" = tok (CTokGnuC GnuCTyCompat)
312 
313 idkwtok cs = \pos -> do
314   name <- getNewName
315   let ident = lexemeToIdent pos cs name
316   tyident <- isTypeIdent ident
317   if tyident
318     then return (CTokTyIdent pos ident)
319     else return (CTokIdent   pos ident)
320 
321 ignoreAttribute :: P ()
322 ignoreAttribute = skipTokens 0
323   where skipTokens n = do
324           tok <- lexToken
325           case tok of
326             CTokRParen _ | n == 1    -> return ()
327                          | otherwise -> skipTokens (n-1)
328             CTokLParen _             -> skipTokens (n+1)
329             _                        -> skipTokens n
330 
331 tok :: (Position -> CToken) -> Position -> P CToken
332 tok tc pos = return (tc pos)
333 
334 -- converts the first character denotation of a C-style string to a character
335 -- and the remaining string
336 --
337 oneChar             :: String -> (Char, String)
338 oneChar ('\\':c:cs)  = case c of
339 			 'n'  -> ('\n', cs)
340 			 't'  -> ('\t', cs)
341 			 'v'  -> ('\v', cs)
342 			 'b'  -> ('\b', cs)
343 			 'r'  -> ('\r', cs)
344 			 'f'  -> ('\f', cs)
345 			 'a'  -> ('\a', cs)
346 			 'e'  -> ('\ESC', cs)  --GNU C extension
347 			 '\\' -> ('\\', cs)
348 			 '?'  -> ('?', cs)
349 			 '\'' -> ('\'', cs)
350 			 '"'  -> ('"', cs)
351 			 'x'  -> case head (readHex cs) of
352 			           (i, cs') -> (toEnum i, cs')
353 			 _    -> case head (readOct (c:cs)) of
354 			           (i, cs') -> (toEnum i, cs')
355 oneChar (c   :cs)    = (c, cs)
356 
357 normalizeEscapes [] = []
358 normalizeEscapes cs = case oneChar cs of
359                         (c, cs') -> c : normalizeEscapes cs'
360 
361 adjustPos :: String -> Position -> Position
362 adjustPos str (Position fname row _) = Position fname' row' 0
363   where
364     str'            = dropWhite . drop 1 $ str
365     (rowStr, str'') = span isDigit str'
366     row'	    = read rowStr
367     str'''	    = dropWhite str''
368     fnameStr	    = takeWhile (/= '"') . drop 1 $ str'''
369     fname'	    | null str''' || head str''' /= '"'	= fname
370 		    -- try and get more sharing of file name strings
371 		    | fnameStr == fname			= fname
372 		    | otherwise				= fnameStr
373     --
374     dropWhite = dropWhile (\c -> c == ' ' || c == '\t')
375 
376 {-# INLINE token_ #-}
377 -- token that ignores the string
378 token_ :: (Position -> CToken) -> Position -> Int -> String -> P CToken
379 token_ tok pos _ _ = return (tok pos)
380 
381 {-# INLINE token #-}
382 -- token that uses the string
383 token :: (Position -> a -> CToken) -> (String -> a)
384       -> Position -> Int -> String -> P CToken
385 token tok read pos len str = return (tok pos (read $ take len str))
386 
387 
388 -- -----------------------------------------------------------------------------
389 -- The input type
390 
391 type AlexInput = (Position, 	-- current position,
392 		  String)	-- current input string
393 
394 alexInputPrevChar :: AlexInput -> Char
395 alexInputPrevChar _ = error "alexInputPrevChar not used"
396 
397 -- For alex >= 3.0
398 alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
399 alexGetByte (p,[]) = Nothing
400 alexGetByte (p,(c:s))  = let p' = alexMove p c in p' `seq`
401                            Just (fromIntegral $ ord c, (p', s))
402 
403 -- For alex < 3.0
404 alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
405 alexGetChar (p,[]) = Nothing
406 alexGetChar (p,(c:s))  = let p' = alexMove p c in p' `seq`
407                            Just (c, (p', s))
408 
409 alexMove :: Position -> Char -> Position
410 alexMove (Position f l c) '\t' = Position f l     (((c+7) `div` 8)*8+1)
411 alexMove (Position f l c) '\n' = Position f (l+1) 1
412 alexMove (Position f l c) _    = Position f l     (c+1)
413 
414 lexicalError :: P a
415 lexicalError = do
416   pos <- getPos
417   (c:cs) <- getInput
418   failP pos
419         ["Lexical error!",
420          "The character " ++ show c ++ " does not fit here."]
421 
422 parseError :: P a
423 parseError = do
424   tok <- getLastToken
425   failP (posOf tok)
426         ["Syntax error!",
427          "The symbol `" ++ show tok ++ "' does not fit here."]
428 
429 lexToken :: P CToken
430 lexToken = do
431   pos <- getPos
432   inp <- getInput
433   case alexScan (pos, inp) 0 of
434     AlexEOF -> return CTokEof
435     AlexError inp' -> lexicalError
436     AlexSkip  (pos', inp') len -> do
437         setPos pos'
438         setInput inp'
439 	lexToken
440     AlexToken (pos', inp') len action -> do
441         setPos pos'
442         setInput inp'
443         tok <- action pos len inp
444         setLastToken tok
445         return tok
446 
447 lexC :: (CToken -> P a) -> P a
448 lexC cont = do
449   tok <- lexToken
450   cont tok
451 }
452