1#!/usr/local/bin/python3.8 2 3'''Preprocess a C source file using gcc and convert the result into 4 a token stream 5 6Reference is C99: 7 * http://www.open-std.org/JTC1/SC22/WG14/www/docs/n1124.pdf 8 9''' 10 11__docformat__ = 'restructuredtext' 12 13import os 14import re 15import shlex 16import sys 17import tokenize 18import traceback 19 20import ctypes 21from . import lex 22from . import yacc 23from .lex import TOKEN 24 25 26PY2 = True 27if sys.version_info.major == 3: 28 PY2 = False 29 long = int 30 31 32tokens = ( 33 'HEADER_NAME', 'IDENTIFIER', 'PP_NUMBER', 'CHARACTER_CONSTANT', 34 'STRING_LITERAL', 'OTHER', 35 36 'PTR_OP', 'INC_OP', 'DEC_OP', 'LEFT_OP', 'RIGHT_OP', 'LE_OP', 'GE_OP', 37 'EQ_OP', 'NE_OP', 'AND_OP', 'OR_OP', 'MUL_ASSIGN', 'DIV_ASSIGN', 38 'MOD_ASSIGN', 'ADD_ASSIGN', 'SUB_ASSIGN', 'LEFT_ASSIGN', 'RIGHT_ASSIGN', 39 'AND_ASSIGN', 'XOR_ASSIGN', 'OR_ASSIGN', 'PERIOD', 'ELLIPSIS', 40 41 'LPAREN', 'NEWLINE', 42 43 'PP_DEFINE', 'PP_DEFINE_NAME', 'PP_DEFINE_MACRO_NAME', 'PP_MACRO_PARAM', 44 'PP_STRINGIFY', 'PP_IDENTIFIER_PASTE', 'PP_END_DEFINE' 45) 46 47states = [('DEFINE', "exclusive")] 48 49subs = { 50 'D': '[0-9]', 51 'L': '[a-zA-Z_]', 52 'H': '[a-fA-F0-9]', 53 'E': '[Ee][+-]?\s*{D}+', 54 'FS': '([FfLl]|d[dfl]|D[DFL]|[fFdD][0-9]+x?)', 55 'IS': '[uUlL]*', 56} 57# Helper: substitute {foo} with subs[foo] in string (makes regexes more lexy) 58sub_pattern = re.compile('{([^}]*)}') 59 60 61def sub_repl_match(m): 62 return subs[m.groups()[0]] 63 64 65def sub(s): 66 return sub_pattern.sub(sub_repl_match, s) 67 68# -------------------------------------------------------------------------- 69# Token value types 70# -------------------------------------------------------------------------- 71 72# Numbers represented as int and float types. 73# For all other tokens, type is just str representation. 74 75 76class StringLiteral(str): 77 78 def __new__(cls, value): 79 # Unescaping probably not perfect but close enough. 80 try: 81 value = re.sub(r'\\x([0-9a-fA-F])(?![0-9a-fA-F])', 82 r'\\x0\\1', value[1:-1]) 83 except ValueError as e: 84 raise ValueError("invalid \\x escape in %s" % value) 85 86 return str.__new__(cls, value) 87 88# -------------------------------------------------------------------------- 89# Token declarations 90# -------------------------------------------------------------------------- 91 92 93punctuators = { 94 # value: (regex, type) 95 r'...': (r'\.\.\.', 'ELLIPSIS'), 96 r'>>=': (r'>>=', 'RIGHT_ASSIGN'), 97 r'<<=': (r'<<=', 'LEFT_ASSIGN'), 98 r'+=': (r'\+=', 'ADD_ASSIGN'), 99 r'-=': (r'-=', 'SUB_ASSIGN'), 100 r'*=': (r'\*=', 'MUL_ASSIGN'), 101 r'/=': (r'/=', 'DIV_ASSIGN'), 102 r'%=': (r'%=', 'MOD_ASSIGN'), 103 r'&=': (r'&=', 'AND_ASSIGN'), 104 r'^=': (r'\^=', 'XOR_ASSIGN'), 105 r'|=': (r'\|=', 'OR_ASSIGN'), 106 r'>>': (r'>>', 'RIGHT_OP'), 107 r'<<': (r'<<', 'LEFT_OP'), 108 r'++': (r'\+\+', 'INC_OP'), 109 r'--': (r'--', 'DEC_OP'), 110 r'->': (r'->', 'PTR_OP'), 111 r'&&': (r'&&', 'AND_OP'), 112 r'||': (r'\|\|', 'OR_OP'), 113 r'<=': (r'<=', 'LE_OP'), 114 r'>=': (r'>=', 'GE_OP'), 115 r'==': (r'==', 'EQ_OP'), 116 r'!=': (r'!=', 'NE_OP'), 117 r'<:': (r'<:', '['), 118 r':>': (r':>', ']'), 119 r'<%': (r'<%', '{'), 120 r'%>': (r'%>', '}'), 121 r';': (r';', ';'), 122 r'{': (r'{', '{'), 123 r'}': (r'}', '}'), 124 r',': (r',', ','), 125 r':': (r':', ':'), 126 r'=': (r'=', '='), 127 r')': (r'\)', ')'), 128 r'[': (r'\[', '['), 129 r']': (r']', ']'), 130 r'.': (r'\.', 'PERIOD'), 131 r'&': (r'&', '&'), 132 r'!': (r'!', '!'), 133 r'~': (r'~', '~'), 134 r'-': (r'-', '-'), 135 r'+': (r'\+', '+'), 136 r'*': (r'\*', '*'), 137 r'/': (r'/', '/'), 138 r'%': (r'%', '%'), 139 r'<': (r'<', '<'), 140 r'>': (r'>', '>'), 141 r'^': (r'\^', '^'), 142 r'|': (r'\|', '|'), 143 r'?': (r'\?', '?') 144} 145 146 147def punctuator_regex(punctuators): 148 punctuator_regexes = [v[0] for v in punctuators.values()] 149 if PY2: 150 punctuator_regexes.sort(lambda a, b: -cmp(len(a), len(b))) 151 else: 152 punctuator_regexes.sort(key=lambda a: -len(a)) 153 return '(%s)' % '|'.join(punctuator_regexes) 154 155 156# Process line-number directives from the preprocessor 157# See http://docs.freebsd.org/info/cpp/cpp.info.Output.html 158DIRECTIVE = r'\#\s+(\d+)\s+"([^"]+)"[ \d]*\n' 159 160 161@TOKEN(DIRECTIVE) 162def t_ANY_directive(t): 163 t.lexer.filename = t.groups[2] 164 t.lexer.lineno = int(t.groups[1]) 165 return None 166 167 168@TOKEN(punctuator_regex(punctuators)) 169def t_ANY_punctuator(t): 170 t.type = punctuators[t.value][1] 171 return t 172 173 174IDENTIFIER = sub('{L}({L}|{D})*') 175 176 177@TOKEN(IDENTIFIER) 178def t_INITIAL_identifier(t): 179 t.type = 'IDENTIFIER' 180 return t 181 182 183@TOKEN(IDENTIFIER) 184def t_DEFINE_identifier(t): 185 if t.lexer.next_is_define_name: 186 # This identifier is the name of a macro 187 # We need to look ahead and see if this macro takes parameters or not. 188 if t.lexpos + len(t.value) < t.lexer.lexlen and \ 189 t.lexer.lexdata[t.lexpos + len(t.value)] == '(': 190 191 t.type = 'PP_DEFINE_MACRO_NAME' 192 193 # Look ahead and read macro parameter list 194 lexdata = t.lexer.lexdata 195 pos = t.lexpos + len(t.value) + 1 196 while lexdata[pos] not in '\n)': 197 pos += 1 198 params = lexdata[t.lexpos + len(t.value) + 1: pos] 199 paramlist = [x.strip() for x in params.split(",") if x.strip()] 200 t.lexer.macro_params = paramlist 201 202 else: 203 t.type = 'PP_DEFINE_NAME' 204 205 t.lexer.next_is_define_name = False 206 elif t.value in t.lexer.macro_params: 207 t.type = 'PP_MACRO_PARAM' 208 else: 209 t.type = 'IDENTIFIER' 210 return t 211 212 213FLOAT_LITERAL = sub(r"(?P<p1>{D}+)?(?P<dp>[.]?)(?P<p2>(?(p1){D}*|{D}+))" 214 r"(?P<exp>(?:[Ee][+-]?{D}+)?)(?P<suf>{FS}?)(?!\w)") 215 216 217@TOKEN(FLOAT_LITERAL) 218def t_ANY_float(t): 219 t.type = 'PP_NUMBER' 220 m = t.lexer.lexmatch 221 222 p1 = m.group("p1") 223 dp = m.group("dp") 224 p2 = m.group("p2") 225 exp = m.group("exp") 226 suf = m.group("suf") 227 228 if dp or exp or (suf and suf not in ("Ll")): 229 s = m.group(0) 230 if suf: 231 s = s[:-len(suf)] 232 # Attach a prefix so the parser can figure out if should become an 233 # integer, float, or long 234 t.value = "f" + s 235 elif (suf and suf in ("Ll")): 236 t.value = "l" + p1 237 else: 238 t.value = "i" + p1 239 240 return t 241 242 243INT_LITERAL = sub(r"(?P<p1>(?:0x{H}+)|(?:{D}+))(?P<suf>{IS})") 244 245 246@TOKEN(INT_LITERAL) 247def t_ANY_int(t): 248 t.type = 'PP_NUMBER' 249 m = t.lexer.lexmatch 250 251 if "L" in m.group(3) or "l" in m.group(2): 252 prefix = "l" 253 else: 254 prefix = "i" 255 256 g1 = m.group(2) 257 if g1.startswith("0x"): 258 # Convert base from hexadecimal 259 g1 = str(long(g1[2:], 16)) 260 elif g1[0] == "0": 261 # Convert base from octal 262 g1 = str(long(g1, 8)) 263 264 t.value = prefix + g1 265 266 return t 267 268 269CHARACTER_CONSTANT = sub(r"L?'(\\.|[^\\'])+'") 270 271 272@TOKEN(CHARACTER_CONSTANT) 273def t_ANY_character_constant(t): 274 t.type = 'CHARACTER_CONSTANT' 275 return t 276 277 278STRING_LITERAL = sub(r'L?"(\\.|[^\\"])*"') 279 280 281@TOKEN(STRING_LITERAL) 282def t_ANY_string_literal(t): 283 t.type = 'STRING_LITERAL' 284 t.value = StringLiteral(t.value) 285 return t 286 287 288@TOKEN(r'\(') 289def t_ANY_lparen(t): 290 if t.lexpos == 0 or t.lexer.lexdata[t.lexpos - 1] not in (' \t\f\v\n'): 291 t.type = 'LPAREN' 292 else: 293 t.type = '(' 294 return t 295 296 297@TOKEN(r'\n') 298def t_INITIAL_newline(t): 299 t.lexer.lineno += 1 300 return None 301 302 303@TOKEN(r'\#define') 304def t_INITIAL_pp_define(t): 305 t.type = 'PP_DEFINE' 306 t.lexer.begin("DEFINE") 307 t.lexer.next_is_define_name = True 308 t.lexer.macro_params = set() 309 return t 310 311 312@TOKEN(r'\n') 313def t_DEFINE_newline(t): 314 t.type = 'PP_END_DEFINE' 315 t.lexer.begin("INITIAL") 316 t.lexer.lineno += 1 317 del t.lexer.macro_params 318 319 # Damage control in case the token immediately after the #define failed 320 # to handle this 321 t.lexer.next_is_define_name = False 322 return t 323 324 325@TOKEN(r'(\#\#)|(\#)') 326def t_DEFINE_pp_param_op(t): 327 if t.value == '#': 328 t.type = 'PP_STRINGIFY' 329 else: 330 t.type = 'PP_IDENTIFIER_PASTE' 331 return t 332 333 334def t_INITIAL_error(t): 335 t.type = 'OTHER' 336 return t 337 338 339def t_DEFINE_error(t): 340 t.type = 'OTHER' 341 t.value = t.value[0] 342 t.lexer.lexpos += 1 # Skip it if it's an error in a #define 343 return t 344 345 346t_ANY_ignore = ' \t\v\f\r' 347