1#!/usr/local/bin/python3.8
2
3'''Preprocess a C source file using gcc and convert the result into
4   a token stream
5
6Reference is C99:
7  * http://www.open-std.org/JTC1/SC22/WG14/www/docs/n1124.pdf
8
9'''
10
11__docformat__ = 'restructuredtext'
12
13import os
14import re
15import shlex
16import sys
17import tokenize
18import traceback
19
20import ctypes
21from . import lex
22from . import yacc
23from .lex import TOKEN
24
25
26PY2 = True
27if sys.version_info.major == 3:
28    PY2 = False
29    long = int
30
31
32tokens = (
33    'HEADER_NAME', 'IDENTIFIER', 'PP_NUMBER', 'CHARACTER_CONSTANT',
34    'STRING_LITERAL', 'OTHER',
35
36    'PTR_OP', 'INC_OP', 'DEC_OP', 'LEFT_OP', 'RIGHT_OP', 'LE_OP', 'GE_OP',
37    'EQ_OP', 'NE_OP', 'AND_OP', 'OR_OP', 'MUL_ASSIGN', 'DIV_ASSIGN',
38    'MOD_ASSIGN', 'ADD_ASSIGN', 'SUB_ASSIGN', 'LEFT_ASSIGN', 'RIGHT_ASSIGN',
39    'AND_ASSIGN', 'XOR_ASSIGN', 'OR_ASSIGN', 'PERIOD', 'ELLIPSIS',
40
41    'LPAREN', 'NEWLINE',
42
43    'PP_DEFINE', 'PP_DEFINE_NAME', 'PP_DEFINE_MACRO_NAME', 'PP_MACRO_PARAM',
44    'PP_STRINGIFY', 'PP_IDENTIFIER_PASTE', 'PP_END_DEFINE'
45)
46
47states = [('DEFINE', "exclusive")]
48
49subs = {
50    'D': '[0-9]',
51    'L': '[a-zA-Z_]',
52    'H': '[a-fA-F0-9]',
53    'E': '[Ee][+-]?\s*{D}+',
54    'FS': '([FfLl]|d[dfl]|D[DFL]|[fFdD][0-9]+x?)',
55    'IS': '[uUlL]*',
56}
57# Helper: substitute {foo} with subs[foo] in string (makes regexes more lexy)
58sub_pattern = re.compile('{([^}]*)}')
59
60
61def sub_repl_match(m):
62    return subs[m.groups()[0]]
63
64
65def sub(s):
66    return sub_pattern.sub(sub_repl_match, s)
67
68# --------------------------------------------------------------------------
69# Token value types
70# --------------------------------------------------------------------------
71
72# Numbers represented as int and float types.
73# For all other tokens, type is just str representation.
74
75
76class StringLiteral(str):
77
78    def __new__(cls, value):
79        # Unescaping probably not perfect but close enough.
80        try:
81            value = re.sub(r'\\x([0-9a-fA-F])(?![0-9a-fA-F])',
82                           r'\\x0\\1', value[1:-1])
83        except ValueError as e:
84            raise ValueError("invalid \\x escape in %s" % value)
85
86        return str.__new__(cls, value)
87
88# --------------------------------------------------------------------------
89# Token declarations
90# --------------------------------------------------------------------------
91
92
93punctuators = {
94    # value: (regex, type)
95    r'...': (r'\.\.\.', 'ELLIPSIS'),
96    r'>>=': (r'>>=', 'RIGHT_ASSIGN'),
97    r'<<=': (r'<<=', 'LEFT_ASSIGN'),
98    r'+=': (r'\+=', 'ADD_ASSIGN'),
99    r'-=': (r'-=', 'SUB_ASSIGN'),
100    r'*=': (r'\*=', 'MUL_ASSIGN'),
101    r'/=': (r'/=', 'DIV_ASSIGN'),
102    r'%=': (r'%=', 'MOD_ASSIGN'),
103    r'&=': (r'&=', 'AND_ASSIGN'),
104    r'^=': (r'\^=', 'XOR_ASSIGN'),
105    r'|=': (r'\|=', 'OR_ASSIGN'),
106    r'>>': (r'>>', 'RIGHT_OP'),
107    r'<<': (r'<<', 'LEFT_OP'),
108    r'++': (r'\+\+', 'INC_OP'),
109    r'--': (r'--', 'DEC_OP'),
110    r'->': (r'->', 'PTR_OP'),
111    r'&&': (r'&&', 'AND_OP'),
112    r'||': (r'\|\|', 'OR_OP'),
113    r'<=': (r'<=', 'LE_OP'),
114    r'>=': (r'>=', 'GE_OP'),
115    r'==': (r'==', 'EQ_OP'),
116    r'!=': (r'!=', 'NE_OP'),
117    r'<:': (r'<:', '['),
118    r':>': (r':>', ']'),
119    r'<%': (r'<%', '{'),
120    r'%>': (r'%>', '}'),
121    r';': (r';', ';'),
122    r'{': (r'{', '{'),
123    r'}': (r'}', '}'),
124    r',': (r',', ','),
125    r':': (r':', ':'),
126    r'=': (r'=', '='),
127    r')': (r'\)', ')'),
128    r'[': (r'\[', '['),
129    r']': (r']', ']'),
130    r'.': (r'\.', 'PERIOD'),
131    r'&': (r'&', '&'),
132    r'!': (r'!', '!'),
133    r'~': (r'~', '~'),
134    r'-': (r'-', '-'),
135    r'+': (r'\+', '+'),
136    r'*': (r'\*', '*'),
137    r'/': (r'/', '/'),
138    r'%': (r'%', '%'),
139    r'<': (r'<', '<'),
140    r'>': (r'>', '>'),
141    r'^': (r'\^', '^'),
142    r'|': (r'\|', '|'),
143    r'?': (r'\?', '?')
144}
145
146
147def punctuator_regex(punctuators):
148    punctuator_regexes = [v[0] for v in punctuators.values()]
149    if PY2:
150        punctuator_regexes.sort(lambda a, b: -cmp(len(a), len(b)))
151    else:
152        punctuator_regexes.sort(key=lambda a: -len(a))
153    return '(%s)' % '|'.join(punctuator_regexes)
154
155
156# Process line-number directives from the preprocessor
157# See http://docs.freebsd.org/info/cpp/cpp.info.Output.html
158DIRECTIVE = r'\#\s+(\d+)\s+"([^"]+)"[ \d]*\n'
159
160
161@TOKEN(DIRECTIVE)
162def t_ANY_directive(t):
163    t.lexer.filename = t.groups[2]
164    t.lexer.lineno = int(t.groups[1])
165    return None
166
167
168@TOKEN(punctuator_regex(punctuators))
169def t_ANY_punctuator(t):
170    t.type = punctuators[t.value][1]
171    return t
172
173
174IDENTIFIER = sub('{L}({L}|{D})*')
175
176
177@TOKEN(IDENTIFIER)
178def t_INITIAL_identifier(t):
179    t.type = 'IDENTIFIER'
180    return t
181
182
183@TOKEN(IDENTIFIER)
184def t_DEFINE_identifier(t):
185    if t.lexer.next_is_define_name:
186        # This identifier is the name of a macro
187        # We need to look ahead and see if this macro takes parameters or not.
188        if t.lexpos + len(t.value) < t.lexer.lexlen and \
189                t.lexer.lexdata[t.lexpos + len(t.value)] == '(':
190
191            t.type = 'PP_DEFINE_MACRO_NAME'
192
193            # Look ahead and read macro parameter list
194            lexdata = t.lexer.lexdata
195            pos = t.lexpos + len(t.value) + 1
196            while lexdata[pos] not in '\n)':
197                pos += 1
198            params = lexdata[t.lexpos + len(t.value) + 1: pos]
199            paramlist = [x.strip() for x in params.split(",") if x.strip()]
200            t.lexer.macro_params = paramlist
201
202        else:
203            t.type = 'PP_DEFINE_NAME'
204
205        t.lexer.next_is_define_name = False
206    elif t.value in t.lexer.macro_params:
207        t.type = 'PP_MACRO_PARAM'
208    else:
209        t.type = 'IDENTIFIER'
210    return t
211
212
213FLOAT_LITERAL = sub(r"(?P<p1>{D}+)?(?P<dp>[.]?)(?P<p2>(?(p1){D}*|{D}+))"
214                    r"(?P<exp>(?:[Ee][+-]?{D}+)?)(?P<suf>{FS}?)(?!\w)")
215
216
217@TOKEN(FLOAT_LITERAL)
218def t_ANY_float(t):
219    t.type = 'PP_NUMBER'
220    m = t.lexer.lexmatch
221
222    p1 = m.group("p1")
223    dp = m.group("dp")
224    p2 = m.group("p2")
225    exp = m.group("exp")
226    suf = m.group("suf")
227
228    if dp or exp or (suf and suf not in ("Ll")):
229        s = m.group(0)
230        if suf:
231            s = s[:-len(suf)]
232        # Attach a prefix so the parser can figure out if should become an
233        # integer, float, or long
234        t.value = "f" + s
235    elif (suf and suf in ("Ll")):
236        t.value = "l" + p1
237    else:
238        t.value = "i" + p1
239
240    return t
241
242
243INT_LITERAL = sub(r"(?P<p1>(?:0x{H}+)|(?:{D}+))(?P<suf>{IS})")
244
245
246@TOKEN(INT_LITERAL)
247def t_ANY_int(t):
248    t.type = 'PP_NUMBER'
249    m = t.lexer.lexmatch
250
251    if "L" in m.group(3) or "l" in m.group(2):
252        prefix = "l"
253    else:
254        prefix = "i"
255
256    g1 = m.group(2)
257    if g1.startswith("0x"):
258        # Convert base from hexadecimal
259        g1 = str(long(g1[2:], 16))
260    elif g1[0] == "0":
261        # Convert base from octal
262        g1 = str(long(g1, 8))
263
264    t.value = prefix + g1
265
266    return t
267
268
269CHARACTER_CONSTANT = sub(r"L?'(\\.|[^\\'])+'")
270
271
272@TOKEN(CHARACTER_CONSTANT)
273def t_ANY_character_constant(t):
274    t.type = 'CHARACTER_CONSTANT'
275    return t
276
277
278STRING_LITERAL = sub(r'L?"(\\.|[^\\"])*"')
279
280
281@TOKEN(STRING_LITERAL)
282def t_ANY_string_literal(t):
283    t.type = 'STRING_LITERAL'
284    t.value = StringLiteral(t.value)
285    return t
286
287
288@TOKEN(r'\(')
289def t_ANY_lparen(t):
290    if t.lexpos == 0 or t.lexer.lexdata[t.lexpos - 1] not in (' \t\f\v\n'):
291        t.type = 'LPAREN'
292    else:
293        t.type = '('
294    return t
295
296
297@TOKEN(r'\n')
298def t_INITIAL_newline(t):
299    t.lexer.lineno += 1
300    return None
301
302
303@TOKEN(r'\#define')
304def t_INITIAL_pp_define(t):
305    t.type = 'PP_DEFINE'
306    t.lexer.begin("DEFINE")
307    t.lexer.next_is_define_name = True
308    t.lexer.macro_params = set()
309    return t
310
311
312@TOKEN(r'\n')
313def t_DEFINE_newline(t):
314    t.type = 'PP_END_DEFINE'
315    t.lexer.begin("INITIAL")
316    t.lexer.lineno += 1
317    del t.lexer.macro_params
318
319    # Damage control in case the token immediately after the #define failed
320    # to handle this
321    t.lexer.next_is_define_name = False
322    return t
323
324
325@TOKEN(r'(\#\#)|(\#)')
326def t_DEFINE_pp_param_op(t):
327    if t.value == '#':
328        t.type = 'PP_STRINGIFY'
329    else:
330        t.type = 'PP_IDENTIFIER_PASTE'
331    return t
332
333
334def t_INITIAL_error(t):
335    t.type = 'OTHER'
336    return t
337
338
339def t_DEFINE_error(t):
340    t.type = 'OTHER'
341    t.value = t.value[0]
342    t.lexer.lexpos += 1  # Skip it if it's an error in a #define
343    return t
344
345
346t_ANY_ignore = ' \t\v\f\r'
347