1#! /usr/bin/env python3
2
3# Copyright 2007 Google Inc.
4#
5# This program is free software; you can redistribute it and/or
6# modify it under the terms of the GNU General Public License
7# as published by the Free Software Foundation; either version 2
8# of the License, or (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program; if not, write to the Free Software
17# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
18# USA.
19
20"""A very fast directives-only parser for C and C++ source code.
21
22We parse only the following directives:
23  #include       (the standard C/C++ inclusion mechanism)
24  #include_next  (a GNU C/C++ extension)
25  #import        (an Objective-C feature, similar to #include)
26  #define        (because #defines can affect the results of '#include MACRO')
27"""
28
29__author__ = 'Nils Klarlund'
30
31import re
32import time
33
34import basics
35import cache_basics
36import statistics
37
38Debug = basics.Debug
39DEBUG_TRACE = basics.DEBUG_TRACE
40DEBUG_TRACE2 = basics.DEBUG_TRACE2
41NotCoveredError = basics.NotCoveredError
42
43# For coarse and fast scanning
44RE_INCLUDE_DEFINE = re.compile("include|define|import")
45
46# For fine-grained, but slow backtracking, parsing
47POUND_SIGN_RE = re.compile(r"""
48  ^                            # start of line
49  [ \t]*                       # space(s)
50  ([*][/])?                    # a possible ..*/ ending block comment
51  [ \t]*                       # space(s)
52  ([/][*] [^\n]* [*][/])*      # initial block comment(s) /*...*/
53  [ \t]*                       # space(s)
54  (?P<directive>               # group('directive') -- what we're after
55   [#]                         # the pound sign
56   [ \t]*                      # space(s)
57   (define|include_next|include|import)\b # the directive
58   ((?!\\\n).)*                 # the rest on this line: zero or more
59                                # characters, each not a backslash that
60                                # is followed by \n
61   (\\\n((?!\\\n).)*)*          # (backslash + \n + rest of line)*
62  )
63  """, re.VERBOSE + re.MULTILINE)
64
65
66NOT_COMMA_OR_PARENS = "([^(),])"
67
68# For parsing macro expressions of the form:
69#  symbol
70#  symbol  (something, ..., something), where something is not ',', '(', or ')'
71MACRO_EXPR = r"""
72 (?P<symbol>\w+)                   # the symbol, named 'symbol'
73  ( \s*
74    [(] \s*                        # beginning parenthesis
75    (?P<args>                      # a parenthesized expression (with no
76                                   # containing expressions -- a limitation)
77                                   # named 'args'
78    %(NOT_COMMA_OR_PARENS)s*        # the first argument (if it exists)
79     ([,]%(NOT_COMMA_OR_PARENS)s*)* # subsequent arguments
80    )
81  [)]                              # ending parenthesis
82  )?""" % {'NOT_COMMA_OR_PARENS': NOT_COMMA_OR_PARENS}
83
84MACRO_EXPR_RE = re.compile(MACRO_EXPR, re.VERBOSE)
85
86# Nice little parser of certain directive lines (after backslash-ended
87# line continuations and comments are removed)
88DIRECTIVE_RE = re.compile(r"""
89 ^[ \t]*
90  [#]
91  [ \t]*
92  (
93   ((?P<include> include_next | include | import)
94    \s*
95    ( "(?P<quote> (\w|[_/.,+-])*)"            |  # "bar/foo.h"
96      <(?P<angle> (\w|[_/.,+-])*)>            |  # <stdio.h>
97      (?P<expr>   .*?))                         # expr, match . minimally
98    )
99   |
100   (?P<define> define \s+ (?P<lhs> %s)    # insert MACRO_EXPR here
101                      \s* (?P<rhs> .*?))  # match . minimally before
102                                          # trailing white space
103  )
104  \s*                                     # trailing whitespace
105  ((/[*]|//).*)?                          # optional trailing comment start
106  $
107  """ % MACRO_EXPR,
108  re.VERBOSE)
109
110#
111INCLUDE_STRING_RE = re.compile(r"""
112  ^
113  \s*
114  ( "\s*(?P<quote> (\w|[\\_/.,+-])*)\s*"            |
115    <\s*(?P<angle> (\w|[\\_/.,+-])*)\s*>
116  )
117  \s*
118  $
119""", re.VERBOSE)
120
121# For ridding lines of backslash
122BACKSLASH_RE = re.compile(r"\\\n", re.MULTILINE)
123
124# For matching non-comment prefix of line.
125COMMENT_RE = re.compile(r"((?!/[*]|//).)*")
126
127# FOR SEARCHING AFTER /* .. */.
128PAIRED_COMMENT_RE = re.compile(r"(/[*].*?[*]/)")
129
130
131def InsertMacroDefInTable(lhs, rhs, symbol_table, callback_function):
132  """Insert the definition of a pair (lhs, rhs) into symbol table.
133
134  Arguments:
135    lhs: a string, of the form "symbol" or "symbol(param1, ..., paramN)"
136    rhs: a string
137    symbol_table: where the definition will be inserted
138    callback_function: a function called with value "symbol"
139  """
140  m_expr = MACRO_EXPR_RE.match(lhs)
141  if m_expr.end(0) != len(lhs):
142    raise NotCoveredError(
143      "Unexpected macro definition with LHS: '%s'." % lhs)
144  # Calculate the definition df, either
145  # - a pair ([arg_1, .., arg_n], rhs) where arg_i is the
146  #   i'th formal parameter (function-like macro definition), or
147  # - just a symbol (object-like macro definition)
148  if m_expr.group('args') != None:  # perhaps ''
149    # A function-like macro definition.
150    # Construct pair (list of formal parameters, rhs).
151    args = m_expr.group('args').split(',')
152    df = args, rhs
153    # lhs is adjusted to be just the 'function' name
154    lhs = m_expr.group('symbol')
155  else: # m_expr.group('args')
156    # An object-like macro definition
157    assert m_expr.group('symbol') == lhs
158    df = rhs
159  if lhs not in symbol_table:
160    symbol_table[lhs] = [df]
161  else:
162    symbol_table[lhs].append(df)
163  callback_function(lhs)
164
165
166class ParseFile(object):
167  """Parser class for syntax understood by CPP, the C and C++
168  preprocessor. An instance of this class defines the Parse method."""
169
170  def __init__(self, includepath_map):
171    """Constructor. Make a parser.
172
173    Arguments:
174      includepath_map: string-to-index map for includepaths
175    """
176    assert isinstance(includepath_map, cache_basics.MapToIndex)
177    self.includepath_map = includepath_map
178    self.define_callback = lambda x: None
179
180  def SetDefineCallback(self, callback_function):
181    """Set a callback function, which is invoked for '#define's.
182
183     The function is called as callback_function(symbol), whenever a '#define'
184     of symbol is parsed.  The callback allows an include processor to adjust
185     its notion of which expressions are still current. If we (the include
186     processor) already met
187
188             #define A B
189
190     and later meet
191
192             #define B
193
194     whether this is the first definition of B or not, then the possible
195     meanings of A have changed.  We set up a callback to identify such
196     situations."""
197
198    self.define_callback = callback_function
199
200  def _ParseFine(self, poundsign_match, includepath_map_index, file_contents,
201                 symbol_table, quote_includes, angle_includes, expr_includes,
202                 next_includes):
203    """Helper function for ParseFile."""
204    Debug(DEBUG_TRACE2, "_ParseFine %s",
205      file_contents[poundsign_match.start('directive'):
206                    poundsign_match.end('directive')])
207    m = DIRECTIVE_RE.match(  # parse the directive
208          PAIRED_COMMENT_RE.sub( # remove possible paired comments
209            "",
210            BACKSLASH_RE.sub(   # get rid of lines ending in backslash
211              "",
212              file_contents[poundsign_match.start('directive'):
213                            poundsign_match.end('directive')])))
214    if m:
215      try:
216        groupdict = m.groupdict()
217        if groupdict['include'] == 'include' or \
218           groupdict['include'] == 'import':
219          if groupdict['quote']:
220            quote_includes.append(includepath_map_index(m.group('quote')))
221          elif groupdict['angle']:
222            angle_includes.append(includepath_map_index(m.group('angle')))
223          elif groupdict['expr']:
224            expr_includes.append(m.group('expr').rstrip())
225          else:
226            assert False
227        elif groupdict['include'] == 'include_next':
228          # We do not, in fact, distinguish between the two kinds of
229          # include_next's, because we conservatively assume that they are of
230          # the quote variety.
231          if groupdict['quote']:
232            next_includes.append(includepath_map_index(m.group('quote')))
233          elif groupdict['angle']:
234            next_includes.append(includepath_map_index(m.group('angle')))
235          # The following restriction would not be too hard to remove.
236          elif groupdict['expr']:
237            NotCoveredError(
238              "For include_next: cannot deal with computed include here.")
239          else:
240            assert False
241            raise NotCoveredError("include_next not parsed")
242        elif groupdict['define']:
243          if not groupdict['lhs']:
244            raise NotCoveredError("Unexpected macro definition with no LHS.")
245          else:
246            lhs = m.group('lhs')
247            rhs = groupdict['rhs'] and groupdict['rhs'] or None
248            InsertMacroDefInTable(lhs, rhs, symbol_table, self.define_callback)
249      except NotCoveredError as inst:
250        # Decorate this exception with the filename, by recreating it
251        # appropriately.
252        if not inst.source_file:
253          raise NotCoveredError(inst.args[0],
254                                self.filepath,
255                                send_email = inst.send_email)
256        else:
257          raise
258
259  def Parse(self, filepath, symbol_table):
260    """Parse filepath for preprocessor directives and update symbol table.
261
262    Arguments:
263      filepath: a string
264      symbol_table: a dictionary, see module macro_expr
265
266    Returns:
267      (quote_includes, angle_includes, expr_includes, next_includes), where
268      all are lists of filepath indices, except for expr_includes, which is a
269      list of expressions.
270    """
271    Debug(DEBUG_TRACE, "ParseFile %s", filepath)
272
273    assert isinstance(filepath, str)
274    self.filepath = filepath
275    parse_file_start_time = time.perf_counter()
276    statistics.parse_file_counter += 1
277
278    includepath_map_index = self.includepath_map.Index
279
280    try:
281      fd = open(filepath, "r", encoding='latin-1')
282    except IOError as msg:
283      # This normally does not happen because the file should be known to
284      # exists. Still there might be, say, a permissions issue that prevents it
285      # from being read.
286      raise NotCoveredError("Parse file: '%s': %s" % (filepath, msg),
287                            send_email=False)
288
289    file_contents = fd.read()
290    fd.close()
291
292    quote_includes, angle_includes, expr_includes, next_includes = (
293      [], [], [], [])
294
295    i = 0
296    line_start_last = None
297
298    while True:
299
300      # Scan coarsely to find something of interest
301      mfast = RE_INCLUDE_DEFINE.search(file_contents, i + 1)
302      if not mfast: break
303      i = mfast.end()
304      # Identify the line of interest by scanning backwards to \n
305      line_start = file_contents.rfind("\n", 0, i) + 1 # to beginning of line
306      # Now, line_start is -1 if \n was not found.
307
308      ### TODO(klarlund) continue going back if line continuation preceeding
309
310      # Is this really a new line?
311      if line_start == line_start_last: continue
312      line_start_last = line_start
313
314      # Here we should really skip back over lines to see whether a totally
315      # pathological situation involving '\'-terminated lines like:
316      #
317      # #include <stdio.h>
318      # # Start of pathological situation involving line continuations:
319      # # \
320      #    \
321      #     \
322      #      \
323      #       include     "nidgaard.h"
324      #
325      # occurs, where the first # on each line is just Python syntax and should
326      # not be considered as part of the C/C++ example. This code defines a
327      # valid directive to include "nidgaard.h". We will not handle such
328      # situations correctly -- the include will be missed.
329
330      # Parse the line of interest according to fine-grained parser
331      poundsign_match = POUND_SIGN_RE.match(file_contents, line_start)
332
333      if not poundsign_match:
334        continue
335
336      self._ParseFine(poundsign_match, includepath_map_index, file_contents,
337                      symbol_table, quote_includes, angle_includes,
338                      expr_includes, next_includes)
339
340
341    statistics.parse_file_total_time += time.perf_counter() - parse_file_start_time
342
343    return (quote_includes, angle_includes, expr_includes, next_includes)
344