1#! /usr/bin/env python3 2 3# Copyright 2007 Google Inc. 4# 5# This program is free software; you can redistribute it and/or 6# modify it under the terms of the GNU General Public License 7# as published by the Free Software Foundation; either version 2 8# of the License, or (at your option) any later version. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License 16# along with this program; if not, write to the Free Software 17# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, 18# USA. 19 20"""A very fast directives-only parser for C and C++ source code. 21 22We parse only the following directives: 23 #include (the standard C/C++ inclusion mechanism) 24 #include_next (a GNU C/C++ extension) 25 #import (an Objective-C feature, similar to #include) 26 #define (because #defines can affect the results of '#include MACRO') 27""" 28 29__author__ = 'Nils Klarlund' 30 31import re 32import time 33 34import basics 35import cache_basics 36import statistics 37 38Debug = basics.Debug 39DEBUG_TRACE = basics.DEBUG_TRACE 40DEBUG_TRACE2 = basics.DEBUG_TRACE2 41NotCoveredError = basics.NotCoveredError 42 43# For coarse and fast scanning 44RE_INCLUDE_DEFINE = re.compile("include|define|import") 45 46# For fine-grained, but slow backtracking, parsing 47POUND_SIGN_RE = re.compile(r""" 48 ^ # start of line 49 [ \t]* # space(s) 50 ([*][/])? # a possible ..*/ ending block comment 51 [ \t]* # space(s) 52 ([/][*] [^\n]* [*][/])* # initial block comment(s) /*...*/ 53 [ \t]* # space(s) 54 (?P<directive> # group('directive') -- what we're after 55 [#] # the pound sign 56 [ \t]* # space(s) 57 (define|include_next|include|import)\b # the directive 58 ((?!\\\n).)* # the rest on this line: zero or more 59 # characters, each not a backslash that 60 # is followed by \n 61 (\\\n((?!\\\n).)*)* # (backslash + \n + rest of line)* 62 ) 63 """, re.VERBOSE + re.MULTILINE) 64 65 66NOT_COMMA_OR_PARENS = "([^(),])" 67 68# For parsing macro expressions of the form: 69# symbol 70# symbol (something, ..., something), where something is not ',', '(', or ')' 71MACRO_EXPR = r""" 72 (?P<symbol>\w+) # the symbol, named 'symbol' 73 ( \s* 74 [(] \s* # beginning parenthesis 75 (?P<args> # a parenthesized expression (with no 76 # containing expressions -- a limitation) 77 # named 'args' 78 %(NOT_COMMA_OR_PARENS)s* # the first argument (if it exists) 79 ([,]%(NOT_COMMA_OR_PARENS)s*)* # subsequent arguments 80 ) 81 [)] # ending parenthesis 82 )?""" % {'NOT_COMMA_OR_PARENS': NOT_COMMA_OR_PARENS} 83 84MACRO_EXPR_RE = re.compile(MACRO_EXPR, re.VERBOSE) 85 86# Nice little parser of certain directive lines (after backslash-ended 87# line continuations and comments are removed) 88DIRECTIVE_RE = re.compile(r""" 89 ^[ \t]* 90 [#] 91 [ \t]* 92 ( 93 ((?P<include> include_next | include | import) 94 \s* 95 ( "(?P<quote> (\w|[_/.,+-])*)" | # "bar/foo.h" 96 <(?P<angle> (\w|[_/.,+-])*)> | # <stdio.h> 97 (?P<expr> .*?)) # expr, match . minimally 98 ) 99 | 100 (?P<define> define \s+ (?P<lhs> %s) # insert MACRO_EXPR here 101 \s* (?P<rhs> .*?)) # match . minimally before 102 # trailing white space 103 ) 104 \s* # trailing whitespace 105 ((/[*]|//).*)? # optional trailing comment start 106 $ 107 """ % MACRO_EXPR, 108 re.VERBOSE) 109 110# 111INCLUDE_STRING_RE = re.compile(r""" 112 ^ 113 \s* 114 ( "\s*(?P<quote> (\w|[\\_/.,+-])*)\s*" | 115 <\s*(?P<angle> (\w|[\\_/.,+-])*)\s*> 116 ) 117 \s* 118 $ 119""", re.VERBOSE) 120 121# For ridding lines of backslash 122BACKSLASH_RE = re.compile(r"\\\n", re.MULTILINE) 123 124# For matching non-comment prefix of line. 125COMMENT_RE = re.compile(r"((?!/[*]|//).)*") 126 127# FOR SEARCHING AFTER /* .. */. 128PAIRED_COMMENT_RE = re.compile(r"(/[*].*?[*]/)") 129 130 131def InsertMacroDefInTable(lhs, rhs, symbol_table, callback_function): 132 """Insert the definition of a pair (lhs, rhs) into symbol table. 133 134 Arguments: 135 lhs: a string, of the form "symbol" or "symbol(param1, ..., paramN)" 136 rhs: a string 137 symbol_table: where the definition will be inserted 138 callback_function: a function called with value "symbol" 139 """ 140 m_expr = MACRO_EXPR_RE.match(lhs) 141 if m_expr.end(0) != len(lhs): 142 raise NotCoveredError( 143 "Unexpected macro definition with LHS: '%s'." % lhs) 144 # Calculate the definition df, either 145 # - a pair ([arg_1, .., arg_n], rhs) where arg_i is the 146 # i'th formal parameter (function-like macro definition), or 147 # - just a symbol (object-like macro definition) 148 if m_expr.group('args') != None: # perhaps '' 149 # A function-like macro definition. 150 # Construct pair (list of formal parameters, rhs). 151 args = m_expr.group('args').split(',') 152 df = args, rhs 153 # lhs is adjusted to be just the 'function' name 154 lhs = m_expr.group('symbol') 155 else: # m_expr.group('args') 156 # An object-like macro definition 157 assert m_expr.group('symbol') == lhs 158 df = rhs 159 if lhs not in symbol_table: 160 symbol_table[lhs] = [df] 161 else: 162 symbol_table[lhs].append(df) 163 callback_function(lhs) 164 165 166class ParseFile(object): 167 """Parser class for syntax understood by CPP, the C and C++ 168 preprocessor. An instance of this class defines the Parse method.""" 169 170 def __init__(self, includepath_map): 171 """Constructor. Make a parser. 172 173 Arguments: 174 includepath_map: string-to-index map for includepaths 175 """ 176 assert isinstance(includepath_map, cache_basics.MapToIndex) 177 self.includepath_map = includepath_map 178 self.define_callback = lambda x: None 179 180 def SetDefineCallback(self, callback_function): 181 """Set a callback function, which is invoked for '#define's. 182 183 The function is called as callback_function(symbol), whenever a '#define' 184 of symbol is parsed. The callback allows an include processor to adjust 185 its notion of which expressions are still current. If we (the include 186 processor) already met 187 188 #define A B 189 190 and later meet 191 192 #define B 193 194 whether this is the first definition of B or not, then the possible 195 meanings of A have changed. We set up a callback to identify such 196 situations.""" 197 198 self.define_callback = callback_function 199 200 def _ParseFine(self, poundsign_match, includepath_map_index, file_contents, 201 symbol_table, quote_includes, angle_includes, expr_includes, 202 next_includes): 203 """Helper function for ParseFile.""" 204 Debug(DEBUG_TRACE2, "_ParseFine %s", 205 file_contents[poundsign_match.start('directive'): 206 poundsign_match.end('directive')]) 207 m = DIRECTIVE_RE.match( # parse the directive 208 PAIRED_COMMENT_RE.sub( # remove possible paired comments 209 "", 210 BACKSLASH_RE.sub( # get rid of lines ending in backslash 211 "", 212 file_contents[poundsign_match.start('directive'): 213 poundsign_match.end('directive')]))) 214 if m: 215 try: 216 groupdict = m.groupdict() 217 if groupdict['include'] == 'include' or \ 218 groupdict['include'] == 'import': 219 if groupdict['quote']: 220 quote_includes.append(includepath_map_index(m.group('quote'))) 221 elif groupdict['angle']: 222 angle_includes.append(includepath_map_index(m.group('angle'))) 223 elif groupdict['expr']: 224 expr_includes.append(m.group('expr').rstrip()) 225 else: 226 assert False 227 elif groupdict['include'] == 'include_next': 228 # We do not, in fact, distinguish between the two kinds of 229 # include_next's, because we conservatively assume that they are of 230 # the quote variety. 231 if groupdict['quote']: 232 next_includes.append(includepath_map_index(m.group('quote'))) 233 elif groupdict['angle']: 234 next_includes.append(includepath_map_index(m.group('angle'))) 235 # The following restriction would not be too hard to remove. 236 elif groupdict['expr']: 237 NotCoveredError( 238 "For include_next: cannot deal with computed include here.") 239 else: 240 assert False 241 raise NotCoveredError("include_next not parsed") 242 elif groupdict['define']: 243 if not groupdict['lhs']: 244 raise NotCoveredError("Unexpected macro definition with no LHS.") 245 else: 246 lhs = m.group('lhs') 247 rhs = groupdict['rhs'] and groupdict['rhs'] or None 248 InsertMacroDefInTable(lhs, rhs, symbol_table, self.define_callback) 249 except NotCoveredError as inst: 250 # Decorate this exception with the filename, by recreating it 251 # appropriately. 252 if not inst.source_file: 253 raise NotCoveredError(inst.args[0], 254 self.filepath, 255 send_email = inst.send_email) 256 else: 257 raise 258 259 def Parse(self, filepath, symbol_table): 260 """Parse filepath for preprocessor directives and update symbol table. 261 262 Arguments: 263 filepath: a string 264 symbol_table: a dictionary, see module macro_expr 265 266 Returns: 267 (quote_includes, angle_includes, expr_includes, next_includes), where 268 all are lists of filepath indices, except for expr_includes, which is a 269 list of expressions. 270 """ 271 Debug(DEBUG_TRACE, "ParseFile %s", filepath) 272 273 assert isinstance(filepath, str) 274 self.filepath = filepath 275 parse_file_start_time = time.perf_counter() 276 statistics.parse_file_counter += 1 277 278 includepath_map_index = self.includepath_map.Index 279 280 try: 281 fd = open(filepath, "r", encoding='latin-1') 282 except IOError as msg: 283 # This normally does not happen because the file should be known to 284 # exists. Still there might be, say, a permissions issue that prevents it 285 # from being read. 286 raise NotCoveredError("Parse file: '%s': %s" % (filepath, msg), 287 send_email=False) 288 289 file_contents = fd.read() 290 fd.close() 291 292 quote_includes, angle_includes, expr_includes, next_includes = ( 293 [], [], [], []) 294 295 i = 0 296 line_start_last = None 297 298 while True: 299 300 # Scan coarsely to find something of interest 301 mfast = RE_INCLUDE_DEFINE.search(file_contents, i + 1) 302 if not mfast: break 303 i = mfast.end() 304 # Identify the line of interest by scanning backwards to \n 305 line_start = file_contents.rfind("\n", 0, i) + 1 # to beginning of line 306 # Now, line_start is -1 if \n was not found. 307 308 ### TODO(klarlund) continue going back if line continuation preceeding 309 310 # Is this really a new line? 311 if line_start == line_start_last: continue 312 line_start_last = line_start 313 314 # Here we should really skip back over lines to see whether a totally 315 # pathological situation involving '\'-terminated lines like: 316 # 317 # #include <stdio.h> 318 # # Start of pathological situation involving line continuations: 319 # # \ 320 # \ 321 # \ 322 # \ 323 # include "nidgaard.h" 324 # 325 # occurs, where the first # on each line is just Python syntax and should 326 # not be considered as part of the C/C++ example. This code defines a 327 # valid directive to include "nidgaard.h". We will not handle such 328 # situations correctly -- the include will be missed. 329 330 # Parse the line of interest according to fine-grained parser 331 poundsign_match = POUND_SIGN_RE.match(file_contents, line_start) 332 333 if not poundsign_match: 334 continue 335 336 self._ParseFine(poundsign_match, includepath_map_index, file_contents, 337 symbol_table, quote_includes, angle_includes, 338 expr_includes, next_includes) 339 340 341 statistics.parse_file_total_time += time.perf_counter() - parse_file_start_time 342 343 return (quote_includes, angle_includes, expr_includes, next_includes) 344