1#! /usr/bin/env python3 2 3# Copyright 2007 Google Inc. 4# 5# This program is free software; you can redistribute it and/or 6# modify it under the terms of the GNU General Public License 7# as published by the Free Software Foundation; either version 2 8# of the License, or (at your option) any later version. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License 16# along with this program; if not, write to the Free Software 17# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, 18# USA. 19 20"""Parsing of C and C++ commands and extraction of search paths.""" 21 22__author__ = "opensource@google.com (Craig Silverstein, Nils Klarlund)" 23 24import re 25import os 26import sys 27import glob 28 29import basics 30import cache_basics 31 32Debug = basics.Debug 33DEBUG_TRACE = basics.DEBUG_TRACE 34NotCoveredError = basics.NotCoveredError 35 36# TODO(klarlund): Make mechanism for handling -U, -undef options, along with 37# default symbols. 38 39class ParseState: 40 """Everything we figure out during parsing. This is accessed a lot and 41 needs to be fast, so you should access and set the data members directly. 42 Mutator functions are provided for the non-list elements, but solely 43 because this way you can set these elements from within a lambda. 44 """ 45 def __init__(self): 46 self.nostdinc = False 47 self.file_names = [] 48 self.quote_dirs = [] 49 self.include_files = [] 50 self.i_dirs = [] 51 self.before_system_dirs = [] 52 self.after_system_dirs = [] 53 54 self.language = 'none' # equivalent to commandline of '-x none' 55 self.isysroot = "" 56 self.sysroot = "" 57 self.output_file = None 58 self.iprefix = "" 59 self.Dopts = [] 60 61 def set_nostdinc(self): self.nostdinc = True 62 def set_language(self, x): self.language = x 63 def set_isysroot(self, x): self.isysroot = x 64 def set_sysroot(self, x): self.sysroot = x 65 def set_outputfile(self, x): self.output_file = x 66 def set_iprefix(self, x): self.iprefix = x 67 def include_sysroot(self): 68 return self.isysroot if self.isysroot else self.sysroot 69 70def _SplitMacroArg(arg): 71 """Split an arg as found in -Darg 72 73 Argument: 74 arg: argument 75 76 Returns: [arg] if there is no '=' in arg, otherwise [symb, val], where symb is 77 what is to the left of '=' and val is what is to the right. 78 """ 79 pos = arg.find("=") 80 if pos > 0: 81 return [arg[:pos], arg[pos + 1:]] 82 else: 83 return [arg] 84 85def _RaiseNotImplemented(name, comment=''): 86 raise NotCoveredError('%s is not implemented. %s' % (name, comment)) 87 88# These are the cpp options that a) are more than one letter long, 89# b) always take an argument, and c) may either have that argument 90# as a separate word in argv, or may have the argument concatenated 91# after the option-name (eg, either "-include foo" or "-includefoo"). 92# These are taken from 93# http://gcc.gnu.org/onlinedocs/cpp/Invocation.html#Invocation 94# and, more completely, from the gnu gcc info pages. 95# Each option takes as a value, the function to run on the opt's argument. 96# Below, ps is a ParseState object. 97# TODO(csilvers): check for arg[0] == '=' for iquote, isystem 98CPP_OPTIONS_MAYBE_TWO_WORDS = { 99 '-MF': lambda ps, arg: None, 100 '-MT': lambda ps, arg: None, 101 '-MQ': lambda ps, arg: None, 102 '-arch': lambda ps, arg: None, 103 '-target': lambda ps, arg: None, 104 '-include': lambda ps, arg: ps.include_files.append(arg), 105 '-imacros': lambda ps, arg: ps.include_files.append(arg), 106 '-idirafter': lambda ps, arg: ps.after_system_dirs.append(arg), 107 '-iprefix': lambda ps, arg: ps.set_iprefix(arg), 108 '-iwithprefix': lambda ps, arg: ps.after_system_dirs.append( 109 os.path.join(ps.iprefix, arg)), 110 '-iwithprefixbefore': lambda ps, arg: ps.i_dirs.append( 111 os.path.join(ps.iprefix, arg)), 112 '-isysroot': lambda ps, arg: ps.set_isysroot(arg), 113 '-imultilib': lambda ps, arg: _RaiseNotImplemented('-imultilib'), 114 '-isystem': lambda ps, arg: ps.before_system_dirs.append(arg), 115 '-iquote': lambda ps, arg: ps.quote_dirs.append(arg), 116} 117CPP_OPTIONS_MAYBE_TWO_WORDS_FIRST_LETTERS = ('M', 'i', '-', 'a', 't') 118# A "compile-time" check to make sure the first-letter list is up-to-date 119for key in CPP_OPTIONS_MAYBE_TWO_WORDS.keys(): 120 assert key[1] in CPP_OPTIONS_MAYBE_TWO_WORDS_FIRST_LETTERS 121 122PATH_EXPR='[/a-zA-Z_0-9.]+' # regular expression for a partial file path 123 124# These are the cpp options that require regular expressions, m is Match. 125CPP_OPTIONS_REGULAR_EXPRESSIONS = { 126 '-Wa,(%s\.s)' % PATH_EXPR: lambda ps, m: ps.include_files.append(m.group(1)), 127 '-Wa,\[(%s\.s)\]' % PATH_EXPR: lambda ps, m: ps.include_files.append(m.group(1)), 128} 129 130CPP_OPTIONS_REGULAR_EXPRESSIONS_STARTS_WITH = '-Wa,' 131for key in CPP_OPTIONS_REGULAR_EXPRESSIONS.keys(): 132 assert key.startswith(CPP_OPTIONS_REGULAR_EXPRESSIONS_STARTS_WITH) 133 134CPP_OPTIONS_REGULAR_EXPRESSIONS_COMPILED = {} 135for key in CPP_OPTIONS_REGULAR_EXPRESSIONS.keys(): 136 CPP_OPTIONS_REGULAR_EXPRESSIONS_COMPILED[key] = re.compile(key) 137 138# These are the cpp options that a) are more than one letter long, 139# b) always take an argument, and c) must have that argument as a 140# separate word in argv. 141CPP_OPTIONS_ALWAYS_TWO_WORDS = { 142 '-Xpreprocessor': lambda ps, arg: _RaiseNotImplemented('-Xpreprocessor'), 143 144 # In order to parse correctly, this data structure needs to include 145 # *all* two-word arguments that gcc accepts (we don't want to see 146 # "gcc -aux-info foo" and think that foo is an output filename...) 147 # This list is taken from the complete list from the gcc info page: 148 # "Option Summary". These aren't preprocessor-related, so are noops. 149 '-aux-info': lambda ps, arg: None, 150 '--param': lambda ps, arg: None, 151 '-Xassembler': lambda ps, arg: None, 152 '-Xlinker': lambda ps, arg: None, 153} 154 155# For efficiency, it's helpful to be able to combine the two above 156CPP_OPTIONS_TWO_WORDS = {} 157CPP_OPTIONS_TWO_WORDS.update(CPP_OPTIONS_MAYBE_TWO_WORDS) 158CPP_OPTIONS_TWO_WORDS.update(CPP_OPTIONS_ALWAYS_TWO_WORDS) 159 160# These are the cpp options that a) are more than one letter long, 161# b) always take an argument, and c) have that argument separated from 162# the option by '='. 163CPP_OPTIONS_APPEARING_AS_ASSIGNMENTS = { 164 '--sysroot': lambda ps, arg: ps.set_sysroot(arg) 165} 166 167# These are the cpp options that do not take an argument. 168# (Note, most cpp options do not take an argument, but do not pertain to 169# preprocessing, so we can ignore them. Those are dealt in the default 170# case in our processing loop. This is only for no-argument options 171# that we actually care about for preprocessing.) 172CPP_OPTIONS_ONE_WORD = { 173# '-undef': lambda ps, arg: _RaiseNotImplemented('-undef') 174 '-undef': lambda ps, arg: None, 175 '-nostdinc': lambda ps: ps.set_nostdinc(), 176 # TODO(csilvers): deal with -nostdinc++ as well? 177} 178 179# These are the cpp options that are one letter long, and take an 180# argument. In all such cases, the argument may either be the next 181# word, or may be appended right after the letter. 182CPP_OPTIONS_ONE_LETTER = { 183 'D': lambda ps, arg: ps.Dopts.append(arg.split('=')), 184 'I': lambda ps, arg: ps.i_dirs.append(arg), 185# 'U': lambda ps, arg: _RaiseNotImplemented('-U') # affects computed includes 186 'U': lambda ps, arg: None, 187 'o': lambda ps, arg: ps.set_outputfile(arg), 188 'x': lambda ps, arg: ps.set_language(arg), 189 190 # In order to parse correctly, this data structure needs to include 191 # *all* two-word arguments that gcc accepts (we don't want to see 192 # "gcc -L foo" and think that foo is an output filename...) Since 193 # most one-letter args can go as either '-Lfoo' or '-L foo', we need 194 # to include (almost) all one-letter args in our list, even when we 195 # don't care about them. This list is taken from the complete list 196 # from the gcc info page: "Option Summary". Since these aren't 197 # preprocessor-related, they are all noops. 198 'A': lambda ps, arg: None, 199 'l': lambda ps, arg: None, 200 'F': lambda ps, arg: ps.i_dirs.extend(glob.glob(os.path.join(arg,'*', 'Headers'))), 201 'u': lambda ps, arg: None, 202 'L': lambda ps, arg: None, 203 'B': lambda ps, arg: None, 204 'V': lambda ps, arg: None, 205 'b': lambda ps, arg: None, 206} 207 208 209### DREADFUL PARSER + OPTIMIZED PARSER 210 211# This parser was written after a *much* simpler parser using regular 212# expression turned out to be too slow, two orders of magnitude slower 213# than str.split. The parser below is faster than the one based on 214# regular expression and more complete, so that's the one we keep. 215 216NONSPACE_RE = re.compile(r'\S') # don't use \S|$, which introduces backtracking 217SPACE_RE = re.compile(r'\s') 218NONESC_QUOTE_RE = re.compile(r'[^\\]"|^"') # inefficient 219QUOTE_RE = re.compile(r'(?<!\\)"') # backtracking, could also be improved 220ESC_QUOTE_RE = re.compile(r'\\"') 221 222def ParseCommandLineSlowly(line): 223 """Parse line as if it were issued in a shell. 224 225 Split the line into a list of string arguments indicated by spaces, 226 except that doubly quoted substrings are treated atomically. Also, 227 do allow backslash escaped quotes; they are turned into regular 228 quotes. This function is written for efficiency; only very simple 229 regular expressions are used in main loop. 230 231 The parser is not needed when the include server is driven by 232 distcc, because the distcc client passes the argv vector. It is used 233 as part of a faster parser. 234 """ 235 236 if "'" in line: 237 raise NotCoveredError("Single-quotes not accepted in command line.") 238 args = [] 239 # Set position of first quote if it exists. 240 m_unesc_q = NONESC_QUOTE_RE.search(line, 0) 241 if m_unesc_q: 242 unesc_q = m_unesc_q.end() - 1 243 else: 244 unesc_q = sys.maxsize 245 m_nonspc = NONSPACE_RE.search(line, 0) 246 if not m_nonspc: 247 return args 248 start = m_nonspc.start() 249 end = start + 1 250 while True: 251 # Invariant: (1) start is at the beginning of the next argument 252 # (perhaps at a quote, which will later be removed). (2) end is 253 # such that line[start:end] is a prefix of the argument. 254 assert start <= unesc_q 255 assert start < end <= len(line), (start, end, len(line)) 256 assert not SPACE_RE.match(line, start) 257 assert unesc_q == sys.maxsize or line[unesc_q] == '"' 258 try: 259 end = SPACE_RE.search(line, end).start() 260 except AttributeError: 261 end = len(line) 262 if end < unesc_q: 263 # We're good: no quotes found, we have an argument. 264 args.append(ESC_QUOTE_RE.sub( 265 '"', 266 QUOTE_RE.sub( 267 '', 268 line[start:end]))) 269 # Search for beginning of next argument. 270 try: 271 start = NONSPACE_RE.search(line, end).start() 272 except AttributeError: 273 return args 274 # We have one character so far. 275 end = start + 1 276 continue 277 # We found a quote. Look for its counterpart. 278 assert start <= unesc_q < end 279 if unesc_q == len(line) - 1: 280 raise NotCoveredError("""Unexpected '"' at end of line.""") 281 m_unesc_q = NONESC_QUOTE_RE.search(line, unesc_q + 1) 282 if not m_unesc_q: 283 raise NotCoveredError("""Missing '"', could not parse command line.""") 284 assert m_unesc_q.end() - 1 > unesc_q 285 end = m_unesc_q.end() 286 if end == len(line): 287 args.append(ESC_QUOTE_RE.sub( 288 '"', 289 QUOTE_RE.sub( 290 '', 291 line[start:end]))) 292 return args 293 # We found the counterpart before the end of the line. The argument may 294 # still not be finished. But before continuing, look for the next quote. 295 m_unesc_q = NONESC_QUOTE_RE.search(line, end) 296 if m_unesc_q: 297 unesc_q = m_unesc_q.end() - 1 298 else: 299 unesc_q = sys.maxsize 300 301 302def ParseCommandLine(line): 303 """Parse line as it were issued in a shell (optimized). 304 """ 305 # It turns out that str.split() for large string (size 500) is almost two 306 # orders of magnitude faster than ParseCommandLineSlowly. Usually, when 307 # there is a '"' this quote is near the beginning of the line (as in dX="some 308 # thing"). We use this observation to apply split() to the suffix following 309 # the last quote. In that way, only the prefix up to somewhere around the last 310 # quote needs to be parsed by more sophisticated means. 311 quote_pos = line.rfind('"') 312 if quote_pos == -1: 313 return line.split() 314 else: 315 # Walk forward to a space; the quote could be an escaped one in 316 # the middle of non-space characters. 317 good_pos = line.find(' ', quote_pos) 318 if good_pos != -1: 319 return (ParseCommandLineSlowly(line[0:good_pos]) 320 + line[good_pos:].split()) 321 else: # give up 322 return ParseCommandLineSlowly(line) 323 324# Make a regular expression that matches suffixes of strings ending in 325# a period followed by a string in the domain of TRANSLATION_UNIT_MAP. 326TRANSLATION_UNIT_FILEPATH_RE = ( 327 re.compile(r".*[.](?P<suffix>%s)$" % 328 '|'.join([re.escape(ext) 329 for ext in basics.TRANSLATION_UNIT_MAP.keys()]))) 330 331 332def ParseCommandArgs(args, current_dir, includepath_map, dir_map, 333 compiler_defaults, timer=None): 334 """Parse arguments like -I to make include directory lists. 335 336 Arguments: 337 args: list of arguments (strings) 338 current_dir: string 339 includepath_map: a MapToIndex object 340 dir_map: a DirectoryMapToIndex object 341 compiler_defaults: a CompilerDefaults object 342 timer: a basics.IncludeAnalyzerTimer object 343 Returns: 344 (quote_dirs, angle_dirs, files, source_file, source_file_prefix, dopts) 345 where: 346 quote_dirs: a list of dir_map-indexed directories 347 angle_dirs: a list of dir_map-indexed directories 348 files: a list of includepath_map-indexed files 349 source_file_prefix: the source file name with extension stripped 350 dopts: a list of items as returned by _SplitMacroArg 351 Modifies: 352 compiler_defaults 353 """ 354 if __debug__: Debug(DEBUG_TRACE, "ParseCommand %s" % args) 355 356 assert isinstance(dir_map, cache_basics.DirectoryMapToIndex) 357 assert isinstance(includepath_map, cache_basics.MapToIndex) 358 359 parse_state = ParseState() 360 361 if len(args) < 2: 362 raise NotCoveredError("Command line: too few arguments.") 363 364 compiler = args[0] 365 366 i = 1 367 while i < len(args): 368 # First, deal with everything that's not a flag-option 369 if args[i][0] != '-' or args[i] == '-': # - is the stdin file 370 if args[i].startswith('"-'): 371 pass # TODO(csilvers): parse arg inside quotes? 372 else: 373 parse_state.file_names.append(args[i]) # if not a flag, it's a file 374 i += 1 375 continue 376 377 # Deal with the one-letter options -- the kind most commonly seen. 378 # We need to figure out whether the option-argument is glommed on to 379 # the end of the option ("-Dfoo"), or is a separate word ("-D foo"). 380 action = CPP_OPTIONS_ONE_LETTER.get(args[i][1]) # letter after the - 381 if action: 382 arg = args[i][2:] 383 if arg: # the glommed-onto-end case 384 action(parse_state, arg) 385 i += 1 386 else: # the separate-word case 387 try: 388 action(parse_state, args[i+1]) 389 i += 2 390 except IndexError: 391 raise NotCoveredError("No argument found for option '%s'" % args[i]) 392 continue 393 394 # Deal with the have-arg options with the arg as the 2nd word ("-MF foo"). 395 action = CPP_OPTIONS_TWO_WORDS.get(args[i]) 396 if action: 397 try: 398 action(parse_state, args[i+1]) 399 i += 2 400 except IndexError: 401 raise NotCoveredError("No argument found for option '%s'" % args[i]) 402 continue 403 404 # Deal with the have-arg options that appear as if assignments 405 # ("--sysroot=/mumble"). 406 if '=' in args[i]: 407 arg, value = args[i].split('=', 1) 408 action = CPP_OPTIONS_APPEARING_AS_ASSIGNMENTS.get(arg) 409 if action: 410 action(parse_state, value) 411 i += 1 412 continue 413 414 # Deal with the options that take no arguments ("-nostdinc"). 415 action = CPP_OPTIONS_ONE_WORD.get(args[i]) 416 if action: 417 action(parse_state) 418 i += 1 419 continue 420 421 # Deal with the have-arg options with the arg concatenated to the word. 422 # ("-MFfoo"). We do this last because it's slowest. 423 if args[i][1] in CPP_OPTIONS_MAYBE_TWO_WORDS_FIRST_LETTERS: # filter 424 found_action = False 425 for (option, action) in CPP_OPTIONS_MAYBE_TWO_WORDS.items(): 426 if action and args[i].startswith(option): 427 action(parse_state, args[i][len(option):]) 428 i += 1 429 found_action = True 430 break 431 if found_action: # what we really need here is a goto! 432 continue 433 434 # Deal with the complex options requiring regular expressions last. 435 if args[i].startswith(CPP_OPTIONS_REGULAR_EXPRESSIONS_STARTS_WITH): 436 found_action = False 437 for (option, action) in CPP_OPTIONS_REGULAR_EXPRESSIONS.items(): 438 r = CPP_OPTIONS_REGULAR_EXPRESSIONS_COMPILED[option] 439 m = r.match(args[i]) 440 if action and m is not None: 441 action(parse_state, m) 442 i += 1 443 found_action = True 444 break 445 if found_action: 446 continue 447 448 # Whatever is left must be a one-word option (that is, an option 449 # without an arg) that it's safe to ignore. 450 i += 1 451 continue 452 # Done parsing arguments! 453 454 # Sanity-checking on arguments 455 # -I- is a special form of the -I command. 456 if "-" in parse_state.i_dirs: 457 _RaiseNotImplemented('-I-', '(Use -iquote instead.)') 458 459 if len(parse_state.file_names) != 1: 460 raise NotCoveredError( 461 "Could not locate name of translation unit: %s." % parse_state.file_names, 462 send_email=False) 463 464 source_file = parse_state.file_names[0] 465 466 if parse_state.output_file: 467 # Use output_file to create prefix 468 source_file_prefix = re.sub("[.]o$", "", parse_state.output_file) 469 else: 470 # Remove suffix from source file 471 source_file_prefix = re.sub("[.](%s)$" % 472 "|".join(basics.TRANSLATION_UNIT_MAP.keys()), 473 "", 474 source_file) 475 source_file_prefix = os.path.join(current_dir, source_file_prefix) 476 if parse_state.language == 'none': # no explicit -x flag, or -x none 477 language_match = TRANSLATION_UNIT_FILEPATH_RE.match(source_file) 478 if not language_match: 479 raise NotCoveredError( 480 "For source file '%s': unrecognized filename extension" % source_file) 481 suffix = language_match.group('suffix') 482 parse_state.language = basics.TRANSLATION_UNIT_MAP[suffix] 483 assert parse_state.language in basics.LANGUAGES 484 485 sysroot = parse_state.include_sysroot() 486 compiler_defaults.SetSystemDirsDefaults(compiler, sysroot, 487 parse_state.language, timer) 488 489 def IndexDirs(dir_list): 490 """Normalize directory names and index. 491 492 Remove leading "./" and trailing "/"'s from directory paths in 493 dir_list before indexing them according to dir_map. 494 """ 495 S = basics.SafeNormPath 496 I = dir_map.Index 497 return [I(S(d)) for d in dir_list] 498 499 # Now string the directory lists together according to CPP semantics. 500 angle_dirs = IndexDirs(parse_state.i_dirs) 501 angle_dirs.extend(IndexDirs(parse_state.before_system_dirs)) 502 if not parse_state.nostdinc: 503 sysroot = parse_state.include_sysroot() 504 angle_dirs.extend( 505 IndexDirs(compiler_defaults.system_dirs_default 506 [compiler][sysroot][parse_state.language])) 507 angle_dirs.extend(IndexDirs(parse_state.after_system_dirs)) 508 509 quote_dirs = IndexDirs(parse_state.quote_dirs) 510 quote_dirs.extend(angle_dirs) 511 angle_dirs = tuple(angle_dirs) 512 quote_dirs = tuple(quote_dirs) 513 # Include files are meant to be sent to the server. They do not pose the 514 # danger of absolute includes, which includepath_map is designed to avoid. 515 include_files = tuple( 516 [includepath_map.Index(basics.SafeNormPath(f), 517 ignore_absolute_path_warning=True) 518 for f in parse_state.include_files]) 519 520 if __debug__: Debug(DEBUG_TRACE, ("ParseCommand result: %s %s %s %s %s %s" % 521 (quote_dirs, angle_dirs, include_files, 522 source_file, source_file_prefix, 523 parse_state.Dopts))) 524 return (quote_dirs, angle_dirs, include_files, source_file, source_file_prefix, 525 parse_state.Dopts) 526