1#! /usr/bin/env python3
2
3# Copyright 2007 Google Inc.
4#
5# This program is free software; you can redistribute it and/or
6# modify it under the terms of the GNU General Public License
7# as published by the Free Software Foundation; either version 2
8# of the License, or (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with this program; if not, write to the Free Software
17# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
18# USA.
19
20"""Parsing of C and C++ commands and extraction of search paths."""
21
22__author__ = "opensource@google.com (Craig Silverstein, Nils Klarlund)"
23
24import re
25import os
26import sys
27import glob
28
29import basics
30import cache_basics
31
32Debug = basics.Debug
33DEBUG_TRACE = basics.DEBUG_TRACE
34NotCoveredError = basics.NotCoveredError
35
36# TODO(klarlund): Make mechanism for handling -U, -undef options, along with
37# default symbols.
38
39class ParseState:
40  """Everything we figure out during parsing.  This is accessed a lot and
41  needs to be fast, so you should access and set the data members directly.
42  Mutator functions are provided for the non-list elements, but solely
43  because this way you can set these elements from within a lambda.
44  """
45  def __init__(self):
46    self.nostdinc = False
47    self.file_names = []
48    self.quote_dirs = []
49    self.include_files = []
50    self.i_dirs = []
51    self.before_system_dirs = []
52    self.after_system_dirs = []
53
54    self.language = 'none'    # equivalent to commandline of '-x none'
55    self.isysroot = ""
56    self.sysroot = ""
57    self.output_file = None
58    self.iprefix = ""
59    self.Dopts = []
60
61  def set_nostdinc(self): self.nostdinc = True
62  def set_language(self, x): self.language = x
63  def set_isysroot(self, x): self.isysroot = x
64  def set_sysroot(self, x): self.sysroot = x
65  def set_outputfile(self, x): self.output_file = x
66  def set_iprefix(self, x): self.iprefix = x
67  def include_sysroot(self):
68    return self.isysroot if self.isysroot else self.sysroot
69
70def _SplitMacroArg(arg):
71  """Split an arg as found in -Darg
72
73  Argument:
74    arg: argument
75
76  Returns: [arg] if there is no '=' in arg, otherwise [symb, val], where symb is
77    what is to the left of '=' and val is what is to the right.
78  """
79  pos = arg.find("=")
80  if pos > 0:
81    return [arg[:pos], arg[pos + 1:]]
82  else:
83    return [arg]
84
85def _RaiseNotImplemented(name, comment=''):
86  raise NotCoveredError('%s is not implemented.  %s' % (name, comment))
87
88# These are the cpp options that a) are more than one letter long,
89# b) always take an argument, and c) may either have that argument
90# as a separate word in argv, or may have the argument concatenated
91# after the option-name (eg, either "-include foo" or "-includefoo").
92# These are taken from
93#    http://gcc.gnu.org/onlinedocs/cpp/Invocation.html#Invocation
94# and, more completely, from the gnu gcc info pages.
95# Each option takes as a value, the function to run on the opt's argument.
96# Below, ps is a ParseState object.
97# TODO(csilvers): check for arg[0] == '=' for iquote, isystem
98CPP_OPTIONS_MAYBE_TWO_WORDS = {
99  '-MF':            lambda ps, arg: None,
100  '-MT':            lambda ps, arg: None,
101  '-MQ':            lambda ps, arg: None,
102  '-arch':          lambda ps, arg: None,
103  '-target':        lambda ps, arg: None,
104  '-include':       lambda ps, arg: ps.include_files.append(arg),
105  '-imacros':       lambda ps, arg: ps.include_files.append(arg),
106  '-idirafter':     lambda ps, arg: ps.after_system_dirs.append(arg),
107  '-iprefix':       lambda ps, arg: ps.set_iprefix(arg),
108  '-iwithprefix':   lambda ps, arg: ps.after_system_dirs.append(
109                                      os.path.join(ps.iprefix, arg)),
110  '-iwithprefixbefore':  lambda ps, arg: ps.i_dirs.append(
111                                           os.path.join(ps.iprefix, arg)),
112  '-isysroot':      lambda ps, arg: ps.set_isysroot(arg),
113  '-imultilib':     lambda ps, arg: _RaiseNotImplemented('-imultilib'),
114  '-isystem':       lambda ps, arg: ps.before_system_dirs.append(arg),
115  '-iquote':        lambda ps, arg: ps.quote_dirs.append(arg),
116}
117CPP_OPTIONS_MAYBE_TWO_WORDS_FIRST_LETTERS = ('M', 'i', '-', 'a', 't')
118# A "compile-time" check to make sure the first-letter list is up-to-date
119for key in CPP_OPTIONS_MAYBE_TWO_WORDS.keys():
120  assert key[1] in CPP_OPTIONS_MAYBE_TWO_WORDS_FIRST_LETTERS
121
122PATH_EXPR='[/a-zA-Z_0-9.]+' # regular expression for a partial file path
123
124# These are the cpp options that require regular expressions, m is Match.
125CPP_OPTIONS_REGULAR_EXPRESSIONS = {
126  '-Wa,(%s\.s)' % PATH_EXPR:     lambda ps, m: ps.include_files.append(m.group(1)),
127  '-Wa,\[(%s\.s)\]' % PATH_EXPR: lambda ps, m: ps.include_files.append(m.group(1)),
128}
129
130CPP_OPTIONS_REGULAR_EXPRESSIONS_STARTS_WITH = '-Wa,'
131for key in CPP_OPTIONS_REGULAR_EXPRESSIONS.keys():
132  assert key.startswith(CPP_OPTIONS_REGULAR_EXPRESSIONS_STARTS_WITH)
133
134CPP_OPTIONS_REGULAR_EXPRESSIONS_COMPILED = {}
135for key in CPP_OPTIONS_REGULAR_EXPRESSIONS.keys():
136  CPP_OPTIONS_REGULAR_EXPRESSIONS_COMPILED[key] = re.compile(key)
137
138# These are the cpp options that a) are more than one letter long,
139# b) always take an argument, and c) must have that argument as a
140# separate word in argv.
141CPP_OPTIONS_ALWAYS_TWO_WORDS = {
142  '-Xpreprocessor': lambda ps, arg: _RaiseNotImplemented('-Xpreprocessor'),
143
144  # In order to parse correctly, this data structure needs to include
145  # *all* two-word arguments that gcc accepts (we don't want to see
146  # "gcc -aux-info foo" and think that foo is an output filename...)
147  # This list is taken from the complete list from the gcc info page:
148  # "Option Summary".  These aren't preprocessor-related, so are noops.
149  '-aux-info':      lambda ps, arg: None,
150  '--param':        lambda ps, arg: None,
151  '-Xassembler':    lambda ps, arg: None,
152  '-Xlinker':       lambda ps, arg: None,
153}
154
155# For efficiency, it's helpful to be able to combine the two above
156CPP_OPTIONS_TWO_WORDS = {}
157CPP_OPTIONS_TWO_WORDS.update(CPP_OPTIONS_MAYBE_TWO_WORDS)
158CPP_OPTIONS_TWO_WORDS.update(CPP_OPTIONS_ALWAYS_TWO_WORDS)
159
160# These are the cpp options that a) are more than one letter long,
161# b) always take an argument, and c) have that argument separated from
162# the option by '='.
163CPP_OPTIONS_APPEARING_AS_ASSIGNMENTS = {
164  '--sysroot':     lambda ps, arg: ps.set_sysroot(arg)
165}
166
167# These are the cpp options that do not take an argument.
168# (Note, most cpp options do not take an argument, but do not pertain to
169# preprocessing, so we can ignore them.  Those are dealt in the default
170# case in our processing loop.  This is only for no-argument options
171# that we actually care about for preprocessing.)
172CPP_OPTIONS_ONE_WORD = {
173#  '-undef':         lambda ps, arg: _RaiseNotImplemented('-undef')
174  '-undef':         lambda ps, arg: None,
175  '-nostdinc':      lambda ps: ps.set_nostdinc(),
176  # TODO(csilvers): deal with -nostdinc++ as well?
177}
178
179# These are the cpp options that are one letter long, and take an
180# argument.  In all such cases, the argument may either be the next
181# word, or may be appended right after the letter.
182CPP_OPTIONS_ONE_LETTER = {
183  'D': lambda ps, arg: ps.Dopts.append(arg.split('=')),
184  'I': lambda ps, arg: ps.i_dirs.append(arg),
185#  'U': lambda ps, arg: _RaiseNotImplemented('-U') # affects computed includes
186  'U': lambda ps, arg: None,
187  'o': lambda ps, arg: ps.set_outputfile(arg),
188  'x': lambda ps, arg: ps.set_language(arg),
189
190  # In order to parse correctly, this data structure needs to include
191  # *all* two-word arguments that gcc accepts (we don't want to see
192  # "gcc -L foo" and think that foo is an output filename...)  Since
193  # most one-letter args can go as either '-Lfoo' or '-L foo', we need
194  # to include (almost) all one-letter args in our list, even when we
195  # don't care about them.  This list is taken from the complete list
196  # from the gcc info page: "Option Summary".  Since these aren't
197  # preprocessor-related, they are all noops.
198  'A': lambda ps, arg: None,
199  'l': lambda ps, arg: None,
200  'F': lambda ps, arg: ps.i_dirs.extend(glob.glob(os.path.join(arg,'*', 'Headers'))),
201  'u': lambda ps, arg: None,
202  'L': lambda ps, arg: None,
203  'B': lambda ps, arg: None,
204  'V': lambda ps, arg: None,
205  'b': lambda ps, arg: None,
206}
207
208
209### DREADFUL PARSER +  OPTIMIZED PARSER
210
211# This parser was written after a *much* simpler parser using regular
212# expression turned out to be too slow, two orders of magnitude slower
213# than str.split. The parser below is faster than the one based on
214# regular expression and more complete, so that's the one we keep.
215
216NONSPACE_RE = re.compile(r'\S') # don't use \S|$, which introduces backtracking
217SPACE_RE = re.compile(r'\s')
218NONESC_QUOTE_RE = re.compile(r'[^\\]"|^"')  # inefficient
219QUOTE_RE = re.compile(r'(?<!\\)"') # backtracking, could also be improved
220ESC_QUOTE_RE = re.compile(r'\\"')
221
222def ParseCommandLineSlowly(line):
223  """Parse line as if it were issued in a shell.
224
225  Split the line into a list of string arguments indicated by spaces,
226  except that doubly quoted substrings are treated atomically. Also,
227  do allow backslash escaped quotes; they are turned into regular
228  quotes.  This function is written for efficiency; only very simple
229  regular expressions are used in main loop.
230
231  The parser is not needed when the include server is driven by
232  distcc, because the distcc client passes the argv vector. It is used
233  as part of a faster parser.
234  """
235
236  if "'" in line:
237    raise NotCoveredError("Single-quotes not accepted in command line.")
238  args = []
239  # Set position of first quote if it exists.
240  m_unesc_q = NONESC_QUOTE_RE.search(line, 0)
241  if m_unesc_q:
242    unesc_q = m_unesc_q.end() - 1
243  else:
244    unesc_q = sys.maxsize
245  m_nonspc = NONSPACE_RE.search(line, 0)
246  if not m_nonspc:
247    return args
248  start = m_nonspc.start()
249  end = start + 1
250  while True:
251    # Invariant: (1) start is at the beginning of the next argument
252    # (perhaps at a quote, which will later be removed). (2) end is
253    # such that line[start:end] is a prefix of the argument.
254    assert start <= unesc_q
255    assert start < end <= len(line), (start, end, len(line))
256    assert not SPACE_RE.match(line, start)
257    assert unesc_q == sys.maxsize or line[unesc_q] == '"'
258    try:
259      end = SPACE_RE.search(line, end).start()
260    except AttributeError:
261      end = len(line)
262    if end < unesc_q:
263      # We're good: no quotes found, we have an argument.
264      args.append(ESC_QUOTE_RE.sub(
265          '"',
266          QUOTE_RE.sub(
267            '',
268            line[start:end])))
269      # Search for beginning of next argument.
270      try:
271        start = NONSPACE_RE.search(line, end).start()
272      except AttributeError:
273        return args
274      # We have one character so far.
275      end = start + 1
276      continue
277    # We found a quote. Look for its counterpart.
278    assert start <= unesc_q < end
279    if unesc_q == len(line) - 1:
280      raise NotCoveredError("""Unexpected '"' at end of line.""")
281    m_unesc_q = NONESC_QUOTE_RE.search(line, unesc_q + 1)
282    if not m_unesc_q:
283      raise NotCoveredError("""Missing '"', could not parse command line.""")
284    assert m_unesc_q.end() - 1 > unesc_q
285    end = m_unesc_q.end()
286    if end == len(line):
287      args.append(ESC_QUOTE_RE.sub(
288        '"',
289        QUOTE_RE.sub(
290        '',
291        line[start:end])))
292      return args
293    # We found the counterpart before the end of the line. The argument may
294    # still not be finished. But before continuing, look for the next quote.
295    m_unesc_q = NONESC_QUOTE_RE.search(line, end)
296    if m_unesc_q:
297      unesc_q = m_unesc_q.end() - 1
298    else:
299      unesc_q = sys.maxsize
300
301
302def ParseCommandLine(line):
303  """Parse line as it were issued in a shell (optimized).
304  """
305  # It turns out that str.split() for large string (size 500) is almost two
306  # orders of magnitude faster than ParseCommandLineSlowly. Usually, when
307  # there is a '"' this quote is near the beginning of the line (as in dX="some
308  # thing"). We use this observation to apply split() to the suffix following
309  # the last quote. In that way, only the prefix up to somewhere around the last
310  # quote needs to be parsed by more sophisticated means.
311  quote_pos = line.rfind('"')
312  if quote_pos == -1:
313    return line.split()
314  else:
315    # Walk forward to a space; the quote could be an escaped one in
316    # the middle of non-space characters.
317    good_pos = line.find(' ', quote_pos)
318    if good_pos != -1:
319      return (ParseCommandLineSlowly(line[0:good_pos])
320              + line[good_pos:].split())
321    else: # give up
322      return ParseCommandLineSlowly(line)
323
324# Make a regular expression that matches suffixes of strings ending in
325# a period followed by a string in the domain of TRANSLATION_UNIT_MAP.
326TRANSLATION_UNIT_FILEPATH_RE = (
327  re.compile(r".*[.](?P<suffix>%s)$" %
328             '|'.join([re.escape(ext)
329                       for ext in basics.TRANSLATION_UNIT_MAP.keys()])))
330
331
332def ParseCommandArgs(args, current_dir, includepath_map, dir_map,
333                     compiler_defaults, timer=None):
334  """Parse arguments like -I to make include directory lists.
335
336  Arguments:
337    args: list of arguments (strings)
338    current_dir: string
339    includepath_map: a MapToIndex object
340    dir_map: a DirectoryMapToIndex object
341    compiler_defaults: a CompilerDefaults object
342    timer: a basics.IncludeAnalyzerTimer object
343  Returns:
344    (quote_dirs, angle_dirs, files, source_file, source_file_prefix, dopts)
345    where:
346      quote_dirs: a list of dir_map-indexed directories
347      angle_dirs: a list of dir_map-indexed directories
348      files: a list of includepath_map-indexed files
349      source_file_prefix: the source file name with extension stripped
350      dopts: a list of items as returned by _SplitMacroArg
351  Modifies:
352    compiler_defaults
353  """
354  if __debug__: Debug(DEBUG_TRACE, "ParseCommand %s" % args)
355
356  assert isinstance(dir_map, cache_basics.DirectoryMapToIndex)
357  assert isinstance(includepath_map, cache_basics.MapToIndex)
358
359  parse_state = ParseState()
360
361  if len(args) < 2:
362    raise NotCoveredError("Command line: too few arguments.")
363
364  compiler = args[0]
365
366  i = 1
367  while i < len(args):
368    # First, deal with everything that's not a flag-option
369    if args[i][0] != '-' or args[i] == '-':     # - is the stdin file
370      if args[i].startswith('"-'):
371        pass     # TODO(csilvers): parse arg inside quotes?
372      else:
373        parse_state.file_names.append(args[i])  # if not a flag, it's a file
374      i += 1
375      continue
376
377    # Deal with the one-letter options -- the kind most commonly seen.
378    # We need to figure out whether the option-argument is glommed on to
379    # the end of the option ("-Dfoo"), or is a separate word ("-D foo").
380    action = CPP_OPTIONS_ONE_LETTER.get(args[i][1])   # letter after the -
381    if action:
382      arg = args[i][2:]
383      if arg:                        # the glommed-onto-end case
384        action(parse_state, arg)
385        i += 1
386      else:                          # the separate-word case
387        try:
388          action(parse_state, args[i+1])
389          i += 2
390        except IndexError:
391          raise NotCoveredError("No argument found for option '%s'" % args[i])
392      continue
393
394    # Deal with the have-arg options with the arg as the 2nd word ("-MF foo").
395    action = CPP_OPTIONS_TWO_WORDS.get(args[i])
396    if action:
397      try:
398        action(parse_state, args[i+1])
399        i += 2
400      except IndexError:
401        raise NotCoveredError("No argument found for option '%s'" % args[i])
402      continue
403
404    # Deal with the have-arg options that appear as if assignments
405    # ("--sysroot=/mumble").
406    if '=' in args[i]:
407      arg, value = args[i].split('=', 1)
408      action = CPP_OPTIONS_APPEARING_AS_ASSIGNMENTS.get(arg)
409      if action:
410        action(parse_state, value)
411        i += 1
412        continue
413
414    # Deal with the options that take no arguments ("-nostdinc").
415    action = CPP_OPTIONS_ONE_WORD.get(args[i])
416    if action:
417      action(parse_state)
418      i += 1
419      continue
420
421    # Deal with the have-arg options with the arg concatenated to the word.
422    # ("-MFfoo").  We do this last because it's slowest.
423    if args[i][1] in CPP_OPTIONS_MAYBE_TWO_WORDS_FIRST_LETTERS:  # filter
424      found_action = False
425      for (option, action) in CPP_OPTIONS_MAYBE_TWO_WORDS.items():
426        if action and args[i].startswith(option):
427          action(parse_state, args[i][len(option):])
428          i += 1
429          found_action = True
430          break
431      if found_action:    # what we really need here is a goto!
432        continue
433
434    # Deal with the complex options requiring regular expressions last.
435    if args[i].startswith(CPP_OPTIONS_REGULAR_EXPRESSIONS_STARTS_WITH):
436      found_action = False
437      for (option, action) in CPP_OPTIONS_REGULAR_EXPRESSIONS.items():
438        r = CPP_OPTIONS_REGULAR_EXPRESSIONS_COMPILED[option]
439        m = r.match(args[i])
440        if action and m is not None:
441          action(parse_state, m)
442          i += 1
443          found_action = True
444          break
445      if found_action:
446        continue
447
448    # Whatever is left must be a one-word option (that is, an option
449    # without an arg) that it's safe to ignore.
450    i += 1
451    continue
452  # Done parsing arguments!
453
454  # Sanity-checking on arguments
455  # -I- is a special form of the -I command.
456  if "-" in parse_state.i_dirs:
457    _RaiseNotImplemented('-I-', '(Use -iquote instead.)')
458
459  if len(parse_state.file_names) != 1:
460    raise NotCoveredError(
461      "Could not locate name of translation unit: %s." % parse_state.file_names,
462      send_email=False)
463
464  source_file = parse_state.file_names[0]
465
466  if parse_state.output_file:
467    # Use output_file to create prefix
468    source_file_prefix = re.sub("[.]o$", "", parse_state.output_file)
469  else:
470    # Remove suffix from source file
471    source_file_prefix = re.sub("[.](%s)$" %
472                                  "|".join(basics.TRANSLATION_UNIT_MAP.keys()),
473                                  "",
474                                  source_file)
475  source_file_prefix = os.path.join(current_dir, source_file_prefix)
476  if parse_state.language == 'none':    # no explicit -x flag, or -x none
477    language_match = TRANSLATION_UNIT_FILEPATH_RE.match(source_file)
478    if not language_match:
479      raise NotCoveredError(
480          "For source file '%s': unrecognized filename extension" % source_file)
481    suffix = language_match.group('suffix')
482    parse_state.language = basics.TRANSLATION_UNIT_MAP[suffix]
483  assert parse_state.language in basics.LANGUAGES
484
485  sysroot = parse_state.include_sysroot()
486  compiler_defaults.SetSystemDirsDefaults(compiler, sysroot,
487                                          parse_state.language, timer)
488
489  def IndexDirs(dir_list):
490    """Normalize directory names and index.
491
492    Remove leading "./" and trailing "/"'s from directory paths in
493    dir_list before indexing them according to dir_map.
494    """
495    S = basics.SafeNormPath
496    I = dir_map.Index
497    return [I(S(d)) for d in dir_list]
498
499  # Now string the directory lists together according to CPP semantics.
500  angle_dirs = IndexDirs(parse_state.i_dirs)
501  angle_dirs.extend(IndexDirs(parse_state.before_system_dirs))
502  if not parse_state.nostdinc:
503    sysroot = parse_state.include_sysroot()
504    angle_dirs.extend(
505      IndexDirs(compiler_defaults.system_dirs_default
506                [compiler][sysroot][parse_state.language]))
507  angle_dirs.extend(IndexDirs(parse_state.after_system_dirs))
508
509  quote_dirs = IndexDirs(parse_state.quote_dirs)
510  quote_dirs.extend(angle_dirs)
511  angle_dirs = tuple(angle_dirs)
512  quote_dirs = tuple(quote_dirs)
513  # Include files are meant to be sent to the server.  They do not pose the
514  # danger of absolute includes, which includepath_map is designed to avoid.
515  include_files = tuple(
516      [includepath_map.Index(basics.SafeNormPath(f),
517                             ignore_absolute_path_warning=True)
518       for f in parse_state.include_files])
519
520  if __debug__: Debug(DEBUG_TRACE, ("ParseCommand result: %s %s %s %s %s %s" %
521                                    (quote_dirs, angle_dirs, include_files,
522                                     source_file, source_file_prefix,
523                                     parse_state.Dopts)))
524  return (quote_dirs, angle_dirs, include_files, source_file, source_file_prefix,
525          parse_state.Dopts)
526