1#!/usr/bin/env python
2#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
3#
4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5# See https://llvm.org/LICENSE.txt for license information.
6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7#
8#===------------------------------------------------------------------------===#
9"""
10Example of use:
11  asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" -s "$HOME/SymbolFiles" < asan.log
12
13PLUGINS
14
15This script provides a way for external plug-ins to hook into the behaviour of
16various parts of this script (see `--plugins`). This is useful for situations
17where it is necessary to handle site-specific quirks (e.g. binaries with debug
18symbols only accessible via a remote service) without having to modify the
19script itself.
20
21"""
22import argparse
23import bisect
24import errno
25import getopt
26import logging
27import os
28import re
29import subprocess
30import sys
31from distutils.spawn import find_executable
32
33symbolizers = {}
34demangle = False
35binutils_prefix = None
36fix_filename_patterns = None
37logfile = sys.stdin
38allow_system_symbolizer = True
39force_system_symbolizer = False
40
41# FIXME: merge the code that calls fix_filename().
42def fix_filename(file_name):
43  if fix_filename_patterns:
44    for path_to_cut in fix_filename_patterns:
45      file_name = re.sub('.*' + path_to_cut, '', file_name)
46  file_name = re.sub('.*asan_[a-z_]*.(cc|cpp):[0-9]*', '_asan_rtl_', file_name)
47  file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
48  return file_name
49
50def is_valid_arch(s):
51  return s in ["i386", "x86_64", "x86_64h", "arm", "armv6", "armv7", "armv7s",
52               "armv7k", "arm64", "powerpc64", "powerpc64le", "s390x", "s390",
53               "riscv64"]
54
55def guess_arch(addr):
56  # Guess which arch we're running. 10 = len('0x') + 8 hex digits.
57  if len(addr) > 10:
58    return 'x86_64'
59  else:
60    return 'i386'
61
62class Symbolizer(object):
63  def __init__(self):
64    pass
65
66  def symbolize(self, addr, binary, offset):
67    """Symbolize the given address (pair of binary and offset).
68
69    Overriden in subclasses.
70    Args:
71        addr: virtual address of an instruction.
72        binary: path to executable/shared object containing this instruction.
73        offset: instruction offset in the @binary.
74    Returns:
75        list of strings (one string for each inlined frame) describing
76        the code locations for this instruction (that is, function name, file
77        name, line and column numbers).
78    """
79    return None
80
81
82class LLVMSymbolizer(Symbolizer):
83  def __init__(self, symbolizer_path, default_arch, system, dsym_hints=[]):
84    super(LLVMSymbolizer, self).__init__()
85    self.symbolizer_path = symbolizer_path
86    self.default_arch = default_arch
87    self.system = system
88    self.dsym_hints = dsym_hints
89    self.pipe = self.open_llvm_symbolizer()
90
91  def open_llvm_symbolizer(self):
92    cmd = [self.symbolizer_path,
93           ('--demangle' if demangle else '--no-demangle'),
94           '--functions=linkage',
95           '--inlines',
96           '--default-arch=%s' % self.default_arch]
97    if self.system == 'Darwin':
98      for hint in self.dsym_hints:
99        cmd.append('--dsym-hint=%s' % hint)
100    logging.debug(' '.join(cmd))
101    try:
102      result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
103                                stdout=subprocess.PIPE,
104                                bufsize=0,
105                                universal_newlines=True)
106    except OSError:
107      result = None
108    return result
109
110  def symbolize(self, addr, binary, offset):
111    """Overrides Symbolizer.symbolize."""
112    if not self.pipe:
113      return None
114    result = []
115    try:
116      symbolizer_input = '"%s" %s' % (binary, offset)
117      logging.debug(symbolizer_input)
118      self.pipe.stdin.write("%s\n" % symbolizer_input)
119      while True:
120        function_name = self.pipe.stdout.readline().rstrip()
121        if not function_name:
122          break
123        file_name = self.pipe.stdout.readline().rstrip()
124        file_name = fix_filename(file_name)
125        if (not function_name.startswith('??') or
126            not file_name.startswith('??')):
127          # Append only non-trivial frames.
128          result.append('%s in %s %s' % (addr, function_name,
129                                         file_name))
130    except Exception:
131      result = []
132    if not result:
133      result = None
134    return result
135
136
137def LLVMSymbolizerFactory(system, default_arch, dsym_hints=[]):
138  symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH')
139  if not symbolizer_path:
140    symbolizer_path = os.getenv('ASAN_SYMBOLIZER_PATH')
141    if not symbolizer_path:
142      # Assume llvm-symbolizer is in PATH.
143      symbolizer_path = 'llvm-symbolizer'
144  return LLVMSymbolizer(symbolizer_path, default_arch, system, dsym_hints)
145
146
147class Addr2LineSymbolizer(Symbolizer):
148  def __init__(self, binary):
149    super(Addr2LineSymbolizer, self).__init__()
150    self.binary = binary
151    self.pipe = self.open_addr2line()
152    self.output_terminator = -1
153
154  def open_addr2line(self):
155    addr2line_tool = 'addr2line'
156    if binutils_prefix:
157      addr2line_tool = binutils_prefix + addr2line_tool
158    logging.debug('addr2line binary is %s' % find_executable(addr2line_tool))
159    cmd = [addr2line_tool, '-fi']
160    if demangle:
161      cmd += ['--demangle']
162    cmd += ['-e', self.binary]
163    logging.debug(' '.join(cmd))
164    return subprocess.Popen(cmd,
165                            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
166                            bufsize=0,
167                            universal_newlines=True)
168
169  def symbolize(self, addr, binary, offset):
170    """Overrides Symbolizer.symbolize."""
171    if self.binary != binary:
172      return None
173    lines = []
174    try:
175      self.pipe.stdin.write("%s\n" % offset)
176      self.pipe.stdin.write("%s\n" % self.output_terminator)
177      is_first_frame = True
178      while True:
179        function_name = self.pipe.stdout.readline().rstrip()
180        logging.debug("read function_name='%s' from addr2line" % function_name)
181        # If llvm-symbolizer is installed as addr2line, older versions of
182        # llvm-symbolizer will print -1 when presented with -1 and not print
183        # a second line. In that case we will block for ever trying to read the
184        # file name. This also happens for non-existent files, in which case GNU
185        # addr2line exits immediate, but llvm-symbolizer does not (see
186        # https://llvm.org/PR42754).
187        if function_name == '-1':
188          logging.debug("got function '-1' -> no more input")
189          break
190        file_name = self.pipe.stdout.readline().rstrip()
191        logging.debug("read file_name='%s' from addr2line" % file_name)
192        if is_first_frame:
193          is_first_frame = False
194        elif function_name == '??':
195          assert file_name == '??:0', file_name
196          logging.debug("got function '??' -> no more input")
197          break
198        elif not function_name:
199          assert not file_name, file_name
200          logging.debug("got empty function name -> no more input")
201          break
202        if not function_name and not file_name:
203          logging.debug("got empty function and file name -> unknown function")
204          function_name = '??'
205          file_name = '??:0'
206        lines.append((function_name, file_name))
207    except IOError as e:
208      # EPIPE happens if addr2line exits early (which some implementations do
209      # if an invalid file is passed).
210      if e.errno == errno.EPIPE:
211        logging.debug("addr2line exited early (broken pipe), returncode=%d" % self.pipe.poll())
212      else:
213        logging.debug("unexpected I/O exception communicating with addr2line", exc_info=e)
214      lines.append(('??', '??:0'))
215    except Exception as e:
216      logging.debug("got unknown exception communicating with addr2line", exc_info=e)
217      lines.append(('??', '??:0'))
218    return ['%s in %s %s' % (addr, function, fix_filename(file)) for (function, file) in lines]
219
220class UnbufferedLineConverter(object):
221  """
222  Wrap a child process that responds to each line of input with one line of
223  output.  Uses pty to trick the child into providing unbuffered output.
224  """
225  def __init__(self, args, close_stderr=False):
226    # Local imports so that the script can start on Windows.
227    import pty
228    import termios
229    pid, fd = pty.fork()
230    if pid == 0:
231      # We're the child. Transfer control to command.
232      if close_stderr:
233        dev_null = os.open('/dev/null', 0)
234        os.dup2(dev_null, 2)
235      os.execvp(args[0], args)
236    else:
237      # Disable echoing.
238      attr = termios.tcgetattr(fd)
239      attr[3] = attr[3] & ~termios.ECHO
240      termios.tcsetattr(fd, termios.TCSANOW, attr)
241      # Set up a file()-like interface to the child process
242      self.r = os.fdopen(fd, "r", 1)
243      self.w = os.fdopen(os.dup(fd), "w", 1)
244
245  def convert(self, line):
246    self.w.write(line + "\n")
247    return self.readline()
248
249  def readline(self):
250    return self.r.readline().rstrip()
251
252
253class DarwinSymbolizer(Symbolizer):
254  def __init__(self, addr, binary, arch):
255    super(DarwinSymbolizer, self).__init__()
256    self.binary = binary
257    self.arch = arch
258    self.open_atos()
259
260  def open_atos(self):
261    logging.debug('atos -o %s -arch %s', self.binary, self.arch)
262    cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
263    self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
264
265  def symbolize(self, addr, binary, offset):
266    """Overrides Symbolizer.symbolize."""
267    if self.binary != binary:
268      return None
269    if not os.path.exists(binary):
270      # If the binary doesn't exist atos will exit which will lead to IOError
271      # exceptions being raised later on so just don't try to symbolize.
272      return ['{} ({}:{}+{})'.format(addr, binary, self.arch, offset)]
273    atos_line = self.atos.convert('0x%x' % int(offset, 16))
274    while "got symbolicator for" in atos_line:
275      atos_line = self.atos.readline()
276    # A well-formed atos response looks like this:
277    #   foo(type1, type2) (in object.name) (filename.cc:80)
278    # NOTE:
279    #   * For C functions atos omits parentheses and argument types.
280    #   * For C++ functions the function name (i.e., `foo` above) may contain
281    #     templates which may contain parentheses.
282    match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
283    logging.debug('atos_line: %s', atos_line)
284    if match:
285      function_name = match.group(1)
286      file_name = fix_filename(match.group(3))
287      return ['%s in %s %s' % (addr, function_name, file_name)]
288    else:
289      return ['%s in %s' % (addr, atos_line)]
290
291
292# Chain several symbolizers so that if one symbolizer fails, we fall back
293# to the next symbolizer in chain.
294class ChainSymbolizer(Symbolizer):
295  def __init__(self, symbolizer_list):
296    super(ChainSymbolizer, self).__init__()
297    self.symbolizer_list = symbolizer_list
298
299  def symbolize(self, addr, binary, offset):
300    """Overrides Symbolizer.symbolize."""
301    for symbolizer in self.symbolizer_list:
302      if symbolizer:
303        result = symbolizer.symbolize(addr, binary, offset)
304        if result:
305          return result
306    return None
307
308  def append_symbolizer(self, symbolizer):
309    self.symbolizer_list.append(symbolizer)
310
311
312def BreakpadSymbolizerFactory(binary):
313  suffix = os.getenv('BREAKPAD_SUFFIX')
314  if suffix:
315    filename = binary + suffix
316    if os.access(filename, os.F_OK):
317      return BreakpadSymbolizer(filename)
318  return None
319
320
321def SystemSymbolizerFactory(system, addr, binary, arch):
322  if system == 'Darwin':
323    return DarwinSymbolizer(addr, binary, arch)
324  elif system in ['Linux', 'FreeBSD', 'NetBSD', 'SunOS']:
325    return Addr2LineSymbolizer(binary)
326
327
328class BreakpadSymbolizer(Symbolizer):
329  def __init__(self, filename):
330    super(BreakpadSymbolizer, self).__init__()
331    self.filename = filename
332    lines = file(filename).readlines()
333    self.files = []
334    self.symbols = {}
335    self.address_list = []
336    self.addresses = {}
337    # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t
338    fragments = lines[0].rstrip().split()
339    self.arch = fragments[2]
340    self.debug_id = fragments[3]
341    self.binary = ' '.join(fragments[4:])
342    self.parse_lines(lines[1:])
343
344  def parse_lines(self, lines):
345    cur_function_addr = ''
346    for line in lines:
347      fragments = line.split()
348      if fragments[0] == 'FILE':
349        assert int(fragments[1]) == len(self.files)
350        self.files.append(' '.join(fragments[2:]))
351      elif fragments[0] == 'PUBLIC':
352        self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:])
353      elif fragments[0] in ['CFI', 'STACK']:
354        pass
355      elif fragments[0] == 'FUNC':
356        cur_function_addr = int(fragments[1], 16)
357        if not cur_function_addr in self.symbols.keys():
358          self.symbols[cur_function_addr] = ' '.join(fragments[4:])
359      else:
360        # Line starting with an address.
361        addr = int(fragments[0], 16)
362        self.address_list.append(addr)
363        # Tuple of symbol address, size, line, file number.
364        self.addresses[addr] = (cur_function_addr,
365                                int(fragments[1], 16),
366                                int(fragments[2]),
367                                int(fragments[3]))
368    self.address_list.sort()
369
370  def get_sym_file_line(self, addr):
371    key = None
372    if addr in self.addresses.keys():
373      key = addr
374    else:
375      index = bisect.bisect_left(self.address_list, addr)
376      if index == 0:
377        return None
378      else:
379        key = self.address_list[index - 1]
380    sym_id, size, line_no, file_no = self.addresses[key]
381    symbol = self.symbols[sym_id]
382    filename = self.files[file_no]
383    if addr < key + size:
384      return symbol, filename, line_no
385    else:
386      return None
387
388  def symbolize(self, addr, binary, offset):
389    if self.binary != binary:
390      return None
391    res = self.get_sym_file_line(int(offset, 16))
392    if res:
393      function_name, file_name, line_no = res
394      result = ['%s in %s %s:%d' % (
395          addr, function_name, file_name, line_no)]
396      print(result)
397      return result
398    else:
399      return None
400
401
402class SymbolizationLoop(object):
403  def __init__(self, plugin_proxy=None, dsym_hint_producer=None):
404    self.plugin_proxy = plugin_proxy
405    if sys.platform == 'win32':
406      # ASan on Windows uses dbghelp.dll to symbolize in-process, which works
407      # even in sandboxed processes.  Nothing needs to be done here.
408      self.process_line = self.process_line_echo
409    else:
410      # Used by clients who may want to supply a different binary name.
411      # E.g. in Chrome several binaries may share a single .dSYM.
412      self.dsym_hint_producer = dsym_hint_producer
413      self.system = os.uname()[0]
414      if self.system not in ['Linux', 'Darwin', 'FreeBSD', 'NetBSD','SunOS']:
415        raise Exception('Unknown system')
416      self.llvm_symbolizers = {}
417      self.last_llvm_symbolizer = None
418      self.dsym_hints = set([])
419      self.frame_no = 0
420      self.process_line = self.process_line_posix
421      self.using_module_map = plugin_proxy.has_plugin(ModuleMapPlugIn.get_name())
422
423  def symbolize_address(self, addr, binary, offset, arch):
424    # On non-Darwin (i.e. on platforms without .dSYM debug info) always use
425    # a single symbolizer binary.
426    # On Darwin, if the dsym hint producer is present:
427    #  1. check whether we've seen this binary already; if so,
428    #     use |llvm_symbolizers[binary]|, which has already loaded the debug
429    #     info for this binary (might not be the case for
430    #     |last_llvm_symbolizer|);
431    #  2. otherwise check if we've seen all the hints for this binary already;
432    #     if so, reuse |last_llvm_symbolizer| which has the full set of hints;
433    #  3. otherwise create a new symbolizer and pass all currently known
434    #     .dSYM hints to it.
435    result = None
436    if not force_system_symbolizer:
437      if not binary in self.llvm_symbolizers:
438        use_new_symbolizer = True
439        if self.system == 'Darwin' and self.dsym_hint_producer:
440          dsym_hints_for_binary = set(self.dsym_hint_producer(binary))
441          use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
442          self.dsym_hints |= dsym_hints_for_binary
443        if self.last_llvm_symbolizer and not use_new_symbolizer:
444            self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
445        else:
446          self.last_llvm_symbolizer = LLVMSymbolizerFactory(
447              self.system, arch, self.dsym_hints)
448          self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
449      # Use the chain of symbolizers:
450      # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos
451      # (fall back to next symbolizer if the previous one fails).
452      if not binary in symbolizers:
453        symbolizers[binary] = ChainSymbolizer(
454            [BreakpadSymbolizerFactory(binary), self.llvm_symbolizers[binary]])
455      result = symbolizers[binary].symbolize(addr, binary, offset)
456    else:
457      symbolizers[binary] = ChainSymbolizer([])
458    if result is None:
459      if not allow_system_symbolizer:
460        raise Exception('Failed to launch or use llvm-symbolizer.')
461      # Initialize system symbolizer only if other symbolizers failed.
462      symbolizers[binary].append_symbolizer(
463          SystemSymbolizerFactory(self.system, addr, binary, arch))
464      result = symbolizers[binary].symbolize(addr, binary, offset)
465    # The system symbolizer must produce some result.
466    assert result
467    return result
468
469  def get_symbolized_lines(self, symbolized_lines, inc_frame_counter=True):
470    if not symbolized_lines:
471      if inc_frame_counter:
472        self.frame_no += 1
473      return [self.current_line]
474    else:
475      assert inc_frame_counter
476      result = []
477      for symbolized_frame in symbolized_lines:
478        result.append('    #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
479        self.frame_no += 1
480      return result
481
482  def process_logfile(self):
483    self.frame_no = 0
484    for line in logfile:
485      processed = self.process_line(line)
486      print('\n'.join(processed))
487
488  def process_line_echo(self, line):
489    return [line.rstrip()]
490
491  def process_line_posix(self, line):
492    self.current_line = line.rstrip()
493    # Unsymbolicated:
494    # #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45)
495    # Partially symbolicated:
496    # #0 0x7f6e35cf2e45 in foo (foo.so+0x11fe45)
497    # NOTE: We have to very liberal with symbol
498    # names in the regex because it could be an
499    # Objective-C or C++ demangled name.
500    stack_trace_line_format = (
501        '^( *#([0-9]+) *)(0x[0-9a-f]+) *(?:in *.+)? *\((.*)\+(0x[0-9a-f]+)\)')
502    match = re.match(stack_trace_line_format, line)
503    if not match:
504      logging.debug('Line "{}" does not match regex'.format(line))
505      # Not a frame line so don't increment the frame counter.
506      return self.get_symbolized_lines(None, inc_frame_counter=False)
507    logging.debug(line)
508    _, frameno_str, addr, binary, offset = match.groups()
509
510    if not self.using_module_map and not os.path.isabs(binary):
511      # Do not try to symbolicate if the binary is just the module file name
512      # and a module map is unavailable.
513      # FIXME(dliew): This is currently necessary for reports on Darwin that are
514      # partially symbolicated by `atos`.
515      return self.get_symbolized_lines(None)
516    arch = ""
517    # Arch can be embedded in the filename, e.g.: "libabc.dylib:x86_64h"
518    colon_pos = binary.rfind(":")
519    if colon_pos != -1:
520      maybe_arch = binary[colon_pos+1:]
521      if is_valid_arch(maybe_arch):
522        arch = maybe_arch
523        binary = binary[0:colon_pos]
524    if arch == "":
525      arch = guess_arch(addr)
526    if frameno_str == '0':
527      # Assume that frame #0 is the first frame of new stack trace.
528      self.frame_no = 0
529    original_binary = binary
530    binary = self.plugin_proxy.filter_binary_path(binary)
531    if binary is None:
532      # The binary filter has told us this binary can't be symbolized.
533      logging.debug('Skipping symbolication of binary "%s"', original_binary)
534      return self.get_symbolized_lines(None)
535    symbolized_line = self.symbolize_address(addr, binary, offset, arch)
536    if not symbolized_line:
537      if original_binary != binary:
538        symbolized_line = self.symbolize_address(addr, original_binary, offset, arch)
539    return self.get_symbolized_lines(symbolized_line)
540
541class AsanSymbolizerPlugInProxy(object):
542  """
543    Serves several purposes:
544    - Manages the lifetime of plugins (must be used a `with` statement).
545    - Provides interface for calling into plugins from within this script.
546  """
547  def __init__(self):
548    self._plugins = [ ]
549    self._plugin_names = set()
550
551  def _load_plugin_from_file_impl_py_gt_2(self, file_path, globals_space):
552      with open(file_path, 'r') as f:
553        exec(f.read(), globals_space, None)
554
555  def load_plugin_from_file(self, file_path):
556    logging.info('Loading plugins from "{}"'.format(file_path))
557    globals_space = dict(globals())
558    # Provide function to register plugins
559    def register_plugin(plugin):
560      logging.info('Registering plugin %s', plugin.get_name())
561      self.add_plugin(plugin)
562    globals_space['register_plugin'] = register_plugin
563    if sys.version_info.major < 3:
564      execfile(file_path, globals_space, None)
565    else:
566      # Indirection here is to avoid a bug in older Python 2 versions:
567      # `SyntaxError: unqualified exec is not allowed in function ...`
568      self._load_plugin_from_file_impl_py_gt_2(file_path, globals_space)
569
570  def add_plugin(self, plugin):
571    assert isinstance(plugin, AsanSymbolizerPlugIn)
572    self._plugins.append(plugin)
573    self._plugin_names.add(plugin.get_name())
574    plugin._receive_proxy(self)
575
576  def remove_plugin(self, plugin):
577    assert isinstance(plugin, AsanSymbolizerPlugIn)
578    self._plugins.remove(plugin)
579    self._plugin_names.remove(plugin.get_name())
580    logging.debug('Removing plugin %s', plugin.get_name())
581    plugin.destroy()
582
583  def has_plugin(self, name):
584    """
585      Returns true iff the plugin name is currently
586      being managed by AsanSymbolizerPlugInProxy.
587    """
588    return name in self._plugin_names
589
590  def register_cmdline_args(self, parser):
591    plugins = list(self._plugins)
592    for plugin in plugins:
593      plugin.register_cmdline_args(parser)
594
595  def process_cmdline_args(self, pargs):
596    # Use copy so we can remove items as we iterate.
597    plugins = list(self._plugins)
598    for plugin in plugins:
599      keep = plugin.process_cmdline_args(pargs)
600      assert isinstance(keep, bool)
601      if not keep:
602        self.remove_plugin(plugin)
603
604  def __enter__(self):
605    return self
606
607  def __exit__(self, exc_type, exc_val, exc_tb):
608    for plugin in self._plugins:
609      plugin.destroy()
610    # Don't suppress raised exceptions
611    return False
612
613  def _filter_single_value(self, function_name, input_value):
614    """
615      Helper for filter style plugin functions.
616    """
617    new_value = input_value
618    for plugin in self._plugins:
619      result = getattr(plugin, function_name)(new_value)
620      if result is None:
621        return None
622      new_value = result
623    return new_value
624
625  def filter_binary_path(self, binary_path):
626    """
627      Consult available plugins to filter the path to a binary
628      to make it suitable for symbolication.
629
630      Returns `None` if symbolication should not be attempted for this
631      binary.
632    """
633    return self._filter_single_value('filter_binary_path', binary_path)
634
635  def filter_module_desc(self, module_desc):
636    """
637      Consult available plugins to determine the module
638      description suitable for symbolication.
639
640      Returns `None` if symbolication should not be attempted for this module.
641    """
642    assert isinstance(module_desc, ModuleDesc)
643    return self._filter_single_value('filter_module_desc', module_desc)
644
645class AsanSymbolizerPlugIn(object):
646  """
647    This is the interface the `asan_symbolize.py` code uses to talk
648    to plugins.
649  """
650  @classmethod
651  def get_name(cls):
652    """
653      Returns the name of the plugin.
654    """
655    return cls.__name__
656
657  def _receive_proxy(self, proxy):
658    assert isinstance(proxy, AsanSymbolizerPlugInProxy)
659    self.proxy = proxy
660
661  def register_cmdline_args(self, parser):
662    """
663      Hook for registering command line arguments to be
664      consumed in `process_cmdline_args()`.
665
666      `parser` - Instance of `argparse.ArgumentParser`.
667    """
668    pass
669
670  def process_cmdline_args(self, pargs):
671    """
672      Hook for handling parsed arguments. Implementations
673      should not modify `pargs`.
674
675      `pargs` - Instance of `argparse.Namespace` containing
676      parsed command line arguments.
677
678      Return `True` if plug-in should be used, otherwise
679      return `False`.
680    """
681    return True
682
683  def destroy(self):
684    """
685      Hook called when a plugin is about to be destroyed.
686      Implementations should free any allocated resources here.
687    """
688    pass
689
690  # Symbolization hooks
691  def filter_binary_path(self, binary_path):
692    """
693      Given a binary path return a binary path suitable for symbolication.
694
695      Implementations should return `None` if symbolication of this binary
696      should be skipped.
697    """
698    return binary_path
699
700  def filter_module_desc(self, module_desc):
701    """
702      Given a ModuleDesc object (`module_desc`) return
703      a ModuleDesc suitable for symbolication.
704
705      Implementations should return `None` if symbolication of this binary
706      should be skipped.
707    """
708    return module_desc
709
710class ModuleDesc(object):
711  def __init__(self, name, arch, start_addr, end_addr, module_path, uuid):
712    self.name = name
713    self.arch = arch
714    self.start_addr = start_addr
715    self.end_addr = end_addr
716    # Module path from an ASan report.
717    self.module_path = module_path
718    # Module for performing symbolization, by default same as above.
719    self.module_path_for_symbolization = module_path
720    self.uuid = uuid
721    assert self.is_valid()
722
723  def __str__(self):
724    assert self.is_valid()
725    return "{name} {arch} {start_addr:#016x}-{end_addr:#016x} {module_path} {uuid}".format(
726      name=self.name,
727      arch=self.arch,
728      start_addr=self.start_addr,
729      end_addr=self.end_addr,
730      module_path=self.module_path if self.module_path == self.module_path_for_symbolization else '{} ({})'.format(self.module_path_for_symbolization, self.module_path),
731      uuid=self.uuid
732    )
733
734  def is_valid(self):
735    if not isinstance(self.name, str):
736      return False
737    if not isinstance(self.arch, str):
738      return False
739    if not isinstance(self.start_addr, int):
740      return False
741    if self.start_addr < 0:
742      return False
743    if not isinstance(self.end_addr, int):
744      return False
745    if self.end_addr <= self.start_addr:
746      return False
747    if not isinstance(self.module_path, str):
748      return False
749    if not os.path.isabs(self.module_path):
750      return False
751    if not isinstance(self.module_path_for_symbolization, str):
752      return False
753    if not os.path.isabs(self.module_path_for_symbolization):
754      return False
755    if not isinstance(self.uuid, str):
756      return False
757    return True
758
759class GetUUIDFromBinaryException(Exception):
760  def __init__(self, msg):
761    super(GetUUIDFromBinaryException, self).__init__(msg)
762
763_get_uuid_from_binary_cache = dict()
764
765def get_uuid_from_binary(path_to_binary, arch=None):
766  cache_key = (path_to_binary, arch)
767  cached_value = _get_uuid_from_binary_cache.get(cache_key)
768  if cached_value:
769    return cached_value
770  if not os.path.exists(path_to_binary):
771    raise GetUUIDFromBinaryException('Binary "{}" does not exist'.format(path_to_binary))
772  cmd = [ '/usr/bin/otool', '-l']
773  if arch:
774    cmd.extend(['-arch', arch])
775  cmd.append(path_to_binary)
776  output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
777  # Look for this output:
778  # cmd LC_UUID
779  # cmdsize 24
780  # uuid 4CA778FE-5BF9-3C45-AE59-7DF01B2BE83F
781  if isinstance(output, str):
782    output_str = output
783  else:
784    assert isinstance(output, bytes)
785    output_str = output.decode()
786  assert isinstance(output_str, str)
787  lines = output_str.split('\n')
788  uuid = None
789  for index, line in enumerate(lines):
790    stripped_line = line.strip()
791    if not stripped_line.startswith('cmd LC_UUID'):
792      continue
793    uuid_line = lines[index+2].strip()
794    if not uuid_line.startswith('uuid'):
795      raise GetUUIDFromBinaryException('Malformed output: "{}"'.format(uuid_line))
796    split_uuid_line = uuid_line.split()
797    uuid = split_uuid_line[1]
798    break
799  if uuid is None:
800    logging.error('Failed to retrieve UUID from binary {}'.format(path_to_binary))
801    logging.error('otool output was:\n{}'.format(output_str))
802    raise GetUUIDFromBinaryException('Failed to retrieve UUID from binary "{}"'.format(path_to_binary))
803  else:
804    # Update cache
805    _get_uuid_from_binary_cache[cache_key] = uuid
806  return uuid
807
808class ModuleMap(object):
809  def __init__(self):
810    self._module_name_to_description_map = dict()
811
812  def add_module(self, desc):
813    assert isinstance(desc, ModuleDesc)
814    assert desc.name not in self._module_name_to_description_map
815    self._module_name_to_description_map[desc.name] = desc
816
817  def find_module_by_name(self, name):
818    return self._module_name_to_description_map.get(name, None)
819
820  def __str__(self):
821    s = '{} modules:\n'.format(self.num_modules)
822    for module_desc in sorted(self._module_name_to_description_map.values(), key=lambda v: v.start_addr):
823      s += str(module_desc) + '\n'
824    return s
825
826  @property
827  def num_modules(self):
828    return len(self._module_name_to_description_map)
829
830  @property
831  def modules(self):
832    return set(self._module_name_to_description_map.values())
833
834  def get_module_path_for_symbolication(self, module_name, proxy, validate_uuid):
835    module_desc = self.find_module_by_name(module_name)
836    if module_desc is None:
837      return None
838    # Allow a plug-in to change the module description to make it
839    # suitable for symbolication or avoid symbolication altogether.
840    module_desc = proxy.filter_module_desc(module_desc)
841    if module_desc is None:
842      return None
843    if validate_uuid:
844      logging.debug('Validating UUID of {}'.format(module_desc.module_path_for_symbolization))
845      try:
846        uuid = get_uuid_from_binary(module_desc.module_path_for_symbolization, arch = module_desc.arch)
847        if uuid != module_desc.uuid:
848          logging.warning("Detected UUID mismatch {} != {}".format(uuid, module_desc.uuid))
849          # UUIDs don't match. Tell client to not symbolize this.
850          return None
851      except GetUUIDFromBinaryException as e:
852        logging.error('Failed to get binary from UUID: %s', str(e))
853        return None
854    else:
855      logging.warning('Skipping validation of UUID of {}'.format(module_desc.module_path_for_symbolization))
856    return module_desc.module_path_for_symbolization
857
858  @staticmethod
859  def parse_from_file(module_map_path):
860    if not os.path.exists(module_map_path):
861      raise Exception('module map "{}" does not exist'.format(module_map_path))
862    with open(module_map_path, 'r') as f:
863      mm = None
864      # E.g.
865      # 0x2db4000-0x102ddc000 /path/to (arm64) <0D6BBDE0-FF90-3680-899D-8E6F9528E04C>
866      hex_regex = lambda name: r'0x(?P<' + name + r'>[0-9a-f]+)'
867      module_path_regex = r'(?P<path>.+)'
868      arch_regex = r'\((?P<arch>.+)\)'
869      uuid_regex = r'<(?P<uuid>[0-9A-Z-]+)>'
870      line_regex = r'^{}-{}\s+{}\s+{}\s+{}'.format(
871        hex_regex('start_addr'),
872        hex_regex('end_addr'),
873        module_path_regex,
874        arch_regex,
875        uuid_regex
876      )
877      matcher = re.compile(line_regex)
878      line_num = 0
879      line = 'dummy'
880      while line != '':
881        line = f.readline()
882        line_num += 1
883        if mm is None:
884          if line.startswith('Process module map:'):
885            mm = ModuleMap()
886          continue
887        if line.startswith('End of module map'):
888          break
889        m_obj = matcher.match(line)
890        if not m_obj:
891          raise Exception('Failed to parse line {} "{}"'.format(line_num, line))
892        arch = m_obj.group('arch')
893        start_addr = int(m_obj.group('start_addr'), base=16)
894        end_addr = int(m_obj.group('end_addr'), base=16)
895        module_path = m_obj.group('path')
896        uuid = m_obj.group('uuid')
897        module_desc = ModuleDesc(
898          name=os.path.basename(module_path),
899          arch=arch,
900          start_addr=start_addr,
901          end_addr=end_addr,
902          module_path=module_path,
903          uuid=uuid
904        )
905        mm.add_module(module_desc)
906      if mm is not None:
907        logging.debug('Loaded Module map from "{}":\n{}'.format(
908          f.name,
909          str(mm))
910        )
911      return mm
912
913class SysRootFilterPlugIn(AsanSymbolizerPlugIn):
914  """
915    Simple plug-in to add sys root prefix to all binary paths
916    used for symbolication.
917  """
918  def __init__(self):
919    self.sysroot_path = ""
920
921  def register_cmdline_args(self, parser):
922    parser.add_argument('-s', dest='sys_root', metavar='SYSROOT',
923                      help='set path to sysroot for sanitized binaries')
924
925  def process_cmdline_args(self, pargs):
926    if pargs.sys_root is None:
927      # Not being used so remove ourselves.
928      return False
929    self.sysroot_path = pargs.sys_root
930    return True
931
932  def filter_binary_path(self, path):
933    return self.sysroot_path + path
934
935class ModuleMapPlugIn(AsanSymbolizerPlugIn):
936  def __init__(self):
937    self._module_map = None
938    self._uuid_validation = True
939  def register_cmdline_args(self, parser):
940    parser.add_argument('--module-map',
941                        help='Path to text file containing module map'
942                        'output. See print_module_map ASan option.')
943    parser.add_argument('--skip-uuid-validation',
944                        default=False,
945                        action='store_true',
946                        help='Skips validating UUID of modules using otool.')
947
948  def process_cmdline_args(self, pargs):
949    if not pargs.module_map:
950      return False
951    self._module_map = ModuleMap.parse_from_file(args.module_map)
952    if self._module_map is None:
953      msg = 'Failed to find module map'
954      logging.error(msg)
955      raise Exception(msg)
956    self._uuid_validation = not pargs.skip_uuid_validation
957    return True
958
959  def filter_binary_path(self, binary_path):
960    if os.path.isabs(binary_path):
961      # This is a binary path so transform into
962      # a module name
963      module_name = os.path.basename(binary_path)
964    else:
965      module_name = binary_path
966    return self._module_map.get_module_path_for_symbolication(
967      module_name,
968      self.proxy,
969      self._uuid_validation
970    )
971
972def add_logging_args(parser):
973  parser.add_argument('--log-dest',
974    default=None,
975    help='Destination path for script logging (default stderr).',
976  )
977  parser.add_argument('--log-level',
978    choices=['debug', 'info', 'warning', 'error', 'critical'],
979    default='info',
980    help='Log level for script (default: %(default)s).'
981  )
982
983def setup_logging():
984  # Set up a parser just for parsing the logging arguments.
985  # This is necessary because logging should be configured before we
986  # perform the main argument parsing.
987  parser = argparse.ArgumentParser(add_help=False)
988  add_logging_args(parser)
989  pargs, unparsed_args = parser.parse_known_args()
990
991  log_level = getattr(logging, pargs.log_level.upper())
992  if log_level == logging.DEBUG:
993    log_format = '%(levelname)s: [%(funcName)s() %(filename)s:%(lineno)d] %(message)s'
994  else:
995    log_format = '%(levelname)s: %(message)s'
996  basic_config = {
997    'level': log_level,
998    'format': log_format
999  }
1000  log_dest = pargs.log_dest
1001  if log_dest:
1002    basic_config['filename'] = log_dest
1003  logging.basicConfig(**basic_config)
1004  logging.debug('Logging level set to "{}" and directing output to "{}"'.format(
1005    pargs.log_level,
1006    'stderr' if log_dest is None else log_dest)
1007  )
1008  return unparsed_args
1009
1010def add_load_plugin_args(parser):
1011  parser.add_argument('-p', '--plugins',
1012    help='Load plug-in', nargs='+', default=[])
1013
1014def setup_plugins(plugin_proxy, args):
1015  parser = argparse.ArgumentParser(add_help=False)
1016  add_load_plugin_args(parser)
1017  pargs , unparsed_args = parser.parse_known_args()
1018  for plugin_path in pargs.plugins:
1019    plugin_proxy.load_plugin_from_file(plugin_path)
1020  # Add built-in plugins.
1021  plugin_proxy.add_plugin(ModuleMapPlugIn())
1022  plugin_proxy.add_plugin(SysRootFilterPlugIn())
1023  return unparsed_args
1024
1025if __name__ == '__main__':
1026  remaining_args = setup_logging()
1027  with AsanSymbolizerPlugInProxy() as plugin_proxy:
1028    remaining_args = setup_plugins(plugin_proxy, remaining_args)
1029    parser = argparse.ArgumentParser(
1030        formatter_class=argparse.RawDescriptionHelpFormatter,
1031        description='ASan symbolization script',
1032        epilog=__doc__)
1033    parser.add_argument('path_to_cut', nargs='*',
1034                        help='pattern to be cut from the result file path ')
1035    parser.add_argument('-d','--demangle', action='store_true',
1036                        help='demangle function names')
1037    parser.add_argument('-c', metavar='CROSS_COMPILE',
1038                        help='set prefix for binutils')
1039    parser.add_argument('-l','--logfile', default=sys.stdin,
1040                        type=argparse.FileType('r'),
1041                        help='set log file name to parse, default is stdin')
1042    parser.add_argument('--force-system-symbolizer', action='store_true',
1043                        help='don\'t use llvm-symbolizer')
1044    # Add logging arguments so that `--help` shows them.
1045    add_logging_args(parser)
1046    # Add load plugin arguments so that `--help` shows them.
1047    add_load_plugin_args(parser)
1048    plugin_proxy.register_cmdline_args(parser)
1049    args = parser.parse_args(remaining_args)
1050    plugin_proxy.process_cmdline_args(args)
1051    if args.path_to_cut:
1052      fix_filename_patterns = args.path_to_cut
1053    if args.demangle:
1054      demangle = True
1055    if args.c:
1056      binutils_prefix = args.c
1057    if args.logfile:
1058      logfile = args.logfile
1059    else:
1060      logfile = sys.stdin
1061    if args.force_system_symbolizer:
1062      force_system_symbolizer = True
1063    if force_system_symbolizer:
1064      assert(allow_system_symbolizer)
1065    loop = SymbolizationLoop(plugin_proxy)
1066    loop.process_logfile()
1067