tools/cygprofile/symbol_extractor.py

# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Utilities to get and manipulate symbols from a binary."""

import collections
import logging
import os
import re
import subprocess
import sys

import cygprofile_utils

START_OF_TEXT_SYMBOL = 'linker_script_start_of_text'

_SRC_PATH = os.path.abspath(os.path.join(
    os.path.dirname(__file__), os.pardir, os.pardir))

sys.path.insert(0, os.path.join(_SRC_PATH, 'build', 'android'))
from pylib.constants import host_paths

_MAX_WARNINGS_TO_PRINT = 200

SymbolInfo = collections.namedtuple('SymbolInfo', ('name', 'offset', 'size',
                                                   'section'))

# Unfortunate global variable :-/
_arch = 'arm'


def SetArchitecture(arch):
  """Set the architecture for binaries to be symbolized."""
  global _arch
  _arch = arch


# Regular expression to match lines printed by 'objdump -t -w'. An example of
# such line looks like this:
# 018db2de l     F .text  00000060              .hidden _ZN8SkBitmapC2ERKS_
#
# The regex intentionally allows matching more than valid inputs. This gives
# more protection against potentially incorrectly silently ignoring unmatched
# input lines. Instead a few assertions early in _FromObjdumpLine() check the
# validity of a few parts matched as groups.
_OBJDUMP_LINE_RE = re.compile(r'''
  # The offset of the function, as hex.
  (?P<offset>^[0-9a-f]+)

  # The space character.
  [ ]

  # The 7 groups of flag characters, one character each.
  (
    (?P<assert_scope>.)           # Global, local, unique local, etc.
    (?P<assert_weak_or_strong>.)
    (?P<assert_4spaces>.{4})      # Constructor, warning, indirect ref,
                                  # debugger symbol.
    (?P<symbol_type>.)            # Function, object, file or normal.
  )

  [ ]

  # The section name should start with ".text", can be ".text.foo". With LLD,
  # and especially LTO the traces of input sections are not preserved. Support
  # ".text.foo" for a little longer time because it is easy.
  (?P<section>.text[^0-9a-f]*)

  (?P<assert_tab> \s+)

  # The size of the symbol, as hex.
  (?P<size>[0-9a-f]+)

  # Normally separated out by 14 spaces, but some bits in ELF may theoretically
  # affect this length.
  (?P<assert_14spaces>[ ]+)

  # Hidden symbols should be treated as usual.
  (.hidden [ ])?

  # The symbol name.
  (?P<name>.*)

  $
  ''', re.VERBOSE)


def _FromObjdumpLine(line):
  """Create a SymbolInfo by parsing a properly formatted objdump output line.

  Args:
    line: line from objdump

  Returns:
    An instance of SymbolInfo if the line represents a symbol, None otherwise.
  """
  m = _OBJDUMP_LINE_RE.match(line)
  if not m:
    return None

  # A symbol can be (g)lobal, (l)ocal, or neither (a space). Per objdump's
  # manpage, "A symbol can be neither local or global for a variety of reasons".
  assert m.group('assert_scope') in set(['g', 'l', ' ']), line
  assert m.group('assert_weak_or_strong') in set(['w', ' ']), line
  assert m.group('assert_tab') == '\t', line
  assert m.group('assert_4spaces') == ' ' * 4, line
  assert m.group('assert_14spaces') == ' ' * 14, line
  name = m.group('name')
  offset = int(m.group('offset'), 16)

  # Output the label that contains the earliest offset. It is needed later for
  # translating offsets from the profile dumps.
  if name == START_OF_TEXT_SYMBOL:
    return SymbolInfo(name=name, offset=offset, section='.text', size=0)

  # Check symbol type for validity and ignore some types.
  # From objdump manual page: The symbol is the name of a function (F) or a file
  # (f) or an object (O) or just a normal symbol (a space). The 'normal' symbols
  # seens so far has been function-local labels.
  symbol_type = m.group('symbol_type')
  if symbol_type == ' ':
    # Ignore local goto labels. Unfortunately, v8 builtins (like 'Builtins_.*')
    # are indistinguishable from labels of size 0 other than by name.
    return None
  # Guard against file symbols, since they are normally not seen in the
  # binaries we parse.
  assert symbol_type != 'f', line

  # Extract the size from the ELF field. This value sometimes does not reflect
  # the real size of the function. One reason for that is the '.size' directive
  # in the assembler. As a result, a few functions in .S files have the size 0.
  # They are not instrumented (yet), but maintaining their order in the
  # orderfile may be important in some cases.
  size = int(m.group('size'), 16)

  # Forbid ARM mapping symbols and other unexpected symbol names, but allow $
  # characters in a non-initial position, which can appear as a component of a
  # mangled name, e.g. Clang can mangle a lambda function to:
  # 02cd61e0 l     F .text  000000c0 _ZZL11get_globalsvENK3$_1clEv
  # The equivalent objdump line from GCC is:
  # 0325c58c l     F .text  000000d0 _ZZL11get_globalsvENKUlvE_clEv
  #
  # Also disallow .internal and .protected symbols (as well as other flags),
  # those have not appeared in the binaries we parse. Rejecting these extra
  # prefixes is done by disallowing spaces in symbol names.
  assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name

  return SymbolInfo(name=name, offset=offset, section=m.group('section'),
                    size=size)


def _SymbolInfosFromStream(objdump_lines):
  """Parses the output of objdump, and get all the symbols from a binary.

  Args:
    objdump_lines: An iterable of lines

  Returns:
    A list of SymbolInfo.
  """
  name_to_offsets = collections.defaultdict(list)
  symbol_infos = []
  for line in objdump_lines:
    symbol_info = _FromObjdumpLine(line.rstrip('\n'))
    if symbol_info is not None:
      # On ARM the LLD linker inserts pseudo-functions (thunks) that allow
      # jumping distances farther than 16 MiB. Such thunks are known to often
      # reside on multiple offsets, they are not instrumented and hence they do
      # not reach the orderfiles. Exclude the thunk symbols from the warning.
      if not symbol_info.name.startswith('__ThumbV7PILongThunk_'):
        name_to_offsets[symbol_info.name].append(symbol_info.offset)
      symbol_infos.append(symbol_info)

  # Outlined functions are known to be repeated often, so ignore them in the
  # repeated symbol count.
  repeated_symbols = filter(lambda s: len(name_to_offsets[s]) > 1,
                            (k for k in name_to_offsets.keys()
                             if not k.startswith('OUTLINED_FUNCTION_')))
  if repeated_symbols:
    # Log the first 5 repeated offsets of the first 10 repeated symbols.
    logging.warning('%d symbols repeated with multiple offsets:\n %s',
                    len(repeated_symbols), '\n '.join(
                        '{} {}'.format(sym, ' '.join(
                            str(offset) for offset in name_to_offsets[sym][:5]))
                        for sym in repeated_symbols[:10]))

  return symbol_infos


def SymbolInfosFromBinary(binary_filename):
  """Runs objdump to get all the symbols from a binary.

  Args:
    binary_filename: path to the binary.

  Returns:
    A list of SymbolInfo from the binary.
  """
  command = (host_paths.ToolPath('objdump', _arch), '-t', '-w', binary_filename)
  p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE)
  try:
    result = _SymbolInfosFromStream(p.stdout)
    return result
  finally:
    p.stdout.close()
    p.wait()


_LLVM_NM_LINE_RE = re.compile(
    r'^[\-0-9a-f]{8,16}[ ](?P<symbol_type>.)[ ](?P<name>.*)$', re.VERBOSE)


def _SymbolInfosFromLlvmNm(lines):
  """Extracts all defined symbols names from llvm-nm output.

  Only defined (weak and regular) symbols are extracted.

  Args:
    lines: Iterable of lines.

  Returns:
    [str] A list of symbol names, can be empty.
  """
  symbol_names = []
  for line in lines:
    m = _LLVM_NM_LINE_RE.match(line)
    assert m is not None, line
    if m.group('symbol_type') not in ['t', 'T', 'w', 'W']:
      continue
    symbol_names.append(m.group('name'))
  return symbol_names


_NM_PATH = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
                        'Release+Asserts', 'bin', 'llvm-nm')


def CheckLlvmNmExists():
  assert os.path.exists(_NM_PATH), (
      'llvm-nm not found. Please run '
      '//tools/clang/scripts/update.py --package=objdump to install it.')


def SymbolNamesFromLlvmBitcodeFile(filename):
  """Extracts all defined symbols names from an LLVM bitcode file.

  Args:
    filename: (str) File to parse.

  Returns:
    [str] A list of symbol names, can be empty.
  """
  command = (_NM_PATH, '-defined-only', filename)
  p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE,
                       stderr=subprocess.PIPE)
  try:
    result = _SymbolInfosFromLlvmNm(p.stdout)
    if not result:
      file_size = os.stat(filename).st_size
      logging.warning('No symbols for %s (size %d)', filename, file_size)
    return result
  finally:
    _, _ = p.communicate()
    p.stdout.close()
    assert p.wait() == 0


def GroupSymbolInfosByOffset(symbol_infos):
  """Create a dict {offset: [symbol_info1, ...], ...}.

  As several symbols can be at the same offset, this is a 1-to-many
  relationship.

  Args:
    symbol_infos: iterable of SymbolInfo instances

  Returns:
    a dict {offset: [symbol_info1, ...], ...}
  """
  offset_to_symbol_infos = collections.defaultdict(list)
  for symbol_info in symbol_infos:
    offset_to_symbol_infos[symbol_info.offset].append(symbol_info)
  return dict(offset_to_symbol_infos)


def GroupSymbolInfosByName(symbol_infos):
  """Create a dict {name: [symbol_info1, ...], ...}.

  A symbol can have several offsets, this is a 1-to-many relationship.

  Args:
    symbol_infos: iterable of SymbolInfo instances

  Returns:
    a dict {name: [symbol_info1, ...], ...}
  """
  name_to_symbol_infos = collections.defaultdict(list)
  for symbol_info in symbol_infos:
    name_to_symbol_infos[symbol_info.name].append(symbol_info)
  return dict(name_to_symbol_infos)


def CreateNameToSymbolInfo(symbol_infos):
  """Create a dict {name: symbol_info, ...}.

  Args:
    symbol_infos: iterable of SymbolInfo instances

  Returns:
    a dict {name: symbol_info, ...}
    If a symbol name corresponds to more than one symbol_info, the symbol_info
    with the lowest offset is chosen.
  """
  # TODO(lizeb,pasko): move the functionality in this method into
  # check_orderfile.
  symbol_infos_by_name = {}
  warnings = cygprofile_utils.WarningCollector(_MAX_WARNINGS_TO_PRINT)
  for infos in GroupSymbolInfosByName(symbol_infos).itervalues():
    first_symbol_info = min(infos, key=lambda x: x.offset)
    symbol_infos_by_name[first_symbol_info.name] = first_symbol_info
    if len(infos) > 1:
      warnings.Write('Symbol %s appears at %d offsets: %s' %
                     (first_symbol_info.name,
                      len(infos),
                      ','.join([hex(x.offset) for x in infos])))
  warnings.WriteEnd('symbols at multiple offsets.')
  return symbol_infos_by_name


def DemangleSymbol(mangled_symbol):
  """Return the demangled form of mangled_symbol."""
  cmd = [host_paths.ToolPath('c++filt', _arch)]
  process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
  demangled_symbol, _ = process.communicate(mangled_symbol + '\n')
  return demangled_symbol