1# Copyright 2015 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Utilities to get and manipulate symbols from a binary."""
6
7import collections
8import logging
9import os
10import re
11import subprocess
12import sys
13
14import cygprofile_utils
15
16START_OF_TEXT_SYMBOL = 'linker_script_start_of_text'
17
18_SRC_PATH = os.path.abspath(os.path.join(
19    os.path.dirname(__file__), os.pardir, os.pardir))
20
21sys.path.insert(0, os.path.join(_SRC_PATH, 'build', 'android'))
22from pylib.constants import host_paths
23
24_MAX_WARNINGS_TO_PRINT = 200
25
26SymbolInfo = collections.namedtuple('SymbolInfo', ('name', 'offset', 'size',
27                                                   'section'))
28
29# Unfortunate global variable :-/
30_arch = 'arm'
31
32
33def SetArchitecture(arch):
34  """Set the architecture for binaries to be symbolized."""
35  global _arch
36  _arch = arch
37
38
39# Regular expression to match lines printed by 'objdump -t -w'. An example of
40# such line looks like this:
41# 018db2de l     F .text  00000060              .hidden _ZN8SkBitmapC2ERKS_
42#
43# The regex intentionally allows matching more than valid inputs. This gives
44# more protection against potentially incorrectly silently ignoring unmatched
45# input lines. Instead a few assertions early in _FromObjdumpLine() check the
46# validity of a few parts matched as groups.
47_OBJDUMP_LINE_RE = re.compile(r'''
48  # The offset of the function, as hex.
49  (?P<offset>^[0-9a-f]+)
50
51  # The space character.
52  [ ]
53
54  # The 7 groups of flag characters, one character each.
55  (
56    (?P<assert_scope>.)           # Global, local, unique local, etc.
57    (?P<assert_weak_or_strong>.)
58    (?P<assert_4spaces>.{4})      # Constructor, warning, indirect ref,
59                                  # debugger symbol.
60    (?P<symbol_type>.)            # Function, object, file or normal.
61  )
62
63  [ ]
64
65  # The section name should start with ".text", can be ".text.foo". With LLD,
66  # and especially LTO the traces of input sections are not preserved. Support
67  # ".text.foo" for a little longer time because it is easy.
68  (?P<section>.text[^0-9a-f]*)
69
70  (?P<assert_tab> \s+)
71
72  # The size of the symbol, as hex.
73  (?P<size>[0-9a-f]+)
74
75  # Normally separated out by 14 spaces, but some bits in ELF may theoretically
76  # affect this length.
77  (?P<assert_14spaces>[ ]+)
78
79  # Hidden symbols should be treated as usual.
80  (.hidden [ ])?
81
82  # The symbol name.
83  (?P<name>.*)
84
85  $
86  ''', re.VERBOSE)
87
88
89def _FromObjdumpLine(line):
90  """Create a SymbolInfo by parsing a properly formatted objdump output line.
91
92  Args:
93    line: line from objdump
94
95  Returns:
96    An instance of SymbolInfo if the line represents a symbol, None otherwise.
97  """
98  m = _OBJDUMP_LINE_RE.match(line)
99  if not m:
100    return None
101
102  # A symbol can be (g)lobal, (l)ocal, or neither (a space). Per objdump's
103  # manpage, "A symbol can be neither local or global for a variety of reasons".
104  assert m.group('assert_scope') in set(['g', 'l', ' ']), line
105  assert m.group('assert_weak_or_strong') in set(['w', ' ']), line
106  assert m.group('assert_tab') == '\t', line
107  assert m.group('assert_4spaces') == ' ' * 4, line
108  assert m.group('assert_14spaces') == ' ' * 14, line
109  name = m.group('name')
110  offset = int(m.group('offset'), 16)
111
112  # Output the label that contains the earliest offset. It is needed later for
113  # translating offsets from the profile dumps.
114  if name == START_OF_TEXT_SYMBOL:
115    return SymbolInfo(name=name, offset=offset, section='.text', size=0)
116
117  # Check symbol type for validity and ignore some types.
118  # From objdump manual page: The symbol is the name of a function (F) or a file
119  # (f) or an object (O) or just a normal symbol (a space). The 'normal' symbols
120  # seens so far has been function-local labels.
121  symbol_type = m.group('symbol_type')
122  if symbol_type == ' ':
123    # Ignore local goto labels. Unfortunately, v8 builtins (like 'Builtins_.*')
124    # are indistinguishable from labels of size 0 other than by name.
125    return None
126  # Guard against file symbols, since they are normally not seen in the
127  # binaries we parse.
128  assert symbol_type != 'f', line
129
130  # Extract the size from the ELF field. This value sometimes does not reflect
131  # the real size of the function. One reason for that is the '.size' directive
132  # in the assembler. As a result, a few functions in .S files have the size 0.
133  # They are not instrumented (yet), but maintaining their order in the
134  # orderfile may be important in some cases.
135  size = int(m.group('size'), 16)
136
137  # Forbid ARM mapping symbols and other unexpected symbol names, but allow $
138  # characters in a non-initial position, which can appear as a component of a
139  # mangled name, e.g. Clang can mangle a lambda function to:
140  # 02cd61e0 l     F .text  000000c0 _ZZL11get_globalsvENK3$_1clEv
141  # The equivalent objdump line from GCC is:
142  # 0325c58c l     F .text  000000d0 _ZZL11get_globalsvENKUlvE_clEv
143  #
144  # Also disallow .internal and .protected symbols (as well as other flags),
145  # those have not appeared in the binaries we parse. Rejecting these extra
146  # prefixes is done by disallowing spaces in symbol names.
147  assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name
148
149  return SymbolInfo(name=name, offset=offset, section=m.group('section'),
150                    size=size)
151
152
153def _SymbolInfosFromStream(objdump_lines):
154  """Parses the output of objdump, and get all the symbols from a binary.
155
156  Args:
157    objdump_lines: An iterable of lines
158
159  Returns:
160    A list of SymbolInfo.
161  """
162  name_to_offsets = collections.defaultdict(list)
163  symbol_infos = []
164  for line in objdump_lines:
165    symbol_info = _FromObjdumpLine(line.rstrip('\n'))
166    if symbol_info is not None:
167      # On ARM the LLD linker inserts pseudo-functions (thunks) that allow
168      # jumping distances farther than 16 MiB. Such thunks are known to often
169      # reside on multiple offsets, they are not instrumented and hence they do
170      # not reach the orderfiles. Exclude the thunk symbols from the warning.
171      if not symbol_info.name.startswith('__ThumbV7PILongThunk_'):
172        name_to_offsets[symbol_info.name].append(symbol_info.offset)
173      symbol_infos.append(symbol_info)
174
175  # Outlined functions are known to be repeated often, so ignore them in the
176  # repeated symbol count.
177  repeated_symbols = filter(lambda s: len(name_to_offsets[s]) > 1,
178                            (k for k in name_to_offsets.keys()
179                             if not k.startswith('OUTLINED_FUNCTION_')))
180  if repeated_symbols:
181    # Log the first 5 repeated offsets of the first 10 repeated symbols.
182    logging.warning('%d symbols repeated with multiple offsets:\n %s',
183                    len(repeated_symbols), '\n '.join(
184                        '{} {}'.format(sym, ' '.join(
185                            str(offset) for offset in name_to_offsets[sym][:5]))
186                        for sym in repeated_symbols[:10]))
187
188  return symbol_infos
189
190
191def SymbolInfosFromBinary(binary_filename):
192  """Runs objdump to get all the symbols from a binary.
193
194  Args:
195    binary_filename: path to the binary.
196
197  Returns:
198    A list of SymbolInfo from the binary.
199  """
200  command = (host_paths.ToolPath('objdump', _arch), '-t', '-w', binary_filename)
201  p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE)
202  try:
203    result = _SymbolInfosFromStream(p.stdout)
204    return result
205  finally:
206    p.stdout.close()
207    p.wait()
208
209
210_LLVM_NM_LINE_RE = re.compile(
211    r'^[\-0-9a-f]{8,16}[ ](?P<symbol_type>.)[ ](?P<name>.*)$', re.VERBOSE)
212
213
214def _SymbolInfosFromLlvmNm(lines):
215  """Extracts all defined symbols names from llvm-nm output.
216
217  Only defined (weak and regular) symbols are extracted.
218
219  Args:
220    lines: Iterable of lines.
221
222  Returns:
223    [str] A list of symbol names, can be empty.
224  """
225  symbol_names = []
226  for line in lines:
227    m = _LLVM_NM_LINE_RE.match(line)
228    assert m is not None, line
229    if m.group('symbol_type') not in ['t', 'T', 'w', 'W']:
230      continue
231    symbol_names.append(m.group('name'))
232  return symbol_names
233
234
235_NM_PATH = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
236                        'Release+Asserts', 'bin', 'llvm-nm')
237
238
239def CheckLlvmNmExists():
240  assert os.path.exists(_NM_PATH), (
241      'llvm-nm not found. Please run '
242      '//tools/clang/scripts/update.py --package=objdump to install it.')
243
244
245def SymbolNamesFromLlvmBitcodeFile(filename):
246  """Extracts all defined symbols names from an LLVM bitcode file.
247
248  Args:
249    filename: (str) File to parse.
250
251  Returns:
252    [str] A list of symbol names, can be empty.
253  """
254  command = (_NM_PATH, '-defined-only', filename)
255  p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE,
256                       stderr=subprocess.PIPE)
257  try:
258    result = _SymbolInfosFromLlvmNm(p.stdout)
259    if not result:
260      file_size = os.stat(filename).st_size
261      logging.warning('No symbols for %s (size %d)', filename, file_size)
262    return result
263  finally:
264    _, _ = p.communicate()
265    p.stdout.close()
266    assert p.wait() == 0
267
268
269def GroupSymbolInfosByOffset(symbol_infos):
270  """Create a dict {offset: [symbol_info1, ...], ...}.
271
272  As several symbols can be at the same offset, this is a 1-to-many
273  relationship.
274
275  Args:
276    symbol_infos: iterable of SymbolInfo instances
277
278  Returns:
279    a dict {offset: [symbol_info1, ...], ...}
280  """
281  offset_to_symbol_infos = collections.defaultdict(list)
282  for symbol_info in symbol_infos:
283    offset_to_symbol_infos[symbol_info.offset].append(symbol_info)
284  return dict(offset_to_symbol_infos)
285
286
287def GroupSymbolInfosByName(symbol_infos):
288  """Create a dict {name: [symbol_info1, ...], ...}.
289
290  A symbol can have several offsets, this is a 1-to-many relationship.
291
292  Args:
293    symbol_infos: iterable of SymbolInfo instances
294
295  Returns:
296    a dict {name: [symbol_info1, ...], ...}
297  """
298  name_to_symbol_infos = collections.defaultdict(list)
299  for symbol_info in symbol_infos:
300    name_to_symbol_infos[symbol_info.name].append(symbol_info)
301  return dict(name_to_symbol_infos)
302
303
304def CreateNameToSymbolInfo(symbol_infos):
305  """Create a dict {name: symbol_info, ...}.
306
307  Args:
308    symbol_infos: iterable of SymbolInfo instances
309
310  Returns:
311    a dict {name: symbol_info, ...}
312    If a symbol name corresponds to more than one symbol_info, the symbol_info
313    with the lowest offset is chosen.
314  """
315  # TODO(lizeb,pasko): move the functionality in this method into
316  # check_orderfile.
317  symbol_infos_by_name = {}
318  warnings = cygprofile_utils.WarningCollector(_MAX_WARNINGS_TO_PRINT)
319  for infos in GroupSymbolInfosByName(symbol_infos).itervalues():
320    first_symbol_info = min(infos, key=lambda x: x.offset)
321    symbol_infos_by_name[first_symbol_info.name] = first_symbol_info
322    if len(infos) > 1:
323      warnings.Write('Symbol %s appears at %d offsets: %s' %
324                     (first_symbol_info.name,
325                      len(infos),
326                      ','.join([hex(x.offset) for x in infos])))
327  warnings.WriteEnd('symbols at multiple offsets.')
328  return symbol_infos_by_name
329
330
331def DemangleSymbol(mangled_symbol):
332  """Return the demangled form of mangled_symbol."""
333  cmd = [host_paths.ToolPath('c++filt', _arch)]
334  process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
335  demangled_symbol, _ = process.communicate(mangled_symbol + '\n')
336  return demangled_symbol
337