1# Copyright 2015 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""Utilities to get and manipulate symbols from a binary.""" 6 7import collections 8import logging 9import os 10import re 11import subprocess 12import sys 13 14import cygprofile_utils 15 16START_OF_TEXT_SYMBOL = 'linker_script_start_of_text' 17 18_SRC_PATH = os.path.abspath(os.path.join( 19 os.path.dirname(__file__), os.pardir, os.pardir)) 20 21sys.path.insert(0, os.path.join(_SRC_PATH, 'build', 'android')) 22from pylib.constants import host_paths 23 24_MAX_WARNINGS_TO_PRINT = 200 25 26SymbolInfo = collections.namedtuple('SymbolInfo', ('name', 'offset', 'size', 27 'section')) 28 29# Unfortunate global variable :-/ 30_arch = 'arm' 31 32 33def SetArchitecture(arch): 34 """Set the architecture for binaries to be symbolized.""" 35 global _arch 36 _arch = arch 37 38 39# Regular expression to match lines printed by 'objdump -t -w'. An example of 40# such line looks like this: 41# 018db2de l F .text 00000060 .hidden _ZN8SkBitmapC2ERKS_ 42# 43# The regex intentionally allows matching more than valid inputs. This gives 44# more protection against potentially incorrectly silently ignoring unmatched 45# input lines. Instead a few assertions early in _FromObjdumpLine() check the 46# validity of a few parts matched as groups. 47_OBJDUMP_LINE_RE = re.compile(r''' 48 # The offset of the function, as hex. 49 (?P<offset>^[0-9a-f]+) 50 51 # The space character. 52 [ ] 53 54 # The 7 groups of flag characters, one character each. 55 ( 56 (?P<assert_scope>.) # Global, local, unique local, etc. 57 (?P<assert_weak_or_strong>.) 58 (?P<assert_4spaces>.{4}) # Constructor, warning, indirect ref, 59 # debugger symbol. 60 (?P<symbol_type>.) # Function, object, file or normal. 61 ) 62 63 [ ] 64 65 # The section name should start with ".text", can be ".text.foo". With LLD, 66 # and especially LTO the traces of input sections are not preserved. Support 67 # ".text.foo" for a little longer time because it is easy. 68 (?P<section>.text[^0-9a-f]*) 69 70 (?P<assert_tab> \s+) 71 72 # The size of the symbol, as hex. 73 (?P<size>[0-9a-f]+) 74 75 # Normally separated out by 14 spaces, but some bits in ELF may theoretically 76 # affect this length. 77 (?P<assert_14spaces>[ ]+) 78 79 # Hidden symbols should be treated as usual. 80 (.hidden [ ])? 81 82 # The symbol name. 83 (?P<name>.*) 84 85 $ 86 ''', re.VERBOSE) 87 88 89def _FromObjdumpLine(line): 90 """Create a SymbolInfo by parsing a properly formatted objdump output line. 91 92 Args: 93 line: line from objdump 94 95 Returns: 96 An instance of SymbolInfo if the line represents a symbol, None otherwise. 97 """ 98 m = _OBJDUMP_LINE_RE.match(line) 99 if not m: 100 return None 101 102 # A symbol can be (g)lobal, (l)ocal, or neither (a space). Per objdump's 103 # manpage, "A symbol can be neither local or global for a variety of reasons". 104 assert m.group('assert_scope') in set(['g', 'l', ' ']), line 105 assert m.group('assert_weak_or_strong') in set(['w', ' ']), line 106 assert m.group('assert_tab') == '\t', line 107 assert m.group('assert_4spaces') == ' ' * 4, line 108 assert m.group('assert_14spaces') == ' ' * 14, line 109 name = m.group('name') 110 offset = int(m.group('offset'), 16) 111 112 # Output the label that contains the earliest offset. It is needed later for 113 # translating offsets from the profile dumps. 114 if name == START_OF_TEXT_SYMBOL: 115 return SymbolInfo(name=name, offset=offset, section='.text', size=0) 116 117 # Check symbol type for validity and ignore some types. 118 # From objdump manual page: The symbol is the name of a function (F) or a file 119 # (f) or an object (O) or just a normal symbol (a space). The 'normal' symbols 120 # seens so far has been function-local labels. 121 symbol_type = m.group('symbol_type') 122 if symbol_type == ' ': 123 # Ignore local goto labels. Unfortunately, v8 builtins (like 'Builtins_.*') 124 # are indistinguishable from labels of size 0 other than by name. 125 return None 126 # Guard against file symbols, since they are normally not seen in the 127 # binaries we parse. 128 assert symbol_type != 'f', line 129 130 # Extract the size from the ELF field. This value sometimes does not reflect 131 # the real size of the function. One reason for that is the '.size' directive 132 # in the assembler. As a result, a few functions in .S files have the size 0. 133 # They are not instrumented (yet), but maintaining their order in the 134 # orderfile may be important in some cases. 135 size = int(m.group('size'), 16) 136 137 # Forbid ARM mapping symbols and other unexpected symbol names, but allow $ 138 # characters in a non-initial position, which can appear as a component of a 139 # mangled name, e.g. Clang can mangle a lambda function to: 140 # 02cd61e0 l F .text 000000c0 _ZZL11get_globalsvENK3$_1clEv 141 # The equivalent objdump line from GCC is: 142 # 0325c58c l F .text 000000d0 _ZZL11get_globalsvENKUlvE_clEv 143 # 144 # Also disallow .internal and .protected symbols (as well as other flags), 145 # those have not appeared in the binaries we parse. Rejecting these extra 146 # prefixes is done by disallowing spaces in symbol names. 147 assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name 148 149 return SymbolInfo(name=name, offset=offset, section=m.group('section'), 150 size=size) 151 152 153def _SymbolInfosFromStream(objdump_lines): 154 """Parses the output of objdump, and get all the symbols from a binary. 155 156 Args: 157 objdump_lines: An iterable of lines 158 159 Returns: 160 A list of SymbolInfo. 161 """ 162 name_to_offsets = collections.defaultdict(list) 163 symbol_infos = [] 164 for line in objdump_lines: 165 symbol_info = _FromObjdumpLine(line.rstrip('\n')) 166 if symbol_info is not None: 167 # On ARM the LLD linker inserts pseudo-functions (thunks) that allow 168 # jumping distances farther than 16 MiB. Such thunks are known to often 169 # reside on multiple offsets, they are not instrumented and hence they do 170 # not reach the orderfiles. Exclude the thunk symbols from the warning. 171 if not symbol_info.name.startswith('__ThumbV7PILongThunk_'): 172 name_to_offsets[symbol_info.name].append(symbol_info.offset) 173 symbol_infos.append(symbol_info) 174 175 # Outlined functions are known to be repeated often, so ignore them in the 176 # repeated symbol count. 177 repeated_symbols = filter(lambda s: len(name_to_offsets[s]) > 1, 178 (k for k in name_to_offsets.keys() 179 if not k.startswith('OUTLINED_FUNCTION_'))) 180 if repeated_symbols: 181 # Log the first 5 repeated offsets of the first 10 repeated symbols. 182 logging.warning('%d symbols repeated with multiple offsets:\n %s', 183 len(repeated_symbols), '\n '.join( 184 '{} {}'.format(sym, ' '.join( 185 str(offset) for offset in name_to_offsets[sym][:5])) 186 for sym in repeated_symbols[:10])) 187 188 return symbol_infos 189 190 191def SymbolInfosFromBinary(binary_filename): 192 """Runs objdump to get all the symbols from a binary. 193 194 Args: 195 binary_filename: path to the binary. 196 197 Returns: 198 A list of SymbolInfo from the binary. 199 """ 200 command = (host_paths.ToolPath('objdump', _arch), '-t', '-w', binary_filename) 201 p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE) 202 try: 203 result = _SymbolInfosFromStream(p.stdout) 204 return result 205 finally: 206 p.stdout.close() 207 p.wait() 208 209 210_LLVM_NM_LINE_RE = re.compile( 211 r'^[\-0-9a-f]{8,16}[ ](?P<symbol_type>.)[ ](?P<name>.*)$', re.VERBOSE) 212 213 214def _SymbolInfosFromLlvmNm(lines): 215 """Extracts all defined symbols names from llvm-nm output. 216 217 Only defined (weak and regular) symbols are extracted. 218 219 Args: 220 lines: Iterable of lines. 221 222 Returns: 223 [str] A list of symbol names, can be empty. 224 """ 225 symbol_names = [] 226 for line in lines: 227 m = _LLVM_NM_LINE_RE.match(line) 228 assert m is not None, line 229 if m.group('symbol_type') not in ['t', 'T', 'w', 'W']: 230 continue 231 symbol_names.append(m.group('name')) 232 return symbol_names 233 234 235_NM_PATH = os.path.join(_SRC_PATH, 'third_party', 'llvm-build', 236 'Release+Asserts', 'bin', 'llvm-nm') 237 238 239def CheckLlvmNmExists(): 240 assert os.path.exists(_NM_PATH), ( 241 'llvm-nm not found. Please run ' 242 '//tools/clang/scripts/update.py --package=objdump to install it.') 243 244 245def SymbolNamesFromLlvmBitcodeFile(filename): 246 """Extracts all defined symbols names from an LLVM bitcode file. 247 248 Args: 249 filename: (str) File to parse. 250 251 Returns: 252 [str] A list of symbol names, can be empty. 253 """ 254 command = (_NM_PATH, '-defined-only', filename) 255 p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, 256 stderr=subprocess.PIPE) 257 try: 258 result = _SymbolInfosFromLlvmNm(p.stdout) 259 if not result: 260 file_size = os.stat(filename).st_size 261 logging.warning('No symbols for %s (size %d)', filename, file_size) 262 return result 263 finally: 264 _, _ = p.communicate() 265 p.stdout.close() 266 assert p.wait() == 0 267 268 269def GroupSymbolInfosByOffset(symbol_infos): 270 """Create a dict {offset: [symbol_info1, ...], ...}. 271 272 As several symbols can be at the same offset, this is a 1-to-many 273 relationship. 274 275 Args: 276 symbol_infos: iterable of SymbolInfo instances 277 278 Returns: 279 a dict {offset: [symbol_info1, ...], ...} 280 """ 281 offset_to_symbol_infos = collections.defaultdict(list) 282 for symbol_info in symbol_infos: 283 offset_to_symbol_infos[symbol_info.offset].append(symbol_info) 284 return dict(offset_to_symbol_infos) 285 286 287def GroupSymbolInfosByName(symbol_infos): 288 """Create a dict {name: [symbol_info1, ...], ...}. 289 290 A symbol can have several offsets, this is a 1-to-many relationship. 291 292 Args: 293 symbol_infos: iterable of SymbolInfo instances 294 295 Returns: 296 a dict {name: [symbol_info1, ...], ...} 297 """ 298 name_to_symbol_infos = collections.defaultdict(list) 299 for symbol_info in symbol_infos: 300 name_to_symbol_infos[symbol_info.name].append(symbol_info) 301 return dict(name_to_symbol_infos) 302 303 304def CreateNameToSymbolInfo(symbol_infos): 305 """Create a dict {name: symbol_info, ...}. 306 307 Args: 308 symbol_infos: iterable of SymbolInfo instances 309 310 Returns: 311 a dict {name: symbol_info, ...} 312 If a symbol name corresponds to more than one symbol_info, the symbol_info 313 with the lowest offset is chosen. 314 """ 315 # TODO(lizeb,pasko): move the functionality in this method into 316 # check_orderfile. 317 symbol_infos_by_name = {} 318 warnings = cygprofile_utils.WarningCollector(_MAX_WARNINGS_TO_PRINT) 319 for infos in GroupSymbolInfosByName(symbol_infos).itervalues(): 320 first_symbol_info = min(infos, key=lambda x: x.offset) 321 symbol_infos_by_name[first_symbol_info.name] = first_symbol_info 322 if len(infos) > 1: 323 warnings.Write('Symbol %s appears at %d offsets: %s' % 324 (first_symbol_info.name, 325 len(infos), 326 ','.join([hex(x.offset) for x in infos]))) 327 warnings.WriteEnd('symbols at multiple offsets.') 328 return symbol_infos_by_name 329 330 331def DemangleSymbol(mangled_symbol): 332 """Return the demangled form of mangled_symbol.""" 333 cmd = [host_paths.ToolPath('c++filt', _arch)] 334 process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) 335 demangled_symbol, _ = process.communicate(mangled_symbol + '\n') 336 return demangled_symbol 337