1#!/usr/bin/env python3 2#===- lib/hwasan/scripts/hwasan_symbolize ----------------------------------===# 3# 4# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5# See https:#llvm.org/LICENSE.txt for license information. 6# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7# 8#===------------------------------------------------------------------------===# 9# 10# HWAddressSanitizer offline symbolization script. 11# 12#===------------------------------------------------------------------------===# 13 14from __future__ import print_function 15from __future__ import unicode_literals 16 17import argparse 18import glob 19import html 20import json 21import mmap 22import os 23import re 24import struct 25import subprocess 26import sys 27 28if sys.version_info.major < 3: 29 # Simulate Python 3.x behaviour of defaulting to UTF-8 for print. This is 30 # important in case any symbols are non-ASCII. 31 import codecs 32 sys.stdout = codecs.getwriter("utf-8")(sys.stdout) 33 34# Below, a parser for a subset of ELF. It only supports 64 bit, little-endian, 35# and only parses what is necessary to find the build ids. It uses a memoryview 36# into an mmap to avoid copying. 37Ehdr_size = 64 38e_shnum_offset = 60 39e_shoff_offset = 40 40 41Shdr_size = 64 42sh_type_offset = 4 43sh_offset_offset = 24 44sh_size_offset = 32 45SHT_NOTE = 7 46 47Nhdr_size = 12 48NT_GNU_BUILD_ID = 3 49 50def align_up(size, alignment): 51 return (size + alignment - 1) & ~(alignment - 1) 52 53def handle_Nhdr(mv, sh_size): 54 offset = 0 55 while offset < sh_size: 56 n_namesz, n_descsz, n_type = struct.unpack_from('<III', buffer=mv, 57 offset=offset) 58 if (n_type == NT_GNU_BUILD_ID and n_namesz == 4 and 59 mv[offset + Nhdr_size: offset + Nhdr_size + 4] == b"GNU\x00"): 60 value = mv[offset + Nhdr_size + 4: offset + Nhdr_size + 4 + n_descsz] 61 return value.hex() 62 offset += Nhdr_size + align_up(n_namesz, 4) + align_up(n_descsz, 4) 63 return None 64 65def handle_Shdr(mv): 66 sh_type, = struct.unpack_from('<I', buffer=mv, offset=sh_type_offset) 67 if sh_type != SHT_NOTE: 68 return None, None 69 sh_offset, = struct.unpack_from('<Q', buffer=mv, offset=sh_offset_offset) 70 sh_size, = struct.unpack_from('<Q', buffer=mv, offset=sh_size_offset) 71 return sh_offset, sh_size 72 73def handle_elf(mv): 74 # \x02 is ELFCLASS64, \x01 is ELFDATA2LSB. HWASan currently only works on 75 # 64-bit little endian platforms (x86_64 and ARM64). If this changes, we will 76 # have to extend the parsing code. 77 if mv[:6] != b'\x7fELF\x02\x01': 78 return None 79 e_shnum, = struct.unpack_from('<H', buffer=mv, offset=e_shnum_offset) 80 e_shoff, = struct.unpack_from('<Q', buffer=mv, offset=e_shoff_offset) 81 for i in range(0, e_shnum): 82 start = e_shoff + i * Shdr_size 83 sh_offset, sh_size = handle_Shdr(mv[start: start + Shdr_size]) 84 if sh_offset is None: 85 continue 86 note_hdr = mv[sh_offset: sh_offset + sh_size] 87 result = handle_Nhdr(note_hdr, sh_size) 88 if result is not None: 89 return result 90 91def get_buildid(filename): 92 with open(filename, "r") as fd: 93 if os.fstat(fd.fileno()).st_size < Ehdr_size: 94 return None 95 with mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_READ) as m: 96 with memoryview(m) as mv: 97 return handle_elf(mv) 98 99class Symbolizer: 100 def __init__(self, path, binary_prefixes, paths_to_cut): 101 self.__pipe = None 102 self.__path = path 103 self.__binary_prefixes = binary_prefixes 104 self.__paths_to_cut = paths_to_cut 105 self.__log = False 106 self.__warnings = set() 107 self.__index = {} 108 self.__link_prefixes = [] 109 self.__html = False 110 self.__last_access_address = None 111 self.__last_access_tag = None 112 113 def enable_html(self, enable): 114 self.__html = enable 115 116 def enable_logging(self, enable): 117 self.__log = enable 118 119 def maybe_escape(self, text): 120 if self.__html: 121 # We need to manually use for leading spaces, html.escape does 122 # not do that, and HTML ignores them. 123 spaces = 0 124 for i, c in enumerate(text): 125 spaces = i 126 if c != ' ': 127 break 128 text = text[spaces:] 129 return spaces * ' ' + html.escape(text) 130 return text 131 132 def print(self, line, escape=True): 133 if escape: 134 line = self.maybe_escape(line) 135 if self.__html: 136 line += '<br/>' 137 print(line) 138 139 def read_linkify(self, filename): 140 with open(filename, 'r') as fd: 141 data = json.load(fd) 142 self.__link_prefixes = [(e["prefix"], e["link"]) for e in data] 143 144 def __open_pipe(self): 145 if not self.__pipe: 146 opt = {} 147 if sys.version_info.major > 2: 148 opt['encoding'] = 'utf-8' 149 self.__pipe = subprocess.Popen([self.__path, "--inlining", "--functions"], 150 stdin=subprocess.PIPE, stdout=subprocess.PIPE, 151 **opt) 152 153 class __EOF(Exception): 154 pass 155 156 def __write(self, s): 157 print(s, file=self.__pipe.stdin) 158 self.__pipe.stdin.flush() 159 if self.__log: 160 print("#>> |%s|" % (s,), file=sys.stderr) 161 162 def __read(self): 163 s = self.__pipe.stdout.readline().rstrip() 164 if self.__log: 165 print("# << |%s|" % (s,), file=sys.stderr) 166 if s == '': 167 raise Symbolizer.__EOF 168 return s 169 170 def __process_source_path(self, file_name): 171 for path_to_cut in self.__paths_to_cut: 172 file_name = re.sub(".*" + path_to_cut, "", file_name) 173 file_name = re.sub(".*hwasan_[a-z_]*.(cc|h):[0-9]*", "[hwasan_rtl]", file_name) 174 file_name = re.sub(".*asan_[a-z_]*.(cc|h):[0-9]*", "[asan_rtl]", file_name) 175 file_name = re.sub(".*crtstuff.c:0", "???:0", file_name) 176 return file_name 177 178 def __process_binary_name(self, name, buildid): 179 if name.startswith('/'): 180 name = name[1:] 181 if buildid is not None and buildid in self.__index: 182 return self.__index[buildid] 183 184 for p in self.__binary_prefixes: 185 full_path = os.path.join(p, name) 186 if os.path.exists(full_path): 187 return full_path 188 apex_prefix = "apex/com.android." 189 if name.startswith(apex_prefix): 190 full_path = os.path.join(p, "apex/com.google.android." + name[len(apex_prefix):]) 191 if os.path.exists(full_path): 192 return full_path 193 # Try stripping extra path components as the last resort. 194 for p in self.__binary_prefixes: 195 full_path = os.path.join(p, os.path.basename(name)) 196 if os.path.exists(full_path): 197 return full_path 198 if name not in self.__warnings: 199 print("Could not find symbols for", name, file=sys.stderr) 200 self.__warnings.add(name) 201 return None 202 203 def iter_locals(self, binary, addr, buildid): 204 self.__open_pipe() 205 p = self.__pipe 206 binary = self.__process_binary_name(binary, buildid) 207 if not binary: 208 return 209 self.__write("FRAME %s %s" % (binary, addr)) 210 try: 211 while True: 212 function_name = self.__read() 213 local_name = self.__read() 214 file_line = self.__read() 215 extra = self.__read().split() 216 217 file_line = self.__process_source_path(file_line) 218 offset = None if extra[0] == '??' else int(extra[0]) 219 size = None if extra[1] == '??' else int(extra[1]) 220 tag_offset = None if extra[2] == '??' else int(extra[2]) 221 yield (function_name, file_line, local_name, offset, size, tag_offset) 222 except Symbolizer.__EOF: 223 pass 224 225 def iter_call_stack(self, binary, buildid, addr): 226 self.__open_pipe() 227 p = self.__pipe 228 binary = self.__process_binary_name(binary, buildid) 229 if not binary: 230 return 231 self.__write("CODE %s %s" % (binary, addr)) 232 try: 233 while True: 234 function_name = self.__read() 235 file_line = self.__read() 236 file_line = self.__process_source_path(file_line) 237 yield (function_name, file_line) 238 except Symbolizer.__EOF: 239 pass 240 241 def maybe_linkify(self, file_line): 242 if not self.__html or not self.__link_prefixes: 243 return file_line 244 filename, line_col = file_line.split(':', 1) 245 if not line_col: 246 line = '0' # simplify the link generation 247 else: 248 line = line_col.split(':')[0] 249 longest_prefix = max(( 250 (prefix, link) for prefix, link in self.__link_prefixes 251 if filename.startswith(prefix)), 252 key=lambda x: len(x[0]), default=None) 253 if longest_prefix is None: 254 return file_line 255 else: 256 prefix, link = longest_prefix 257 return '<a href="{}">{}</a>'.format( 258 html.escape(link.format(file=filename[len(prefix):], line=line, 259 file_line=file_line, prefix=prefix)), file_line) 260 261 def build_index(self): 262 for p in self.__binary_prefixes: 263 for dname, _, fnames in os.walk(p): 264 for fn in fnames: 265 filename = os.path.join(dname, fn) 266 try: 267 bid = get_buildid(filename) 268 except FileNotFoundError: 269 continue 270 except Exception as e: 271 print("Failed to parse {}: {}".format(filename, e), file=sys.stderr) 272 continue 273 if bid is not None: 274 self.__index[bid] = filename 275 276 def symbolize_line(self, line): 277 #0 0x7f6e35cf2e45 (/blah/foo.so+0x11fe45) (BuildId: 4abce4cd41ea5c2f34753297b7e774d9) 278 match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)' 279 r'(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE) 280 if match: 281 frameno = match.group(2) 282 binary = match.group(5) 283 addr = int(match.group(6), 16) 284 buildid = match.group(7) 285 286 frames = list(self.iter_call_stack(binary, buildid, addr)) 287 288 if len(frames) > 0: 289 self.print( 290 self.maybe_escape( 291 "%s#%s%s%s in " % (match.group(1), match.group(2), match.group(3), 292 frames[0][0]) 293 ) + self.maybe_linkify(frames[0][1]), 294 escape=False) 295 for i in range(1, len(frames)): 296 space1 = ' ' * match.end(1) 297 space2 = ' ' * (match.start(4) - match.end(1) - 2) 298 self.print( 299 self.maybe_escape("%s->%s%s in " % (space1, space2, frames[i][0])) 300 + self.maybe_linkify(frames[i][1]), escape=False) 301 else: 302 self.print(line.rstrip()) 303 else: 304 self.print(line.rstrip()) 305 306 def save_access_address(self, line): 307 match = re.match(r'^(.*?)HWAddressSanitizer: tag-mismatch on address (0x[0-9a-f]+) ', line, re.UNICODE) 308 if match: 309 self.__last_access_address = int(match.group(2), 16) 310 match = re.match(r'^(.*?) of size [0-9]+ at 0x[0-9a-f]* tags: ([0-9a-f]+)/[0-9a-f]+(\([0-9a-f]+\))? \(ptr/mem\)', line, re.UNICODE) 311 if match: 312 self.__last_access_tag = int(match.group(2), 16) 313 314 def process_stack_history(self, line, ignore_tags=False): 315 if self.__last_access_address is None or self.__last_access_tag is None: 316 return 317 if re.match(r'Previously allocated frames:', line, re.UNICODE): 318 return True 319 pc_mask = (1 << 48) - 1 320 fp_mask = (1 << 20) - 1 321 # record_addr:0x1234ABCD record:0x1234ABCD (/path/to/binary+0x1234ABCD) (BuildId: 4abce4cd41ea5c2f34753297b7e774d9) 322 match = re.match(r'^(.*?)record_addr:(0x[0-9a-f]+) +record:(0x[0-9a-f]+) +\((.*)\+(0x[0-9a-f]+)\)' 323 r'(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE) 324 if match: 325 record_addr = int(match.group(2), 16) 326 record = int(match.group(3), 16) 327 binary = match.group(4) 328 addr = int(match.group(5), 16) 329 buildid = match.group(6) 330 base_tag = (record_addr >> 3) & 0xFF 331 fp = (record >> 48) << 4 332 pc = record & pc_mask 333 334 for local in self.iter_locals(binary, addr, buildid): 335 frame_offset = local[3] 336 size = local[4] 337 if frame_offset is None or size is None: 338 continue 339 obj_offset = (self.__last_access_address - fp - frame_offset) & fp_mask 340 if obj_offset >= size: 341 continue 342 tag_offset = local[5] 343 if not ignore_tags and (tag_offset is None or base_tag ^ tag_offset != self.__last_access_tag): 344 continue 345 self.print('') 346 self.print('Potentially referenced stack object:') 347 self.print(' %d bytes inside a variable "%s" in stack frame of function "%s"' % (obj_offset, local[2], local[0])) 348 self.print(' at %s' % (local[1],)) 349 return True 350 return False 351 352def extract_version(s): 353 idx = s.rfind('-') 354 if idx == -1: 355 return 0 356 x = float(s[idx + 1:]) 357 return x 358 359def main(): 360 parser = argparse.ArgumentParser() 361 parser.add_argument('-d', action='store_true') 362 parser.add_argument('-v', action='store_true') 363 parser.add_argument('--ignore-tags', action='store_true') 364 parser.add_argument('--symbols', action='append') 365 parser.add_argument('--source', action='append') 366 parser.add_argument('--index', action='store_true') 367 parser.add_argument('--symbolizer') 368 parser.add_argument('--linkify', type=str) 369 parser.add_argument('--html', action='store_true') 370 parser.add_argument('args', nargs=argparse.REMAINDER) 371 args = parser.parse_args() 372 373 # Unstripped binaries location. 374 binary_prefixes = args.symbols or [] 375 if not binary_prefixes: 376 if 'ANDROID_PRODUCT_OUT' in os.environ: 377 product_out = os.path.join(os.environ['ANDROID_PRODUCT_OUT'], 'symbols') 378 binary_prefixes.append(product_out) 379 binary_prefixes.append('/') 380 381 for p in binary_prefixes: 382 if not os.path.isdir(p): 383 print("Symbols path does not exist or is not a directory:", p, file=sys.stderr) 384 sys.exit(1) 385 386 # Source location. 387 paths_to_cut = args.source or [] 388 if not paths_to_cut: 389 paths_to_cut.append(os.getcwd() + '/') 390 if 'ANDROID_BUILD_TOP' in os.environ: 391 paths_to_cut.append(os.environ['ANDROID_BUILD_TOP'] + '/') 392 393 # llvm-symbolizer binary. 394 # 1. --symbolizer flag 395 # 2. environment variable 396 # 3. unsuffixed binary in the current directory 397 # 4. if inside Android platform, prebuilt binary at a known path 398 # 5. first "llvm-symbolizer", then "llvm-symbolizer-$VER" with the 399 # highest available version in $PATH 400 symbolizer_path = args.symbolizer 401 if not symbolizer_path: 402 if 'LLVM_SYMBOLIZER_PATH' in os.environ: 403 symbolizer_path = os.environ['LLVM_SYMBOLIZER_PATH'] 404 elif 'HWASAN_SYMBOLIZER_PATH' in os.environ: 405 symbolizer_path = os.environ['HWASAN_SYMBOLIZER_PATH'] 406 407 if not symbolizer_path: 408 s = os.path.join(os.path.dirname(sys.argv[0]), 'llvm-symbolizer') 409 if os.path.exists(s): 410 symbolizer_path = s 411 412 if not symbolizer_path: 413 if 'ANDROID_BUILD_TOP' in os.environ: 414 s = os.path.join(os.environ['ANDROID_BUILD_TOP'], 'prebuilts/clang/host/linux-x86/llvm-binutils-stable/llvm-symbolizer') 415 if os.path.exists(s): 416 symbolizer_path = s 417 418 if not symbolizer_path: 419 for path in os.environ["PATH"].split(os.pathsep): 420 p = os.path.join(path, 'llvm-symbolizer') 421 if os.path.exists(p): 422 symbolizer_path = p 423 break 424 425 if not symbolizer_path: 426 for path in os.environ["PATH"].split(os.pathsep): 427 candidates = glob.glob(os.path.join(path, 'llvm-symbolizer-*')) 428 if len(candidates) > 0: 429 candidates.sort(key = extract_version, reverse = True) 430 symbolizer_path = candidates[0] 431 break 432 433 if not os.path.exists(symbolizer_path): 434 print("Symbolizer path does not exist:", symbolizer_path, file=sys.stderr) 435 sys.exit(1) 436 437 if args.v: 438 print("Looking for symbols in:") 439 for s in binary_prefixes: 440 print(" %s" % (s,)) 441 print("Stripping source path prefixes:") 442 for s in paths_to_cut: 443 print(" %s" % (s,)) 444 print("Using llvm-symbolizer binary in:\n %s" % (symbolizer_path,)) 445 print() 446 447 symbolizer = Symbolizer(symbolizer_path, binary_prefixes, paths_to_cut) 448 symbolizer.enable_html(args.html) 449 symbolizer.enable_logging(args.d) 450 if args.index: 451 symbolizer.build_index() 452 453 if args.linkify: 454 if not args.html: 455 print('Need --html to --linkify', file=sys.stderr) 456 sys.exit(1) 457 symbolizer.read_linkify(args.linkify) 458 459 for line in sys.stdin: 460 if sys.version_info.major < 3: 461 line = line.decode('utf-8') 462 symbolizer.save_access_address(line) 463 if symbolizer.process_stack_history(line, ignore_tags=args.ignore_tags): 464 continue 465 symbolizer.symbolize_line(line) 466 467 468if __name__ == '__main__': 469 main() 470