1#!/usr/local/bin/python3.8 2# Copyright 2018 The Emscripten Authors. All rights reserved. 3# Emscripten is available under two separate licenses, the MIT license and the 4# University of Illinois/NCSA Open Source License. Both these licenses can be 5# found in the LICENSE file. 6 7"""Utility tools that extracts DWARF information encoded in a wasm output 8produced by the LLVM tools, and encodes it as a wasm source map. Additionally, 9it can collect original sources, change files prefixes, and strip debug 10sections from a wasm file. 11""" 12 13import argparse 14from collections import OrderedDict 15import json 16import logging 17from math import floor, log 18import os 19import re 20from subprocess import Popen, PIPE 21import sys 22 23sys.path.insert(1, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 24 25from tools.shared import asstr 26 27logger = logging.getLogger('wasm-sourcemap') 28 29 30def parse_args(): 31 parser = argparse.ArgumentParser(prog='wasm-sourcemap.py', description=__doc__) 32 parser.add_argument('wasm', help='wasm file') 33 parser.add_argument('-o', '--output', help='output source map') 34 parser.add_argument('-p', '--prefix', nargs='*', help='replace source debug filename prefix for source map', default=[]) 35 parser.add_argument('-s', '--sources', action='store_true', help='read and embed source files from file system into source map') 36 parser.add_argument('-l', '--load-prefix', nargs='*', help='replace source debug filename prefix for reading sources from file system (see also --sources)', default=[]) 37 parser.add_argument('-w', nargs='?', help='set output wasm file') 38 parser.add_argument('-x', '--strip', action='store_true', help='removes debug and linking sections') 39 parser.add_argument('-u', '--source-map-url', nargs='?', help='specifies sourceMappingURL section contest') 40 parser.add_argument('--dwarfdump', help="path to llvm-dwarfdump executable") 41 parser.add_argument('--dwarfdump-output', nargs='?', help=argparse.SUPPRESS) 42 parser.add_argument('--basepath', help='base path for source files, which will be relative to this') 43 return parser.parse_args() 44 45 46class Prefixes: 47 def __init__(self, args): 48 prefixes = [] 49 for p in args: 50 if '=' in p: 51 prefix, replacement = p.split('=') 52 prefixes.append({'prefix': prefix, 'replacement': replacement}) 53 else: 54 prefixes.append({'prefix': p, 'replacement': None}) 55 self.prefixes = prefixes 56 self.cache = {} 57 58 def resolve(self, name): 59 if name in self.cache: 60 return self.cache[name] 61 62 for p in self.prefixes: 63 if name.startswith(p['prefix']): 64 if p['replacement'] is None: 65 result = name[len(p['prefix'])::] 66 else: 67 result = p['replacement'] + name[len(p['prefix'])::] 68 break 69 self.cache[name] = result 70 return result 71 72 73# SourceMapPrefixes contains resolver for file names that are: 74# - "sources" is for names that output to source maps JSON 75# - "load" is for paths that used to load source text 76class SourceMapPrefixes: 77 def __init__(self, sources, load): 78 self.sources = sources 79 self.load = load 80 81 def provided(self): 82 return bool(self.sources.prefixes or self.load.prefixes) 83 84 85def encode_vlq(n): 86 VLQ_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" 87 x = (n << 1) if n >= 0 else ((-n << 1) + 1) 88 result = "" 89 while x > 31: 90 result = result + VLQ_CHARS[32 + (x & 31)] 91 x = x >> 5 92 return result + VLQ_CHARS[x] 93 94 95def read_var_uint(wasm, pos): 96 n = 0 97 shift = 0 98 b = ord(wasm[pos:pos + 1]) 99 pos = pos + 1 100 while b >= 128: 101 n = n | ((b - 128) << shift) 102 b = ord(wasm[pos:pos + 1]) 103 pos = pos + 1 104 shift += 7 105 return n + (b << shift), pos 106 107 108def strip_debug_sections(wasm): 109 logger.debug('Strip debug sections') 110 pos = 8 111 stripped = wasm[:pos] 112 113 while pos < len(wasm): 114 section_start = pos 115 section_id, pos_ = read_var_uint(wasm, pos) 116 section_size, section_body = read_var_uint(wasm, pos_) 117 pos = section_body + section_size 118 if section_id == 0: 119 name_len, name_pos = read_var_uint(wasm, section_body) 120 name_end = name_pos + name_len 121 name = wasm[name_pos:name_end] 122 if name == "linking" or name == "sourceMappingURL" or name.startswith("reloc..debug_") or name.startswith(".debug_"): 123 continue # skip debug related sections 124 stripped = stripped + wasm[section_start:pos] 125 126 return stripped 127 128 129def encode_uint_var(n): 130 result = bytearray() 131 while n > 127: 132 result.append(128 | (n & 127)) 133 n = n >> 7 134 result.append(n) 135 return bytes(result) 136 137 138def append_source_mapping(wasm, url): 139 logger.debug('Append sourceMappingURL section') 140 section_name = "sourceMappingURL" 141 section_content = encode_uint_var(len(section_name)) + section_name + encode_uint_var(len(url)) + url 142 return wasm + encode_uint_var(0) + encode_uint_var(len(section_content)) + section_content 143 144 145def get_code_section_offset(wasm): 146 logger.debug('Read sections index') 147 pos = 8 148 149 while pos < len(wasm): 150 section_id, pos_ = read_var_uint(wasm, pos) 151 section_size, pos = read_var_uint(wasm, pos_) 152 if section_id == 10: 153 return pos 154 pos = pos + section_size 155 156 157def remove_dead_entries(entries): 158 # Remove entries for dead functions. It is a heuristics to ignore data if the 159 # function starting address near to 0 (is equal to its size field length). 160 block_start = 0 161 cur_entry = 0 162 while cur_entry < len(entries): 163 if not entries[cur_entry]['eos']: 164 cur_entry += 1 165 continue 166 fn_start = entries[block_start]['address'] 167 # Calculate the LEB encoded function size (including size field) 168 fn_size_length = floor(log(entries[cur_entry]['address'] - fn_start + 1, 128)) + 1 169 min_live_offset = 1 + fn_size_length # 1 byte is for code section entries 170 if fn_start < min_live_offset: 171 # Remove dead code debug info block. 172 del entries[block_start:cur_entry + 1] 173 cur_entry = block_start 174 continue 175 cur_entry += 1 176 block_start = cur_entry 177 178 179def read_dwarf_entries(wasm, options): 180 if options.dwarfdump_output: 181 output = open(options.dwarfdump_output, 'r').read() 182 elif options.dwarfdump: 183 logger.debug('Reading DWARF information from %s' % wasm) 184 if not os.path.exists(options.dwarfdump): 185 logger.error('llvm-dwarfdump not found: ' + options.dwarfdump) 186 sys.exit(1) 187 process = Popen([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=PIPE) 188 output, err = process.communicate() 189 exit_code = process.wait() 190 if exit_code != 0: 191 logger.error('Error during llvm-dwarfdump execution (%s)' % exit_code) 192 sys.exit(1) 193 else: 194 logger.error('Please specify either --dwarfdump or --dwarfdump-output') 195 sys.exit(1) 196 197 entries = [] 198 debug_line_chunks = re.split(r"debug_line\[(0x[0-9a-f]*)\]", asstr(output)) 199 maybe_debug_info_content = debug_line_chunks[0] 200 for i in range(1, len(debug_line_chunks), 2): 201 stmt_list = debug_line_chunks[i] 202 comp_dir_match = re.search(r"DW_AT_stmt_list\s+\(" + stmt_list + r"\)\s+" + 203 r"DW_AT_comp_dir\s+\(\"([^\"]+)", maybe_debug_info_content) 204 comp_dir = comp_dir_match.group(1) if comp_dir_match is not None else "" 205 206 line_chunk = debug_line_chunks[i + 1] 207 208 # include_directories[ 1] = "/Users/yury/Work/junk/sqlite-playground/src" 209 # file_names[ 1]: 210 # name: "playground.c" 211 # dir_index: 1 212 # mod_time: 0x00000000 213 # length: 0x00000000 214 # 215 # Address Line Column File ISA Discriminator Flags 216 # ------------------ ------ ------ ------ --- ------------- ------------- 217 # 0x0000000000000006 22 0 1 0 0 is_stmt 218 # 0x0000000000000007 23 10 1 0 0 is_stmt prologue_end 219 # 0x000000000000000f 23 3 1 0 0 220 # 0x0000000000000010 23 3 1 0 0 end_sequence 221 # 0x0000000000000011 28 0 1 0 0 is_stmt 222 223 include_directories = {'0': comp_dir} 224 for dir in re.finditer(r"include_directories\[\s*(\d+)\] = \"([^\"]*)", line_chunk): 225 include_directories[dir.group(1)] = dir.group(2) 226 227 files = {} 228 for file in re.finditer(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)", line_chunk): 229 dir = include_directories[file.group(3)] 230 file_path = (dir + '/' if file.group(2)[0] != '/' else '') + file.group(2) 231 files[file.group(1)] = file_path 232 233 for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk): 234 entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None} 235 if not entry['eos']: 236 entries.append(entry) 237 else: 238 # move end of function to the last END operator 239 entry['address'] -= 1 240 if entries[-1]['address'] == entry['address']: 241 # last entry has the same address, reusing 242 entries[-1]['eos'] = True 243 else: 244 entries.append(entry) 245 246 remove_dead_entries(entries) 247 248 # return entries sorted by the address field 249 return sorted(entries, key=lambda entry: entry['address']) 250 251 252def normalize_path(path): 253 return path.replace('\\', '/').replace('//', '/') 254 255 256def build_sourcemap(entries, code_section_offset, prefixes, collect_sources, base_path): 257 sources = [] 258 sources_content = [] if collect_sources else None 259 mappings = [] 260 sources_map = {} 261 last_address = 0 262 last_source_id = 0 263 last_line = 1 264 last_column = 1 265 for entry in entries: 266 line = entry['line'] 267 column = entry['column'] 268 # ignore entries with line 0 269 if line == 0: 270 continue 271 # start at least at column 1 272 if column == 0: 273 column = 1 274 address = entry['address'] + code_section_offset 275 file_name = entry['file'] 276 file_name = normalize_path(file_name) 277 # if prefixes were provided, we use that; otherwise, we emit a relative 278 # path 279 if prefixes.provided(): 280 source_name = prefixes.sources.resolve(file_name) 281 else: 282 try: 283 file_name = os.path.relpath(file_name, base_path) 284 except ValueError: 285 file_name = os.path.abspath(file_name) 286 file_name = normalize_path(file_name) 287 source_name = file_name 288 if source_name not in sources_map: 289 source_id = len(sources) 290 sources_map[source_name] = source_id 291 sources.append(source_name) 292 if collect_sources: 293 load_name = prefixes.load.resolve(file_name) 294 try: 295 with open(load_name, 'r') as infile: 296 source_content = infile.read() 297 sources_content.append(source_content) 298 except IOError: 299 print('Failed to read source: %s' % load_name) 300 sources_content.append(None) 301 else: 302 source_id = sources_map[source_name] 303 304 address_delta = address - last_address 305 source_id_delta = source_id - last_source_id 306 line_delta = line - last_line 307 column_delta = column - last_column 308 mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta)) 309 last_address = address 310 last_source_id = source_id 311 last_line = line 312 last_column = column 313 return OrderedDict([('version', 3), 314 ('names', []), 315 ('sources', sources), 316 ('sourcesContent', sources_content), 317 ('mappings', ','.join(mappings))]) 318 319 320def main(): 321 options = parse_args() 322 323 wasm_input = options.wasm 324 with open(wasm_input, 'rb') as infile: 325 wasm = infile.read() 326 327 entries = read_dwarf_entries(wasm_input, options) 328 329 code_section_offset = get_code_section_offset(wasm) 330 331 prefixes = SourceMapPrefixes(sources=Prefixes(options.prefix), load=Prefixes(options.load_prefix)) 332 333 logger.debug('Saving to %s' % options.output) 334 map = build_sourcemap(entries, code_section_offset, prefixes, options.sources, options.basepath) 335 with open(options.output, 'w') as outfile: 336 json.dump(map, outfile, separators=(',', ':')) 337 338 if options.strip: 339 wasm = strip_debug_sections(wasm) 340 341 if options.source_map_url: 342 wasm = append_source_mapping(wasm, options.source_map_url) 343 344 if options.w: 345 logger.debug('Saving wasm to %s' % options.w) 346 with open(options.w, 'wb') as outfile: 347 outfile.write(wasm) 348 349 logger.debug('Done') 350 return 0 351 352 353if __name__ == '__main__': 354 logging.basicConfig(level=logging.DEBUG if os.environ.get('EMCC_DEBUG') else logging.INFO) 355 sys.exit(main()) 356