1#!/usr/local/bin/python3.8
2# Copyright 2018 The Emscripten Authors.  All rights reserved.
3# Emscripten is available under two separate licenses, the MIT license and the
4# University of Illinois/NCSA Open Source License.  Both these licenses can be
5# found in the LICENSE file.
6
7"""Utility tools that extracts DWARF information encoded in a wasm output
8produced by the LLVM tools, and encodes it as a wasm source map. Additionally,
9it can collect original sources, change files prefixes, and strip debug
10sections from a wasm file.
11"""
12
13import argparse
14from collections import OrderedDict
15import json
16import logging
17from math import floor, log
18import os
19import re
20from subprocess import Popen, PIPE
21import sys
22
23sys.path.insert(1, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
24
25from tools.shared import asstr
26
27logger = logging.getLogger('wasm-sourcemap')
28
29
30def parse_args():
31  parser = argparse.ArgumentParser(prog='wasm-sourcemap.py', description=__doc__)
32  parser.add_argument('wasm', help='wasm file')
33  parser.add_argument('-o', '--output', help='output source map')
34  parser.add_argument('-p', '--prefix', nargs='*', help='replace source debug filename prefix for source map', default=[])
35  parser.add_argument('-s', '--sources', action='store_true', help='read and embed source files from file system into source map')
36  parser.add_argument('-l', '--load-prefix', nargs='*', help='replace source debug filename prefix for reading sources from file system (see also --sources)', default=[])
37  parser.add_argument('-w', nargs='?', help='set output wasm file')
38  parser.add_argument('-x', '--strip', action='store_true', help='removes debug and linking sections')
39  parser.add_argument('-u', '--source-map-url', nargs='?', help='specifies sourceMappingURL section contest')
40  parser.add_argument('--dwarfdump', help="path to llvm-dwarfdump executable")
41  parser.add_argument('--dwarfdump-output', nargs='?', help=argparse.SUPPRESS)
42  parser.add_argument('--basepath', help='base path for source files, which will be relative to this')
43  return parser.parse_args()
44
45
46class Prefixes:
47  def __init__(self, args):
48    prefixes = []
49    for p in args:
50      if '=' in p:
51        prefix, replacement = p.split('=')
52        prefixes.append({'prefix': prefix, 'replacement': replacement})
53      else:
54        prefixes.append({'prefix': p, 'replacement': None})
55    self.prefixes = prefixes
56    self.cache = {}
57
58  def resolve(self, name):
59    if name in self.cache:
60      return self.cache[name]
61
62    for p in self.prefixes:
63      if name.startswith(p['prefix']):
64        if p['replacement'] is None:
65          result = name[len(p['prefix'])::]
66        else:
67          result = p['replacement'] + name[len(p['prefix'])::]
68        break
69    self.cache[name] = result
70    return result
71
72
73# SourceMapPrefixes contains resolver for file names that are:
74#  - "sources" is for names that output to source maps JSON
75#  - "load" is for paths that used to load source text
76class SourceMapPrefixes:
77  def __init__(self, sources, load):
78    self.sources = sources
79    self.load = load
80
81  def provided(self):
82    return bool(self.sources.prefixes or self.load.prefixes)
83
84
85def encode_vlq(n):
86  VLQ_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
87  x = (n << 1) if n >= 0 else ((-n << 1) + 1)
88  result = ""
89  while x > 31:
90    result = result + VLQ_CHARS[32 + (x & 31)]
91    x = x >> 5
92  return result + VLQ_CHARS[x]
93
94
95def read_var_uint(wasm, pos):
96  n = 0
97  shift = 0
98  b = ord(wasm[pos:pos + 1])
99  pos = pos + 1
100  while b >= 128:
101    n = n | ((b - 128) << shift)
102    b = ord(wasm[pos:pos + 1])
103    pos = pos + 1
104    shift += 7
105  return n + (b << shift), pos
106
107
108def strip_debug_sections(wasm):
109  logger.debug('Strip debug sections')
110  pos = 8
111  stripped = wasm[:pos]
112
113  while pos < len(wasm):
114    section_start = pos
115    section_id, pos_ = read_var_uint(wasm, pos)
116    section_size, section_body = read_var_uint(wasm, pos_)
117    pos = section_body + section_size
118    if section_id == 0:
119      name_len, name_pos = read_var_uint(wasm, section_body)
120      name_end = name_pos + name_len
121      name = wasm[name_pos:name_end]
122      if name == "linking" or name == "sourceMappingURL" or name.startswith("reloc..debug_") or name.startswith(".debug_"):
123        continue  # skip debug related sections
124    stripped = stripped + wasm[section_start:pos]
125
126  return stripped
127
128
129def encode_uint_var(n):
130  result = bytearray()
131  while n > 127:
132    result.append(128 | (n & 127))
133    n = n >> 7
134  result.append(n)
135  return bytes(result)
136
137
138def append_source_mapping(wasm, url):
139  logger.debug('Append sourceMappingURL section')
140  section_name = "sourceMappingURL"
141  section_content = encode_uint_var(len(section_name)) + section_name + encode_uint_var(len(url)) + url
142  return wasm + encode_uint_var(0) + encode_uint_var(len(section_content)) + section_content
143
144
145def get_code_section_offset(wasm):
146  logger.debug('Read sections index')
147  pos = 8
148
149  while pos < len(wasm):
150    section_id, pos_ = read_var_uint(wasm, pos)
151    section_size, pos = read_var_uint(wasm, pos_)
152    if section_id == 10:
153      return pos
154    pos = pos + section_size
155
156
157def remove_dead_entries(entries):
158  # Remove entries for dead functions. It is a heuristics to ignore data if the
159  # function starting address near to 0 (is equal to its size field length).
160  block_start = 0
161  cur_entry = 0
162  while cur_entry < len(entries):
163    if not entries[cur_entry]['eos']:
164      cur_entry += 1
165      continue
166    fn_start = entries[block_start]['address']
167    # Calculate the LEB encoded function size (including size field)
168    fn_size_length = floor(log(entries[cur_entry]['address'] - fn_start + 1, 128)) + 1
169    min_live_offset = 1 + fn_size_length # 1 byte is for code section entries
170    if fn_start < min_live_offset:
171      # Remove dead code debug info block.
172      del entries[block_start:cur_entry + 1]
173      cur_entry = block_start
174      continue
175    cur_entry += 1
176    block_start = cur_entry
177
178
179def read_dwarf_entries(wasm, options):
180  if options.dwarfdump_output:
181    output = open(options.dwarfdump_output, 'r').read()
182  elif options.dwarfdump:
183    logger.debug('Reading DWARF information from %s' % wasm)
184    if not os.path.exists(options.dwarfdump):
185      logger.error('llvm-dwarfdump not found: ' + options.dwarfdump)
186      sys.exit(1)
187    process = Popen([options.dwarfdump, '-debug-info', '-debug-line', '--recurse-depth=0', wasm], stdout=PIPE)
188    output, err = process.communicate()
189    exit_code = process.wait()
190    if exit_code != 0:
191      logger.error('Error during llvm-dwarfdump execution (%s)' % exit_code)
192      sys.exit(1)
193  else:
194    logger.error('Please specify either --dwarfdump or --dwarfdump-output')
195    sys.exit(1)
196
197  entries = []
198  debug_line_chunks = re.split(r"debug_line\[(0x[0-9a-f]*)\]", asstr(output))
199  maybe_debug_info_content = debug_line_chunks[0]
200  for i in range(1, len(debug_line_chunks), 2):
201    stmt_list = debug_line_chunks[i]
202    comp_dir_match = re.search(r"DW_AT_stmt_list\s+\(" + stmt_list + r"\)\s+" +
203                               r"DW_AT_comp_dir\s+\(\"([^\"]+)", maybe_debug_info_content)
204    comp_dir = comp_dir_match.group(1) if comp_dir_match is not None else ""
205
206    line_chunk = debug_line_chunks[i + 1]
207
208    # include_directories[  1] = "/Users/yury/Work/junk/sqlite-playground/src"
209    # file_names[  1]:
210    #            name: "playground.c"
211    #       dir_index: 1
212    #        mod_time: 0x00000000
213    #          length: 0x00000000
214    #
215    # Address            Line   Column File   ISA Discriminator Flags
216    # ------------------ ------ ------ ------ --- ------------- -------------
217    # 0x0000000000000006     22      0      1   0             0  is_stmt
218    # 0x0000000000000007     23     10      1   0             0  is_stmt prologue_end
219    # 0x000000000000000f     23      3      1   0             0
220    # 0x0000000000000010     23      3      1   0             0  end_sequence
221    # 0x0000000000000011     28      0      1   0             0  is_stmt
222
223    include_directories = {'0': comp_dir}
224    for dir in re.finditer(r"include_directories\[\s*(\d+)\] = \"([^\"]*)", line_chunk):
225      include_directories[dir.group(1)] = dir.group(2)
226
227    files = {}
228    for file in re.finditer(r"file_names\[\s*(\d+)\]:\s+name: \"([^\"]*)\"\s+dir_index: (\d+)", line_chunk):
229      dir = include_directories[file.group(3)]
230      file_path = (dir + '/' if file.group(2)[0] != '/' else '') + file.group(2)
231      files[file.group(1)] = file_path
232
233    for line in re.finditer(r"\n0x([0-9a-f]+)\s+(\d+)\s+(\d+)\s+(\d+)(.*?end_sequence)?", line_chunk):
234      entry = {'address': int(line.group(1), 16), 'line': int(line.group(2)), 'column': int(line.group(3)), 'file': files[line.group(4)], 'eos': line.group(5) is not None}
235      if not entry['eos']:
236        entries.append(entry)
237      else:
238        # move end of function to the last END operator
239        entry['address'] -= 1
240        if entries[-1]['address'] == entry['address']:
241          # last entry has the same address, reusing
242          entries[-1]['eos'] = True
243        else:
244          entries.append(entry)
245
246  remove_dead_entries(entries)
247
248  # return entries sorted by the address field
249  return sorted(entries, key=lambda entry: entry['address'])
250
251
252def normalize_path(path):
253  return path.replace('\\', '/').replace('//', '/')
254
255
256def build_sourcemap(entries, code_section_offset, prefixes, collect_sources, base_path):
257  sources = []
258  sources_content = [] if collect_sources else None
259  mappings = []
260  sources_map = {}
261  last_address = 0
262  last_source_id = 0
263  last_line = 1
264  last_column = 1
265  for entry in entries:
266    line = entry['line']
267    column = entry['column']
268    # ignore entries with line 0
269    if line == 0:
270      continue
271    # start at least at column 1
272    if column == 0:
273      column = 1
274    address = entry['address'] + code_section_offset
275    file_name = entry['file']
276    file_name = normalize_path(file_name)
277    # if prefixes were provided, we use that; otherwise, we emit a relative
278    # path
279    if prefixes.provided():
280      source_name = prefixes.sources.resolve(file_name)
281    else:
282      try:
283        file_name = os.path.relpath(file_name, base_path)
284      except ValueError:
285        file_name = os.path.abspath(file_name)
286      file_name = normalize_path(file_name)
287      source_name = file_name
288    if source_name not in sources_map:
289      source_id = len(sources)
290      sources_map[source_name] = source_id
291      sources.append(source_name)
292      if collect_sources:
293        load_name = prefixes.load.resolve(file_name)
294        try:
295          with open(load_name, 'r') as infile:
296            source_content = infile.read()
297          sources_content.append(source_content)
298        except IOError:
299          print('Failed to read source: %s' % load_name)
300          sources_content.append(None)
301    else:
302      source_id = sources_map[source_name]
303
304    address_delta = address - last_address
305    source_id_delta = source_id - last_source_id
306    line_delta = line - last_line
307    column_delta = column - last_column
308    mappings.append(encode_vlq(address_delta) + encode_vlq(source_id_delta) + encode_vlq(line_delta) + encode_vlq(column_delta))
309    last_address = address
310    last_source_id = source_id
311    last_line = line
312    last_column = column
313  return OrderedDict([('version', 3),
314                      ('names', []),
315                      ('sources', sources),
316                      ('sourcesContent', sources_content),
317                      ('mappings', ','.join(mappings))])
318
319
320def main():
321  options = parse_args()
322
323  wasm_input = options.wasm
324  with open(wasm_input, 'rb') as infile:
325    wasm = infile.read()
326
327  entries = read_dwarf_entries(wasm_input, options)
328
329  code_section_offset = get_code_section_offset(wasm)
330
331  prefixes = SourceMapPrefixes(sources=Prefixes(options.prefix), load=Prefixes(options.load_prefix))
332
333  logger.debug('Saving to %s' % options.output)
334  map = build_sourcemap(entries, code_section_offset, prefixes, options.sources, options.basepath)
335  with open(options.output, 'w') as outfile:
336    json.dump(map, outfile, separators=(',', ':'))
337
338  if options.strip:
339    wasm = strip_debug_sections(wasm)
340
341  if options.source_map_url:
342    wasm = append_source_mapping(wasm, options.source_map_url)
343
344  if options.w:
345    logger.debug('Saving wasm to %s' % options.w)
346    with open(options.w, 'wb') as outfile:
347      outfile.write(wasm)
348
349  logger.debug('Done')
350  return 0
351
352
353if __name__ == '__main__':
354  logging.basicConfig(level=logging.DEBUG if os.environ.get('EMCC_DEBUG') else logging.INFO)
355  sys.exit(main())
356