1#!/usr/local/bin/python3.8
2# Copyright 2018 The Emscripten Authors.  All rights reserved.
3# Emscripten is available under two separate licenses, the MIT license and the
4# University of Illinois/NCSA Open Source License.  Both these licenses can be
5# found in the LICENSE file.
6
7# -*- Mode: python -*-
8
9"""
10emdump.py prints out statistics about compiled code sizes
11"""
12
13from __future__ import print_function
14import sys
15
16if sys.version_info >= (3,):
17  print("emdump.py unfortunately requires Python 2.7.x (attempted to run in Python " + '.'.join(map(lambda x: str(x), sys.version_info)) + ' from ' + sys.executable + ')')
18  sys.exit(1)
19
20from collections import OrderedDict
21import os, subprocess, functools, re, argparse
22
23# If true, we are printing delta information between two data sets. If false, we are just printing symbol info for a single data set
24diffing_two_data_sets = False
25
26# Global command line options
27options = None
28
29# Finds the given executable 'program' in PATH. Operates like the Unix tool 'which'.
30def which(program):
31  def is_exe(fpath):
32    return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
33
34  fpath, fname = os.path.split(program)
35  if fpath:
36    if is_exe(program):
37      return program
38  else:
39    for path in os.environ["PATH"].split(os.pathsep):
40      path = path.strip('"')
41      exe_file = os.path.join(path, program)
42      if is_exe(exe_file):
43        return exe_file
44
45      if os.name == 'nt' and '.' not in fname:
46        if is_exe(exe_file + '.exe'):
47          return exe_file + '.exe'
48        if is_exe(exe_file + '.cmd'):
49          return exe_file + '.cmd'
50        if is_exe(exe_file + '.bat'):
51          return exe_file + '.bat'
52
53  return None
54
55# Given a string s and an index i, counts how many times character ch is repeated looking backwards at s[i], s[i-1], s[i-2], s[i-3], ...
56def rcount(s, ch, i):
57  j = i
58  while j > 0 and s[j] == ch:
59    j -= 1
60  return i - j
61
62# Finds the index where a "foo" or 'foo' string ends in the given string s. Given string s and index 'start' to a string symbol " or ', finds the matching index where the string ends.
63# This takes into account escapes in the middle, i.e. "foo\\\\\\\"bar" will be properly matched.
64def find_unescaped_end(s, ch, start, end):
65  if s[start] != ch: raise Exception('Index start should point to starting occurrence of ch')
66  start += 1
67  while start < end:
68    if s[start] == ch and rcount(s, '\\', start-1) % 2 == 0:
69      return start
70    start += 1
71  return -1
72
73# Transforms linear index to string to file, column pair. (for debugging use only, need to build index->file:line mapping table for batch operations)
74def idx_to_line_col(s, i):
75  line = s.count('\n', 0, i)+1
76  last_n = s.rfind('\n', 0, i)
77  return 'line ' + str(line) + ', column ' + str(i-last_n) + ' (idx ' + str(i) + ')'
78
79# Given a string, returns brace_map dictionary that maps starting parens/brackets/braces indices to their ending positions.
80# This can be brittle since we are not able to parse JS proper, but good enough for Emscripten compiled output. (some debugging code retained in body if you run into a tricky case)
81def parse_parens(s):
82  brace_map = {}
83
84  parens = [] # ()
85  brackets = [] # []
86  braces = [] # {}
87
88  i = 0
89  end = len(s)
90  while i < end:
91    ch = s[i]
92    if ch == '/':
93      if i < end and s[i+1] == '/':
94        prev = i
95        i = s.find('\n', i)
96#        print(idx_to_line_col(s, prev) + ' starts // comment, skipping to ' + idx_to_line_col(s, i))
97      if i < end and s[i+1] == '*':
98        prev = i
99        i = s.find('*/', i+2)+1
100#        print(idx_to_line_col(s, prev) + ' starts /* comment, skipping to ' + idx_to_line_col(s, i))
101    elif ch == '"' and rcount(s, '\\', i-1) % 2 == 0:
102      prev = i
103      i = find_unescaped_end(s, '"', i, end)
104#      print(idx_to_line_col(s, prev) + ' is a "" string, skipping to ' + idx_to_line_col(s, i))
105    elif ch == "'" and rcount(s, '\\', i-1) % 2 == 0:
106      prev = i
107      i = find_unescaped_end(s, "'", i, end)
108#      print(idx_to_line_col(s, prev) + ' is a \'\' string, skipping to ' + idx_to_line_col(s, i))
109    elif ch == '^': # Ignore parens/brackets/braces if the previous character was a '^'. This is a bit of a heuristic, '^)' occur commonly in Emscripten generated regexes
110      i += 1
111    elif ch == '(':
112      if rcount(s, '\\', i-1) % 2 == 0: parens.append(i)
113#      print(idx_to_line_col(s, i) + ' has (')
114    elif ch == '[':
115      if rcount(s, '\\', i-1) % 2 == 0: brackets.append(i)
116#      print(idx_to_line_col(s, i) + ' has [')
117    elif ch == '{':
118      if rcount(s, '\\', i-1) % 2 == 0: braces.append(i)
119#      print(idx_to_line_col(s, i) + ' has {')
120    elif ch == ')':
121      if rcount(s, '\\', i-1) % 2 == 0:
122#       print(idx_to_line_col(s, i) + ' has )')
123        if len(parens) > 0: brace_map[parens.pop()] = i
124#        else: print('Warning: ' + idx_to_line_col(s, i) + ' has ), but could not find the opening parenthesis.')
125    elif ch == ']':
126      if rcount(s, '\\', i-1) % 2 == 0:
127#       print(idx_to_line_col(s, i) + ' has ]')
128        if len(brackets) > 0: brace_map[brackets.pop()] = i
129#        else: print('Warning: ' + idx_to_line_col(s, i) + ' has ], but could not find the opening bracket.')
130    elif ch == '}':
131      if rcount(s, '\\', i-1) % 2 == 0:
132#       print(idx_to_line_col(s, i) + ' has }')
133        if len(braces) > 0: brace_map[braces.pop()] = i
134#        else: print('Warning: ' + idx_to_line_col(s, i) + ' has }, but could not find the opening brace.')
135    i += 1
136  return brace_map
137
138# Valid characters in Emscripten outputted JS content (in reality valid character set is much more complex, but do not need that here)
139def is_javascript_symbol_char(ch):
140  i = ord(ch)
141  return (i >= 97 and i <= 122) or (i >= 65 and i <= 90) or (i >= 48 and i <= 57) or i == 36 or i == 95 # a-z, A-Z, 0-9, $, _
142
143def cxxfilt():
144  filt = which('llvm-cxxfilt')
145  if filt:
146    return filt
147  return which('c++filt')
148
149# Runs the given symbols list through c++filt to demangle.
150def cpp_demangle(symbols):
151  try:
152    filt = cxxfilt()
153    if not filt:
154      print('"llvm-cxxfilt" or "c++filt" executable is not found, demangled symbol names will not be available')
155      return ''
156    proc = subprocess.Popen([cxxfilt(), '--strip-underscore'], stdout=subprocess.PIPE, stdin=subprocess.PIPE)
157    output = proc.communicate(input=symbols)
158    return output[0].replace('\r\n', '\n')
159  except Exception as e:
160    return ''
161
162# Given a data set, fills in the 'demangled_data' field for each entry.
163def find_demangled_names(data):
164  if not data or len(data) == 0: return
165  data_lines = list(data.keys())
166  demangled_names = cpp_demangle('\n'.join(data_lines)).split('\n')
167  for i in xrange(len(data)):
168    mangled = data_lines[i]
169    data[mangled]['demangled_name'] = demangled_names[i].strip() if i < len(demangled_names) else mangled
170
171# Merges a new_entry with an old entry with the same name accumulating to its size (or adds new)
172def merge_entry_to_existing(existing_data, new_entry, total_source_set_size):
173  name = new_entry['unminified_name']
174  if name in existing_data:
175    ex = existing_data[name]
176    num_times_occurs_1 = ex['num_times_occurs'] if 'num_times_occurs' in ex else 1
177    num_times_occurs_2 = new_entry['num_times_occurs'] if 'num_times_occurs' in new_entry else 1
178    existing_data[name] = {
179      'lines': ex['lines'] + new_entry['lines'],
180      'bytes': ex['bytes'] + new_entry['bytes'],
181      'demangled_name': ex['demangled_name'] if 'demangled_name' in ex else (new_entry['demangled_name'] if 'demangled_name' in new_entry else new_entry['minified_name']),
182      'minified_name': ex['minified_name'],
183      'unminified_name': ex['unminified_name'],
184      'function_parameters': ex['function_parameters'],
185      'type': ex['type'],
186      'percentage': (ex['bytes'] + new_entry['bytes']) * 100.0 / total_source_set_size,
187      'num_times_occurs': num_times_occurs_1 + num_times_occurs_2
188    }
189  else:
190    existing_data[name] = new_entry
191
192def merge_to_data_set(to_set, from_set, total_source_set_size):
193  for key, value in from_set.items():
194    if diffing_two_data_sets:
195      merge_entry_to_existing(to_set, value, total_source_set_size)
196    else:
197      if key in to_set: key = s + '__' + key
198      to_set[key] = value
199
200# Builds up a dataset of functions and variables in the given JavaScript file (JS or asm.js)
201def analyze_javascript_file_contents(filename, file_contents, total_source_set_size, symbol_map=None):
202  data = {}
203  brace_map = parse_parens(file_contents)
204  parse_pos = 0
205  prev_end_pos = 0
206  file_len = len(file_contents)
207  func_regex = re.compile(r'function\s+([\w$]+)\s*\(([\w\s$,]*?)\)\s*{') # Search for "function foo (param1, param2, ..., paranN) {"
208  var_block_regex = re.compile(r'var\s+(\w+)\s*=\s*([{\[\(])') # Search for "var foo = {"
209  var_regex = re.compile(r'var\s+([\w]+)\s*=\s*[\w\s,]*?;') # Search for "var foo = .... ;"
210  unaccounted_bytes = 0
211  unaccounted_lines = 0
212
213  asm_start = file_contents.find('use asm')
214  asm_start_brace = -1
215  asm_end_brace = -1
216  asm_type = 'asmjs'
217  if asm_start < 0:
218    asm_start = file_contents.find('almost asm')
219    asm_type = '~asmjs'
220  if asm_start >= 0:
221    asm_start_brace = file_contents.rfind('{', 0, asm_start)
222    if asm_start_brace >= 0:
223      asm_end_brace = brace_map[asm_start_brace] if asm_start_brace in brace_map else file_len
224
225  func_pos = -1
226  var_pos = -1
227  while parse_pos < file_len:
228    if func_pos < parse_pos: func_pos = file_contents.find('function ', parse_pos)
229    if func_pos < 0: func_pos = file_len
230    if var_pos < parse_pos: var_pos = file_contents.find('var ', parse_pos)
231    if var_pos < 0: var_pos = file_len
232    if min(func_pos, var_pos) >= file_len: break
233    next_pos = min(func_pos, var_pos)
234    parse_pos = next_pos+1
235
236    # Skip this occurrence of 'function' if it had a prefix as part of some other string, e.g. 'foofunction'
237    if next_pos > 0 and is_javascript_symbol_char(file_contents[next_pos-1]): continue
238
239    if next_pos > prev_end_pos:
240      unaccounted_lines += file_contents.count('\n', prev_end_pos, next_pos) + 1
241      unaccounted_bytes += next_pos - prev_end_pos
242      if options.dump_unaccounted_larger_than >= 0 and next_pos - prev_end_pos > options.dump_unaccounted_larger_than:
243        print('--- Unaccounted ' + str(next_pos - prev_end_pos) + ' bytes in ' + filename + ':')
244        print(file_contents[prev_end_pos:next_pos])
245        print('===')
246    prev_end_pos = next_pos
247
248    # Verify that this position actually starts a function by testing against a regex (this is much slower than substring search,
249    # which is why it's done as a second step, instead of as primary way to search)
250    if next_pos == func_pos:
251      func_match = func_regex.match(file_contents[func_pos:])
252      if not func_match: continue
253
254      # find starting and ending braces { } for the function
255      start_brace = file_contents.find('{', func_pos)
256      if start_brace < 0: break # Must be at the end of file
257      if start_brace not in brace_map:
258        print('Warning: ' + idx_to_line_col(file_contents, start_brace) + ' cannot parse function start brace, skipping.')
259        continue
260      end_brace = brace_map[start_brace]
261      if end_brace < 0: break # Must be at the end of file
262
263      num_bytes = end_brace + 1 - func_pos
264      num_lines = file_contents.count('\n', func_pos, end_brace) + 1
265      prev_end_pos = parse_pos = end_brace + 1
266
267      function_type = asm_type if func_pos >= asm_start_brace and end_brace <= asm_end_brace else 'js'
268      minified_name = func_match.group(1)
269      function_parameters = func_match.group(2).strip()
270      if symbol_map and minified_name in symbol_map and function_type == asm_type: unminified_name = symbol_map[minified_name]
271      else: unminified_name = minified_name
272      data[unminified_name] = {
273        'lines': num_lines,
274        'bytes': num_bytes,
275        'minified_name': minified_name,
276        'unminified_name': unminified_name,
277        'function_parameters': function_parameters,
278        'type': function_type,
279        'percentage': num_bytes * 100.0 / total_source_set_size
280      }
281    else: # This is a variable
282      var_block_match = var_block_regex.match(file_contents[var_pos:])
283      if var_block_match:
284        # find starting and ending braces { } for the var
285        start_brace = file_contents.find(var_block_match.group(2), var_pos)
286        if start_brace < 0: break # Must be at the end of file
287        if start_brace not in brace_map:
288          print('Warning: ' + idx_to_line_col(file_contents, start_brace) + ' cannot parse variable start brace, skipping.')
289          continue
290        end_brace = brace_map[start_brace]
291        if end_brace < 0: break # Must be at the end of file
292        minified_name = var_block_match.group(1)
293      else:
294        start_brace = var_pos
295        var_match = var_regex.match(file_contents[var_pos:])
296        if not var_match: continue
297        end_brace = file_contents.find(';', var_pos)
298        minified_name = var_match.group(1)
299
300      # Special case ignore the 'var asm = (function(global, env, buffer) { 'use asm'; ... }; ' variable that contains all the asm.js code.
301      # Ignoring this variable lets all the asm.js code be trated as functions in this parser, instead of assigning them to the asm variable.
302      if file_contents[start_brace] == '(' and ("'use asm'" in file_contents[var_pos:end_brace] or '"use asm"' in file_contents[var_pos:end_brace] or "'almost asm'" in file_contents[var_pos:end_brace] or '"almost asm"' in file_contents[var_pos:end_brace]): continue
303
304      num_bytes = end_brace + 1 - var_pos
305      num_lines = file_contents.count('\n', var_pos, end_brace) + 1
306      prev_end_pos = parse_pos = end_brace + 1
307
308      var_type = 'asm_var' if func_pos >= asm_start_brace and end_brace <= asm_end_brace else 'var'
309
310      if symbol_map and minified_name in symbol_map and var_type =='asm_var': unminified_name = symbol_map[minified_name].strip()
311      else: unminified_name = minified_name
312      data[unminified_name] = {
313        'lines': num_lines,
314        'bytes': num_bytes,
315        'minified_name': minified_name,
316        'unminified_name': unminified_name,
317        'function_parameters': '',
318        'type': var_type,
319        'percentage': num_bytes * 100.0 / total_source_set_size
320      }
321
322  if options.list_unaccounted:
323    if diffing_two_data_sets: unaccounted_name = '$unaccounted_js_content' # If diffing two data sets, must make the names of the unaccounted content blocks be comparable
324    else: unaccounted_name = '$unaccounted_js_content_in("' + os.path.basename(filename) + '")'
325    unaccounted_entry = {
326      'lines': unaccounted_lines,
327      'bytes': unaccounted_bytes,
328      'minified_name': unaccounted_name,
329      'unminified_name': unaccounted_name,
330      'function_parameters': '',
331      'type': '[UNKN]',
332      'percentage': unaccounted_bytes * 100.0 / total_source_set_size
333    }
334    merge_entry_to_existing(data, unaccounted_entry, total_source_set_size)
335
336  return data
337
338def analyze_javascript_file(filename, total_source_set_size, symbol_map=None):
339  file_contents = open(filename, 'rb').read()
340  print('Analyzing JS file ' + filename + ', ' + str(len(file_contents)) + ' bytes...')
341  return analyze_javascript_file_contents(filename, file_contents, total_source_set_size, symbol_map)
342
343def analyze_html_file(filename, total_source_set_size, symbol_map=None):
344  file_contents = open(filename, 'rb').read()
345  print('Analyzing HTML file ' + filename + ', ' + str(len(file_contents)) + ' bytes...')
346  data = {}
347  parse_pos = 0
348  file_len = len(file_contents)
349  unaccounted_bytes = 0
350  unaccounted_lines = 0
351
352  while parse_pos < file_len:
353    script_pos = file_contents.find('<script', parse_pos)
354    if script_pos < 0: break
355    script_pos = file_contents.find('>', script_pos)
356    if script_pos < 0: break
357    script_pos += 1
358    script_end_pos = file_contents.find('</script>', script_pos)
359    if script_end_pos < 0: break
360
361    if script_pos > parse_pos:
362      unaccounted_bytes += script_pos - parse_pos
363      unaccounted_lines += file_contents.count('\n', parse_pos, script_pos) + 1
364    data_set = analyze_javascript_file_contents(filename, file_contents[script_pos:script_end_pos], total_source_set_size, symbol_map)
365    merge_to_data_set(data, data_set, total_source_set_size)
366    parse_pos = script_end_pos
367
368  if file_len > parse_pos:
369    unaccounted_bytes += file_len - parse_pos
370    unaccounted_lines += file_contents.count('\n', parse_pos, file_len) + 1
371
372  if options.list_unaccounted and unaccounted_bytes > 0:
373    if diffing_two_data_sets: unaccounted_name = '$unaccounted_html_content' # If diffing two data sets, must make the names of the unaccounted content blocks be comparable
374    else: unaccounted_name = '$unaccounted_html_content_in("' + os.path.basename(filename) + '")'
375    unaccounted_entry = {
376      'lines': unaccounted_lines,
377      'bytes': unaccounted_bytes,
378      'minified_name': unaccounted_name,
379      'unminified_name': unaccounted_name,
380      'function_parameters': '',
381      'type': 'HTML',
382      'percentage': unaccounted_bytes * 100.0 / total_source_set_size
383    }
384    merge_entry_to_existing(data, unaccounted_entry, total_source_set_size)
385
386  return data
387
388def analyze_source_file(filename, total_source_set_size, symbol_map=None):
389  if '.htm' in os.path.basename(filename).lower():
390    return analyze_html_file(filename, total_source_set_size, symbol_map)
391  else:
392    return analyze_javascript_file(filename, total_source_set_size, symbol_map)
393
394def common_compare(data1, data2):
395  fns1 = set(data1.keys())
396  fns2 = set(data2.keys())
397  commonfns = fns1.intersection(fns2)
398  commonlinediff = 0
399  commonbytediff = 0
400  for fn in commonfns:
401    d1 = data1[fn]
402    d2 = data2[fn]
403    commonlinediff += d2['lines'] - d1['lines']
404    commonbytediff += d2['bytes'] - d1['bytes']
405  linesword = 'more' if commonlinediff >= 0 else 'less'
406  bytesword = 'more' if commonbytediff >= 0 else 'less'
407  print('set 2 has {} lines {} than set 1 in {} common functions'.format(abs(commonlinediff), linesword, len(commonfns)))
408  print('set 2 has {} bytes {} than set 1 in {} common functions'.format(str(abs(commonbytediff)), bytesword, len(commonfns)))
409
410def uniq_compare(data1, data2):
411  fns1 = set(data1.keys())
412  fns2 = set(data2.keys())
413  uniqfns1 = fns1 - fns2
414  uniqfns2 = fns2 - fns1
415  uniqlines1 = 0
416  uniqbytes1 = 0
417  uniqlines2 = 0
418  uniqbytes2 = 0
419  for fn in uniqfns1:
420    d = data1[fn]
421    uniqlines1 += d['lines']
422    uniqbytes1 += d['bytes']
423  for fn in uniqfns2:
424    d = data2[fn]
425    uniqlines2 += d['lines']
426    uniqbytes2 += d['bytes']
427  uniqcountdiff = len(uniqfns2) - len(uniqfns1)
428  assert len(fns2) - len(fns1) == uniqcountdiff
429  uniqlinediff = uniqlines2 - uniqlines1
430  uniqbytediff = uniqbytes2 - uniqbytes1
431  countword = 'more' if uniqcountdiff >= 0 else 'less'
432  linesword = 'more' if uniqlinediff >= 0 else 'less'
433  bytesword = 'more' if uniqbytediff >= 0 else 'less'
434  print('set 2 has {} functions {} than set 1 overall (unique: {} vs {})'.format(abs(uniqcountdiff), countword, len(uniqfns2), len(uniqfns1)))
435  print('set 2 has {} lines {} than set 1 overall in unique functions'.format(abs(uniqlinediff), linesword))
436  print('set 2 has {} bytes {} than set 1 overall in unique functions'.format(str(abs(uniqbytediff)), bytesword))
437
438# Use a bunch of regexps to simplify the demangled name
439DEM_RE = None
440def simplify_cxx_name(name):
441  global DEM_RE
442  if DEM_RE is None:
443    DEM_RE = []
444    string_m = re.compile(r'std::__2::basic_string<char, std::__2::char_traits<char>, std::__2::allocator<char> >')
445    DEM_RE.append(lambda s: string_m.sub(r'std::string', s))
446    vec_m = re.compile(r'std::__2::vector<([^,]+), std::__2::allocator<\1\s*> >')
447    DEM_RE.append(lambda s: vec_m.sub(r'std::vector<\1>', s))
448    unordered_map_m = re.compile(r'std::__2::unordered_map<([^,]+), ([^,]+), std::__2::hash<\1\s*>, std::__2::equal_to<\1\s*>, std::__2::allocator<std::__2::pair<\1 const, \2> > >')
449    DEM_RE.append(lambda s: unordered_map_m.sub(r'std::unordered_map<\1, \2>', s))
450    sort_m = re.compile(r'std::__2::__sort<std::__2::__less<([^,]+), \1\s*>&, \1\*>\(\1\*, \1\*, std::__2::__less<\1, \1\s*>&\)')
451    DEM_RE.append(lambda s: sort_m.sub(r'std::sort(\1*, \1*)', s))
452    DEM_RE.append(lambda s: s.replace('std::__2::', 'std::'))
453
454  for dem in DEM_RE:
455    name = dem(name)
456  return name
457
458# 'foo(int, float)' -> 'foo'
459def function_args_removed(s):
460  if '(' in s: return s[:s.find('(')]
461  else: return s
462
463# 'foo(int, float)' -> 'int, float)'
464def function_args_part(s):
465  if '(' in s: return s[s.find('(')+1:]
466  else: return ''
467
468def sort_key_py2(key_value):
469  return key_value[1][options.sort]
470
471# Apparently for python 3, one will use the following, but currently untested
472# def sort_key_py3(key, value):
473#   return value[options.sort]
474
475def print_symbol_info(data, total_source_set_size):
476  data = list(data.items())
477  data.sort(key=sort_key_py2, reverse=not options.sort_ascending)
478
479  total_size = 0
480  for unminified_name, e in data:
481    if options.only_unique_1 and e['in_set_2']: continue
482    if options.only_unique_2 and e['in_set_1']: continue
483    if options.only_common and (not e['in_set_1'] or not e['in_set_2']): continue
484    prev_bytes = e['prev_bytes'] if 'prev_bytes' in e else 0
485    if max(e['bytes'], prev_bytes) < options.filter_size: continue
486    if e['bytes'] == prev_bytes and options.only_changes: continue
487
488    minified_name = e['minified_name']
489    demangled_name = e['demangled_name']
490    if options.simplify_cxx:
491      demangled_name = simplify_cxx_name(demangled_name)
492
493    if not '(' in demangled_name and 'js' in e['type']: demangled_name_with_args = demangled_name + '(' + e['function_parameters'] + ')'
494    else: demangled_name_with_args = demangled_name
495    demangled_name = function_args_removed(demangled_name)
496
497    if not options.filter_name in demangled_name_with_args.lower():
498      continue
499
500    if e['function_parameters']:
501      unminified_name_with_args = unminified_name + '(' + e['function_parameters'] + ')'
502      minified_name_with_args = minified_name + '(' + e['function_parameters'] + ')'
503    elif 'js' in e['type']:
504      unminified_name_with_args = unminified_name + '()'
505      minified_name_with_args = minified_name + '()'
506    else:
507      unminified_name_with_args = unminified_name
508      minified_name_with_args = minified_name
509
510    # Build up the function name to print based on the desired formatting specifiers (mangled/minified/unminified, yes/no args)
511    print_name = []
512    for i in options.print_format:
513      if i == 'd': print_name += [demangled_name]
514      elif i == 'u': print_name += [unminified_name]
515      elif i == 'm': print_name += [minified_name]
516      elif i == 'D': print_name += [demangled_name_with_args]
517      elif i == 'U': print_name += [unminified_name_with_args]
518      elif i == 'M': print_name += [minified_name_with_args]
519
520    # Collapse names that are identical
521    i = 0
522    while i+1 < len(print_name):
523      if print_name[i] == print_name[i+1]:
524        print_name = print_name[:i] + print_name[i+1:]
525        continue
526      n1 = function_args_removed(print_name[i])
527      n2 = function_args_removed(print_name[i+1])
528      args1 = function_args_part(print_name[i])
529      args2 = function_args_part(print_name[i+1])
530      if n1 == n2 and (not args1 or not args2):
531        if not args1: print_name = print_name[:i] + print_name[i+1:]
532        else: print_name = print_name[:i+1] + print_name[i+2:]
533        continue
534      i += 1
535
536    print_name = ' ; '.join(print_name)
537    if 'num_times_occurs' in e:
538      print_name = '[' + str(e['num_times_occurs']) + ' times] ' + print_name
539    delta_string = ' %+8d (%+6.2f%%)' % (e['bytes'] - e['prev_bytes'], e['percentage'] - e['prev_percentage']) if diffing_two_data_sets else ''
540    print('%6d lines %7s (%5.2f%%) %s: %8s %s' % (e['lines'], str(e['bytes']), e['percentage'], delta_string, e['type'], print_name))
541
542    total_size += e['bytes']
543
544  if total_size < total_source_set_size:
545    print('Total size of printed functions: ' + str(total_size) + ' bytes. (%.2f%% of all symbols)' % (total_size * 100.0 / total_source_set_size))
546  else:
547    print('Total size of printed functions: ' + str(total_size) + ' bytes.')
548
549# Parses Emscripten compiler generated .symbols map file for minified->unminified mappings
550def read_symbol_map(filename):
551  if not filename: return
552  symbol_map = {}
553  for line in open(filename):
554    minified, unminified = line.split(':')
555    symbol_map[minified.strip()] = unminified.strip()
556  return symbol_map
557
558# Locates foo.js to foo.js.symbols or foo.html.symbols based on default output name rules for Emscripten compiler
559def guess_symbol_map_file_location(sources, symbol_map_file):
560  if os.path.isfile(symbol_map_file): return symbol_map_file
561  for s in sources:
562    if os.path.isfile(s + '.symbols'): return s + '.symbols'
563    if os.path.isfile(s.replace('.js', '.html') + '.symbols'): return s.replace('.js', '.html') + '.symbols'
564  return None
565
566# Returns total byte size of the given list of source files
567def count_file_set_size(sources):
568  total_size = 0
569  for s in sources:
570    total_size += os.path.getsize(s)
571  return total_size
572
573# Merges two given data sets into one large data set with diffing information
574def diff_data_sets(data1, data2):
575  all_keys = set().union(data1.keys(), data2.keys())
576  diffed_data = {}
577  for k in all_keys:
578    if k in data2:
579      e = data2[k].copy()
580      e['in_set_2'] = True
581      if k in data1:
582        prev = data1[k]
583        e['prev_percentage'] = prev['percentage']
584        e['prev_bytes'] = prev['bytes']
585        e['prev_lines'] = prev['lines']
586        e['in_set_1'] = True
587      else:
588        e['prev_percentage'] = 0
589        e['prev_bytes'] = 0
590        e['prev_lines'] = 0
591        e['in_set_1'] = False
592    else:
593      e = data1[k].copy()
594      e['prev_percentage'] = e['percentage']
595      e['prev_lines'] = e['lines']
596      e['prev_bytes'] = e['bytes']
597      e['in_set_1'] = True
598      if k in data2:
599        new = data2[k]
600        e['percentage'] = prev['percentage']
601        e['bytes'] = prev['bytes']
602        e['lines'] = prev['lines']
603        e['in_set_2'] = True
604      else:
605        e['percentage'] = 0
606        e['bytes'] = 0
607        e['lines'] = 0
608        e['in_set_2'] = False
609    e['delta'] = e['bytes'] - e['prev_bytes']
610    e['delta_percentage'] = e['percentage'] - e['prev_percentage']
611    e['abs_delta'] = abs(e['bytes'] - e['prev_bytes'])
612    diffed_data[k] = e
613  return diffed_data
614
615# Given string s and start index that contains a (, {, <, [, ", or ', finds forward the index where the token closes (taking nesting into account)
616def find_index_of_closing_token(s, start):
617  start_ch = s[start]
618  if start_ch == '(': end_ch = ')'
619  elif start_ch == '{': end_ch = '}'
620  elif start_ch == '<': end_ch = '>'
621  elif start_ch == '[': end_ch = ']'
622  elif start_ch == '"': end_ch = '"'
623  elif start_ch == "'": end_ch = "'"
624  else:
625    raise Exception('Unknown start token ' + start_ch + ', string ' + s + ', start ' + start)
626
627  i = start + 1
628  nesting_count = 1
629  while i < len(s):
630    if s[i] == end_ch:
631      nesting_count -= 1
632      if nesting_count <= 0:
633        return i
634    elif s[i] == start_ch:
635      nesting_count += 1
636    i += 1
637  return i
638
639def compute_templates_collapsed_name(demangled_name):
640  i = 0
641  generic_template_name = 'T'
642  type_names = {}
643  while True:
644    i = demangled_name.find('<', i)
645    if i < 0:
646      return demangled_name
647
648    end = find_index_of_closing_token(demangled_name, i)
649    if end < 0:
650      return demangled_name
651
652    i += 1
653    template_type = demangled_name[i:end]
654    if template_type in type_names:
655      template_name = type_names[template_type]
656    else:
657      template_name = generic_template_name
658      type_names[template_type] = generic_template_name
659      generic_template_name = chr(ord(generic_template_name) + 1)
660
661    demangled_name = demangled_name[:i] + template_name + demangled_name[end:]
662
663def collapse_templates(data_set, total_source_set_size, no_function_args):
664  collapsed_data_set = {}
665  keys = data_set.keys()
666  for k in keys:
667    e = data_set[k]
668    if 'demangled_name' in e:
669      demangled_name = compute_templates_collapsed_name(e['demangled_name'])
670      if no_function_args: demangled_name = function_args_removed(demangled_name)
671      e['demangled_name'] = e['unminified_name'] = demangled_name
672    merge_entry_to_existing(collapsed_data_set, e, total_source_set_size)
673  return collapsed_data_set
674
675def print_function_args(options):
676  return 'D' in options.print_format or 'U' in options.print_format or 'M' in options.print_format
677
678def main():
679  global options, diffing_two_data_sets
680  usage_str = "emdump.py prints out statistics about compiled code sizes.\npython emdump.py --file a.js [--file2 b.js]"
681  parser = argparse.ArgumentParser(usage=usage_str)
682
683  parser.add_argument('--file', dest='file', default=[], nargs='*',
684    help='Specifies the compiled JavaScript build file to analyze.')
685
686  parser.add_argument('--file1', dest='file1', default=[], nargs='*',
687    help='Specifies the compiled JavaScript build file to analyze.')
688
689  parser.add_argument('--symbol-map', dest='symbol_map', default='',
690    help='Specifies a filename to the symbol map file that can be used to unminify function and variable names.')
691
692  parser.add_argument('--file2', dest='file2', default=[], nargs='*',
693    help='Specifies a second compiled JavaScript build file to analyze.')
694
695  parser.add_argument('--symbol-map2', dest='symbol_map2', default='',
696    help='Specifies a filename to a second symbol map file that will be used to unminify function and variable names of file2.')
697
698  parser.add_argument('--list-unaccounted', dest='list_unaccounted', type=int, default=1,
699    help='Pass --list-unaccounted=0 to skip listing a summary entry of unaccounted content')
700
701  parser.add_argument('--dump-unaccounted-larger-than', dest='dump_unaccounted_larger_than', type=int, default=-1,
702    help='If an integer value >= 0 is specified, all unaccounted strings of content longer than the given value will be printed out to the console.\n(Note that it is common to have several unaccounted blocks, this is provided for curiosity/debugging/optimization ideas)')
703
704  parser.add_argument('--only-unique-1', dest='only_unique_1', action='store_true', default=False,
705    help='If two data sets are specified, prints out only the symbols that are present in set 1, but not in set 2')
706
707  parser.add_argument('--only-unique-2', dest='only_unique_2', action='store_true', default=False,
708    help='If two data sets are specified, prints out only the symbols that are present in set 2, but not in set 1')
709
710  parser.add_argument('--only-common', dest='only_common', action='store_true', default=False,
711    help='If two data sets are specified, prints out only the symbols that are common to both data sets')
712
713  parser.add_argument('--only-changes', dest='only_changes', action='store_true', default=False,
714    help='If two data sets are specified, prints out only the symbols that have changed size or are added/removed')
715
716  parser.add_argument('--only-summarize', dest='only_summarize', action='store_true', default=False,
717    help='If specified, detailed information about each symbol is not printed, but only summary data is shown.')
718
719  parser.add_argument('--filter-name', dest='filter_name', default='',
720    help='Only prints out information about symbols that contain the given filter substring in their demangled names. The filtering is always performed in lower case.')
721
722  parser.add_argument('--filter-size', dest='filter_size', type=int, default=0,
723    help='Only prints out information about symbols that are (or were) larger than the given amount of bytes.')
724
725  parser.add_argument('--sort', dest='sort', default='bytes',
726    help='Specifies the data column to sort output by. Possible values are: lines, bytes, delta, abs_delta, type, minified, unminified, demangled')
727
728  parser.add_argument('--print-format', dest='print_format', default='DM',
729    help='Specifies the naming format for the symbols. Possible options are one of: m, u, d, du, dm, um, dum. Here "m" denotes minified, "u" denotes unminified, and "d" denotes demangled. Specify any combination of the characters in upper case to print out function parameters.\nDefault: DM.')
730
731  parser.add_argument('--sort-ascending', dest='sort_ascending', action='store_true', default=False,
732    help='If true, reverses the sorting order to be ascending instead of default descending.')
733
734  parser.add_argument('--simplify-cxx', dest='simplify_cxx', action='store_true', default=False,
735    help='Simplify C++ STL types as much as possible in the output')
736
737  parser.add_argument('--group-templates', dest='group_templates', action='store_true', default=False,
738    help='Group/collapse all C++ templates with Foo<asdf> and Foo<qwer> to generic Foo<T>')
739
740  options = parser.parse_args(sys.argv[1:])
741  options.file = options.file + options.file1
742
743  if len(options.file) == 0:
744    print('Specify a set of JavaScript build output files to analyze with --file file1.js file2.js ... fileN.js.\nRun python emdump.py --help to see all options.')
745    return 1
746
747  options.filter_name = options.filter_name.lower()
748
749  diffing_two_data_sets = len(options.file2) > 0
750  if not diffing_two_data_sets:
751    if options.only_unique_1:
752      print('Error: Must specify two data sets with --file a.js b.js c.js --file2 d.js e.js f.js to diff in order to use --only-unique-symbols-in-set-1 option!')
753      sys.exit(1)
754
755    if options.only_unique_2:
756      print('Error: Must specify two data sets with --file a.js b.js c.js --file2 d.js e.js f.js to diff in order to use --only-unique-symbols-in-set-2 option!')
757      sys.exit(1)
758
759    if options.only_common:
760      print('Error: Must specify two data sets with --file a.js b.js c.js --file2 d.js e.js f.js to diff in order to use --only-common-symbols option!')
761      sys.exit(1)
762
763  # Validate column sorting input:
764  valid_sort_options = ['lines', 'bytes', 'delta', 'abs_delta', 'type', 'minified', 'unminified', 'demangled']
765  if options.sort not in valid_sort_options:
766    print('Invalid sort option ' + options.sort + ' specified! Choose one of: ' + ', '.join(valid_sort_options) + '.')
767    sys.exit(1)
768  if options.sort == 'minified': options.sort = 'minified_name'
769  if options.sort == 'unminified': options.sort = 'unminified_name'
770  if options.sort == 'demangled': options.sort = 'demangled_name'
771
772  if 'delta' in options.sort and not diffing_two_data_sets:
773    print('Error: Must specify two data sets with --file a.js b.js c.js --file2 d.js e.js f.js to diff in order to use --sort='+options.sort)
774    sys.exit(1)
775
776  # Autoguess .symbols file location based on default Emscripten build output, to save the need to type it out in the common case
777  options.symbol_map = guess_symbol_map_file_location(options.file, options.symbol_map)
778  options.symbol_map2 = guess_symbol_map_file_location(options.file2, options.symbol_map2)
779
780  symbol_map1 = read_symbol_map(options.symbol_map)
781  symbol_map2 = read_symbol_map(options.symbol_map2)
782
783  set1_size = count_file_set_size(options.file)
784  data1 = {}
785  for s in options.file:
786    data = analyze_source_file(s, set1_size, symbol_map1)
787    merge_to_data_set(data1, data, set1_size)
788
789  set2_size = count_file_set_size(options.file2)
790  data2 = {}
791  for s in options.file2:
792    data = analyze_source_file(s, set2_size, symbol_map2)
793    merge_to_data_set(data2, data, set2_size)
794
795  find_demangled_names(data1)
796  find_demangled_names(data2)
797
798  if options.group_templates:
799    data1 = collapse_templates(data1, set1_size, not print_function_args(options))
800    data2 = collapse_templates(data2, set2_size, not print_function_args(options))
801
802  if diffing_two_data_sets:
803    diffed_data = diff_data_sets(data1, data2)
804    if not options.only_summarize:
805      print_symbol_info(diffed_data, set2_size)
806      print('')
807    print('set 2 is %d bytes, which is %+.2f%% %s than set 1 size (%d bytes)' % (set2_size, (set2_size - set1_size) * 100.0 / set2_size, 'more' if set2_size > set1_size else 'less', set1_size))
808    uniq_compare(data1, data2)
809    common_compare(data1, data2)
810  else:
811    if not options.only_summarize:
812      print_symbol_info(data1, set1_size)
813    # TODO: print some kind of summary?
814
815  return 0
816
817if __name__ == '__main__':
818  sys.exit(main())
819