1#!/usr/local/bin/python3.8 2# Copyright 2018 The Emscripten Authors. All rights reserved. 3# Emscripten is available under two separate licenses, the MIT license and the 4# University of Illinois/NCSA Open Source License. Both these licenses can be 5# found in the LICENSE file. 6 7# -*- Mode: python -*- 8 9""" 10emdump.py prints out statistics about compiled code sizes 11""" 12 13from __future__ import print_function 14import sys 15 16if sys.version_info >= (3,): 17 print("emdump.py unfortunately requires Python 2.7.x (attempted to run in Python " + '.'.join(map(lambda x: str(x), sys.version_info)) + ' from ' + sys.executable + ')') 18 sys.exit(1) 19 20from collections import OrderedDict 21import os, subprocess, functools, re, argparse 22 23# If true, we are printing delta information between two data sets. If false, we are just printing symbol info for a single data set 24diffing_two_data_sets = False 25 26# Global command line options 27options = None 28 29# Finds the given executable 'program' in PATH. Operates like the Unix tool 'which'. 30def which(program): 31 def is_exe(fpath): 32 return os.path.isfile(fpath) and os.access(fpath, os.X_OK) 33 34 fpath, fname = os.path.split(program) 35 if fpath: 36 if is_exe(program): 37 return program 38 else: 39 for path in os.environ["PATH"].split(os.pathsep): 40 path = path.strip('"') 41 exe_file = os.path.join(path, program) 42 if is_exe(exe_file): 43 return exe_file 44 45 if os.name == 'nt' and '.' not in fname: 46 if is_exe(exe_file + '.exe'): 47 return exe_file + '.exe' 48 if is_exe(exe_file + '.cmd'): 49 return exe_file + '.cmd' 50 if is_exe(exe_file + '.bat'): 51 return exe_file + '.bat' 52 53 return None 54 55# Given a string s and an index i, counts how many times character ch is repeated looking backwards at s[i], s[i-1], s[i-2], s[i-3], ... 56def rcount(s, ch, i): 57 j = i 58 while j > 0 and s[j] == ch: 59 j -= 1 60 return i - j 61 62# Finds the index where a "foo" or 'foo' string ends in the given string s. Given string s and index 'start' to a string symbol " or ', finds the matching index where the string ends. 63# This takes into account escapes in the middle, i.e. "foo\\\\\\\"bar" will be properly matched. 64def find_unescaped_end(s, ch, start, end): 65 if s[start] != ch: raise Exception('Index start should point to starting occurrence of ch') 66 start += 1 67 while start < end: 68 if s[start] == ch and rcount(s, '\\', start-1) % 2 == 0: 69 return start 70 start += 1 71 return -1 72 73# Transforms linear index to string to file, column pair. (for debugging use only, need to build index->file:line mapping table for batch operations) 74def idx_to_line_col(s, i): 75 line = s.count('\n', 0, i)+1 76 last_n = s.rfind('\n', 0, i) 77 return 'line ' + str(line) + ', column ' + str(i-last_n) + ' (idx ' + str(i) + ')' 78 79# Given a string, returns brace_map dictionary that maps starting parens/brackets/braces indices to their ending positions. 80# This can be brittle since we are not able to parse JS proper, but good enough for Emscripten compiled output. (some debugging code retained in body if you run into a tricky case) 81def parse_parens(s): 82 brace_map = {} 83 84 parens = [] # () 85 brackets = [] # [] 86 braces = [] # {} 87 88 i = 0 89 end = len(s) 90 while i < end: 91 ch = s[i] 92 if ch == '/': 93 if i < end and s[i+1] == '/': 94 prev = i 95 i = s.find('\n', i) 96# print(idx_to_line_col(s, prev) + ' starts // comment, skipping to ' + idx_to_line_col(s, i)) 97 if i < end and s[i+1] == '*': 98 prev = i 99 i = s.find('*/', i+2)+1 100# print(idx_to_line_col(s, prev) + ' starts /* comment, skipping to ' + idx_to_line_col(s, i)) 101 elif ch == '"' and rcount(s, '\\', i-1) % 2 == 0: 102 prev = i 103 i = find_unescaped_end(s, '"', i, end) 104# print(idx_to_line_col(s, prev) + ' is a "" string, skipping to ' + idx_to_line_col(s, i)) 105 elif ch == "'" and rcount(s, '\\', i-1) % 2 == 0: 106 prev = i 107 i = find_unescaped_end(s, "'", i, end) 108# print(idx_to_line_col(s, prev) + ' is a \'\' string, skipping to ' + idx_to_line_col(s, i)) 109 elif ch == '^': # Ignore parens/brackets/braces if the previous character was a '^'. This is a bit of a heuristic, '^)' occur commonly in Emscripten generated regexes 110 i += 1 111 elif ch == '(': 112 if rcount(s, '\\', i-1) % 2 == 0: parens.append(i) 113# print(idx_to_line_col(s, i) + ' has (') 114 elif ch == '[': 115 if rcount(s, '\\', i-1) % 2 == 0: brackets.append(i) 116# print(idx_to_line_col(s, i) + ' has [') 117 elif ch == '{': 118 if rcount(s, '\\', i-1) % 2 == 0: braces.append(i) 119# print(idx_to_line_col(s, i) + ' has {') 120 elif ch == ')': 121 if rcount(s, '\\', i-1) % 2 == 0: 122# print(idx_to_line_col(s, i) + ' has )') 123 if len(parens) > 0: brace_map[parens.pop()] = i 124# else: print('Warning: ' + idx_to_line_col(s, i) + ' has ), but could not find the opening parenthesis.') 125 elif ch == ']': 126 if rcount(s, '\\', i-1) % 2 == 0: 127# print(idx_to_line_col(s, i) + ' has ]') 128 if len(brackets) > 0: brace_map[brackets.pop()] = i 129# else: print('Warning: ' + idx_to_line_col(s, i) + ' has ], but could not find the opening bracket.') 130 elif ch == '}': 131 if rcount(s, '\\', i-1) % 2 == 0: 132# print(idx_to_line_col(s, i) + ' has }') 133 if len(braces) > 0: brace_map[braces.pop()] = i 134# else: print('Warning: ' + idx_to_line_col(s, i) + ' has }, but could not find the opening brace.') 135 i += 1 136 return brace_map 137 138# Valid characters in Emscripten outputted JS content (in reality valid character set is much more complex, but do not need that here) 139def is_javascript_symbol_char(ch): 140 i = ord(ch) 141 return (i >= 97 and i <= 122) or (i >= 65 and i <= 90) or (i >= 48 and i <= 57) or i == 36 or i == 95 # a-z, A-Z, 0-9, $, _ 142 143def cxxfilt(): 144 filt = which('llvm-cxxfilt') 145 if filt: 146 return filt 147 return which('c++filt') 148 149# Runs the given symbols list through c++filt to demangle. 150def cpp_demangle(symbols): 151 try: 152 filt = cxxfilt() 153 if not filt: 154 print('"llvm-cxxfilt" or "c++filt" executable is not found, demangled symbol names will not be available') 155 return '' 156 proc = subprocess.Popen([cxxfilt(), '--strip-underscore'], stdout=subprocess.PIPE, stdin=subprocess.PIPE) 157 output = proc.communicate(input=symbols) 158 return output[0].replace('\r\n', '\n') 159 except Exception as e: 160 return '' 161 162# Given a data set, fills in the 'demangled_data' field for each entry. 163def find_demangled_names(data): 164 if not data or len(data) == 0: return 165 data_lines = list(data.keys()) 166 demangled_names = cpp_demangle('\n'.join(data_lines)).split('\n') 167 for i in xrange(len(data)): 168 mangled = data_lines[i] 169 data[mangled]['demangled_name'] = demangled_names[i].strip() if i < len(demangled_names) else mangled 170 171# Merges a new_entry with an old entry with the same name accumulating to its size (or adds new) 172def merge_entry_to_existing(existing_data, new_entry, total_source_set_size): 173 name = new_entry['unminified_name'] 174 if name in existing_data: 175 ex = existing_data[name] 176 num_times_occurs_1 = ex['num_times_occurs'] if 'num_times_occurs' in ex else 1 177 num_times_occurs_2 = new_entry['num_times_occurs'] if 'num_times_occurs' in new_entry else 1 178 existing_data[name] = { 179 'lines': ex['lines'] + new_entry['lines'], 180 'bytes': ex['bytes'] + new_entry['bytes'], 181 'demangled_name': ex['demangled_name'] if 'demangled_name' in ex else (new_entry['demangled_name'] if 'demangled_name' in new_entry else new_entry['minified_name']), 182 'minified_name': ex['minified_name'], 183 'unminified_name': ex['unminified_name'], 184 'function_parameters': ex['function_parameters'], 185 'type': ex['type'], 186 'percentage': (ex['bytes'] + new_entry['bytes']) * 100.0 / total_source_set_size, 187 'num_times_occurs': num_times_occurs_1 + num_times_occurs_2 188 } 189 else: 190 existing_data[name] = new_entry 191 192def merge_to_data_set(to_set, from_set, total_source_set_size): 193 for key, value in from_set.items(): 194 if diffing_two_data_sets: 195 merge_entry_to_existing(to_set, value, total_source_set_size) 196 else: 197 if key in to_set: key = s + '__' + key 198 to_set[key] = value 199 200# Builds up a dataset of functions and variables in the given JavaScript file (JS or asm.js) 201def analyze_javascript_file_contents(filename, file_contents, total_source_set_size, symbol_map=None): 202 data = {} 203 brace_map = parse_parens(file_contents) 204 parse_pos = 0 205 prev_end_pos = 0 206 file_len = len(file_contents) 207 func_regex = re.compile(r'function\s+([\w$]+)\s*\(([\w\s$,]*?)\)\s*{') # Search for "function foo (param1, param2, ..., paranN) {" 208 var_block_regex = re.compile(r'var\s+(\w+)\s*=\s*([{\[\(])') # Search for "var foo = {" 209 var_regex = re.compile(r'var\s+([\w]+)\s*=\s*[\w\s,]*?;') # Search for "var foo = .... ;" 210 unaccounted_bytes = 0 211 unaccounted_lines = 0 212 213 asm_start = file_contents.find('use asm') 214 asm_start_brace = -1 215 asm_end_brace = -1 216 asm_type = 'asmjs' 217 if asm_start < 0: 218 asm_start = file_contents.find('almost asm') 219 asm_type = '~asmjs' 220 if asm_start >= 0: 221 asm_start_brace = file_contents.rfind('{', 0, asm_start) 222 if asm_start_brace >= 0: 223 asm_end_brace = brace_map[asm_start_brace] if asm_start_brace in brace_map else file_len 224 225 func_pos = -1 226 var_pos = -1 227 while parse_pos < file_len: 228 if func_pos < parse_pos: func_pos = file_contents.find('function ', parse_pos) 229 if func_pos < 0: func_pos = file_len 230 if var_pos < parse_pos: var_pos = file_contents.find('var ', parse_pos) 231 if var_pos < 0: var_pos = file_len 232 if min(func_pos, var_pos) >= file_len: break 233 next_pos = min(func_pos, var_pos) 234 parse_pos = next_pos+1 235 236 # Skip this occurrence of 'function' if it had a prefix as part of some other string, e.g. 'foofunction' 237 if next_pos > 0 and is_javascript_symbol_char(file_contents[next_pos-1]): continue 238 239 if next_pos > prev_end_pos: 240 unaccounted_lines += file_contents.count('\n', prev_end_pos, next_pos) + 1 241 unaccounted_bytes += next_pos - prev_end_pos 242 if options.dump_unaccounted_larger_than >= 0 and next_pos - prev_end_pos > options.dump_unaccounted_larger_than: 243 print('--- Unaccounted ' + str(next_pos - prev_end_pos) + ' bytes in ' + filename + ':') 244 print(file_contents[prev_end_pos:next_pos]) 245 print('===') 246 prev_end_pos = next_pos 247 248 # Verify that this position actually starts a function by testing against a regex (this is much slower than substring search, 249 # which is why it's done as a second step, instead of as primary way to search) 250 if next_pos == func_pos: 251 func_match = func_regex.match(file_contents[func_pos:]) 252 if not func_match: continue 253 254 # find starting and ending braces { } for the function 255 start_brace = file_contents.find('{', func_pos) 256 if start_brace < 0: break # Must be at the end of file 257 if start_brace not in brace_map: 258 print('Warning: ' + idx_to_line_col(file_contents, start_brace) + ' cannot parse function start brace, skipping.') 259 continue 260 end_brace = brace_map[start_brace] 261 if end_brace < 0: break # Must be at the end of file 262 263 num_bytes = end_brace + 1 - func_pos 264 num_lines = file_contents.count('\n', func_pos, end_brace) + 1 265 prev_end_pos = parse_pos = end_brace + 1 266 267 function_type = asm_type if func_pos >= asm_start_brace and end_brace <= asm_end_brace else 'js' 268 minified_name = func_match.group(1) 269 function_parameters = func_match.group(2).strip() 270 if symbol_map and minified_name in symbol_map and function_type == asm_type: unminified_name = symbol_map[minified_name] 271 else: unminified_name = minified_name 272 data[unminified_name] = { 273 'lines': num_lines, 274 'bytes': num_bytes, 275 'minified_name': minified_name, 276 'unminified_name': unminified_name, 277 'function_parameters': function_parameters, 278 'type': function_type, 279 'percentage': num_bytes * 100.0 / total_source_set_size 280 } 281 else: # This is a variable 282 var_block_match = var_block_regex.match(file_contents[var_pos:]) 283 if var_block_match: 284 # find starting and ending braces { } for the var 285 start_brace = file_contents.find(var_block_match.group(2), var_pos) 286 if start_brace < 0: break # Must be at the end of file 287 if start_brace not in brace_map: 288 print('Warning: ' + idx_to_line_col(file_contents, start_brace) + ' cannot parse variable start brace, skipping.') 289 continue 290 end_brace = brace_map[start_brace] 291 if end_brace < 0: break # Must be at the end of file 292 minified_name = var_block_match.group(1) 293 else: 294 start_brace = var_pos 295 var_match = var_regex.match(file_contents[var_pos:]) 296 if not var_match: continue 297 end_brace = file_contents.find(';', var_pos) 298 minified_name = var_match.group(1) 299 300 # Special case ignore the 'var asm = (function(global, env, buffer) { 'use asm'; ... }; ' variable that contains all the asm.js code. 301 # Ignoring this variable lets all the asm.js code be trated as functions in this parser, instead of assigning them to the asm variable. 302 if file_contents[start_brace] == '(' and ("'use asm'" in file_contents[var_pos:end_brace] or '"use asm"' in file_contents[var_pos:end_brace] or "'almost asm'" in file_contents[var_pos:end_brace] or '"almost asm"' in file_contents[var_pos:end_brace]): continue 303 304 num_bytes = end_brace + 1 - var_pos 305 num_lines = file_contents.count('\n', var_pos, end_brace) + 1 306 prev_end_pos = parse_pos = end_brace + 1 307 308 var_type = 'asm_var' if func_pos >= asm_start_brace and end_brace <= asm_end_brace else 'var' 309 310 if symbol_map and minified_name in symbol_map and var_type =='asm_var': unminified_name = symbol_map[minified_name].strip() 311 else: unminified_name = minified_name 312 data[unminified_name] = { 313 'lines': num_lines, 314 'bytes': num_bytes, 315 'minified_name': minified_name, 316 'unminified_name': unminified_name, 317 'function_parameters': '', 318 'type': var_type, 319 'percentage': num_bytes * 100.0 / total_source_set_size 320 } 321 322 if options.list_unaccounted: 323 if diffing_two_data_sets: unaccounted_name = '$unaccounted_js_content' # If diffing two data sets, must make the names of the unaccounted content blocks be comparable 324 else: unaccounted_name = '$unaccounted_js_content_in("' + os.path.basename(filename) + '")' 325 unaccounted_entry = { 326 'lines': unaccounted_lines, 327 'bytes': unaccounted_bytes, 328 'minified_name': unaccounted_name, 329 'unminified_name': unaccounted_name, 330 'function_parameters': '', 331 'type': '[UNKN]', 332 'percentage': unaccounted_bytes * 100.0 / total_source_set_size 333 } 334 merge_entry_to_existing(data, unaccounted_entry, total_source_set_size) 335 336 return data 337 338def analyze_javascript_file(filename, total_source_set_size, symbol_map=None): 339 file_contents = open(filename, 'rb').read() 340 print('Analyzing JS file ' + filename + ', ' + str(len(file_contents)) + ' bytes...') 341 return analyze_javascript_file_contents(filename, file_contents, total_source_set_size, symbol_map) 342 343def analyze_html_file(filename, total_source_set_size, symbol_map=None): 344 file_contents = open(filename, 'rb').read() 345 print('Analyzing HTML file ' + filename + ', ' + str(len(file_contents)) + ' bytes...') 346 data = {} 347 parse_pos = 0 348 file_len = len(file_contents) 349 unaccounted_bytes = 0 350 unaccounted_lines = 0 351 352 while parse_pos < file_len: 353 script_pos = file_contents.find('<script', parse_pos) 354 if script_pos < 0: break 355 script_pos = file_contents.find('>', script_pos) 356 if script_pos < 0: break 357 script_pos += 1 358 script_end_pos = file_contents.find('</script>', script_pos) 359 if script_end_pos < 0: break 360 361 if script_pos > parse_pos: 362 unaccounted_bytes += script_pos - parse_pos 363 unaccounted_lines += file_contents.count('\n', parse_pos, script_pos) + 1 364 data_set = analyze_javascript_file_contents(filename, file_contents[script_pos:script_end_pos], total_source_set_size, symbol_map) 365 merge_to_data_set(data, data_set, total_source_set_size) 366 parse_pos = script_end_pos 367 368 if file_len > parse_pos: 369 unaccounted_bytes += file_len - parse_pos 370 unaccounted_lines += file_contents.count('\n', parse_pos, file_len) + 1 371 372 if options.list_unaccounted and unaccounted_bytes > 0: 373 if diffing_two_data_sets: unaccounted_name = '$unaccounted_html_content' # If diffing two data sets, must make the names of the unaccounted content blocks be comparable 374 else: unaccounted_name = '$unaccounted_html_content_in("' + os.path.basename(filename) + '")' 375 unaccounted_entry = { 376 'lines': unaccounted_lines, 377 'bytes': unaccounted_bytes, 378 'minified_name': unaccounted_name, 379 'unminified_name': unaccounted_name, 380 'function_parameters': '', 381 'type': 'HTML', 382 'percentage': unaccounted_bytes * 100.0 / total_source_set_size 383 } 384 merge_entry_to_existing(data, unaccounted_entry, total_source_set_size) 385 386 return data 387 388def analyze_source_file(filename, total_source_set_size, symbol_map=None): 389 if '.htm' in os.path.basename(filename).lower(): 390 return analyze_html_file(filename, total_source_set_size, symbol_map) 391 else: 392 return analyze_javascript_file(filename, total_source_set_size, symbol_map) 393 394def common_compare(data1, data2): 395 fns1 = set(data1.keys()) 396 fns2 = set(data2.keys()) 397 commonfns = fns1.intersection(fns2) 398 commonlinediff = 0 399 commonbytediff = 0 400 for fn in commonfns: 401 d1 = data1[fn] 402 d2 = data2[fn] 403 commonlinediff += d2['lines'] - d1['lines'] 404 commonbytediff += d2['bytes'] - d1['bytes'] 405 linesword = 'more' if commonlinediff >= 0 else 'less' 406 bytesword = 'more' if commonbytediff >= 0 else 'less' 407 print('set 2 has {} lines {} than set 1 in {} common functions'.format(abs(commonlinediff), linesword, len(commonfns))) 408 print('set 2 has {} bytes {} than set 1 in {} common functions'.format(str(abs(commonbytediff)), bytesword, len(commonfns))) 409 410def uniq_compare(data1, data2): 411 fns1 = set(data1.keys()) 412 fns2 = set(data2.keys()) 413 uniqfns1 = fns1 - fns2 414 uniqfns2 = fns2 - fns1 415 uniqlines1 = 0 416 uniqbytes1 = 0 417 uniqlines2 = 0 418 uniqbytes2 = 0 419 for fn in uniqfns1: 420 d = data1[fn] 421 uniqlines1 += d['lines'] 422 uniqbytes1 += d['bytes'] 423 for fn in uniqfns2: 424 d = data2[fn] 425 uniqlines2 += d['lines'] 426 uniqbytes2 += d['bytes'] 427 uniqcountdiff = len(uniqfns2) - len(uniqfns1) 428 assert len(fns2) - len(fns1) == uniqcountdiff 429 uniqlinediff = uniqlines2 - uniqlines1 430 uniqbytediff = uniqbytes2 - uniqbytes1 431 countword = 'more' if uniqcountdiff >= 0 else 'less' 432 linesword = 'more' if uniqlinediff >= 0 else 'less' 433 bytesword = 'more' if uniqbytediff >= 0 else 'less' 434 print('set 2 has {} functions {} than set 1 overall (unique: {} vs {})'.format(abs(uniqcountdiff), countword, len(uniqfns2), len(uniqfns1))) 435 print('set 2 has {} lines {} than set 1 overall in unique functions'.format(abs(uniqlinediff), linesword)) 436 print('set 2 has {} bytes {} than set 1 overall in unique functions'.format(str(abs(uniqbytediff)), bytesword)) 437 438# Use a bunch of regexps to simplify the demangled name 439DEM_RE = None 440def simplify_cxx_name(name): 441 global DEM_RE 442 if DEM_RE is None: 443 DEM_RE = [] 444 string_m = re.compile(r'std::__2::basic_string<char, std::__2::char_traits<char>, std::__2::allocator<char> >') 445 DEM_RE.append(lambda s: string_m.sub(r'std::string', s)) 446 vec_m = re.compile(r'std::__2::vector<([^,]+), std::__2::allocator<\1\s*> >') 447 DEM_RE.append(lambda s: vec_m.sub(r'std::vector<\1>', s)) 448 unordered_map_m = re.compile(r'std::__2::unordered_map<([^,]+), ([^,]+), std::__2::hash<\1\s*>, std::__2::equal_to<\1\s*>, std::__2::allocator<std::__2::pair<\1 const, \2> > >') 449 DEM_RE.append(lambda s: unordered_map_m.sub(r'std::unordered_map<\1, \2>', s)) 450 sort_m = re.compile(r'std::__2::__sort<std::__2::__less<([^,]+), \1\s*>&, \1\*>\(\1\*, \1\*, std::__2::__less<\1, \1\s*>&\)') 451 DEM_RE.append(lambda s: sort_m.sub(r'std::sort(\1*, \1*)', s)) 452 DEM_RE.append(lambda s: s.replace('std::__2::', 'std::')) 453 454 for dem in DEM_RE: 455 name = dem(name) 456 return name 457 458# 'foo(int, float)' -> 'foo' 459def function_args_removed(s): 460 if '(' in s: return s[:s.find('(')] 461 else: return s 462 463# 'foo(int, float)' -> 'int, float)' 464def function_args_part(s): 465 if '(' in s: return s[s.find('(')+1:] 466 else: return '' 467 468def sort_key_py2(key_value): 469 return key_value[1][options.sort] 470 471# Apparently for python 3, one will use the following, but currently untested 472# def sort_key_py3(key, value): 473# return value[options.sort] 474 475def print_symbol_info(data, total_source_set_size): 476 data = list(data.items()) 477 data.sort(key=sort_key_py2, reverse=not options.sort_ascending) 478 479 total_size = 0 480 for unminified_name, e in data: 481 if options.only_unique_1 and e['in_set_2']: continue 482 if options.only_unique_2 and e['in_set_1']: continue 483 if options.only_common and (not e['in_set_1'] or not e['in_set_2']): continue 484 prev_bytes = e['prev_bytes'] if 'prev_bytes' in e else 0 485 if max(e['bytes'], prev_bytes) < options.filter_size: continue 486 if e['bytes'] == prev_bytes and options.only_changes: continue 487 488 minified_name = e['minified_name'] 489 demangled_name = e['demangled_name'] 490 if options.simplify_cxx: 491 demangled_name = simplify_cxx_name(demangled_name) 492 493 if not '(' in demangled_name and 'js' in e['type']: demangled_name_with_args = demangled_name + '(' + e['function_parameters'] + ')' 494 else: demangled_name_with_args = demangled_name 495 demangled_name = function_args_removed(demangled_name) 496 497 if not options.filter_name in demangled_name_with_args.lower(): 498 continue 499 500 if e['function_parameters']: 501 unminified_name_with_args = unminified_name + '(' + e['function_parameters'] + ')' 502 minified_name_with_args = minified_name + '(' + e['function_parameters'] + ')' 503 elif 'js' in e['type']: 504 unminified_name_with_args = unminified_name + '()' 505 minified_name_with_args = minified_name + '()' 506 else: 507 unminified_name_with_args = unminified_name 508 minified_name_with_args = minified_name 509 510 # Build up the function name to print based on the desired formatting specifiers (mangled/minified/unminified, yes/no args) 511 print_name = [] 512 for i in options.print_format: 513 if i == 'd': print_name += [demangled_name] 514 elif i == 'u': print_name += [unminified_name] 515 elif i == 'm': print_name += [minified_name] 516 elif i == 'D': print_name += [demangled_name_with_args] 517 elif i == 'U': print_name += [unminified_name_with_args] 518 elif i == 'M': print_name += [minified_name_with_args] 519 520 # Collapse names that are identical 521 i = 0 522 while i+1 < len(print_name): 523 if print_name[i] == print_name[i+1]: 524 print_name = print_name[:i] + print_name[i+1:] 525 continue 526 n1 = function_args_removed(print_name[i]) 527 n2 = function_args_removed(print_name[i+1]) 528 args1 = function_args_part(print_name[i]) 529 args2 = function_args_part(print_name[i+1]) 530 if n1 == n2 and (not args1 or not args2): 531 if not args1: print_name = print_name[:i] + print_name[i+1:] 532 else: print_name = print_name[:i+1] + print_name[i+2:] 533 continue 534 i += 1 535 536 print_name = ' ; '.join(print_name) 537 if 'num_times_occurs' in e: 538 print_name = '[' + str(e['num_times_occurs']) + ' times] ' + print_name 539 delta_string = ' %+8d (%+6.2f%%)' % (e['bytes'] - e['prev_bytes'], e['percentage'] - e['prev_percentage']) if diffing_two_data_sets else '' 540 print('%6d lines %7s (%5.2f%%) %s: %8s %s' % (e['lines'], str(e['bytes']), e['percentage'], delta_string, e['type'], print_name)) 541 542 total_size += e['bytes'] 543 544 if total_size < total_source_set_size: 545 print('Total size of printed functions: ' + str(total_size) + ' bytes. (%.2f%% of all symbols)' % (total_size * 100.0 / total_source_set_size)) 546 else: 547 print('Total size of printed functions: ' + str(total_size) + ' bytes.') 548 549# Parses Emscripten compiler generated .symbols map file for minified->unminified mappings 550def read_symbol_map(filename): 551 if not filename: return 552 symbol_map = {} 553 for line in open(filename): 554 minified, unminified = line.split(':') 555 symbol_map[minified.strip()] = unminified.strip() 556 return symbol_map 557 558# Locates foo.js to foo.js.symbols or foo.html.symbols based on default output name rules for Emscripten compiler 559def guess_symbol_map_file_location(sources, symbol_map_file): 560 if os.path.isfile(symbol_map_file): return symbol_map_file 561 for s in sources: 562 if os.path.isfile(s + '.symbols'): return s + '.symbols' 563 if os.path.isfile(s.replace('.js', '.html') + '.symbols'): return s.replace('.js', '.html') + '.symbols' 564 return None 565 566# Returns total byte size of the given list of source files 567def count_file_set_size(sources): 568 total_size = 0 569 for s in sources: 570 total_size += os.path.getsize(s) 571 return total_size 572 573# Merges two given data sets into one large data set with diffing information 574def diff_data_sets(data1, data2): 575 all_keys = set().union(data1.keys(), data2.keys()) 576 diffed_data = {} 577 for k in all_keys: 578 if k in data2: 579 e = data2[k].copy() 580 e['in_set_2'] = True 581 if k in data1: 582 prev = data1[k] 583 e['prev_percentage'] = prev['percentage'] 584 e['prev_bytes'] = prev['bytes'] 585 e['prev_lines'] = prev['lines'] 586 e['in_set_1'] = True 587 else: 588 e['prev_percentage'] = 0 589 e['prev_bytes'] = 0 590 e['prev_lines'] = 0 591 e['in_set_1'] = False 592 else: 593 e = data1[k].copy() 594 e['prev_percentage'] = e['percentage'] 595 e['prev_lines'] = e['lines'] 596 e['prev_bytes'] = e['bytes'] 597 e['in_set_1'] = True 598 if k in data2: 599 new = data2[k] 600 e['percentage'] = prev['percentage'] 601 e['bytes'] = prev['bytes'] 602 e['lines'] = prev['lines'] 603 e['in_set_2'] = True 604 else: 605 e['percentage'] = 0 606 e['bytes'] = 0 607 e['lines'] = 0 608 e['in_set_2'] = False 609 e['delta'] = e['bytes'] - e['prev_bytes'] 610 e['delta_percentage'] = e['percentage'] - e['prev_percentage'] 611 e['abs_delta'] = abs(e['bytes'] - e['prev_bytes']) 612 diffed_data[k] = e 613 return diffed_data 614 615# Given string s and start index that contains a (, {, <, [, ", or ', finds forward the index where the token closes (taking nesting into account) 616def find_index_of_closing_token(s, start): 617 start_ch = s[start] 618 if start_ch == '(': end_ch = ')' 619 elif start_ch == '{': end_ch = '}' 620 elif start_ch == '<': end_ch = '>' 621 elif start_ch == '[': end_ch = ']' 622 elif start_ch == '"': end_ch = '"' 623 elif start_ch == "'": end_ch = "'" 624 else: 625 raise Exception('Unknown start token ' + start_ch + ', string ' + s + ', start ' + start) 626 627 i = start + 1 628 nesting_count = 1 629 while i < len(s): 630 if s[i] == end_ch: 631 nesting_count -= 1 632 if nesting_count <= 0: 633 return i 634 elif s[i] == start_ch: 635 nesting_count += 1 636 i += 1 637 return i 638 639def compute_templates_collapsed_name(demangled_name): 640 i = 0 641 generic_template_name = 'T' 642 type_names = {} 643 while True: 644 i = demangled_name.find('<', i) 645 if i < 0: 646 return demangled_name 647 648 end = find_index_of_closing_token(demangled_name, i) 649 if end < 0: 650 return demangled_name 651 652 i += 1 653 template_type = demangled_name[i:end] 654 if template_type in type_names: 655 template_name = type_names[template_type] 656 else: 657 template_name = generic_template_name 658 type_names[template_type] = generic_template_name 659 generic_template_name = chr(ord(generic_template_name) + 1) 660 661 demangled_name = demangled_name[:i] + template_name + demangled_name[end:] 662 663def collapse_templates(data_set, total_source_set_size, no_function_args): 664 collapsed_data_set = {} 665 keys = data_set.keys() 666 for k in keys: 667 e = data_set[k] 668 if 'demangled_name' in e: 669 demangled_name = compute_templates_collapsed_name(e['demangled_name']) 670 if no_function_args: demangled_name = function_args_removed(demangled_name) 671 e['demangled_name'] = e['unminified_name'] = demangled_name 672 merge_entry_to_existing(collapsed_data_set, e, total_source_set_size) 673 return collapsed_data_set 674 675def print_function_args(options): 676 return 'D' in options.print_format or 'U' in options.print_format or 'M' in options.print_format 677 678def main(): 679 global options, diffing_two_data_sets 680 usage_str = "emdump.py prints out statistics about compiled code sizes.\npython emdump.py --file a.js [--file2 b.js]" 681 parser = argparse.ArgumentParser(usage=usage_str) 682 683 parser.add_argument('--file', dest='file', default=[], nargs='*', 684 help='Specifies the compiled JavaScript build file to analyze.') 685 686 parser.add_argument('--file1', dest='file1', default=[], nargs='*', 687 help='Specifies the compiled JavaScript build file to analyze.') 688 689 parser.add_argument('--symbol-map', dest='symbol_map', default='', 690 help='Specifies a filename to the symbol map file that can be used to unminify function and variable names.') 691 692 parser.add_argument('--file2', dest='file2', default=[], nargs='*', 693 help='Specifies a second compiled JavaScript build file to analyze.') 694 695 parser.add_argument('--symbol-map2', dest='symbol_map2', default='', 696 help='Specifies a filename to a second symbol map file that will be used to unminify function and variable names of file2.') 697 698 parser.add_argument('--list-unaccounted', dest='list_unaccounted', type=int, default=1, 699 help='Pass --list-unaccounted=0 to skip listing a summary entry of unaccounted content') 700 701 parser.add_argument('--dump-unaccounted-larger-than', dest='dump_unaccounted_larger_than', type=int, default=-1, 702 help='If an integer value >= 0 is specified, all unaccounted strings of content longer than the given value will be printed out to the console.\n(Note that it is common to have several unaccounted blocks, this is provided for curiosity/debugging/optimization ideas)') 703 704 parser.add_argument('--only-unique-1', dest='only_unique_1', action='store_true', default=False, 705 help='If two data sets are specified, prints out only the symbols that are present in set 1, but not in set 2') 706 707 parser.add_argument('--only-unique-2', dest='only_unique_2', action='store_true', default=False, 708 help='If two data sets are specified, prints out only the symbols that are present in set 2, but not in set 1') 709 710 parser.add_argument('--only-common', dest='only_common', action='store_true', default=False, 711 help='If two data sets are specified, prints out only the symbols that are common to both data sets') 712 713 parser.add_argument('--only-changes', dest='only_changes', action='store_true', default=False, 714 help='If two data sets are specified, prints out only the symbols that have changed size or are added/removed') 715 716 parser.add_argument('--only-summarize', dest='only_summarize', action='store_true', default=False, 717 help='If specified, detailed information about each symbol is not printed, but only summary data is shown.') 718 719 parser.add_argument('--filter-name', dest='filter_name', default='', 720 help='Only prints out information about symbols that contain the given filter substring in their demangled names. The filtering is always performed in lower case.') 721 722 parser.add_argument('--filter-size', dest='filter_size', type=int, default=0, 723 help='Only prints out information about symbols that are (or were) larger than the given amount of bytes.') 724 725 parser.add_argument('--sort', dest='sort', default='bytes', 726 help='Specifies the data column to sort output by. Possible values are: lines, bytes, delta, abs_delta, type, minified, unminified, demangled') 727 728 parser.add_argument('--print-format', dest='print_format', default='DM', 729 help='Specifies the naming format for the symbols. Possible options are one of: m, u, d, du, dm, um, dum. Here "m" denotes minified, "u" denotes unminified, and "d" denotes demangled. Specify any combination of the characters in upper case to print out function parameters.\nDefault: DM.') 730 731 parser.add_argument('--sort-ascending', dest='sort_ascending', action='store_true', default=False, 732 help='If true, reverses the sorting order to be ascending instead of default descending.') 733 734 parser.add_argument('--simplify-cxx', dest='simplify_cxx', action='store_true', default=False, 735 help='Simplify C++ STL types as much as possible in the output') 736 737 parser.add_argument('--group-templates', dest='group_templates', action='store_true', default=False, 738 help='Group/collapse all C++ templates with Foo<asdf> and Foo<qwer> to generic Foo<T>') 739 740 options = parser.parse_args(sys.argv[1:]) 741 options.file = options.file + options.file1 742 743 if len(options.file) == 0: 744 print('Specify a set of JavaScript build output files to analyze with --file file1.js file2.js ... fileN.js.\nRun python emdump.py --help to see all options.') 745 return 1 746 747 options.filter_name = options.filter_name.lower() 748 749 diffing_two_data_sets = len(options.file2) > 0 750 if not diffing_two_data_sets: 751 if options.only_unique_1: 752 print('Error: Must specify two data sets with --file a.js b.js c.js --file2 d.js e.js f.js to diff in order to use --only-unique-symbols-in-set-1 option!') 753 sys.exit(1) 754 755 if options.only_unique_2: 756 print('Error: Must specify two data sets with --file a.js b.js c.js --file2 d.js e.js f.js to diff in order to use --only-unique-symbols-in-set-2 option!') 757 sys.exit(1) 758 759 if options.only_common: 760 print('Error: Must specify two data sets with --file a.js b.js c.js --file2 d.js e.js f.js to diff in order to use --only-common-symbols option!') 761 sys.exit(1) 762 763 # Validate column sorting input: 764 valid_sort_options = ['lines', 'bytes', 'delta', 'abs_delta', 'type', 'minified', 'unminified', 'demangled'] 765 if options.sort not in valid_sort_options: 766 print('Invalid sort option ' + options.sort + ' specified! Choose one of: ' + ', '.join(valid_sort_options) + '.') 767 sys.exit(1) 768 if options.sort == 'minified': options.sort = 'minified_name' 769 if options.sort == 'unminified': options.sort = 'unminified_name' 770 if options.sort == 'demangled': options.sort = 'demangled_name' 771 772 if 'delta' in options.sort and not diffing_two_data_sets: 773 print('Error: Must specify two data sets with --file a.js b.js c.js --file2 d.js e.js f.js to diff in order to use --sort='+options.sort) 774 sys.exit(1) 775 776 # Autoguess .symbols file location based on default Emscripten build output, to save the need to type it out in the common case 777 options.symbol_map = guess_symbol_map_file_location(options.file, options.symbol_map) 778 options.symbol_map2 = guess_symbol_map_file_location(options.file2, options.symbol_map2) 779 780 symbol_map1 = read_symbol_map(options.symbol_map) 781 symbol_map2 = read_symbol_map(options.symbol_map2) 782 783 set1_size = count_file_set_size(options.file) 784 data1 = {} 785 for s in options.file: 786 data = analyze_source_file(s, set1_size, symbol_map1) 787 merge_to_data_set(data1, data, set1_size) 788 789 set2_size = count_file_set_size(options.file2) 790 data2 = {} 791 for s in options.file2: 792 data = analyze_source_file(s, set2_size, symbol_map2) 793 merge_to_data_set(data2, data, set2_size) 794 795 find_demangled_names(data1) 796 find_demangled_names(data2) 797 798 if options.group_templates: 799 data1 = collapse_templates(data1, set1_size, not print_function_args(options)) 800 data2 = collapse_templates(data2, set2_size, not print_function_args(options)) 801 802 if diffing_two_data_sets: 803 diffed_data = diff_data_sets(data1, data2) 804 if not options.only_summarize: 805 print_symbol_info(diffed_data, set2_size) 806 print('') 807 print('set 2 is %d bytes, which is %+.2f%% %s than set 1 size (%d bytes)' % (set2_size, (set2_size - set1_size) * 100.0 / set2_size, 'more' if set2_size > set1_size else 'less', set1_size)) 808 uniq_compare(data1, data2) 809 common_compare(data1, data2) 810 else: 811 if not options.only_summarize: 812 print_symbol_info(data1, set1_size) 813 # TODO: print some kind of summary? 814 815 return 0 816 817if __name__ == '__main__': 818 sys.exit(main()) 819