1#!/usr/bin/env python 2 3"""A tool for extracting a list of symbols to export 4 5When exporting symbols from a dll or exe we either need to mark the symbols in 6the source code as __declspec(dllexport) or supply a list of symbols to the 7linker. This program automates the latter by inspecting the symbol tables of a 8list of link inputs and deciding which of those symbols need to be exported. 9 10We can't just export all the defined symbols, as there's a limit of 65535 11exported symbols and in clang we go way over that, particularly in a debug 12build. Therefore a large part of the work is pruning symbols either which can't 13be imported, or which we think are things that have definitions in public header 14files (i.e. template instantiations) and we would get defined in the thing 15importing these symbols anyway. 16""" 17 18from __future__ import print_function 19import sys 20import re 21import os 22import subprocess 23import multiprocessing 24import argparse 25 26# Define functions which extract a list of symbols from a library using several 27# different tools. We use subprocess.Popen and yield a symbol at a time instead 28# of using subprocess.check_output and returning a list as, especially on 29# Windows, waiting for the entire output to be ready can take a significant 30# amount of time. 31 32def dumpbin_get_symbols(lib): 33 process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1, 34 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 35 universal_newlines=True) 36 process.stdin.close() 37 for line in process.stdout: 38 # Look for external symbols that are defined in some section 39 match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line) 40 if match: 41 yield match.group(1) 42 process.wait() 43 44def nm_get_symbols(lib): 45 if sys.platform.startswith('aix'): 46 process = subprocess.Popen(['nm','-P','-Xany','-C','-p',lib], bufsize=1, 47 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 48 universal_newlines=True) 49 else: 50 process = subprocess.Popen(['nm','-P',lib], bufsize=1, 51 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 52 universal_newlines=True) 53 process.stdin.close() 54 for line in process.stdout: 55 # Look for external symbols that are defined in some section 56 match = re.match("^(\S+)\s+[BDGRSTVW]\s+\S+\s+\S+$", line) 57 if match: 58 yield match.group(1) 59 process.wait() 60 61def readobj_get_symbols(lib): 62 process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1, 63 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 64 universal_newlines=True) 65 process.stdin.close() 66 for line in process.stdout: 67 # When looking through the output of llvm-readobj we expect to see Name, 68 # Section, then StorageClass, so record Name and Section when we see 69 # them and decide if this is a defined external symbol when we see 70 # StorageClass. 71 match = re.search('Name: (\S+)', line) 72 if match: 73 name = match.group(1) 74 match = re.search('Section: (\S+)', line) 75 if match: 76 section = match.group(1) 77 match = re.search('StorageClass: (\S+)', line) 78 if match: 79 storageclass = match.group(1) 80 if section != 'IMAGE_SYM_ABSOLUTE' and \ 81 section != 'IMAGE_SYM_UNDEFINED' and \ 82 storageclass == 'External': 83 yield name 84 process.wait() 85 86# Define functions which determine if the target is 32-bit Windows (as that's 87# where calling convention name decoration happens). 88 89def dumpbin_is_32bit_windows(lib): 90 # dumpbin /headers can output a huge amount of data (>100MB in a debug 91 # build) so we read only up to the 'machine' line then close the output. 92 process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1, 93 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 94 universal_newlines=True) 95 process.stdin.close() 96 retval = False 97 for line in process.stdout: 98 match = re.match('.+machine \((\S+)\)', line) 99 if match: 100 retval = (match.group(1) == 'x86') 101 break 102 process.stdout.close() 103 process.wait() 104 return retval 105 106def objdump_is_32bit_windows(lib): 107 output = subprocess.check_output(['objdump','-f',lib], 108 universal_newlines=True) 109 for line in output: 110 match = re.match('.+file format (\S+)', line) 111 if match: 112 return (match.group(1) == 'pe-i386') 113 return False 114 115def readobj_is_32bit_windows(lib): 116 output = subprocess.check_output(['llvm-readobj','-file-headers',lib], 117 universal_newlines=True) 118 for line in output: 119 match = re.match('Format: (\S+)', line) 120 if match: 121 return (match.group(1) == 'COFF-i386') 122 return False 123 124# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the 125# identifier/type mangling we can decide which symbols could possibly be 126# required and which we can discard. 127def should_keep_microsoft_symbol(symbol, calling_convention_decoration): 128 # Keep unmangled (i.e. extern "C") names 129 if not '?' in symbol: 130 if calling_convention_decoration: 131 # Remove calling convention decoration from names 132 match = re.match('[_@]([^@]+)', symbol) 133 if match: 134 return match.group(1) 135 return symbol 136 # Function template instantiations start with ?$; keep the instantiations of 137 # clang::Type::getAs, as some of them are explipict specializations that are 138 # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that 139 # the definition is public 140 elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol): 141 return symbol 142 elif symbol.startswith('??$'): 143 return None 144 # Deleting destructors start with ?_G or ?_E and can be discarded because 145 # link.exe gives you a warning telling you they can't be exported if you 146 # don't 147 elif symbol.startswith('??_G') or symbol.startswith('??_E'): 148 return None 149 # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be 150 # defined in headers and not required to be kept 151 elif symbol.startswith('??0?$') or symbol.startswith('??1?$'): 152 return None 153 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol 154 # that mentions an anonymous namespace can be discarded, as the anonymous 155 # namespace doesn't exist outside of that translation unit. 156 elif re.search('\?A(0x\w+)?@', symbol): 157 return None 158 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a 159 # bit of a mess and imprecise, but that avoids having to completely demangle 160 # the symbol name. The outermost namespace is at the end of the identifier 161 # mangling, and the identifier mangling is followed by the type mangling, so 162 # we look for (llvm|clang)@@ followed by something that looks like a 163 # function type mangling. To spot a function type we use (this is derived 164 # from clang/lib/AST/MicrosoftMangle.cpp): 165 # <function-type> ::= <function-class> <this-cvr-qualifiers> 166 # <calling-convention> <return-type> 167 # <argument-list> <throw-spec> 168 # <function-class> ::= [A-Z] 169 # <this-cvr-qualifiers> ::= [A-Z0-9_]* 170 # <calling-convention> ::= [A-JQ] 171 # <return-type> ::= .+ 172 # <argument-list> ::= X (void) 173 # ::= .+@ (list of types) 174 # ::= .*Z (list of types, varargs) 175 # <throw-spec> ::= exceptions are not allowed 176 elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol): 177 return symbol 178 return None 179 180# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We 181# demangle the identifier mangling to identify symbols that can be safely 182# discarded. 183def should_keep_itanium_symbol(symbol, calling_convention_decoration): 184 # Start by removing any calling convention decoration (which we expect to 185 # see on all symbols, even mangled C++ symbols) 186 if calling_convention_decoration and symbol.startswith('_'): 187 symbol = symbol[1:] 188 # Keep unmangled names 189 if not symbol.startswith('_') and not symbol.startswith('.'): 190 return symbol 191 # Discard manglings that aren't nested names 192 match = re.match('_Z(T[VTIS])?(N.+)', symbol) 193 if not match: 194 return None 195 # Demangle the name. If the name is too complex then we don't need to keep 196 # it, but it the demangling fails then keep the symbol just in case. 197 try: 198 names, _ = parse_itanium_nested_name(match.group(2)) 199 except TooComplexName: 200 return None 201 if not names: 202 return symbol 203 # Constructors and destructors of templates classes are assumed to be 204 # defined in headers and not required to be kept 205 if re.match('[CD][123]', names[-1][0]) and names[-2][1]: 206 return None 207 # Keep the instantiations of clang::Type::getAs, as some of them are 208 # explipict specializations that are defined in clang's lib/AST/Type.cpp; 209 # discard any other function template instantiations as it's assumed that 210 # the definition is public 211 elif symbol.startswith('_ZNK5clang4Type5getAs'): 212 return symbol 213 elif names[-1][1]: 214 return None 215 # Keep llvm:: and clang:: names 216 elif names[0][0] == '4llvm' or names[0][0] == '5clang': 217 return symbol 218 # Discard everything else 219 else: 220 return None 221 222# Certain kinds of complex manglings we assume cannot be part of a public 223# interface, and we handle them by raising an exception. 224class TooComplexName(Exception): 225 pass 226 227# Parse an itanium mangled name from the start of a string and return a 228# (name, rest of string) pair. 229def parse_itanium_name(arg): 230 # Check for a normal name 231 match = re.match('(\d+)(.+)', arg) 232 if match: 233 n = int(match.group(1)) 234 name = match.group(1)+match.group(2)[:n] 235 rest = match.group(2)[n:] 236 return name, rest 237 # Check for constructor/destructor names 238 match = re.match('([CD][123])(.+)', arg) 239 if match: 240 return match.group(1), match.group(2) 241 # Assume that a sequence of characters that doesn't end a nesting is an 242 # operator (this is very imprecise, but appears to be good enough) 243 match = re.match('([^E]+)(.+)', arg) 244 if match: 245 return match.group(1), match.group(2) 246 # Anything else: we can't handle it 247 return None, arg 248 249# Parse an itanium mangled template argument list from the start of a string 250# and throw it away, returning the rest of the string. 251def skip_itanium_template(arg): 252 # A template argument list starts with I 253 assert arg.startswith('I'), arg 254 tmp = arg[1:] 255 while tmp: 256 # Check for names 257 match = re.match('(\d+)(.+)', tmp) 258 if match: 259 n = int(match.group(1)) 260 tmp = match.group(2)[n:] 261 continue 262 # Check for substitutions 263 match = re.match('S[A-Z0-9]*_(.+)', tmp) 264 if match: 265 tmp = match.group(1) 266 # Start of a template 267 elif tmp.startswith('I'): 268 tmp = skip_itanium_template(tmp) 269 # Start of a nested name 270 elif tmp.startswith('N'): 271 _, tmp = parse_itanium_nested_name(tmp) 272 # Start of an expression: assume that it's too complicated 273 elif tmp.startswith('L') or tmp.startswith('X'): 274 raise TooComplexName 275 # End of the template 276 elif tmp.startswith('E'): 277 return tmp[1:] 278 # Something else: probably a type, skip it 279 else: 280 tmp = tmp[1:] 281 return None 282 283# Parse an itanium mangled nested name and transform it into a list of pairs of 284# (name, is_template), returning (list, rest of string). 285def parse_itanium_nested_name(arg): 286 # A nested name starts with N 287 assert arg.startswith('N'), arg 288 ret = [] 289 290 # Skip past the N, and possibly a substitution 291 match = re.match('NS[A-Z0-9]*_(.+)', arg) 292 if match: 293 tmp = match.group(1) 294 else: 295 tmp = arg[1:] 296 297 # Skip past CV-qualifiers and ref qualifiers 298 match = re.match('[rVKRO]*(.+)', tmp); 299 if match: 300 tmp = match.group(1) 301 302 # Repeatedly parse names from the string until we reach the end of the 303 # nested name 304 while tmp: 305 # An E ends the nested name 306 if tmp.startswith('E'): 307 return ret, tmp[1:] 308 # Parse a name 309 name_part, tmp = parse_itanium_name(tmp) 310 if not name_part: 311 # If we failed then we don't know how to demangle this 312 return None, None 313 is_template = False 314 # If this name is a template record that, then skip the template 315 # arguments 316 if tmp.startswith('I'): 317 tmp = skip_itanium_template(tmp) 318 is_template = True 319 # Add the name to the list 320 ret.append((name_part, is_template)) 321 322 # If we get here then something went wrong 323 return None, None 324 325def extract_symbols(arg): 326 get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg 327 symbols = dict() 328 for symbol in get_symbols(lib): 329 symbol = should_keep_symbol(symbol, calling_convention_decoration) 330 if symbol: 331 symbols[symbol] = 1 + symbols.setdefault(symbol,0) 332 return symbols 333 334if __name__ == '__main__': 335 tool_exes = ['dumpbin','nm','objdump','llvm-readobj'] 336 parser = argparse.ArgumentParser( 337 description='Extract symbols to export from libraries') 338 parser.add_argument('--mangling', choices=['itanium','microsoft'], 339 required=True, help='expected symbol mangling scheme') 340 parser.add_argument('--tools', choices=tool_exes, nargs='*', 341 help='tools to use to extract symbols and determine the' 342 ' target') 343 parser.add_argument('libs', metavar='lib', type=str, nargs='+', 344 help='libraries to extract symbols from') 345 parser.add_argument('-o', metavar='file', type=str, help='output to file') 346 args = parser.parse_args() 347 348 # Determine the function to use to get the list of symbols from the inputs, 349 # and the function to use to determine if the target is 32-bit windows. 350 tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows), 351 'nm' : (nm_get_symbols, None), 352 'objdump' : (None, objdump_is_32bit_windows), 353 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) } 354 get_symbols = None 355 is_32bit_windows = None 356 # If we have a tools argument then use that for the list of tools to check 357 if args.tools: 358 tool_exes = args.tools 359 # Find a tool to use by trying each in turn until we find one that exists 360 # (subprocess.call will throw OSError when the program does not exist) 361 get_symbols = None 362 for exe in tool_exes: 363 try: 364 # Close std streams as we don't want any output and we don't 365 # want the process to wait for something on stdin. 366 p = subprocess.Popen([exe], stdout=subprocess.PIPE, 367 stderr=subprocess.PIPE, 368 stdin=subprocess.PIPE, 369 universal_newlines=True) 370 p.stdout.close() 371 p.stderr.close() 372 p.stdin.close() 373 p.wait() 374 # Keep going until we have a tool to use for both get_symbols and 375 # is_32bit_windows 376 if not get_symbols: 377 get_symbols = tools[exe][0] 378 if not is_32bit_windows: 379 is_32bit_windows = tools[exe][1] 380 if get_symbols and is_32bit_windows: 381 break 382 except OSError: 383 continue 384 if not get_symbols: 385 print("Couldn't find a program to read symbols with", file=sys.stderr) 386 exit(1) 387 if not is_32bit_windows: 388 print("Couldn't find a program to determining the target", file=sys.stderr) 389 exit(1) 390 391 # How we determine which symbols to keep and which to discard depends on 392 # the mangling scheme 393 if args.mangling == 'microsoft': 394 should_keep_symbol = should_keep_microsoft_symbol 395 else: 396 should_keep_symbol = should_keep_itanium_symbol 397 398 # Get the list of libraries to extract symbols from 399 libs = list() 400 for lib in args.libs: 401 # When invoked by cmake the arguments are the cmake target names of the 402 # libraries, so we need to add .lib/.a to the end and maybe lib to the 403 # start to get the filename. Also allow objects. 404 suffixes = ['.lib','.a','.obj','.o'] 405 if not any([lib.endswith(s) for s in suffixes]): 406 for s in suffixes: 407 if os.path.exists(lib+s): 408 lib = lib+s 409 break 410 if os.path.exists('lib'+lib+s): 411 lib = 'lib'+lib+s 412 break 413 if not any([lib.endswith(s) for s in suffixes]): 414 print("Don't know what to do with argument "+lib, file=sys.stderr) 415 exit(1) 416 libs.append(lib) 417 418 # Check if calling convention decoration is used by inspecting the first 419 # library in the list 420 calling_convention_decoration = is_32bit_windows(libs[0]) 421 422 # Extract symbols from libraries in parallel. This is a huge time saver when 423 # doing a debug build, as there are hundreds of thousands of symbols in each 424 # library. 425 pool = multiprocessing.Pool() 426 try: 427 # Only one argument can be passed to the mapping function, and we can't 428 # use a lambda or local function definition as that doesn't work on 429 # windows, so create a list of tuples which duplicates the arguments 430 # that are the same in all calls. 431 vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs] 432 # Do an async map then wait for the result to make sure that 433 # KeyboardInterrupt gets caught correctly (see 434 # http://bugs.python.org/issue8296) 435 result = pool.map_async(extract_symbols, vals) 436 pool.close() 437 libs_symbols = result.get(3600) 438 except KeyboardInterrupt: 439 # On Ctrl-C terminate everything and exit 440 pool.terminate() 441 pool.join() 442 exit(1) 443 444 # Merge everything into a single dict 445 symbols = dict() 446 for this_lib_symbols in libs_symbols: 447 for k,v in list(this_lib_symbols.items()): 448 symbols[k] = v + symbols.setdefault(k,0) 449 450 # Count instances of member functions of template classes, and map the 451 # symbol name to the function+class. We do this under the assumption that if 452 # a member function of a template class is instantiated many times it's 453 # probably declared in a public header file. 454 template_function_count = dict() 455 template_function_mapping = dict() 456 template_function_count[""] = 0 457 for k in symbols: 458 name = None 459 if args.mangling == 'microsoft': 460 # Member functions of templates start with 461 # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>. 462 # As manglings go from the innermost scope to the outermost scope 463 # this means: 464 # * When we have a function member of a subclass of a template 465 # class then <fn_name> will actually contain the mangling of 466 # both the subclass and the function member. This is fine. 467 # * When we have a function member of a template subclass of a 468 # (possibly template) class then it's the innermost template 469 # subclass that becomes <class_name>. This should be OK so long 470 # as we don't have multiple classes with a template subclass of 471 # the same name. 472 match = re.search("^\?(\??\w+\@\?\$\w+)\@", k) 473 if match: 474 name = match.group(1) 475 else: 476 # Find member functions of templates by demangling the name and 477 # checking if the second-to-last name in the list is a template. 478 match = re.match('_Z(T[VTIS])?(N.+)', k) 479 if match: 480 try: 481 names, _ = parse_itanium_nested_name(match.group(2)) 482 if names and names[-2][1]: 483 name = ''.join([x for x,_ in names]) 484 except TooComplexName: 485 # Manglings that are too complex should already have been 486 # filtered out, but if we happen to somehow see one here 487 # just leave it as-is. 488 pass 489 if name: 490 old_count = template_function_count.setdefault(name,0) 491 template_function_count[name] = old_count + 1 492 template_function_mapping[k] = name 493 else: 494 template_function_mapping[k] = "" 495 496 # Print symbols which both: 497 # * Appear in exactly one input, as symbols defined in multiple 498 # objects/libraries are assumed to have public definitions. 499 # * Aren't instances of member functions of templates which have been 500 # instantiated 100 times or more, which are assumed to have public 501 # definitions. (100 is an arbitrary guess here.) 502 if args.o: 503 outfile = open(args.o,'w') 504 else: 505 outfile = sys.stdout 506 for k,v in list(symbols.items()): 507 template_count = template_function_count[template_function_mapping[k]] 508 if v == 1 and template_count < 100: 509 print(k, file=outfile) 510