1#!/usr/bin/env python
2
3"""A tool for extracting a list of symbols to export
4
5When exporting symbols from a dll or exe we either need to mark the symbols in
6the source code as __declspec(dllexport) or supply a list of symbols to the
7linker. This program automates the latter by inspecting the symbol tables of a
8list of link inputs and deciding which of those symbols need to be exported.
9
10We can't just export all the defined symbols, as there's a limit of 65535
11exported symbols and in clang we go way over that, particularly in a debug
12build. Therefore a large part of the work is pruning symbols either which can't
13be imported, or which we think are things that have definitions in public header
14files (i.e. template instantiations) and we would get defined in the thing
15importing these symbols anyway.
16"""
17
18from __future__ import print_function
19import sys
20import re
21import os
22import subprocess
23import multiprocessing
24import argparse
25
26# Define functions which extract a list of symbols from a library using several
27# different tools. We use subprocess.Popen and yield a symbol at a time instead
28# of using subprocess.check_output and returning a list as, especially on
29# Windows, waiting for the entire output to be ready can take a significant
30# amount of time.
31
32def dumpbin_get_symbols(lib):
33    process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
34                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
35                               universal_newlines=True)
36    process.stdin.close()
37    for line in process.stdout:
38        # Look for external symbols that are defined in some section
39        match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
40        if match:
41            yield match.group(1)
42    process.wait()
43
44def nm_get_symbols(lib):
45    if sys.platform.startswith('aix'):
46        process = subprocess.Popen(['nm','-P','-Xany','-C','-p',lib], bufsize=1,
47                                   stdout=subprocess.PIPE, stdin=subprocess.PIPE,
48                                   universal_newlines=True)
49    else:
50        process = subprocess.Popen(['nm','-P',lib], bufsize=1,
51                                   stdout=subprocess.PIPE, stdin=subprocess.PIPE,
52                                   universal_newlines=True)
53    process.stdin.close()
54    for line in process.stdout:
55        # Look for external symbols that are defined in some section
56        match = re.match("^(\S+)\s+[BDGRSTVW]\s+\S+\s+\S+$", line)
57        if match:
58            yield match.group(1)
59    process.wait()
60
61def readobj_get_symbols(lib):
62    process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1,
63                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
64                               universal_newlines=True)
65    process.stdin.close()
66    for line in process.stdout:
67        # When looking through the output of llvm-readobj we expect to see Name,
68        # Section, then StorageClass, so record Name and Section when we see
69        # them and decide if this is a defined external symbol when we see
70        # StorageClass.
71        match = re.search('Name: (\S+)', line)
72        if match:
73            name = match.group(1)
74        match = re.search('Section: (\S+)', line)
75        if match:
76            section = match.group(1)
77        match = re.search('StorageClass: (\S+)', line)
78        if match:
79            storageclass = match.group(1)
80            if section != 'IMAGE_SYM_ABSOLUTE' and \
81               section != 'IMAGE_SYM_UNDEFINED' and \
82               storageclass == 'External':
83                yield name
84    process.wait()
85
86# Define functions which determine if the target is 32-bit Windows (as that's
87# where calling convention name decoration happens).
88
89def dumpbin_is_32bit_windows(lib):
90    # dumpbin /headers can output a huge amount of data (>100MB in a debug
91    # build) so we read only up to the 'machine' line then close the output.
92    process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
93                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
94                               universal_newlines=True)
95    process.stdin.close()
96    retval = False
97    for line in process.stdout:
98        match = re.match('.+machine \((\S+)\)', line)
99        if match:
100            retval = (match.group(1) == 'x86')
101            break
102    process.stdout.close()
103    process.wait()
104    return retval
105
106def objdump_is_32bit_windows(lib):
107    output = subprocess.check_output(['objdump','-f',lib],
108                                     universal_newlines=True)
109    for line in output:
110        match = re.match('.+file format (\S+)', line)
111        if match:
112            return (match.group(1) == 'pe-i386')
113    return False
114
115def readobj_is_32bit_windows(lib):
116    output = subprocess.check_output(['llvm-readobj','-file-headers',lib],
117                                     universal_newlines=True)
118    for line in output:
119        match = re.match('Format: (\S+)', line)
120        if match:
121            return (match.group(1) == 'COFF-i386')
122    return False
123
124# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
125# identifier/type mangling we can decide which symbols could possibly be
126# required and which we can discard.
127def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
128    # Keep unmangled (i.e. extern "C") names
129    if not '?' in symbol:
130        if calling_convention_decoration:
131            # Remove calling convention decoration from names
132            match = re.match('[_@]([^@]+)', symbol)
133            if match:
134                return match.group(1)
135        return symbol
136    # Function template instantiations start with ?$; keep the instantiations of
137    # clang::Type::getAs, as some of them are explipict specializations that are
138    # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that
139    # the definition is public
140    elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol):
141        return symbol
142    elif symbol.startswith('??$'):
143        return None
144    # Deleting destructors start with ?_G or ?_E and can be discarded because
145    # link.exe gives you a warning telling you they can't be exported if you
146    # don't
147    elif symbol.startswith('??_G') or symbol.startswith('??_E'):
148        return None
149    # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
150    # defined in headers and not required to be kept
151    elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
152        return None
153    # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
154    # that mentions an anonymous namespace can be discarded, as the anonymous
155    # namespace doesn't exist outside of that translation unit.
156    elif re.search('\?A(0x\w+)?@', symbol):
157        return None
158    # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
159    # bit of a mess and imprecise, but that avoids having to completely demangle
160    # the symbol name. The outermost namespace is at the end of the identifier
161    # mangling, and the identifier mangling is followed by the type mangling, so
162    # we look for (llvm|clang)@@ followed by something that looks like a
163    # function type mangling. To spot a function type we use (this is derived
164    # from clang/lib/AST/MicrosoftMangle.cpp):
165    # <function-type> ::= <function-class> <this-cvr-qualifiers>
166    #                     <calling-convention> <return-type>
167    #                     <argument-list> <throw-spec>
168    # <function-class> ::= [A-Z]
169    # <this-cvr-qualifiers> ::= [A-Z0-9_]*
170    # <calling-convention> ::= [A-JQ]
171    # <return-type> ::= .+
172    # <argument-list> ::= X   (void)
173    #                 ::= .+@ (list of types)
174    #                 ::= .*Z (list of types, varargs)
175    # <throw-spec> ::= exceptions are not allowed
176    elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
177        return symbol
178    return None
179
180# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
181# demangle the identifier mangling to identify symbols that can be safely
182# discarded.
183def should_keep_itanium_symbol(symbol, calling_convention_decoration):
184    # Start by removing any calling convention decoration (which we expect to
185    # see on all symbols, even mangled C++ symbols)
186    if calling_convention_decoration and symbol.startswith('_'):
187        symbol = symbol[1:]
188    # Keep unmangled names
189    if not symbol.startswith('_') and not symbol.startswith('.'):
190        return symbol
191    # Discard manglings that aren't nested names
192    match = re.match('_Z(T[VTIS])?(N.+)', symbol)
193    if not match:
194        return None
195    # Demangle the name. If the name is too complex then we don't need to keep
196    # it, but it the demangling fails then keep the symbol just in case.
197    try:
198        names, _ = parse_itanium_nested_name(match.group(2))
199    except TooComplexName:
200        return None
201    if not names:
202        return symbol
203    # Constructors and destructors of templates classes are assumed to be
204    # defined in headers and not required to be kept
205    if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
206        return None
207    # Keep the instantiations of clang::Type::getAs, as some of them are
208    # explipict specializations that are defined in clang's lib/AST/Type.cpp;
209    # discard any other function template instantiations as it's assumed that
210    # the definition is public
211    elif symbol.startswith('_ZNK5clang4Type5getAs'):
212        return symbol
213    elif names[-1][1]:
214        return None
215    # Keep llvm:: and clang:: names
216    elif names[0][0] == '4llvm' or names[0][0] == '5clang':
217        return symbol
218    # Discard everything else
219    else:
220        return None
221
222# Certain kinds of complex manglings we assume cannot be part of a public
223# interface, and we handle them by raising an exception.
224class TooComplexName(Exception):
225    pass
226
227# Parse an itanium mangled name from the start of a string and return a
228# (name, rest of string) pair.
229def parse_itanium_name(arg):
230    # Check for a normal name
231    match = re.match('(\d+)(.+)', arg)
232    if match:
233        n = int(match.group(1))
234        name = match.group(1)+match.group(2)[:n]
235        rest = match.group(2)[n:]
236        return name, rest
237    # Check for constructor/destructor names
238    match = re.match('([CD][123])(.+)', arg)
239    if match:
240        return match.group(1), match.group(2)
241    # Assume that a sequence of characters that doesn't end a nesting is an
242    # operator (this is very imprecise, but appears to be good enough)
243    match = re.match('([^E]+)(.+)', arg)
244    if match:
245        return match.group(1), match.group(2)
246    # Anything else: we can't handle it
247    return None, arg
248
249# Parse an itanium mangled template argument list from the start of a string
250# and throw it away, returning the rest of the string.
251def skip_itanium_template(arg):
252    # A template argument list starts with I
253    assert arg.startswith('I'), arg
254    tmp = arg[1:]
255    while tmp:
256        # Check for names
257        match = re.match('(\d+)(.+)', tmp)
258        if match:
259            n = int(match.group(1))
260            tmp =  match.group(2)[n:]
261            continue
262        # Check for substitutions
263        match = re.match('S[A-Z0-9]*_(.+)', tmp)
264        if match:
265            tmp = match.group(1)
266        # Start of a template
267        elif tmp.startswith('I'):
268            tmp = skip_itanium_template(tmp)
269        # Start of a nested name
270        elif tmp.startswith('N'):
271            _, tmp = parse_itanium_nested_name(tmp)
272        # Start of an expression: assume that it's too complicated
273        elif tmp.startswith('L') or tmp.startswith('X'):
274            raise TooComplexName
275        # End of the template
276        elif tmp.startswith('E'):
277            return tmp[1:]
278        # Something else: probably a type, skip it
279        else:
280            tmp = tmp[1:]
281    return None
282
283# Parse an itanium mangled nested name and transform it into a list of pairs of
284# (name, is_template), returning (list, rest of string).
285def parse_itanium_nested_name(arg):
286    # A nested name starts with N
287    assert arg.startswith('N'), arg
288    ret = []
289
290    # Skip past the N, and possibly a substitution
291    match = re.match('NS[A-Z0-9]*_(.+)', arg)
292    if match:
293        tmp = match.group(1)
294    else:
295        tmp = arg[1:]
296
297    # Skip past CV-qualifiers and ref qualifiers
298    match = re.match('[rVKRO]*(.+)', tmp);
299    if match:
300        tmp = match.group(1)
301
302    # Repeatedly parse names from the string until we reach the end of the
303    # nested name
304    while tmp:
305        # An E ends the nested name
306        if tmp.startswith('E'):
307            return ret, tmp[1:]
308        # Parse a name
309        name_part, tmp = parse_itanium_name(tmp)
310        if not name_part:
311            # If we failed then we don't know how to demangle this
312            return None, None
313        is_template = False
314        # If this name is a template record that, then skip the template
315        # arguments
316        if tmp.startswith('I'):
317            tmp = skip_itanium_template(tmp)
318            is_template = True
319        # Add the name to the list
320        ret.append((name_part, is_template))
321
322    # If we get here then something went wrong
323    return None, None
324
325def extract_symbols(arg):
326    get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
327    symbols = dict()
328    for symbol in get_symbols(lib):
329        symbol = should_keep_symbol(symbol, calling_convention_decoration)
330        if symbol:
331            symbols[symbol] = 1 + symbols.setdefault(symbol,0)
332    return symbols
333
334if __name__ == '__main__':
335    tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
336    parser = argparse.ArgumentParser(
337        description='Extract symbols to export from libraries')
338    parser.add_argument('--mangling', choices=['itanium','microsoft'],
339                        required=True, help='expected symbol mangling scheme')
340    parser.add_argument('--tools', choices=tool_exes, nargs='*',
341                        help='tools to use to extract symbols and determine the'
342                        ' target')
343    parser.add_argument('libs', metavar='lib', type=str, nargs='+',
344                        help='libraries to extract symbols from')
345    parser.add_argument('-o', metavar='file', type=str, help='output to file')
346    args = parser.parse_args()
347
348    # Determine the function to use to get the list of symbols from the inputs,
349    # and the function to use to determine if the target is 32-bit windows.
350    tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
351              'nm' : (nm_get_symbols, None),
352              'objdump' : (None, objdump_is_32bit_windows),
353              'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
354    get_symbols = None
355    is_32bit_windows = None
356    # If we have a tools argument then use that for the list of tools to check
357    if args.tools:
358        tool_exes = args.tools
359    # Find a tool to use by trying each in turn until we find one that exists
360    # (subprocess.call will throw OSError when the program does not exist)
361    get_symbols = None
362    for exe in tool_exes:
363        try:
364            # Close std streams as we don't want any output and we don't
365            # want the process to wait for something on stdin.
366            p = subprocess.Popen([exe], stdout=subprocess.PIPE,
367                                 stderr=subprocess.PIPE,
368                                 stdin=subprocess.PIPE,
369                                 universal_newlines=True)
370            p.stdout.close()
371            p.stderr.close()
372            p.stdin.close()
373            p.wait()
374            # Keep going until we have a tool to use for both get_symbols and
375            # is_32bit_windows
376            if not get_symbols:
377                get_symbols = tools[exe][0]
378            if not is_32bit_windows:
379                is_32bit_windows = tools[exe][1]
380            if get_symbols and is_32bit_windows:
381                break
382        except OSError:
383            continue
384    if not get_symbols:
385        print("Couldn't find a program to read symbols with", file=sys.stderr)
386        exit(1)
387    if not is_32bit_windows:
388        print("Couldn't find a program to determining the target", file=sys.stderr)
389        exit(1)
390
391    # How we determine which symbols to keep and which to discard depends on
392    # the mangling scheme
393    if args.mangling == 'microsoft':
394        should_keep_symbol = should_keep_microsoft_symbol
395    else:
396        should_keep_symbol = should_keep_itanium_symbol
397
398    # Get the list of libraries to extract symbols from
399    libs = list()
400    for lib in args.libs:
401        # When invoked by cmake the arguments are the cmake target names of the
402        # libraries, so we need to add .lib/.a to the end and maybe lib to the
403        # start to get the filename. Also allow objects.
404        suffixes = ['.lib','.a','.obj','.o']
405        if not any([lib.endswith(s) for s in suffixes]):
406            for s in suffixes:
407                if os.path.exists(lib+s):
408                    lib = lib+s
409                    break
410                if os.path.exists('lib'+lib+s):
411                    lib = 'lib'+lib+s
412                    break
413        if not any([lib.endswith(s) for s in suffixes]):
414            print("Don't know what to do with argument "+lib, file=sys.stderr)
415            exit(1)
416        libs.append(lib)
417
418    # Check if calling convention decoration is used by inspecting the first
419    # library in the list
420    calling_convention_decoration = is_32bit_windows(libs[0])
421
422    # Extract symbols from libraries in parallel. This is a huge time saver when
423    # doing a debug build, as there are hundreds of thousands of symbols in each
424    # library.
425    pool = multiprocessing.Pool()
426    try:
427        # Only one argument can be passed to the mapping function, and we can't
428        # use a lambda or local function definition as that doesn't work on
429        # windows, so create a list of tuples which duplicates the arguments
430        # that are the same in all calls.
431        vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
432        # Do an async map then wait for the result to make sure that
433        # KeyboardInterrupt gets caught correctly (see
434        # http://bugs.python.org/issue8296)
435        result = pool.map_async(extract_symbols, vals)
436        pool.close()
437        libs_symbols = result.get(3600)
438    except KeyboardInterrupt:
439        # On Ctrl-C terminate everything and exit
440        pool.terminate()
441        pool.join()
442        exit(1)
443
444    # Merge everything into a single dict
445    symbols = dict()
446    for this_lib_symbols in libs_symbols:
447        for k,v in list(this_lib_symbols.items()):
448            symbols[k] = v + symbols.setdefault(k,0)
449
450    # Count instances of member functions of template classes, and map the
451    # symbol name to the function+class. We do this under the assumption that if
452    # a member function of a template class is instantiated many times it's
453    # probably declared in a public header file.
454    template_function_count = dict()
455    template_function_mapping = dict()
456    template_function_count[""] = 0
457    for k in symbols:
458        name = None
459        if args.mangling == 'microsoft':
460            # Member functions of templates start with
461            # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
462            # As manglings go from the innermost scope to the outermost scope
463            # this means:
464            #  * When we have a function member of a subclass of a template
465            #    class then <fn_name> will actually contain the mangling of
466            #    both the subclass and the function member. This is fine.
467            #  * When we have a function member of a template subclass of a
468            #    (possibly template) class then it's the innermost template
469            #    subclass that becomes <class_name>. This should be OK so long
470            #    as we don't have multiple classes with a template subclass of
471            #    the same name.
472            match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
473            if match:
474                name = match.group(1)
475        else:
476            # Find member functions of templates by demangling the name and
477            # checking if the second-to-last name in the list is a template.
478            match = re.match('_Z(T[VTIS])?(N.+)', k)
479            if match:
480                try:
481                    names, _ = parse_itanium_nested_name(match.group(2))
482                    if names and names[-2][1]:
483                        name = ''.join([x for x,_ in names])
484                except TooComplexName:
485                    # Manglings that are too complex should already have been
486                    # filtered out, but if we happen to somehow see one here
487                    # just leave it as-is.
488                    pass
489        if name:
490            old_count = template_function_count.setdefault(name,0)
491            template_function_count[name] = old_count + 1
492            template_function_mapping[k] = name
493        else:
494            template_function_mapping[k] = ""
495
496    # Print symbols which both:
497    #  * Appear in exactly one input, as symbols defined in multiple
498    #    objects/libraries are assumed to have public definitions.
499    #  * Aren't instances of member functions of templates which have been
500    #    instantiated 100 times or more, which are assumed to have public
501    #    definitions. (100 is an arbitrary guess here.)
502    if args.o:
503        outfile = open(args.o,'w')
504    else:
505        outfile = sys.stdout
506    for k,v in list(symbols.items()):
507        template_count = template_function_count[template_function_mapping[k]]
508        if v == 1 and template_count < 100:
509            print(k, file=outfile)
510