1#!/usr/bin/env python
2
3# Copyright (C) 2006 Aladdin Enterprises.  All rights reserved.
4
5# Analyze the module dependency structure of binary files.
6# This script requires GNU-compatible 'nm' and 'objdump' programs:
7# it has only been tested on (32- and 64-bit) x86 systems.
8
9# $Id: ocheck.py 7940 2007-05-09 21:51:57Z giles $
10
11USAGE = """\
12Usage: python ocheck.py [<cwd>] <ld-script> (--exclude [from]:[to]* |
13         --query [from]:[to]* | --cycles | --verbose)*
14from and to are comma-separated lists of patterns, which are glob-style
15expressions representing modules (if they contain a '.') or symbols (if
16not).  An empty from or to is equivalent to '*'."""
17HELP = """
18An example from Ghostscript (to check references from library files to
19PostScript interpreter files, which are not allowed):
20
21    python ocheck.py gs gs/obj/ldt.tr \\
22      --exclude :ptr_const_string_procs,ptr_string_procs,ptr_struct_procs \\
23      gconfig.o: iconfig.o: :icc.o :ijs_client.o :inflate.o gs.o:imain*.o \\
24      --query [gs]*.o:[iz]*.o
25"""
26
27__author__ = 'L. Peter Deutsch'
28
29from os.path import basename, isdir, join
30from subprocess import PIPE, Popen
31import re, sys
32
33_type2section = {'D': 'data', 'd': 'data',
34                 'R': 'rodata', 'r': 'rodata',
35                 'T': 'text', 't': 'text'}
36
37class OFile:
38
39    def __init__(self, name):
40        self.name = name
41        self.basename = basename(name)
42
43    def __str__(self):
44        return self.name
45
46    def _readsymbols(self):
47        # Return a mapping from symbol to (type, loc|None).
48        nmlist, symbols = self._exec('nm'), {}
49        for line in nmlist.split('\n'):
50            parts = line.strip().split(' ')
51            if len(parts) == 3:
52                loc, type, sym = parts
53                loc = int(loc, 16)
54            elif len(parts) == 2 and len(parts[0]) == 1:
55                loc = None
56                type, sym = parts
57            else:
58                assert not line
59                continue
60            assert sym not in symbols   # ****** WRONG FOR LOCALS? ******
61            symbols[sym] = (type, loc)
62        self.symbols = symbols
63        return symbols
64
65    def _readrelocs(self):
66        # Return a mapping from section name to a sorted list of
67        # (offset, referenced symbol).  Note that the referenced symbols
68        # may be defined in this file, i.e., not imported.
69        odlist, relocs = self._exec('objdump', '-r'), {}
70        # We don't know whether the output format of objdump is standard,
71        # but we'll find out....
72        for line in odlist.split('\n'):
73            if line.startswith('RELOCATION RECORDS FOR [.'):
74                assert line.endswith(']:')
75                section = line[25:-2]
76                assert section not in relocs
77                rlist = relocs[section] = []
78            elif line.startswith('OFFSET '):
79                pass
80            elif not line:
81                rlist = None
82            elif rlist is None:         # preamble
83                pass
84            else:                       # must be a real entry
85                line = line.split(' ')
86                loc, sym = line[0], line[-1]
87                if sym[0] != '.':           # i.e., a real reference
88                    sym = sym.split('+')[0] # might have an offset
89                    rlist.append((int(loc, 16), sym))
90        for rlist in relocs.values():
91            rlist.sort(key = lambda (loc, sym): loc)
92        self.relocs = relocs
93        return relocs
94
95    def _readreferences(self):
96        # Return a mapping from referencing symbol (procedure/data) to
97        # referenced imported symbols.  Note that the referencing symbol
98        # may be local (not exported).
99        syms = self.symbols
100        relocs = dict([(section, list(rels)) for section, rels in
101                       self.relocs.items()]) # copy rel lists
102        refs = {}
103        defs = [(sym, _type2section[type], loc) \
104                for sym, (type, loc) in syms.items() \
105                if loc is not None and type in _type2section]
106        imports = dict([(sym, 1) for sym, (type, loc) in syms.items() \
107                        if type == 'U'])
108        # Sorting in reverse order turns out to simplify things.
109        defs.sort(key = lambda (sym, sec, loc): -loc)
110        for rels in relocs.values():
111            rels.reverse()
112        for sym, section, loc in defs:
113            rels, rlist = relocs.get(section, ()), []
114            while rels and rels[0][0] >= loc:
115                ref = rels.pop(0)[1]
116                if ref in imports and ref not in rlist: # quadratic, don't care
117                    rlist.append(ref)
118            if rlist:
119                refs[sym] = rlist
120        self.references = refs
121        return refs
122
123    def _exec(self, *cmd):
124        return Popen(cmd + (self.name,), stdout = PIPE).communicate()[0]
125
126    # Read information from file lazily, with no overhead afterwards.
127    symbols = property(_readsymbols)
128    relocs = property(_readrelocs)
129    references = property(_readreferences)
130
131class DGraph:
132
133    def __init__(self):
134        self.graph = {}
135
136    def add(self, ofrom, oto):
137        g = self.graph
138        if not ofrom in g: g[ofrom] = [{}, {}]
139        if not oto in g: g[oto] = [{}, {}]
140        g[ofrom][1][oto] = 1
141        g[oto][0][ofrom] = 1
142
143    def remove(self, ofrom = None, oto = None):
144        if ofrom is None:
145            for obj in self.fromkeys():
146                self.remove(obj, oto)
147        elif oto is None:
148            for obj in self.tokeys(ofrom):
149                self.remove(ofrom, obj)
150        else:
151            g = self.graph
152            if ofrom in g and oto in g:
153                try:
154                    del g[ofrom][1][oto]
155                    del g[oto][0][ofrom]
156                except KeyError:
157                    pass
158
159    def fromkeys(self, oto = None):
160        if oto:
161            return self.graph.get(oto, [{}, {}])[0].keys()
162        return [key for key, (f, t) in self.graph.items() if t]
163
164    def tokeys(self, ofrom = None):
165        if ofrom:
166            return self.graph.get(ofrom, [{}, {}])[1].keys()
167        return [key for key, (f, t) in self.graph.items() if f]
168
169    # Thanks to http://www.bitformation.com/art/python_toposort.html
170    # for this algorithm.
171
172    def toposort(self):
173        g = self.graph
174        roots = [obj for (obj, (f, t)) in g.items() if len(f) == 0]
175        sorted = []
176        while roots:
177            root = roots.pop()
178            sorted.append(root)
179            for child in g[root][1]:
180                del g[child][0][root]
181                if len(g[child][0]) == 0:
182                    roots.append(child)
183            del g[root]
184        return sorted, g.keys()
185
186    def pick(self):
187        # Pick a non-root to convert to a root.
188        g = self.graph
189        gi = g.items()
190        gi.sort(key = lambda (o, (f, t)): (len(f), -len(t)))
191        o, (f, t) = gi[0]
192        # Work backward through single-reference parents.
193        # Invariant: (f, t) = g[o]
194        no = o
195        seen = [o]
196        while 1:
197            nf, nt = g[no]
198            if len(nf) > 1: break
199            o, f, t = no, nf, nt
200            no = f.keys()[0]
201            if no in seen: break
202            seen.append(no)
203        for fo in f:
204            del g[fo][1][o]
205        g[o][0] = {}
206        seen.reverse()
207        return o, f.keys(), t.keys(), seen
208
209# ---------------------------- main program ---------------------------- #
210
211def usage():
212    print USAGE
213
214def cvpat(str):
215    if not str: return ('', '')
216    sre, mre = '', ''
217    for p in str.split(','):
218        r = p.replace('.', r'\.').replace('?', '.').replace('*', '.*')
219        if '.' in p: mre += '|' + r
220        else:        sre += '|' + r
221    return (sre and sre[1:], mre and mre[1:])
222
223def forfts(args, defs, refs, proc, verbose):
224    # Call proc(m, s) for each (m, s) matching an item in ftlist
225    defmods, refmods = defs.fromkeys(), refs.fromkeys()
226    while args and not args[0].startswith('-'):
227        ft = args.pop(0).split(':')
228        if len(ft) != 2: break
229        if verbose:
230            print '-------- %s:%s --------' % tuple(ft)
231        f, t = ft
232        fs, fm = cvpat(f)
233        ts, tm = cvpat(t)
234        if fm:
235            fm = re.compile(fm)
236            fmods = [m for m in refmods if fm.match(m)]
237        else:
238            fmods = refmods
239        fmods = dict([(m, 1) for m in fmods])
240        if tm or not ts:                # neither tm nor ts = all
241            tm = re.compile(tm)
242            tmods = dict([(m, 1) for m in defmods if tm.match(m)])
243        else:
244            tmods = None
245        # ****** fs IS BOGUS, USES ENTIRE MODULE ******
246        if fs:
247            fs = re.compile(fs)
248            for s in defs.tokeys():
249                if fs.match(s):
250                    for m in defs.fromkeys(s):
251                        fmods[m] = 1
252        if ts:
253            ts = re.compile(ts)
254        for m in fmods:
255            if ts:
256                for s in refs.tokeys(m):
257                    if ts.match(s):
258                        proc(m, s)
259            if tm:
260                for s in refs.tokeys(m):
261                    for dm in defs.fromkeys(s):
262                        if tm.match(dm):
263                            proc(m, s)
264
265def tsort(defs, refs):
266    ts = DGraph()
267    for m1 in refs.fromkeys():
268        for s in refs.tokeys(m1):
269            for m2 in defs.fromkeys(s):
270                ts.add(m1, m2)
271    sorted, loops = ts.toposort()
272    passes = 1
273    while loops:
274        print 'Sorted:', len(sorted), sorted
275        print len(ts.graph), 'files remaining'
276        o, f, t, seen = ts.pick()
277        print 'Picking', o, ': %d references' % len(f), f
278        print '    %d children,' % len(t),\
279              [(o, len(ts.tokeys(o))) for o in t]
280        sorted, loops = ts.toposort()
281        passes += 1
282    print 'Topo sort completed, %d pass(es).' % passes
283
284def main(argv):
285    args = argv[1:]
286    if len(args) < 1:
287        print 'Use --help for usage information.'
288        return
289    if args[0] == '--help':
290        print USAGE
291        print HELP
292        return
293    if isdir(args[0]):
294        cwd = args.pop(0)
295    else:
296        cwd = ''
297    if len(args) < 1: return usage()
298    verbose = False
299    # Read the ld script and each file's symbol table.
300    ldscript = args.pop(0)
301    f = file(ldscript)
302    ld = f.read()
303    f.close()
304    tokens = filter(None, ld.replace('\\\n', '').split())
305    ofiles, ofdict = [], {}
306    del tokens[0]                       # gcc or ld
307    while tokens:
308        t = tokens.pop(0)
309        if t == '-o':
310            tokens.pop(0)
311            continue
312        elif t.startswith('-l') or t.startswith('-L'):
313            continue
314        elif t.startswith('-'):
315            print 'Unrecognized switch in ld script:', t
316            return 1
317        ofile = OFile(join(cwd, t))
318        ofiles.append(ofile)
319        ofdict[ofile.basename] = ofile
320    print len(ofiles), 'object files'
321    # Collect the symbols.
322    defs, refs = DGraph(), DGraph()
323    for ofile in ofiles:
324        fname = basename(ofile.name)
325        for sym, (type, loc) in ofile.symbols.items():
326            if '.' in sym:              # generated symbol
327                continue
328            if type == 'U':
329                refs.add(fname, sym)
330            elif type.islower():        # local symbol
331                continue
332            elif sym in defs.graph:
333                dfiles = defs.graph[sym][0].keys()
334                print '****', sym, 'defined in',\
335                      [(dfile, ofdict[dfile].symbols[sym]) for dfile in dfiles],\
336                      'and', ofile, (type, loc)
337            else:
338                defs.add(fname, sym)
339    print len(refs.graph), 'in refs,', len(defs.graph), 'in defs'
340    # Process the rest of the command line.
341    while args:
342        arg = args.pop(0)
343        if arg == '--exclude':
344            # Remove excluded relationships.
345            def remove1(m, s):
346                if verbose:
347                    print 'removing', (m, s)
348                refs.remove(m, s)
349            forfts(args, defs, refs, remove1, verbose)
350        elif arg == '--query':
351            # Process queries.
352            print 64 * '-'
353            def query1(m, s):
354                dms = defs.fromkeys(s)
355                print '>>', m, 'references', s, '(defined in',\
356                      ', '.join(dms) + ')'
357                ofile, srcs = ofdict[m], []
358                for src, syms in ofile.references.items():
359                    if s in syms:
360                        srcs.append(src)
361                if srcs:
362                    print '     from', ', '.join(srcs)
363            forfts(args, defs, refs, query1, verbose)
364        elif arg == '--cycles':
365            # Attempt to sort the modules topologically, using the criterion
366            # that M1 precedes M2 iff M1 uses a symbol defined in M2.
367            print 64 * '-'
368            tsort(defs, refs)
369        elif arg == '--verbose':
370            verbose = True
371        else:
372            return usage()
373
374if __name__ == '__main__':
375    sys.exit(main(sys.argv) or 0)
376