1#!/usr/bin/env python 2 3# Copyright (C) 2006 Aladdin Enterprises. All rights reserved. 4 5# Analyze the module dependency structure of binary files. 6# This script requires GNU-compatible 'nm' and 'objdump' programs: 7# it has only been tested on (32- and 64-bit) x86 systems. 8 9# $Id: ocheck.py 7940 2007-05-09 21:51:57Z giles $ 10 11USAGE = """\ 12Usage: python ocheck.py [<cwd>] <ld-script> (--exclude [from]:[to]* | 13 --query [from]:[to]* | --cycles | --verbose)* 14from and to are comma-separated lists of patterns, which are glob-style 15expressions representing modules (if they contain a '.') or symbols (if 16not). An empty from or to is equivalent to '*'.""" 17HELP = """ 18An example from Ghostscript (to check references from library files to 19PostScript interpreter files, which are not allowed): 20 21 python ocheck.py gs gs/obj/ldt.tr \\ 22 --exclude :ptr_const_string_procs,ptr_string_procs,ptr_struct_procs \\ 23 gconfig.o: iconfig.o: :icc.o :ijs_client.o :inflate.o gs.o:imain*.o \\ 24 --query [gs]*.o:[iz]*.o 25""" 26 27__author__ = 'L. Peter Deutsch' 28 29from os.path import basename, isdir, join 30from subprocess import PIPE, Popen 31import re, sys 32 33_type2section = {'D': 'data', 'd': 'data', 34 'R': 'rodata', 'r': 'rodata', 35 'T': 'text', 't': 'text'} 36 37class OFile: 38 39 def __init__(self, name): 40 self.name = name 41 self.basename = basename(name) 42 43 def __str__(self): 44 return self.name 45 46 def _readsymbols(self): 47 # Return a mapping from symbol to (type, loc|None). 48 nmlist, symbols = self._exec('nm'), {} 49 for line in nmlist.split('\n'): 50 parts = line.strip().split(' ') 51 if len(parts) == 3: 52 loc, type, sym = parts 53 loc = int(loc, 16) 54 elif len(parts) == 2 and len(parts[0]) == 1: 55 loc = None 56 type, sym = parts 57 else: 58 assert not line 59 continue 60 assert sym not in symbols # ****** WRONG FOR LOCALS? ****** 61 symbols[sym] = (type, loc) 62 self.symbols = symbols 63 return symbols 64 65 def _readrelocs(self): 66 # Return a mapping from section name to a sorted list of 67 # (offset, referenced symbol). Note that the referenced symbols 68 # may be defined in this file, i.e., not imported. 69 odlist, relocs = self._exec('objdump', '-r'), {} 70 # We don't know whether the output format of objdump is standard, 71 # but we'll find out.... 72 for line in odlist.split('\n'): 73 if line.startswith('RELOCATION RECORDS FOR [.'): 74 assert line.endswith(']:') 75 section = line[25:-2] 76 assert section not in relocs 77 rlist = relocs[section] = [] 78 elif line.startswith('OFFSET '): 79 pass 80 elif not line: 81 rlist = None 82 elif rlist is None: # preamble 83 pass 84 else: # must be a real entry 85 line = line.split(' ') 86 loc, sym = line[0], line[-1] 87 if sym[0] != '.': # i.e., a real reference 88 sym = sym.split('+')[0] # might have an offset 89 rlist.append((int(loc, 16), sym)) 90 for rlist in relocs.values(): 91 rlist.sort(key = lambda (loc, sym): loc) 92 self.relocs = relocs 93 return relocs 94 95 def _readreferences(self): 96 # Return a mapping from referencing symbol (procedure/data) to 97 # referenced imported symbols. Note that the referencing symbol 98 # may be local (not exported). 99 syms = self.symbols 100 relocs = dict([(section, list(rels)) for section, rels in 101 self.relocs.items()]) # copy rel lists 102 refs = {} 103 defs = [(sym, _type2section[type], loc) \ 104 for sym, (type, loc) in syms.items() \ 105 if loc is not None and type in _type2section] 106 imports = dict([(sym, 1) for sym, (type, loc) in syms.items() \ 107 if type == 'U']) 108 # Sorting in reverse order turns out to simplify things. 109 defs.sort(key = lambda (sym, sec, loc): -loc) 110 for rels in relocs.values(): 111 rels.reverse() 112 for sym, section, loc in defs: 113 rels, rlist = relocs.get(section, ()), [] 114 while rels and rels[0][0] >= loc: 115 ref = rels.pop(0)[1] 116 if ref in imports and ref not in rlist: # quadratic, don't care 117 rlist.append(ref) 118 if rlist: 119 refs[sym] = rlist 120 self.references = refs 121 return refs 122 123 def _exec(self, *cmd): 124 return Popen(cmd + (self.name,), stdout = PIPE).communicate()[0] 125 126 # Read information from file lazily, with no overhead afterwards. 127 symbols = property(_readsymbols) 128 relocs = property(_readrelocs) 129 references = property(_readreferences) 130 131class DGraph: 132 133 def __init__(self): 134 self.graph = {} 135 136 def add(self, ofrom, oto): 137 g = self.graph 138 if not ofrom in g: g[ofrom] = [{}, {}] 139 if not oto in g: g[oto] = [{}, {}] 140 g[ofrom][1][oto] = 1 141 g[oto][0][ofrom] = 1 142 143 def remove(self, ofrom = None, oto = None): 144 if ofrom is None: 145 for obj in self.fromkeys(): 146 self.remove(obj, oto) 147 elif oto is None: 148 for obj in self.tokeys(ofrom): 149 self.remove(ofrom, obj) 150 else: 151 g = self.graph 152 if ofrom in g and oto in g: 153 try: 154 del g[ofrom][1][oto] 155 del g[oto][0][ofrom] 156 except KeyError: 157 pass 158 159 def fromkeys(self, oto = None): 160 if oto: 161 return self.graph.get(oto, [{}, {}])[0].keys() 162 return [key for key, (f, t) in self.graph.items() if t] 163 164 def tokeys(self, ofrom = None): 165 if ofrom: 166 return self.graph.get(ofrom, [{}, {}])[1].keys() 167 return [key for key, (f, t) in self.graph.items() if f] 168 169 # Thanks to http://www.bitformation.com/art/python_toposort.html 170 # for this algorithm. 171 172 def toposort(self): 173 g = self.graph 174 roots = [obj for (obj, (f, t)) in g.items() if len(f) == 0] 175 sorted = [] 176 while roots: 177 root = roots.pop() 178 sorted.append(root) 179 for child in g[root][1]: 180 del g[child][0][root] 181 if len(g[child][0]) == 0: 182 roots.append(child) 183 del g[root] 184 return sorted, g.keys() 185 186 def pick(self): 187 # Pick a non-root to convert to a root. 188 g = self.graph 189 gi = g.items() 190 gi.sort(key = lambda (o, (f, t)): (len(f), -len(t))) 191 o, (f, t) = gi[0] 192 # Work backward through single-reference parents. 193 # Invariant: (f, t) = g[o] 194 no = o 195 seen = [o] 196 while 1: 197 nf, nt = g[no] 198 if len(nf) > 1: break 199 o, f, t = no, nf, nt 200 no = f.keys()[0] 201 if no in seen: break 202 seen.append(no) 203 for fo in f: 204 del g[fo][1][o] 205 g[o][0] = {} 206 seen.reverse() 207 return o, f.keys(), t.keys(), seen 208 209# ---------------------------- main program ---------------------------- # 210 211def usage(): 212 print USAGE 213 214def cvpat(str): 215 if not str: return ('', '') 216 sre, mre = '', '' 217 for p in str.split(','): 218 r = p.replace('.', r'\.').replace('?', '.').replace('*', '.*') 219 if '.' in p: mre += '|' + r 220 else: sre += '|' + r 221 return (sre and sre[1:], mre and mre[1:]) 222 223def forfts(args, defs, refs, proc, verbose): 224 # Call proc(m, s) for each (m, s) matching an item in ftlist 225 defmods, refmods = defs.fromkeys(), refs.fromkeys() 226 while args and not args[0].startswith('-'): 227 ft = args.pop(0).split(':') 228 if len(ft) != 2: break 229 if verbose: 230 print '-------- %s:%s --------' % tuple(ft) 231 f, t = ft 232 fs, fm = cvpat(f) 233 ts, tm = cvpat(t) 234 if fm: 235 fm = re.compile(fm) 236 fmods = [m for m in refmods if fm.match(m)] 237 else: 238 fmods = refmods 239 fmods = dict([(m, 1) for m in fmods]) 240 if tm or not ts: # neither tm nor ts = all 241 tm = re.compile(tm) 242 tmods = dict([(m, 1) for m in defmods if tm.match(m)]) 243 else: 244 tmods = None 245 # ****** fs IS BOGUS, USES ENTIRE MODULE ****** 246 if fs: 247 fs = re.compile(fs) 248 for s in defs.tokeys(): 249 if fs.match(s): 250 for m in defs.fromkeys(s): 251 fmods[m] = 1 252 if ts: 253 ts = re.compile(ts) 254 for m in fmods: 255 if ts: 256 for s in refs.tokeys(m): 257 if ts.match(s): 258 proc(m, s) 259 if tm: 260 for s in refs.tokeys(m): 261 for dm in defs.fromkeys(s): 262 if tm.match(dm): 263 proc(m, s) 264 265def tsort(defs, refs): 266 ts = DGraph() 267 for m1 in refs.fromkeys(): 268 for s in refs.tokeys(m1): 269 for m2 in defs.fromkeys(s): 270 ts.add(m1, m2) 271 sorted, loops = ts.toposort() 272 passes = 1 273 while loops: 274 print 'Sorted:', len(sorted), sorted 275 print len(ts.graph), 'files remaining' 276 o, f, t, seen = ts.pick() 277 print 'Picking', o, ': %d references' % len(f), f 278 print ' %d children,' % len(t),\ 279 [(o, len(ts.tokeys(o))) for o in t] 280 sorted, loops = ts.toposort() 281 passes += 1 282 print 'Topo sort completed, %d pass(es).' % passes 283 284def main(argv): 285 args = argv[1:] 286 if len(args) < 1: 287 print 'Use --help for usage information.' 288 return 289 if args[0] == '--help': 290 print USAGE 291 print HELP 292 return 293 if isdir(args[0]): 294 cwd = args.pop(0) 295 else: 296 cwd = '' 297 if len(args) < 1: return usage() 298 verbose = False 299 # Read the ld script and each file's symbol table. 300 ldscript = args.pop(0) 301 f = file(ldscript) 302 ld = f.read() 303 f.close() 304 tokens = filter(None, ld.replace('\\\n', '').split()) 305 ofiles, ofdict = [], {} 306 del tokens[0] # gcc or ld 307 while tokens: 308 t = tokens.pop(0) 309 if t == '-o': 310 tokens.pop(0) 311 continue 312 elif t.startswith('-l') or t.startswith('-L'): 313 continue 314 elif t.startswith('-'): 315 print 'Unrecognized switch in ld script:', t 316 return 1 317 ofile = OFile(join(cwd, t)) 318 ofiles.append(ofile) 319 ofdict[ofile.basename] = ofile 320 print len(ofiles), 'object files' 321 # Collect the symbols. 322 defs, refs = DGraph(), DGraph() 323 for ofile in ofiles: 324 fname = basename(ofile.name) 325 for sym, (type, loc) in ofile.symbols.items(): 326 if '.' in sym: # generated symbol 327 continue 328 if type == 'U': 329 refs.add(fname, sym) 330 elif type.islower(): # local symbol 331 continue 332 elif sym in defs.graph: 333 dfiles = defs.graph[sym][0].keys() 334 print '****', sym, 'defined in',\ 335 [(dfile, ofdict[dfile].symbols[sym]) for dfile in dfiles],\ 336 'and', ofile, (type, loc) 337 else: 338 defs.add(fname, sym) 339 print len(refs.graph), 'in refs,', len(defs.graph), 'in defs' 340 # Process the rest of the command line. 341 while args: 342 arg = args.pop(0) 343 if arg == '--exclude': 344 # Remove excluded relationships. 345 def remove1(m, s): 346 if verbose: 347 print 'removing', (m, s) 348 refs.remove(m, s) 349 forfts(args, defs, refs, remove1, verbose) 350 elif arg == '--query': 351 # Process queries. 352 print 64 * '-' 353 def query1(m, s): 354 dms = defs.fromkeys(s) 355 print '>>', m, 'references', s, '(defined in',\ 356 ', '.join(dms) + ')' 357 ofile, srcs = ofdict[m], [] 358 for src, syms in ofile.references.items(): 359 if s in syms: 360 srcs.append(src) 361 if srcs: 362 print ' from', ', '.join(srcs) 363 forfts(args, defs, refs, query1, verbose) 364 elif arg == '--cycles': 365 # Attempt to sort the modules topologically, using the criterion 366 # that M1 precedes M2 iff M1 uses a symbol defined in M2. 367 print 64 * '-' 368 tsort(defs, refs) 369 elif arg == '--verbose': 370 verbose = True 371 else: 372 return usage() 373 374if __name__ == '__main__': 375 sys.exit(main(sys.argv) or 0) 376