1#!/usr/bin/env python 2# Copyright 2019 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Utilities for optimistically parsing dex files. 7 8This file is not meant to provide a generic tool for analyzing dex files. 9A DexFile class that exposes access to several memory items in the dex format 10is provided, but it does not include error handling or validation. 11""" 12 13from __future__ import print_function 14 15import argparse 16import collections 17import errno 18import os 19import re 20import struct 21import sys 22import zipfile 23 24# https://source.android.com/devices/tech/dalvik/dex-format#header-item 25_DEX_HEADER_FMT = ( 26 ('magic', '8s'), 27 ('checksum', 'I'), 28 ('signature', '20s'), 29 ('file_size', 'I'), 30 ('header_size', 'I'), 31 ('endian_tag', 'I'), 32 ('link_size', 'I'), 33 ('link_off', 'I'), 34 ('map_off', 'I'), 35 ('string_ids_size', 'I'), 36 ('string_ids_off', 'I'), 37 ('type_ids_size', 'I'), 38 ('type_ids_off', 'I'), 39 ('proto_ids_size', 'I'), 40 ('proto_ids_off', 'I'), 41 ('field_ids_size', 'I'), 42 ('field_ids_off', 'I'), 43 ('method_ids_size', 'I'), 44 ('method_ids_off', 'I'), 45 ('class_defs_size', 'I'), 46 ('class_defs_off', 'I'), 47 ('data_size', 'I'), 48 ('data_off', 'I'), 49) 50 51DexHeader = collections.namedtuple('DexHeader', 52 ','.join(t[0] for t in _DEX_HEADER_FMT)) 53 54# Simple memory items. 55_TypeIdItem = collections.namedtuple('TypeIdItem', 'descriptor_idx') 56_ProtoIdItem = collections.namedtuple( 57 'ProtoIdItem', 'shorty_idx,return_type_idx,parameters_off') 58_MethodIdItem = collections.namedtuple('MethodIdItem', 59 'type_idx,proto_idx,name_idx') 60_TypeItem = collections.namedtuple('TypeItem', 'type_idx') 61_StringDataItem = collections.namedtuple('StringItem', 'utf16_size,data') 62_ClassDefItem = collections.namedtuple( 63 'ClassDefItem', 64 'class_idx,access_flags,superclass_idx,interfaces_off,source_file_idx,' 65 'annotations_off,class_data_off,static_values_off') 66 67 68class _MemoryItemList(object): 69 """Base class for repeated memory items.""" 70 71 def __init__(self, 72 reader, 73 offset, 74 size, 75 factory, 76 alignment=None, 77 first_item_offset=None): 78 """Creates the item list using the specific item factory. 79 80 Args: 81 reader: _DexReader used for decoding the memory item. 82 offset: Offset from start of the file to the item list, serving as the 83 key for some item types. 84 size: Number of memory items in the list. 85 factory: Function to extract each memory item from a _DexReader. 86 alignment: Optional integer specifying the alignment for the memory 87 section represented by this list. 88 first_item_offset: Optional, specifies a different offset to use for 89 extracting memory items (default is to use offset). 90 """ 91 self.offset = offset 92 self.size = size 93 reader.Seek(first_item_offset or offset) 94 self._items = [factory(reader) for _ in xrange(size)] 95 96 if alignment: 97 reader.AlignUpTo(alignment) 98 99 def __iter__(self): 100 return iter(self._items) 101 102 def __getitem__(self, key): 103 return self._items[key] 104 105 def __len__(self): 106 return len(self._items) 107 108 def __repr__(self): 109 item_type_part = '' 110 if self.size != 0: 111 item_type = type(self._items[0]) 112 item_type_part = ', item type={}'.format(item_type.__name__) 113 114 return '{}(offset={:#x}, size={}{})'.format( 115 type(self).__name__, self.offset, self.size, item_type_part) 116 117 118class _TypeIdItemList(_MemoryItemList): 119 120 def __init__(self, reader, offset, size): 121 factory = lambda x: _TypeIdItem(x.ReadUInt()) 122 super(_TypeIdItemList, self).__init__(reader, offset, size, factory) 123 124 125class _ProtoIdItemList(_MemoryItemList): 126 127 def __init__(self, reader, offset, size): 128 factory = lambda x: _ProtoIdItem(x.ReadUInt(), x.ReadUInt(), x.ReadUInt()) 129 super(_ProtoIdItemList, self).__init__(reader, offset, size, factory) 130 131 132class _MethodIdItemList(_MemoryItemList): 133 134 def __init__(self, reader, offset, size): 135 factory = ( 136 lambda x: _MethodIdItem(x.ReadUShort(), x.ReadUShort(), x.ReadUInt())) 137 super(_MethodIdItemList, self).__init__(reader, offset, size, factory) 138 139 140class _StringItemList(_MemoryItemList): 141 142 def __init__(self, reader, offset, size): 143 reader.Seek(offset) 144 string_item_offsets = iter([reader.ReadUInt() for _ in xrange(size)]) 145 146 def factory(x): 147 data_offset = next(string_item_offsets) 148 string = x.ReadString(data_offset) 149 return _StringDataItem(len(string), string) 150 151 super(_StringItemList, self).__init__(reader, offset, size, factory) 152 153 154class _TypeListItem(_MemoryItemList): 155 156 def __init__(self, reader): 157 offset = reader.Tell() 158 size = reader.ReadUInt() 159 factory = lambda x: _TypeItem(x.ReadUShort()) 160 # This is necessary because we need to extract the size of the type list 161 # (in other cases the list size is provided in the header). 162 first_item_offset = reader.Tell() 163 super(_TypeListItem, self).__init__( 164 reader, 165 offset, 166 size, 167 factory, 168 alignment=4, 169 first_item_offset=first_item_offset) 170 171 172class _TypeListItemList(_MemoryItemList): 173 174 def __init__(self, reader, offset, size): 175 super(_TypeListItemList, self).__init__(reader, offset, size, _TypeListItem) 176 177 178class _ClassDefItemList(_MemoryItemList): 179 180 def __init__(self, reader, offset, size): 181 reader.Seek(offset) 182 183 def factory(x): 184 return _ClassDefItem(*(x.ReadUInt() 185 for _ in xrange(len(_ClassDefItem._fields)))) 186 187 super(_ClassDefItemList, self).__init__(reader, offset, size, factory) 188 189 190class _DexMapItem(object): 191 192 def __init__(self, reader): 193 self.type = reader.ReadUShort() 194 reader.ReadUShort() 195 self.size = reader.ReadUInt() 196 self.offset = reader.ReadUInt() 197 198 def __repr__(self): 199 return '_DexMapItem(type={}, size={}, offset={:#x})'.format( 200 self.type, self.size, self.offset) 201 202 203class _DexMapList(object): 204 # Full list of type codes: 205 # https://source.android.com/devices/tech/dalvik/dex-format#type-codes 206 TYPE_TYPE_LIST = 0x1001 207 208 def __init__(self, reader, offset): 209 self._map = {} 210 reader.Seek(offset) 211 self._size = reader.ReadUInt() 212 for _ in xrange(self._size): 213 item = _DexMapItem(reader) 214 self._map[item.type] = item 215 216 def __getitem__(self, key): 217 return self._map[key] 218 219 def __contains__(self, key): 220 return key in self._map 221 222 def __repr__(self): 223 return '_DexMapList(size={}, items={})'.format(self._size, self._map) 224 225 226class _DexReader(object): 227 228 def __init__(self, data): 229 self._data = data 230 self._pos = 0 231 232 def Seek(self, offset): 233 self._pos = offset 234 235 def Tell(self): 236 return self._pos 237 238 def ReadUByte(self): 239 return self._ReadData('<B') 240 241 def ReadUShort(self): 242 return self._ReadData('<H') 243 244 def ReadUInt(self): 245 return self._ReadData('<I') 246 247 def ReadString(self, data_offset): 248 string_length, string_offset = self._ReadULeb128(data_offset) 249 string_data_offset = string_offset + data_offset 250 return self._DecodeMUtf8(string_length, string_data_offset) 251 252 def AlignUpTo(self, align_unit): 253 off_by = self._pos % align_unit 254 if off_by: 255 self.Seek(self._pos + align_unit - off_by) 256 257 def ReadHeader(self): 258 header_fmt = '<' + ''.join(t[1] for t in _DEX_HEADER_FMT) 259 return DexHeader._make(struct.unpack_from(header_fmt, self._data)) 260 261 def _ReadData(self, fmt): 262 ret = struct.unpack_from(fmt, self._data, self._pos)[0] 263 self._pos += struct.calcsize(fmt) 264 return ret 265 266 def _ReadULeb128(self, data_offset): 267 """Returns a tuple of (uleb128 value, number of bytes occupied). 268 269 From DWARF3 spec: http://dwarfstd.org/doc/Dwarf3.pdf 270 271 Args: 272 data_offset: Location of the unsigned LEB128. 273 """ 274 value = 0 275 shift = 0 276 cur_offset = data_offset 277 while True: 278 byte = self._data[cur_offset] 279 cur_offset += 1 280 value |= (byte & 0b01111111) << shift 281 if (byte & 0b10000000) == 0: 282 break 283 shift += 7 284 285 return value, cur_offset - data_offset 286 287 def _DecodeMUtf8(self, string_length, offset): 288 """Returns the string located at the specified offset. 289 290 See https://source.android.com/devices/tech/dalvik/dex-format#mutf-8 291 292 Ported from the Android Java implementation: 293 https://android.googlesource.com/platform/dalvik/+/fe107fb6e3f308ac5174ebdc5a794ee880c741d9/dx/src/com/android/dex/Mutf8.java#34 294 295 Args: 296 string_length: The length of the decoded string. 297 offset: Offset to the beginning of the string. 298 """ 299 self.Seek(offset) 300 ret = '' 301 302 for _ in xrange(string_length): 303 a = self.ReadUByte() 304 if a == 0: 305 raise _MUTf8DecodeError('Early string termination encountered', 306 string_length, offset) 307 if (a & 0x80) == 0x00: 308 code = a 309 elif (a & 0xe0) == 0xc0: 310 b = self.ReadUByte() 311 if (b & 0xc0) != 0x80: 312 raise _MUTf8DecodeError('Error in byte 2', string_length, offset) 313 code = ((a & 0x1f) << 6) | (b & 0x3f) 314 elif (a & 0xf0) == 0xe0: 315 b = self.ReadUByte() 316 c = self.ReadUByte() 317 if (b & 0xc0) != 0x80 or (c & 0xc0) != 0x80: 318 raise _MUTf8DecodeError('Error in byte 3 or 4', string_length, offset) 319 code = ((a & 0x0f) << 12) | ((b & 0x3f) << 6) | (c & 0x3f) 320 else: 321 raise _MUTf8DecodeError('Bad byte', string_length, offset) 322 323 ret += unichr(code) 324 325 if self.ReadUByte() != 0x00: 326 raise _MUTf8DecodeError('Expected string termination', string_length, 327 offset) 328 329 return ret 330 331 332class _MUTf8DecodeError(Exception): 333 334 def __init__(self, message, length, offset): 335 message += ' (decoded string length: {}, string data offset: {:#x})'.format( 336 length, offset) 337 super(_MUTf8DecodeError, self).__init__(message) 338 339 340class DexFile(object): 341 """Represents a single dex file. 342 343 Parses and exposes access to dex file structure and contents, as described 344 at https://source.android.com/devices/tech/dalvik/dex-format 345 346 Fields: 347 reader: _DexReader object used to decode dex file contents. 348 header: DexHeader for this dex file. 349 map_list: _DexMapList object containing list of dex file contents. 350 type_item_list: _TypeIdItemList containing type_id_items. 351 proto_item_list: _ProtoIdItemList containing proto_id_items. 352 method_item_list: _MethodIdItemList containing method_id_items. 353 string_item_list: _StringItemList containing string_data_items that are 354 referenced by index in other sections. 355 type_list_item_list: _TypeListItemList containing _TypeListItems. 356 _TypeListItems are referenced by their offsets from other dex items. 357 class_def_item_list: _ClassDefItemList containing _ClassDefItems. 358 """ 359 _CLASS_ACCESS_FLAGS = { 360 0x1: 'public', 361 0x2: 'private', 362 0x4: 'protected', 363 0x8: 'static', 364 0x10: 'final', 365 0x200: 'interface', 366 0x400: 'abstract', 367 0x1000: 'synthetic', 368 0x2000: 'annotation', 369 0x4000: 'enum', 370 } 371 372 def __init__(self, data): 373 """Decodes dex file memory sections. 374 375 Args: 376 data: bytearray containing the contents of a dex file. 377 """ 378 self.reader = _DexReader(data) 379 self.header = self.reader.ReadHeader() 380 self.map_list = _DexMapList(self.reader, self.header.map_off) 381 self.type_item_list = _TypeIdItemList(self.reader, self.header.type_ids_off, 382 self.header.type_ids_size) 383 self.proto_item_list = _ProtoIdItemList( 384 self.reader, self.header.proto_ids_off, self.header.proto_ids_size) 385 self.method_item_list = _MethodIdItemList( 386 self.reader, self.header.method_ids_off, self.header.method_ids_size) 387 self.string_item_list = _StringItemList( 388 self.reader, self.header.string_ids_off, self.header.string_ids_size) 389 self.class_def_item_list = _ClassDefItemList( 390 self.reader, self.header.class_defs_off, self.header.class_defs_size) 391 392 type_list_key = _DexMapList.TYPE_TYPE_LIST 393 if type_list_key in self.map_list: 394 map_list_item = self.map_list[type_list_key] 395 self.type_list_item_list = _TypeListItemList( 396 self.reader, map_list_item.offset, map_list_item.size) 397 else: 398 self.type_list_item_list = _TypeListItemList(self.reader, 0, 0) 399 self._type_lists_by_offset = { 400 type_list.offset: type_list 401 for type_list in self.type_list_item_list 402 } 403 404 def GetString(self, string_item_idx): 405 string_item = self.string_item_list[string_item_idx] 406 return string_item.data 407 408 def GetTypeString(self, type_item_idx): 409 type_item = self.type_item_list[type_item_idx] 410 return self.GetString(type_item.descriptor_idx) 411 412 def GetTypeListStringsByOffset(self, offset): 413 if not offset: 414 return () 415 type_list = self._type_lists_by_offset[offset] 416 return tuple(self.GetTypeString(item.type_idx) for item in type_list) 417 418 @staticmethod 419 def ResolveClassAccessFlags(access_flags): 420 return tuple( 421 flag_string 422 for flag, flag_string in DexFile._CLASS_ACCESS_FLAGS.iteritems() 423 if flag & access_flags) 424 425 def __repr__(self): 426 items = [ 427 self.header, 428 self.map_list, 429 self.type_item_list, 430 self.proto_item_list, 431 self.method_item_list, 432 self.string_item_list, 433 self.type_list_item_list, 434 self.class_def_item_list, 435 ] 436 return '\n'.join(str(item) for item in items) 437 438 439def _MethodSignaturePartsFromDexFile(dexfile): 440 """Yields the string components of dex methods in a dex file. 441 442 Args: 443 dexfile: The input dex file. 444 445 Yields: 446 Tuples that look like: 447 (class name, return type, method name, (parameter type, ...)). 448 """ 449 for method_item in dexfile.method_item_list: 450 class_name_string = dexfile.GetTypeString(method_item.type_idx) 451 method_name_string = dexfile.GetString(method_item.name_idx) 452 proto_item = dexfile.proto_item_list[method_item.proto_idx] 453 return_type_string = dexfile.GetTypeString(proto_item.return_type_idx) 454 parameter_types = dexfile.GetTypeListStringsByOffset( 455 proto_item.parameters_off) 456 yield (class_name_string, return_type_string, method_name_string, 457 parameter_types) 458 459 460def CountUniqueDexMethods(dexfiles): 461 """Returns the number of unique methods given an iterable of dex files. 462 463 For method counts, most tools count the total number of dex methods referred 464 to by a dex file. In the multi-dex case, some method items are referred to by 465 multiple dex files, which means some methods are double counted. This method 466 returns a count of the number of unique methods referred to across all given 467 dex files. 468 469 Args: 470 dexfiles: Iterable of DexFile objects to count unique methods for. 471 """ 472 unique_methods = set() 473 for dexfile in dexfiles: 474 unique_methods.update(_MethodSignaturePartsFromDexFile(dexfile)) 475 return len(unique_methods) 476 477 478class _DumpCommand(object): 479 480 def __init__(self, dexfile): 481 self._dexfile = dexfile 482 483 def Run(self): 484 raise NotImplementedError() 485 486 487class _DumpMethods(_DumpCommand): 488 489 def Run(self): 490 for parts in _MethodSignaturePartsFromDexFile(self._dexfile): 491 class_type, return_type, method_name, parameter_types = parts 492 print('{} {} (return type={}, parameters={})'.format( 493 class_type, method_name, return_type, parameter_types)) 494 495 496class _DumpStrings(_DumpCommand): 497 498 def Run(self): 499 for string_item in self._dexfile.string_item_list: 500 # Some strings are likely to be non-ascii (vs. methods/classes). 501 print(string_item.data.encode('utf-8')) 502 503 504class _DumpClasses(_DumpCommand): 505 506 def Run(self): 507 for class_item in self._dexfile.class_def_item_list: 508 class_string = self._dexfile.GetTypeString(class_item.class_idx) 509 superclass_string = self._dexfile.GetTypeString(class_item.superclass_idx) 510 interfaces = self._dexfile.GetTypeListStringsByOffset( 511 class_item.interfaces_off) 512 access_flags = DexFile.ResolveClassAccessFlags(class_item.access_flags) 513 print('{} (superclass={}, interfaces={}, access_flags={})'.format( 514 class_string, superclass_string, interfaces, access_flags)) 515 516 517class _DumpSummary(_DumpCommand): 518 519 def Run(self): 520 print(self._dexfile) 521 522 523def _DumpDexItems(dexfile_data, name, item): 524 dexfile = DexFile(bytearray(dexfile_data)) 525 print('dex_parser: Dumping {} for {}'.format(item, name)) 526 cmds = { 527 'summary': _DumpSummary, 528 'methods': _DumpMethods, 529 'strings': _DumpStrings, 530 'classes': _DumpClasses, 531 } 532 try: 533 cmds[item](dexfile).Run() 534 except IOError as e: 535 if e.errno == errno.EPIPE: 536 # Assume we're piping to "less", do nothing. 537 pass 538 539 540def main(): 541 parser = argparse.ArgumentParser(description='Dump dex contents to stdout.') 542 parser.add_argument( 543 'input', help='Input (.dex, .jar, .zip, .aab, .apk) file path.') 544 parser.add_argument( 545 'item', 546 choices=('methods', 'strings', 'classes', 'summary'), 547 help='Item to dump', 548 nargs='?', 549 default='summary') 550 args = parser.parse_args() 551 552 if os.path.splitext(args.input)[1] in ('.apk', '.jar', '.zip', '.aab'): 553 with zipfile.ZipFile(args.input) as z: 554 dex_file_paths = [ 555 f for f in z.namelist() if re.match(r'.*classes[0-9]*\.dex$', f) 556 ] 557 if not dex_file_paths: 558 print('Error: {} does not contain any classes.dex files'.format( 559 args.input)) 560 sys.exit(1) 561 562 for path in dex_file_paths: 563 _DumpDexItems(z.read(path), path, args.item) 564 565 else: 566 with open(args.input) as f: 567 _DumpDexItems(f.read(), args.input, args.item) 568 569 570if __name__ == '__main__': 571 main() 572