1#!/usr/bin/env python
2# Copyright 2019 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Utilities for optimistically parsing dex files.
7
8This file is not meant to provide a generic tool for analyzing dex files.
9A DexFile class that exposes access to several memory items in the dex format
10is provided, but it does not include error handling or validation.
11"""
12
13from __future__ import print_function
14
15import argparse
16import collections
17import errno
18import os
19import re
20import struct
21import sys
22import zipfile
23
24# https://source.android.com/devices/tech/dalvik/dex-format#header-item
25_DEX_HEADER_FMT = (
26    ('magic', '8s'),
27    ('checksum', 'I'),
28    ('signature', '20s'),
29    ('file_size', 'I'),
30    ('header_size', 'I'),
31    ('endian_tag', 'I'),
32    ('link_size', 'I'),
33    ('link_off', 'I'),
34    ('map_off', 'I'),
35    ('string_ids_size', 'I'),
36    ('string_ids_off', 'I'),
37    ('type_ids_size', 'I'),
38    ('type_ids_off', 'I'),
39    ('proto_ids_size', 'I'),
40    ('proto_ids_off', 'I'),
41    ('field_ids_size', 'I'),
42    ('field_ids_off', 'I'),
43    ('method_ids_size', 'I'),
44    ('method_ids_off', 'I'),
45    ('class_defs_size', 'I'),
46    ('class_defs_off', 'I'),
47    ('data_size', 'I'),
48    ('data_off', 'I'),
49)
50
51DexHeader = collections.namedtuple('DexHeader',
52                                   ','.join(t[0] for t in _DEX_HEADER_FMT))
53
54# Simple memory items.
55_TypeIdItem = collections.namedtuple('TypeIdItem', 'descriptor_idx')
56_ProtoIdItem = collections.namedtuple(
57    'ProtoIdItem', 'shorty_idx,return_type_idx,parameters_off')
58_MethodIdItem = collections.namedtuple('MethodIdItem',
59                                       'type_idx,proto_idx,name_idx')
60_TypeItem = collections.namedtuple('TypeItem', 'type_idx')
61_StringDataItem = collections.namedtuple('StringItem', 'utf16_size,data')
62_ClassDefItem = collections.namedtuple(
63    'ClassDefItem',
64    'class_idx,access_flags,superclass_idx,interfaces_off,source_file_idx,'
65    'annotations_off,class_data_off,static_values_off')
66
67
68class _MemoryItemList(object):
69  """Base class for repeated memory items."""
70
71  def __init__(self,
72               reader,
73               offset,
74               size,
75               factory,
76               alignment=None,
77               first_item_offset=None):
78    """Creates the item list using the specific item factory.
79
80    Args:
81      reader: _DexReader used for decoding the memory item.
82      offset: Offset from start of the file to the item list, serving as the
83        key for some item types.
84      size: Number of memory items in the list.
85      factory: Function to extract each memory item from a _DexReader.
86      alignment: Optional integer specifying the alignment for the memory
87        section represented by this list.
88      first_item_offset: Optional, specifies a different offset to use for
89        extracting memory items (default is to use offset).
90    """
91    self.offset = offset
92    self.size = size
93    reader.Seek(first_item_offset or offset)
94    self._items = [factory(reader) for _ in xrange(size)]
95
96    if alignment:
97      reader.AlignUpTo(alignment)
98
99  def __iter__(self):
100    return iter(self._items)
101
102  def __getitem__(self, key):
103    return self._items[key]
104
105  def __len__(self):
106    return len(self._items)
107
108  def __repr__(self):
109    item_type_part = ''
110    if self.size != 0:
111      item_type = type(self._items[0])
112      item_type_part = ', item type={}'.format(item_type.__name__)
113
114    return '{}(offset={:#x}, size={}{})'.format(
115        type(self).__name__, self.offset, self.size, item_type_part)
116
117
118class _TypeIdItemList(_MemoryItemList):
119
120  def __init__(self, reader, offset, size):
121    factory = lambda x: _TypeIdItem(x.ReadUInt())
122    super(_TypeIdItemList, self).__init__(reader, offset, size, factory)
123
124
125class _ProtoIdItemList(_MemoryItemList):
126
127  def __init__(self, reader, offset, size):
128    factory = lambda x: _ProtoIdItem(x.ReadUInt(), x.ReadUInt(), x.ReadUInt())
129    super(_ProtoIdItemList, self).__init__(reader, offset, size, factory)
130
131
132class _MethodIdItemList(_MemoryItemList):
133
134  def __init__(self, reader, offset, size):
135    factory = (
136        lambda x: _MethodIdItem(x.ReadUShort(), x.ReadUShort(), x.ReadUInt()))
137    super(_MethodIdItemList, self).__init__(reader, offset, size, factory)
138
139
140class _StringItemList(_MemoryItemList):
141
142  def __init__(self, reader, offset, size):
143    reader.Seek(offset)
144    string_item_offsets = iter([reader.ReadUInt() for _ in xrange(size)])
145
146    def factory(x):
147      data_offset = next(string_item_offsets)
148      string = x.ReadString(data_offset)
149      return _StringDataItem(len(string), string)
150
151    super(_StringItemList, self).__init__(reader, offset, size, factory)
152
153
154class _TypeListItem(_MemoryItemList):
155
156  def __init__(self, reader):
157    offset = reader.Tell()
158    size = reader.ReadUInt()
159    factory = lambda x: _TypeItem(x.ReadUShort())
160    # This is necessary because we need to extract the size of the type list
161    # (in other cases the list size is provided in the header).
162    first_item_offset = reader.Tell()
163    super(_TypeListItem, self).__init__(
164        reader,
165        offset,
166        size,
167        factory,
168        alignment=4,
169        first_item_offset=first_item_offset)
170
171
172class _TypeListItemList(_MemoryItemList):
173
174  def __init__(self, reader, offset, size):
175    super(_TypeListItemList, self).__init__(reader, offset, size, _TypeListItem)
176
177
178class _ClassDefItemList(_MemoryItemList):
179
180  def __init__(self, reader, offset, size):
181    reader.Seek(offset)
182
183    def factory(x):
184      return _ClassDefItem(*(x.ReadUInt()
185                             for _ in xrange(len(_ClassDefItem._fields))))
186
187    super(_ClassDefItemList, self).__init__(reader, offset, size, factory)
188
189
190class _DexMapItem(object):
191
192  def __init__(self, reader):
193    self.type = reader.ReadUShort()
194    reader.ReadUShort()
195    self.size = reader.ReadUInt()
196    self.offset = reader.ReadUInt()
197
198  def __repr__(self):
199    return '_DexMapItem(type={}, size={}, offset={:#x})'.format(
200        self.type, self.size, self.offset)
201
202
203class _DexMapList(object):
204  # Full list of type codes:
205  # https://source.android.com/devices/tech/dalvik/dex-format#type-codes
206  TYPE_TYPE_LIST = 0x1001
207
208  def __init__(self, reader, offset):
209    self._map = {}
210    reader.Seek(offset)
211    self._size = reader.ReadUInt()
212    for _ in xrange(self._size):
213      item = _DexMapItem(reader)
214      self._map[item.type] = item
215
216  def __getitem__(self, key):
217    return self._map[key]
218
219  def __contains__(self, key):
220    return key in self._map
221
222  def __repr__(self):
223    return '_DexMapList(size={}, items={})'.format(self._size, self._map)
224
225
226class _DexReader(object):
227
228  def __init__(self, data):
229    self._data = data
230    self._pos = 0
231
232  def Seek(self, offset):
233    self._pos = offset
234
235  def Tell(self):
236    return self._pos
237
238  def ReadUByte(self):
239    return self._ReadData('<B')
240
241  def ReadUShort(self):
242    return self._ReadData('<H')
243
244  def ReadUInt(self):
245    return self._ReadData('<I')
246
247  def ReadString(self, data_offset):
248    string_length, string_offset = self._ReadULeb128(data_offset)
249    string_data_offset = string_offset + data_offset
250    return self._DecodeMUtf8(string_length, string_data_offset)
251
252  def AlignUpTo(self, align_unit):
253    off_by = self._pos % align_unit
254    if off_by:
255      self.Seek(self._pos + align_unit - off_by)
256
257  def ReadHeader(self):
258    header_fmt = '<' + ''.join(t[1] for t in _DEX_HEADER_FMT)
259    return DexHeader._make(struct.unpack_from(header_fmt, self._data))
260
261  def _ReadData(self, fmt):
262    ret = struct.unpack_from(fmt, self._data, self._pos)[0]
263    self._pos += struct.calcsize(fmt)
264    return ret
265
266  def _ReadULeb128(self, data_offset):
267    """Returns a tuple of (uleb128 value, number of bytes occupied).
268
269    From DWARF3 spec: http://dwarfstd.org/doc/Dwarf3.pdf
270
271    Args:
272      data_offset: Location of the unsigned LEB128.
273    """
274    value = 0
275    shift = 0
276    cur_offset = data_offset
277    while True:
278      byte = self._data[cur_offset]
279      cur_offset += 1
280      value |= (byte & 0b01111111) << shift
281      if (byte & 0b10000000) == 0:
282        break
283      shift += 7
284
285    return value, cur_offset - data_offset
286
287  def _DecodeMUtf8(self, string_length, offset):
288    """Returns the string located at the specified offset.
289
290    See https://source.android.com/devices/tech/dalvik/dex-format#mutf-8
291
292    Ported from the Android Java implementation:
293    https://android.googlesource.com/platform/dalvik/+/fe107fb6e3f308ac5174ebdc5a794ee880c741d9/dx/src/com/android/dex/Mutf8.java#34
294
295    Args:
296      string_length: The length of the decoded string.
297      offset: Offset to the beginning of the string.
298    """
299    self.Seek(offset)
300    ret = ''
301
302    for _ in xrange(string_length):
303      a = self.ReadUByte()
304      if a == 0:
305        raise _MUTf8DecodeError('Early string termination encountered',
306                                string_length, offset)
307      if (a & 0x80) == 0x00:
308        code = a
309      elif (a & 0xe0) == 0xc0:
310        b = self.ReadUByte()
311        if (b & 0xc0) != 0x80:
312          raise _MUTf8DecodeError('Error in byte 2', string_length, offset)
313        code = ((a & 0x1f) << 6) | (b & 0x3f)
314      elif (a & 0xf0) == 0xe0:
315        b = self.ReadUByte()
316        c = self.ReadUByte()
317        if (b & 0xc0) != 0x80 or (c & 0xc0) != 0x80:
318          raise _MUTf8DecodeError('Error in byte 3 or 4', string_length, offset)
319        code = ((a & 0x0f) << 12) | ((b & 0x3f) << 6) | (c & 0x3f)
320      else:
321        raise _MUTf8DecodeError('Bad byte', string_length, offset)
322
323      ret += unichr(code)
324
325    if self.ReadUByte() != 0x00:
326      raise _MUTf8DecodeError('Expected string termination', string_length,
327                              offset)
328
329    return ret
330
331
332class _MUTf8DecodeError(Exception):
333
334  def __init__(self, message, length, offset):
335    message += ' (decoded string length: {}, string data offset: {:#x})'.format(
336        length, offset)
337    super(_MUTf8DecodeError, self).__init__(message)
338
339
340class DexFile(object):
341  """Represents a single dex file.
342
343  Parses and exposes access to dex file structure and contents, as described
344  at https://source.android.com/devices/tech/dalvik/dex-format
345
346  Fields:
347    reader: _DexReader object used to decode dex file contents.
348    header: DexHeader for this dex file.
349    map_list: _DexMapList object containing list of dex file contents.
350    type_item_list: _TypeIdItemList containing type_id_items.
351    proto_item_list: _ProtoIdItemList containing proto_id_items.
352    method_item_list: _MethodIdItemList containing method_id_items.
353    string_item_list: _StringItemList containing string_data_items that are
354      referenced by index in other sections.
355    type_list_item_list: _TypeListItemList containing _TypeListItems.
356      _TypeListItems are referenced by their offsets from other dex items.
357    class_def_item_list: _ClassDefItemList containing _ClassDefItems.
358  """
359  _CLASS_ACCESS_FLAGS = {
360      0x1: 'public',
361      0x2: 'private',
362      0x4: 'protected',
363      0x8: 'static',
364      0x10: 'final',
365      0x200: 'interface',
366      0x400: 'abstract',
367      0x1000: 'synthetic',
368      0x2000: 'annotation',
369      0x4000: 'enum',
370  }
371
372  def __init__(self, data):
373    """Decodes dex file memory sections.
374
375    Args:
376      data: bytearray containing the contents of a dex file.
377    """
378    self.reader = _DexReader(data)
379    self.header = self.reader.ReadHeader()
380    self.map_list = _DexMapList(self.reader, self.header.map_off)
381    self.type_item_list = _TypeIdItemList(self.reader, self.header.type_ids_off,
382                                          self.header.type_ids_size)
383    self.proto_item_list = _ProtoIdItemList(
384        self.reader, self.header.proto_ids_off, self.header.proto_ids_size)
385    self.method_item_list = _MethodIdItemList(
386        self.reader, self.header.method_ids_off, self.header.method_ids_size)
387    self.string_item_list = _StringItemList(
388        self.reader, self.header.string_ids_off, self.header.string_ids_size)
389    self.class_def_item_list = _ClassDefItemList(
390        self.reader, self.header.class_defs_off, self.header.class_defs_size)
391
392    type_list_key = _DexMapList.TYPE_TYPE_LIST
393    if type_list_key in self.map_list:
394      map_list_item = self.map_list[type_list_key]
395      self.type_list_item_list = _TypeListItemList(
396          self.reader, map_list_item.offset, map_list_item.size)
397    else:
398      self.type_list_item_list = _TypeListItemList(self.reader, 0, 0)
399    self._type_lists_by_offset = {
400        type_list.offset: type_list
401        for type_list in self.type_list_item_list
402    }
403
404  def GetString(self, string_item_idx):
405    string_item = self.string_item_list[string_item_idx]
406    return string_item.data
407
408  def GetTypeString(self, type_item_idx):
409    type_item = self.type_item_list[type_item_idx]
410    return self.GetString(type_item.descriptor_idx)
411
412  def GetTypeListStringsByOffset(self, offset):
413    if not offset:
414      return ()
415    type_list = self._type_lists_by_offset[offset]
416    return tuple(self.GetTypeString(item.type_idx) for item in type_list)
417
418  @staticmethod
419  def ResolveClassAccessFlags(access_flags):
420    return tuple(
421        flag_string
422        for flag, flag_string in DexFile._CLASS_ACCESS_FLAGS.iteritems()
423        if flag & access_flags)
424
425  def __repr__(self):
426    items = [
427        self.header,
428        self.map_list,
429        self.type_item_list,
430        self.proto_item_list,
431        self.method_item_list,
432        self.string_item_list,
433        self.type_list_item_list,
434        self.class_def_item_list,
435    ]
436    return '\n'.join(str(item) for item in items)
437
438
439def _MethodSignaturePartsFromDexFile(dexfile):
440  """Yields the string components of dex methods in a dex file.
441
442  Args:
443    dexfile: The input dex file.
444
445  Yields:
446    Tuples that look like:
447      (class name, return type, method name, (parameter type, ...)).
448  """
449  for method_item in dexfile.method_item_list:
450    class_name_string = dexfile.GetTypeString(method_item.type_idx)
451    method_name_string = dexfile.GetString(method_item.name_idx)
452    proto_item = dexfile.proto_item_list[method_item.proto_idx]
453    return_type_string = dexfile.GetTypeString(proto_item.return_type_idx)
454    parameter_types = dexfile.GetTypeListStringsByOffset(
455        proto_item.parameters_off)
456    yield (class_name_string, return_type_string, method_name_string,
457           parameter_types)
458
459
460def CountUniqueDexMethods(dexfiles):
461  """Returns the number of unique methods given an iterable of dex files.
462
463  For method counts, most tools count the total number of dex methods referred
464  to by a dex file. In the multi-dex case, some method items are referred to by
465  multiple dex files, which means some methods are double counted. This method
466  returns a count of the number of unique methods referred to across all given
467  dex files.
468
469  Args:
470    dexfiles: Iterable of DexFile objects to count unique methods for.
471  """
472  unique_methods = set()
473  for dexfile in dexfiles:
474    unique_methods.update(_MethodSignaturePartsFromDexFile(dexfile))
475  return len(unique_methods)
476
477
478class _DumpCommand(object):
479
480  def __init__(self, dexfile):
481    self._dexfile = dexfile
482
483  def Run(self):
484    raise NotImplementedError()
485
486
487class _DumpMethods(_DumpCommand):
488
489  def Run(self):
490    for parts in _MethodSignaturePartsFromDexFile(self._dexfile):
491      class_type, return_type, method_name, parameter_types = parts
492      print('{} {} (return type={}, parameters={})'.format(
493          class_type, method_name, return_type, parameter_types))
494
495
496class _DumpStrings(_DumpCommand):
497
498  def Run(self):
499    for string_item in self._dexfile.string_item_list:
500      # Some strings are likely to be non-ascii (vs. methods/classes).
501      print(string_item.data.encode('utf-8'))
502
503
504class _DumpClasses(_DumpCommand):
505
506  def Run(self):
507    for class_item in self._dexfile.class_def_item_list:
508      class_string = self._dexfile.GetTypeString(class_item.class_idx)
509      superclass_string = self._dexfile.GetTypeString(class_item.superclass_idx)
510      interfaces = self._dexfile.GetTypeListStringsByOffset(
511          class_item.interfaces_off)
512      access_flags = DexFile.ResolveClassAccessFlags(class_item.access_flags)
513      print('{} (superclass={}, interfaces={}, access_flags={})'.format(
514          class_string, superclass_string, interfaces, access_flags))
515
516
517class _DumpSummary(_DumpCommand):
518
519  def Run(self):
520    print(self._dexfile)
521
522
523def _DumpDexItems(dexfile_data, name, item):
524  dexfile = DexFile(bytearray(dexfile_data))
525  print('dex_parser: Dumping {} for {}'.format(item, name))
526  cmds = {
527      'summary': _DumpSummary,
528      'methods': _DumpMethods,
529      'strings': _DumpStrings,
530      'classes': _DumpClasses,
531  }
532  try:
533    cmds[item](dexfile).Run()
534  except IOError as e:
535    if e.errno == errno.EPIPE:
536      # Assume we're piping to "less", do nothing.
537      pass
538
539
540def main():
541  parser = argparse.ArgumentParser(description='Dump dex contents to stdout.')
542  parser.add_argument(
543      'input', help='Input (.dex, .jar, .zip, .aab, .apk) file path.')
544  parser.add_argument(
545      'item',
546      choices=('methods', 'strings', 'classes', 'summary'),
547      help='Item to dump',
548      nargs='?',
549      default='summary')
550  args = parser.parse_args()
551
552  if os.path.splitext(args.input)[1] in ('.apk', '.jar', '.zip', '.aab'):
553    with zipfile.ZipFile(args.input) as z:
554      dex_file_paths = [
555          f for f in z.namelist() if re.match(r'.*classes[0-9]*\.dex$', f)
556      ]
557      if not dex_file_paths:
558        print('Error: {} does not contain any classes.dex files'.format(
559            args.input))
560        sys.exit(1)
561
562      for path in dex_file_paths:
563        _DumpDexItems(z.read(path), path, args.item)
564
565  else:
566    with open(args.input) as f:
567      _DumpDexItems(f.read(), args.input, args.item)
568
569
570if __name__ == '__main__':
571  main()
572