1#!/usr/bin/env python
2# Copyright 2018 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Extracts the unwind tables in from breakpad symbol files
7
8Runs dump_syms on the given binary file and extracts the CFI data into the
9given output file.
10The output file is a binary file containing CFI rows ordered based on function
11address. The output file only contains rows that match the most popular rule
12type in CFI table, to reduce the output size and specify data in compact format.
13See doc https://github.com/google/breakpad/blob/master/docs/symbol_files.md.
141. The CFA rules should be of postfix form "SP <val> +".
152. The RA rules should be of postfix form "CFA <val> + ^".
16Note: breakpad represents dereferencing address with '^' operator.
17
18The output file has 2 tables UNW_INDEX and UNW_DATA, inspired from ARM EHABI
19format. The first table contains function addresses and an index into the
20UNW_DATA table. The second table contains one or more rows for the function
21unwind information.
22
23The output file starts with 4 bytes counting the number of entries in UNW_INDEX.
24Then UNW_INDEX table and UNW_DATA table.
25
26UNW_INDEX contains two columns of N rows each, where N is the number of
27functions.
28  1. First column 4 byte rows of all the function start address as offset from
29     start of the binary, in sorted order.
30  2. For each function addr, the second column contains 2 byte indices in order.
31     The indices are offsets (in count of 2 bytes) of the CFI data from start of
32     UNW_DATA.
33The last entry in the table always contains CANT_UNWIND index to specify the
34end address of the last function.
35
36UNW_DATA contains data of all the functions. Each function data contains N rows.
37The data found at the address pointed from UNW_INDEX will be:
38  2 bytes: N - number of rows that belong to current function.
39  N * 4 bytes: N rows of data. 16 bits : Address offset from function start.
40                               14 bits : CFA offset / 4.
41                                2 bits : RA offset / 4.
42
43The function is not added to the unwind table in following conditions:
44C1. If length of the function code (number of instructions) is greater than
45    0xFFFF (2 byte address span). This is because we use 16 bits to refer to
46    offset of instruction from start of the address.
47C2. If the function moves the SP by more than 0xFFFF bytes. This is because we
48    use 14 bits to denote CFA offset (last 2 bits are 0).
49C3. If the Return Address is stored at an offset >= 16 from the CFA. Some
50    functions which have variable arguments can have offset upto 16.
51    TODO(ssid): We can actually store offset 16 by subtracting 1 from RA/4 since
52    we never have 0.
53C4: Some functions do not have unwind information defined in dwarf info. These
54    functions have index value CANT_UNWIND(0xFFFF) in UNW_INDEX table.
55
56
57Usage:
58  extract_unwind_tables.py --input_path [root path to unstripped chrome.so]
59      --output_path [output path] --dump_syms_path [path to dump_syms binary]
60"""
61
62import argparse
63import re
64import struct
65import subprocess
66import sys
67import tempfile
68
69
70_CFA_REG = '.cfa'
71_RA_REG = '.ra'
72
73_ADDR_ENTRY = 0
74_LENGTH_ENTRY = 1
75
76_CANT_UNWIND = 0xFFFF
77
78
79def _Write4Bytes(output_file, val):
80  """Writes a 32 bit unsigned integer to the given output file."""
81  output_file.write(struct.pack('<L', val));
82
83
84def _Write2Bytes(output_file, val):
85  """Writes a 16 bit unsigned integer to the given output file."""
86  output_file.write(struct.pack('<H', val));
87
88
89def _FindRuleForRegister(cfi_row, reg):
90  """Returns the postfix expression as string for a given register.
91
92  Breakpad CFI row format specifies rules for unwinding each register in postfix
93  expression form separated by space. Each rule starts with register name and a
94  colon. Eg: "CFI R1: <rule> R2: <rule>".
95  """
96  out = []
97  found_register = False
98  for part in cfi_row:
99    if found_register:
100      if part[-1] == ':':
101        break
102      out.append(part)
103    elif part == reg + ':':
104      found_register = True
105  return ' '.join(out)
106
107
108def _GetCfaAndRaOffset(cfi_row):
109  """Returns a tuple with 2 numbers (cfa_offset, ra_offset).
110
111  Returns right values if rule matches the predefined criteria. Returns (0, 0)
112  otherwise. The criteria for CFA rule is postfix form "SP <val> +" and RA rule
113  is postfix form "CFA -<val> + ^".
114  """
115  cfa_offset = 0
116  ra_offset = 0
117  cfa_rule = _FindRuleForRegister(cfi_row, _CFA_REG)
118  ra_rule = _FindRuleForRegister(cfi_row, _RA_REG)
119  if cfa_rule and re.match(r'sp [0-9]+ \+', cfa_rule):
120    cfa_offset = int(cfa_rule.split()[1], 10)
121  if ra_rule:
122    if not re.match(r'.cfa -[0-9]+ \+ \^', ra_rule):
123      return (0, 0)
124    ra_offset = -1 * int(ra_rule.split()[1], 10)
125  return (cfa_offset, ra_offset)
126
127
128def _GetAllCfiRows(symbol_file):
129  """Returns parsed CFI data from given symbol_file.
130
131  Each entry in the cfi data dictionary returned is a map from function start
132  address to array of function rows, starting with FUNCTION type, followed by
133  one or more CFI rows.
134  """
135  cfi_data = {}
136  current_func = []
137  for line in symbol_file:
138    if 'STACK CFI' not in line:
139      continue
140
141    parts = line.split()
142    data = {}
143    if parts[2] == 'INIT':
144      # Add the previous function to the output
145      if len(current_func) > 1:
146        cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
147      current_func = []
148
149      # The function line is of format "STACK CFI INIT <addr> <length> ..."
150      data[_ADDR_ENTRY] = int(parts[3], 16)
151      data[_LENGTH_ENTRY] = int(parts[4], 16)
152
153      # Condition C1: Skip if length is large.
154      if data[_LENGTH_ENTRY] == 0 or data[_LENGTH_ENTRY] > 0xffff:
155        continue  # Skip the current function.
156    else:
157      # The current function is skipped.
158      if len(current_func) == 0:
159        continue
160
161      # The CFI row is of format "STACK CFI <addr> .cfa: <expr> .ra: <expr> ..."
162      data[_ADDR_ENTRY] = int(parts[2], 16)
163      (data[_CFA_REG], data[_RA_REG]) = _GetCfaAndRaOffset(parts)
164
165      # Condition C2 and C3: Skip based on limits on offsets.
166      if data[_CFA_REG] == 0 or data[_RA_REG] >= 16 or data[_CFA_REG] > 0xffff:
167        current_func = []
168        continue
169      assert data[_CFA_REG] % 4 == 0
170      # Since we skipped functions with code size larger than 0xffff, we should
171      # have no function offset larger than the same value.
172      assert data[_ADDR_ENTRY] - current_func[0][_ADDR_ENTRY] < 0xffff
173
174    if data[_ADDR_ENTRY] == 0:
175      # Skip current function, delete all previous entries.
176      current_func = []
177      continue
178    assert data[_ADDR_ENTRY] % 2 == 0
179    current_func.append(data)
180
181  # Condition C4: Skip function without CFI rows.
182  if len(current_func) > 1:
183    cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
184  return cfi_data
185
186
187def _WriteCfiData(cfi_data, out_file):
188  """Writes the CFI data in defined format to out_file."""
189  # Stores the final data that will be written to UNW_DATA table, in order
190  # with 2 byte items.
191  unw_data = []
192
193  # Represent all the CFI data of functions as set of numbers and map them to an
194  # index in the |unw_data|. This index is later written to the UNW_INDEX table
195  # for each function. This map is used to find index of the data for functions.
196  data_to_index = {}
197  # Store mapping between the functions to the index.
198  func_addr_to_index = {}
199  previous_func_end = 0
200  for addr, function in sorted(cfi_data.iteritems()):
201    # Add an empty function entry when functions CFIs are missing between 2
202    # functions.
203    if previous_func_end != 0 and addr - previous_func_end  > 4:
204      func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND
205    previous_func_end = addr + cfi_data[addr][0][_LENGTH_ENTRY]
206
207    assert len(function) > 1
208    func_data_arr = []
209    func_data = 0
210    # The first row contains the function address and length. The rest of the
211    # rows have CFI data. Create function data array as given in the format.
212    for row in function[1:]:
213      addr_offset = row[_ADDR_ENTRY] - addr
214      cfa_offset = (row[_CFA_REG]) | (row[_RA_REG] / 4)
215
216      func_data_arr.append(addr_offset)
217      func_data_arr.append(cfa_offset)
218
219    # Consider all the rows in the data as one large integer and add it as a key
220    # to the |data_to_index|.
221    for data in func_data_arr:
222      func_data = (func_data << 16) | data
223
224    row_count = len(func_data_arr) / 2
225    if func_data not in data_to_index:
226      # When data is not found, create a new index = len(unw_data), and write
227      # the data to |unw_data|.
228      index = len(unw_data)
229      data_to_index[func_data] = index
230      unw_data.append(row_count)
231      for row in func_data_arr:
232        unw_data.append(row)
233    else:
234      # If the data was found, then use the same index for the function.
235      index = data_to_index[func_data]
236      assert row_count == unw_data[index]
237    func_addr_to_index[addr] = data_to_index[func_data]
238
239  # Mark the end end of last function entry.
240  func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND
241
242  # Write the size of UNW_INDEX file in bytes.
243  _Write4Bytes(out_file, len(func_addr_to_index))
244
245  # Write the UNW_INDEX table. First list of addresses and then indices.
246  sorted_unw_index = sorted(func_addr_to_index.iteritems())
247  for addr, index in sorted_unw_index:
248    _Write4Bytes(out_file, addr)
249  for addr, index in sorted_unw_index:
250    _Write2Bytes(out_file, index)
251
252  # Write the UNW_DATA table.
253  for data in unw_data:
254    _Write2Bytes(out_file, data)
255
256
257def _ParseCfiData(sym_stream, output_path):
258  cfi_data = _GetAllCfiRows(sym_stream)
259  with open(output_path, 'wb') as out_file:
260    _WriteCfiData(cfi_data, out_file)
261
262
263def main():
264  parser = argparse.ArgumentParser()
265  parser.add_argument(
266      '--input_path', required=True,
267      help='The input path of the unstripped binary')
268  parser.add_argument(
269      '--output_path', required=True,
270      help='The path of the output file')
271  parser.add_argument(
272      '--dump_syms_path', required=True,
273      help='The path of the dump_syms binary')
274
275  args = parser.parse_args()
276  cmd = ['./' + args.dump_syms_path, args.input_path]
277  proc = subprocess.Popen(cmd, bufsize=-1, stdout=subprocess.PIPE)
278  _ParseCfiData(proc.stdout, args.output_path)
279  assert proc.wait() == 0
280
281  return 0
282
283if __name__ == '__main__':
284  sys.exit(main())
285