1#!/usr/bin/env python 2# Copyright 2018 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Extracts the unwind tables in from breakpad symbol files 7 8Runs dump_syms on the given binary file and extracts the CFI data into the 9given output file. 10The output file is a binary file containing CFI rows ordered based on function 11address. The output file only contains rows that match the most popular rule 12type in CFI table, to reduce the output size and specify data in compact format. 13See doc https://github.com/google/breakpad/blob/master/docs/symbol_files.md. 141. The CFA rules should be of postfix form "SP <val> +". 152. The RA rules should be of postfix form "CFA <val> + ^". 16Note: breakpad represents dereferencing address with '^' operator. 17 18The output file has 2 tables UNW_INDEX and UNW_DATA, inspired from ARM EHABI 19format. The first table contains function addresses and an index into the 20UNW_DATA table. The second table contains one or more rows for the function 21unwind information. 22 23The output file starts with 4 bytes counting the number of entries in UNW_INDEX. 24Then UNW_INDEX table and UNW_DATA table. 25 26UNW_INDEX contains two columns of N rows each, where N is the number of 27functions. 28 1. First column 4 byte rows of all the function start address as offset from 29 start of the binary, in sorted order. 30 2. For each function addr, the second column contains 2 byte indices in order. 31 The indices are offsets (in count of 2 bytes) of the CFI data from start of 32 UNW_DATA. 33The last entry in the table always contains CANT_UNWIND index to specify the 34end address of the last function. 35 36UNW_DATA contains data of all the functions. Each function data contains N rows. 37The data found at the address pointed from UNW_INDEX will be: 38 2 bytes: N - number of rows that belong to current function. 39 N * 4 bytes: N rows of data. 16 bits : Address offset from function start. 40 14 bits : CFA offset / 4. 41 2 bits : RA offset / 4. 42 43The function is not added to the unwind table in following conditions: 44C1. If length of the function code (number of instructions) is greater than 45 0xFFFF (2 byte address span). This is because we use 16 bits to refer to 46 offset of instruction from start of the address. 47C2. If the function moves the SP by more than 0xFFFF bytes. This is because we 48 use 14 bits to denote CFA offset (last 2 bits are 0). 49C3. If the Return Address is stored at an offset >= 16 from the CFA. Some 50 functions which have variable arguments can have offset upto 16. 51 TODO(ssid): We can actually store offset 16 by subtracting 1 from RA/4 since 52 we never have 0. 53C4: Some functions do not have unwind information defined in dwarf info. These 54 functions have index value CANT_UNWIND(0xFFFF) in UNW_INDEX table. 55 56 57Usage: 58 extract_unwind_tables.py --input_path [root path to unstripped chrome.so] 59 --output_path [output path] --dump_syms_path [path to dump_syms binary] 60""" 61 62import argparse 63import re 64import struct 65import subprocess 66import sys 67import tempfile 68 69 70_CFA_REG = '.cfa' 71_RA_REG = '.ra' 72 73_ADDR_ENTRY = 0 74_LENGTH_ENTRY = 1 75 76_CANT_UNWIND = 0xFFFF 77 78 79def _Write4Bytes(output_file, val): 80 """Writes a 32 bit unsigned integer to the given output file.""" 81 output_file.write(struct.pack('<L', val)); 82 83 84def _Write2Bytes(output_file, val): 85 """Writes a 16 bit unsigned integer to the given output file.""" 86 output_file.write(struct.pack('<H', val)); 87 88 89def _FindRuleForRegister(cfi_row, reg): 90 """Returns the postfix expression as string for a given register. 91 92 Breakpad CFI row format specifies rules for unwinding each register in postfix 93 expression form separated by space. Each rule starts with register name and a 94 colon. Eg: "CFI R1: <rule> R2: <rule>". 95 """ 96 out = [] 97 found_register = False 98 for part in cfi_row: 99 if found_register: 100 if part[-1] == ':': 101 break 102 out.append(part) 103 elif part == reg + ':': 104 found_register = True 105 return ' '.join(out) 106 107 108def _GetCfaAndRaOffset(cfi_row): 109 """Returns a tuple with 2 numbers (cfa_offset, ra_offset). 110 111 Returns right values if rule matches the predefined criteria. Returns (0, 0) 112 otherwise. The criteria for CFA rule is postfix form "SP <val> +" and RA rule 113 is postfix form "CFA -<val> + ^". 114 """ 115 cfa_offset = 0 116 ra_offset = 0 117 cfa_rule = _FindRuleForRegister(cfi_row, _CFA_REG) 118 ra_rule = _FindRuleForRegister(cfi_row, _RA_REG) 119 if cfa_rule and re.match(r'sp [0-9]+ \+', cfa_rule): 120 cfa_offset = int(cfa_rule.split()[1], 10) 121 if ra_rule: 122 if not re.match(r'.cfa -[0-9]+ \+ \^', ra_rule): 123 return (0, 0) 124 ra_offset = -1 * int(ra_rule.split()[1], 10) 125 return (cfa_offset, ra_offset) 126 127 128def _GetAllCfiRows(symbol_file): 129 """Returns parsed CFI data from given symbol_file. 130 131 Each entry in the cfi data dictionary returned is a map from function start 132 address to array of function rows, starting with FUNCTION type, followed by 133 one or more CFI rows. 134 """ 135 cfi_data = {} 136 current_func = [] 137 for line in symbol_file: 138 if 'STACK CFI' not in line: 139 continue 140 141 parts = line.split() 142 data = {} 143 if parts[2] == 'INIT': 144 # Add the previous function to the output 145 if len(current_func) > 1: 146 cfi_data[current_func[0][_ADDR_ENTRY]] = current_func 147 current_func = [] 148 149 # The function line is of format "STACK CFI INIT <addr> <length> ..." 150 data[_ADDR_ENTRY] = int(parts[3], 16) 151 data[_LENGTH_ENTRY] = int(parts[4], 16) 152 153 # Condition C1: Skip if length is large. 154 if data[_LENGTH_ENTRY] == 0 or data[_LENGTH_ENTRY] > 0xffff: 155 continue # Skip the current function. 156 else: 157 # The current function is skipped. 158 if len(current_func) == 0: 159 continue 160 161 # The CFI row is of format "STACK CFI <addr> .cfa: <expr> .ra: <expr> ..." 162 data[_ADDR_ENTRY] = int(parts[2], 16) 163 (data[_CFA_REG], data[_RA_REG]) = _GetCfaAndRaOffset(parts) 164 165 # Condition C2 and C3: Skip based on limits on offsets. 166 if data[_CFA_REG] == 0 or data[_RA_REG] >= 16 or data[_CFA_REG] > 0xffff: 167 current_func = [] 168 continue 169 assert data[_CFA_REG] % 4 == 0 170 # Since we skipped functions with code size larger than 0xffff, we should 171 # have no function offset larger than the same value. 172 assert data[_ADDR_ENTRY] - current_func[0][_ADDR_ENTRY] < 0xffff 173 174 if data[_ADDR_ENTRY] == 0: 175 # Skip current function, delete all previous entries. 176 current_func = [] 177 continue 178 assert data[_ADDR_ENTRY] % 2 == 0 179 current_func.append(data) 180 181 # Condition C4: Skip function without CFI rows. 182 if len(current_func) > 1: 183 cfi_data[current_func[0][_ADDR_ENTRY]] = current_func 184 return cfi_data 185 186 187def _WriteCfiData(cfi_data, out_file): 188 """Writes the CFI data in defined format to out_file.""" 189 # Stores the final data that will be written to UNW_DATA table, in order 190 # with 2 byte items. 191 unw_data = [] 192 193 # Represent all the CFI data of functions as set of numbers and map them to an 194 # index in the |unw_data|. This index is later written to the UNW_INDEX table 195 # for each function. This map is used to find index of the data for functions. 196 data_to_index = {} 197 # Store mapping between the functions to the index. 198 func_addr_to_index = {} 199 previous_func_end = 0 200 for addr, function in sorted(cfi_data.iteritems()): 201 # Add an empty function entry when functions CFIs are missing between 2 202 # functions. 203 if previous_func_end != 0 and addr - previous_func_end > 4: 204 func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND 205 previous_func_end = addr + cfi_data[addr][0][_LENGTH_ENTRY] 206 207 assert len(function) > 1 208 func_data_arr = [] 209 func_data = 0 210 # The first row contains the function address and length. The rest of the 211 # rows have CFI data. Create function data array as given in the format. 212 for row in function[1:]: 213 addr_offset = row[_ADDR_ENTRY] - addr 214 cfa_offset = (row[_CFA_REG]) | (row[_RA_REG] / 4) 215 216 func_data_arr.append(addr_offset) 217 func_data_arr.append(cfa_offset) 218 219 # Consider all the rows in the data as one large integer and add it as a key 220 # to the |data_to_index|. 221 for data in func_data_arr: 222 func_data = (func_data << 16) | data 223 224 row_count = len(func_data_arr) / 2 225 if func_data not in data_to_index: 226 # When data is not found, create a new index = len(unw_data), and write 227 # the data to |unw_data|. 228 index = len(unw_data) 229 data_to_index[func_data] = index 230 unw_data.append(row_count) 231 for row in func_data_arr: 232 unw_data.append(row) 233 else: 234 # If the data was found, then use the same index for the function. 235 index = data_to_index[func_data] 236 assert row_count == unw_data[index] 237 func_addr_to_index[addr] = data_to_index[func_data] 238 239 # Mark the end end of last function entry. 240 func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND 241 242 # Write the size of UNW_INDEX file in bytes. 243 _Write4Bytes(out_file, len(func_addr_to_index)) 244 245 # Write the UNW_INDEX table. First list of addresses and then indices. 246 sorted_unw_index = sorted(func_addr_to_index.iteritems()) 247 for addr, index in sorted_unw_index: 248 _Write4Bytes(out_file, addr) 249 for addr, index in sorted_unw_index: 250 _Write2Bytes(out_file, index) 251 252 # Write the UNW_DATA table. 253 for data in unw_data: 254 _Write2Bytes(out_file, data) 255 256 257def _ParseCfiData(sym_stream, output_path): 258 cfi_data = _GetAllCfiRows(sym_stream) 259 with open(output_path, 'wb') as out_file: 260 _WriteCfiData(cfi_data, out_file) 261 262 263def main(): 264 parser = argparse.ArgumentParser() 265 parser.add_argument( 266 '--input_path', required=True, 267 help='The input path of the unstripped binary') 268 parser.add_argument( 269 '--output_path', required=True, 270 help='The path of the output file') 271 parser.add_argument( 272 '--dump_syms_path', required=True, 273 help='The path of the dump_syms binary') 274 275 args = parser.parse_args() 276 cmd = ['./' + args.dump_syms_path, args.input_path] 277 proc = subprocess.Popen(cmd, bufsize=-1, stdout=subprocess.PIPE) 278 _ParseCfiData(proc.stdout, args.output_path) 279 assert proc.wait() == 0 280 281 return 0 282 283if __name__ == '__main__': 284 sys.exit(main()) 285