1# This file is Copyright 2019 Volatility Foundation and licensed under the Volatility Software License 1.0 2# which is available at https://www.volatilityfoundation.org/license/vsl-v1.0 3# 4import binascii 5import datetime 6import json 7import logging 8import os 9from bisect import bisect 10from typing import Tuple, Dict, Any, Optional, Union, List 11from urllib import request, error 12 13from volatility.framework import contexts, interfaces, constants 14from volatility.framework.layers import physical, msf, resources 15 16vollog = logging.getLogger(__name__) 17 18primatives = { 19 0x03: ("void", { 20 "endian": "little", 21 "kind": "void", 22 "signed": True, 23 "size": 0 24 }), 25 0x08: ("HRESULT", { 26 "endian": "little", 27 "kind": "int", 28 "signed": False, 29 "size": 4 30 }), 31 0x10: ("char", { 32 "endian": "little", 33 "kind": "char", 34 "signed": True, 35 "size": 1 36 }), 37 0x20: ("unsigned char", { 38 "endian": "little", 39 "kind": "char", 40 "signed": False, 41 "size": 1 42 }), 43 0x68: ("int8", { 44 "endian": "little", 45 "kind": "int", 46 "signed": True, 47 "size": 1 48 }), 49 0x69: ("uint8", { 50 "endian": "little", 51 "kind": "int", 52 "signed": False, 53 "size": 1 54 }), 55 0x70: ("char", { 56 "endian": "little", 57 "kind": "char", 58 "signed": True, 59 "size": 1 60 }), 61 0x71: ("wchar", { 62 "endian": "little", 63 "kind": "int", 64 "signed": True, 65 "size": 2 66 }), 67 # 0x7a: ("rchar16", {}), 68 # 0x7b: ("rchar32", {}), 69 0x11: ("short", { 70 "endian": "little", 71 "kind": "int", 72 "signed": True, 73 "size": 2 74 }), 75 0x21: ("unsigned short", { 76 "endian": "little", 77 "kind": "int", 78 "signed": False, 79 "size": 2 80 }), 81 0x72: ("short", { 82 "endian": "little", 83 "kind": "int", 84 "signed": True, 85 "size": 2 86 }), 87 0x73: ("unsigned short", { 88 "endian": "little", 89 "kind": "int", 90 "signed": False, 91 "size": 2 92 }), 93 0x12: ("long", { 94 "endian": "little", 95 "kind": "int", 96 "signed": True, 97 "size": 4 98 }), 99 0x22: ("unsigned long", { 100 "endian": "little", 101 "kind": "int", 102 "signed": False, 103 "size": 4 104 }), 105 0x74: ("int", { 106 "endian": "little", 107 "kind": "int", 108 "signed": True, 109 "size": 4 110 }), 111 0x75: ("unsigned int", { 112 "endian": "little", 113 "kind": "int", 114 "signed": False, 115 "size": 4 116 }), 117 0x13: ("long long", { 118 "endian": "little", 119 "kind": "int", 120 "signed": True, 121 "size": 8 122 }), 123 0x23: ("unsigned long long", { 124 "endian": "little", 125 "kind": "int", 126 "signed": False, 127 "size": 8 128 }), 129 0x76: ("long long", { 130 "endian": "little", 131 "kind": "int", 132 "signed": True, 133 "size": 8 134 }), 135 0x77: ("unsigned long long", { 136 "endian": "little", 137 "kind": "int", 138 "signed": False, 139 "size": 8 140 }), 141 0x14: ("int128", { 142 "endian": "little", 143 "kind": "int", 144 "signed": True, 145 "size": 16 146 }), 147 0x24: ("uint128", { 148 "endian": "little", 149 "kind": "int", 150 "signed": False, 151 "size": 16 152 }), 153 0x78: ("int128", { 154 "endian": "little", 155 "kind": "int", 156 "signed": True, 157 "size": 16 158 }), 159 0x79: ("uint128", { 160 "endian": "little", 161 "kind": "int", 162 "signed": False, 163 "size": 16 164 }), 165 0x46: ("f16", { 166 "endian": "little", 167 "kind": "float", 168 "signed": True, 169 "size": 2 170 }), 171 0x40: ("f32", { 172 "endian": "little", 173 "kind": "float", 174 "signed": True, 175 "size": 4 176 }), 177 0x45: ("f32pp", { 178 "endian": "little", 179 "kind": "float", 180 "signed": True, 181 "size": 4 182 }), 183 0x44: ("f48", { 184 "endian": "little", 185 "kind": "float", 186 "signed": True, 187 "size": 6 188 }), 189 0x41: ("double", { 190 "endian": "little", 191 "kind": "float", 192 "signed": True, 193 "size": 8 194 }), 195 0x42: ("f80", { 196 "endian": "little", 197 "kind": "float", 198 "signed": True, 199 "size": 10 200 }), 201 0x43: ("f128", { 202 "endian": "little", 203 "kind": "float", 204 "signed": True, 205 "size": 16 206 }) 207} 208 209indirections = { 210 0x100: ("pointer16", { 211 "endian": "little", 212 "kind": "int", 213 "signed": False, 214 "size": 2 215 }), 216 0x400: ("pointer32", { 217 "endian": "little", 218 "kind": "int", 219 "signed": False, 220 "size": 4 221 }), 222 0x600: ("pointer64", { 223 "endian": "little", 224 "kind": "int", 225 "signed": False, 226 "size": 8 227 }) 228} 229 230 231class ForwardArrayCount: 232 233 def __init__(self, size, element_type): 234 self.element_type = element_type 235 self.size = size 236 237 238class PdbReader: 239 """Class to read Microsoft PDB files. 240 241 This reads the various streams according to various sources as to how pdb should be read. 242 These sources include: 243 244 https://docs.rs/crate/pdb/0.5.0/source/src/ 245 https://github.com/moyix/pdbparse 246 https://llvm.org/docs/PDB/index.html 247 https://github.com/Microsoft/microsoft-pdb/ 248 249 In order to generate ISF files, we need the type stream (2), and the symbols stream (variable). 250 The MultiStream Format wrapper is handled as a volatility layer, which constructs sublayers for each stream. 251 The streams can then be read contiguously allowing the data to be accessed. 252 253 Volatility's type system is strong when everything must be laid out in advance, but PDB data is reasonably dynamic, 254 particularly when it comes to names. We must therefore parse it after we've collected other information already. 255 This is in comparison to something such as Construct/pdbparse which can use just-parsed data to determine dynamically 256 sized data following. 257 """ 258 259 def __init__(self, 260 context: interfaces.context.ContextInterface, 261 location: str, 262 progress_callback: constants.ProgressCallback = None) -> None: 263 self._layer_name, self._context = self.load_pdb_layer(context, location) 264 self._dbiheader = None # type: Optional[interfaces.objects.ObjectInterface] 265 if not progress_callback: 266 progress_callback = lambda x, y: None 267 self._progress_callback = progress_callback 268 self.types = [ 269 ] # type: List[Tuple[interfaces.objects.ObjectInterface, str, interfaces.objects.ObjectInterface]] 270 self.bases = {} # type: Dict[str, Any] 271 self.user_types = {} # type: Dict[str, Any] 272 self.enumerations = {} # type: Dict[str, Any] 273 self.symbols = {} # type: Dict[str, Any] 274 self._omap_mapping = [] # type: List[Tuple[int, int]] 275 self._sections = [] # type: List[interfaces.objects.ObjectInterface] 276 self.metadata = {"format": "6.1.0", "windows": {}} 277 278 @property 279 def context(self): 280 return self._context 281 282 @property 283 def pdb_layer_name(self): 284 return self._layer_name 285 286 @classmethod 287 def load_pdb_layer(cls, context: interfaces.context.ContextInterface, 288 location: str) -> Tuple[str, interfaces.context.ContextInterface]: 289 """Loads a PDB file into a layer within the context and returns the 290 name of the new layer. 291 292 Note: the context may be changed by this method 293 """ 294 physical_layer_name = context.layers.free_layer_name("FileLayer") 295 physical_config_path = interfaces.configuration.path_join("pdbreader", physical_layer_name) 296 297 # Create the file layer 298 # This must be specific to get us started, setup the config and run 299 new_context = context.clone() 300 new_context.config[interfaces.configuration.path_join(physical_config_path, "location")] = location 301 302 physical_layer = physical.FileLayer(new_context, physical_config_path, physical_layer_name) 303 new_context.add_layer(physical_layer) 304 305 # Add on the MSF format layer 306 msf_layer_name = context.layers.free_layer_name("MSFLayer") 307 msf_config_path = interfaces.configuration.path_join("pdbreader", msf_layer_name) 308 new_context.config[interfaces.configuration.path_join(msf_config_path, "base_layer")] = physical_layer_name 309 msf_layer = msf.PdbMultiStreamFormat(new_context, msf_config_path, msf_layer_name) 310 new_context.add_layer(msf_layer) 311 312 msf_layer.read_streams() 313 314 return msf_layer_name, new_context 315 316 def reset(self): 317 self.bases = {} 318 self.user_types = {} 319 self.enumerations = {} 320 self.symbols = {} 321 self._sections = [] 322 self._omap_mapping = [] 323 324 def read_necessary_streams(self): 325 """Read streams to populate the various internal components for a PDB 326 table.""" 327 if not self.metadata['windows'].get('pdb', None): 328 self.read_pdb_info_stream() 329 if not self.user_types: 330 self.read_tpi_stream() 331 if not self.symbols: 332 self.read_symbol_stream() 333 334 def read_tpi_stream(self) -> None: 335 """Reads the TPI type steam.""" 336 vollog.debug("Reading TPI") 337 tpi_layer = self._context.layers.get(self._layer_name + "_stream2", None) 338 if not tpi_layer: 339 raise ValueError("No TPI stream available") 340 module = self._context.module(module_name = tpi_layer.pdb_symbol_table, layer_name = tpi_layer.name, offset = 0) 341 header = module.object(object_type = "TPI_HEADER", offset = 0) 342 343 # Check the header 344 if not (56 <= header.header_size < 1024): 345 raise ValueError("TPI Stream Header size outside normal bounds") 346 if header.index_min < 4096: 347 raise ValueError("Minimum TPI index is 4096, found: {}".format(header.index_min)) 348 if header.index_max < header.index_min: 349 raise ValueError("Maximum TPI index is smaller than minimum TPI index, found: {} < {} ".format( 350 header.index_max, header.index_min)) 351 352 # Reset the state 353 self.types = [] 354 type_references = {} # type: Dict[str, int] 355 356 offset = header.header_size 357 # Ensure we use the same type everywhere 358 length_type = "unsigned short" 359 length_len = module.get_type(length_type).size 360 type_index = 1 361 while tpi_layer.maximum_address - offset > 0: 362 self._progress_callback(offset * 100 / tpi_layer.maximum_address, "Reading TPI layer") 363 length = module.object(object_type = length_type, offset = offset) 364 if not isinstance(length, int): 365 raise TypeError("Non-integer length provided") 366 offset += length_len 367 output, consumed = self.consume_type(module, offset, length) 368 leaf_type, name, value = output 369 for tag_type in ['unnamed', 'anonymous']: 370 if name == '<{}-tag>'.format(tag_type) or name == '__{}'.format(tag_type): 371 name = '__{}_'.format(tag_type) + hex(len(self.types) + 0x1000)[2:] 372 type_references[name] = len(self.types) 373 self.types.append((leaf_type, name, value)) 374 offset += length 375 type_index += 1 376 # Since types can only refer to earlier types, assigning the name at this point is fine 377 378 if tpi_layer.maximum_address - offset != 0: 379 raise ValueError("Type values did not fill the TPI stream correctly") 380 381 self.process_types(type_references) 382 383 def read_dbi_stream(self) -> None: 384 """Reads the DBI Stream.""" 385 vollog.debug("Reading DBI stream") 386 dbi_layer = self._context.layers.get(self._layer_name + "_stream3", None) 387 if not dbi_layer: 388 raise ValueError("No DBI stream available") 389 module = self._context.module(module_name = dbi_layer.pdb_symbol_table, layer_name = dbi_layer.name, offset = 0) 390 self._dbiheader = module.object(object_type = "DBI_HEADER", offset = 0) 391 392 if not self._dbiheader: 393 raise ValueError("DBI Header could not be read") 394 395 # Skip past sections we don't care about to get to the DBG header 396 dbg_hdr_offset = (self._dbiheader.vol.size + self._dbiheader.module_size + self._dbiheader.secconSize + 397 self._dbiheader.secmapSize + self._dbiheader.filinfSize + self._dbiheader.tsmapSize + 398 self._dbiheader.ecinfoSize) 399 self._dbidbgheader = module.object(object_type = "DBI_DBG_HEADER", offset = dbg_hdr_offset) 400 401 self._sections = [] 402 self._omap_mapping = [] 403 404 if self._dbidbgheader.snSectionHdrOrig != -1: 405 section_orig_layer_name = self._layer_name + "_stream" + str(self._dbidbgheader.snSectionHdrOrig) 406 consumed, length = 0, self.context.layers[section_orig_layer_name].maximum_address 407 while consumed < length: 408 section = self.context.object(dbi_layer.pdb_symbol_table + constants.BANG + "IMAGE_SECTION_HEADER", 409 offset = consumed, 410 layer_name = section_orig_layer_name) 411 self._sections.append(section) 412 consumed += section.vol.size 413 414 if self._dbidbgheader.snOmapFromSrc != -1: 415 omap_layer_name = self._layer_name + "_stream" + str(self._dbidbgheader.snOmapFromSrc) 416 length = self.context.layers[omap_layer_name].maximum_address 417 data = self.context.layers[omap_layer_name].read(0, length) 418 # For speed we don't use the framework to read this (usually sizeable) data 419 for i in range(0, length, 8): 420 self._omap_mapping.append( 421 (int.from_bytes(data[i:i + 4], 422 byteorder = 'little'), int.from_bytes(data[i + 4:i + 8], byteorder = 'little'))) 423 elif self._dbidbgheader.snSectionHdr != -1: 424 section_layer_name = self._layer_name + "_stream" + str(self._dbidbgheader.snSectionHdr) 425 consumed, length = 0, self.context.layers[section_layer_name].maximum_address 426 while consumed < length: 427 section = self.context.object(dbi_layer.pdb_symbol_table + constants.BANG + "IMAGE_SECTION_HEADER", 428 offset = consumed, 429 layer_name = section_layer_name) 430 self._sections.append(section) 431 consumed += section.vol.size 432 433 def read_symbol_stream(self): 434 """Reads in the symbol stream.""" 435 self.symbols = {} 436 437 if not self._dbiheader: 438 self.read_dbi_stream() 439 440 vollog.debug("Reading Symbols") 441 442 symrec_layer = self._context.layers.get(self._layer_name + "_stream" + str(self._dbiheader.symrecStream), None) 443 if not symrec_layer: 444 raise ValueError("No SymRec stream available") 445 module = self._context.module(module_name = symrec_layer.pdb_symbol_table, 446 layer_name = symrec_layer.name, 447 offset = 0) 448 449 offset = 0 450 max_address = symrec_layer.maximum_address 451 452 while offset < max_address: 453 self._progress_callback(offset * 100 / max_address, "Reading Symbol layer") 454 sym = module.object(object_type = "GLOBAL_SYMBOL", offset = offset) 455 leaf_type = module.object(object_type = "unsigned short", offset = sym.leaf_type.vol.offset) 456 name = None 457 address = None 458 if sym.segment < len(self._sections): 459 if leaf_type == 0x110e: 460 # v3 symbol (c-string) 461 name = self.parse_string(sym.name, False, sym.length - sym.vol.size + 2) 462 address = self._sections[sym.segment - 1].VirtualAddress + sym.offset 463 elif leaf_type == 0x1009: 464 # v2 symbol (pascal-string) 465 name = self.parse_string(sym.name, True, sym.length - sym.vol.size + 2) 466 address = self._sections[sym.segment - 1].VirtualAddress + sym.offset 467 else: 468 vollog.debug("Only v2 and v3 symbols are supported") 469 if name: 470 if self._omap_mapping: 471 address = self.omap_lookup(address) 472 stripped_name = self.name_strip(name) 473 self.symbols[stripped_name] = {"address": address} 474 if name != self.name_strip(name): 475 self.symbols[stripped_name]["linkage_name"] = name 476 offset += sym.length + 2 # Add on length itself 477 478 def read_pdb_info_stream(self): 479 """Reads in the pdb information stream.""" 480 if not self._dbiheader: 481 self.read_dbi_stream() 482 483 vollog.debug("Reading PDB Info") 484 pdb_info_layer = self._context.layers.get(self._layer_name + "_stream1", None) 485 if not pdb_info_layer: 486 raise ValueError("No PDB Info Stream available") 487 module = self._context.module(module_name = pdb_info_layer.pdb_symbol_table, 488 layer_name = pdb_info_layer.name, 489 offset = 0) 490 pdb_info = module.object(object_type = "PDB_INFORMATION", offset = 0) 491 492 self.metadata['windows']['pdb'] = { 493 "GUID": self.convert_bytes_to_guid(pdb_info.GUID), 494 "age": pdb_info.age, 495 "database": "ntkrnlmp.pdb", 496 "machine_type": self._dbiheader.machine 497 } 498 499 def convert_bytes_to_guid(self, original: bytes) -> str: 500 """Convert the bytes to the correct ordering for a GUID.""" 501 orig_guid_list = [x for x in original] 502 guid_list = [] 503 for i in [3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15]: 504 guid_list.append(orig_guid_list[i]) 505 return str(binascii.hexlify(bytes(guid_list)), "latin-1").upper() 506 507 # SYMBOL HANDLING CODE 508 509 def omap_lookup(self, address): 510 """Looks up an address using the omap mapping.""" 511 pos = bisect(self._omap_mapping, (address, -1)) 512 if self._omap_mapping[pos][0] > address: 513 pos -= 1 514 515 if not self._omap_mapping[pos][1]: 516 return 0 517 return self._omap_mapping[pos][1] + (address - self._omap_mapping[pos][0]) 518 519 def name_strip(self, name): 520 """Strips unnecessary components from the start of a symbol name.""" 521 new_name = name 522 523 if new_name[:1] in ["_", "@", "\u007F"]: 524 new_name = new_name[1:] 525 526 name_array = new_name.split("@") 527 if len(name_array) == 2: 528 if name_array[1].isnumeric() and name_array[0][0] != "?": 529 new_name = name_array[0] 530 else: 531 new_name = name 532 533 return new_name 534 535 def get_json(self): 536 """Returns the intermediate format JSON data from this pdb file.""" 537 self.read_necessary_streams() 538 539 # Set the time/datestamp for the output 540 self.metadata["producer"] = { 541 "datetime": datetime.datetime.now().isoformat(), 542 "name": "volatility3", 543 "version": constants.PACKAGE_VERSION 544 } 545 546 return { 547 "user_types": self.user_types, 548 "enums": self.enumerations, 549 "base_types": self.bases, 550 "symbols": self.symbols, 551 "metadata": self.metadata, 552 } 553 554 def get_type_from_index(self, index: int) -> Union[List[Any], Dict[str, Any]]: 555 """Takes a type index and returns appropriate dictionary.""" 556 if index < 0x1000: 557 base_name, base = primatives[index & 0xff] 558 self.bases[base_name] = base 559 result = {"kind": "base", "name": base_name} # type: Union[List[Dict[str, Any]], Dict[str, Any]] 560 indirection = (index & 0xf00) 561 if indirection: 562 pointer_name, pointer_base = indirections[indirection] 563 if self.bases.get('pointer', None) and self.bases['pointer'] == pointer_base: 564 result = {"kind": "pointer", "subtype": result} 565 else: 566 self.bases[pointer_name] = pointer_base 567 result = {"kind": "pointer", "base": pointer_name, "subtype": result} 568 return result 569 else: 570 leaf_type, name, value = self.types[index - 0x1000] 571 result = {"kind": "struct", "name": name} 572 if leaf_type in [leaf_type.LF_MODIFIER]: 573 result = self.get_type_from_index(value.subtype_index) 574 elif leaf_type in [leaf_type.LF_ARRAY, leaf_type.LF_ARRAY_ST, leaf_type.LF_STRIDED_ARRAY]: 575 result = { 576 "count": ForwardArrayCount(value.size, value.element_type), 577 "kind": "array", 578 "subtype": self.get_type_from_index(value.element_type) 579 } 580 elif leaf_type in [leaf_type.LF_BITFIELD]: 581 result = { 582 "kind": "bitfield", 583 "type": self.get_type_from_index(value.underlying_type), 584 "bit_length": value.length, 585 "bit_position": value.position 586 } 587 elif leaf_type in [leaf_type.LF_POINTER]: 588 # Since we use the base['pointer'] to set the size for pointers, update it and check we don't get conflicts 589 size = self.get_size_from_index(index) 590 if self.bases.get("pointer", None) is None: 591 self.bases['pointer'] = {"endian": "little", "kind": "int", "signed": False, "size": size} 592 else: 593 if size != self.bases['pointer']['size']: 594 raise ValueError("Native pointers with different sizes!") 595 result = {"kind": "pointer", "subtype": self.get_type_from_index(value.subtype_index)} 596 elif leaf_type in [leaf_type.LF_PROCEDURE]: 597 return {"kind": "function"} 598 elif leaf_type in [leaf_type.LF_UNION]: 599 result = {"kind": "union", "name": name} 600 elif leaf_type in [leaf_type.LF_ENUM]: 601 result = {"kind": "enum", "name": name} 602 elif leaf_type in [leaf_type.LF_FIELDLIST]: 603 result = value 604 elif not name: 605 raise ValueError("No name for structure that should be named") 606 return result 607 608 def get_size_from_index(self, index: int) -> int: 609 """Returns the size of the structure based on the type index 610 provided.""" 611 result = -1 612 name = '' 613 if index < 0x1000: 614 if (index & 0xf00): 615 _, base = indirections[index & 0xf00] 616 else: 617 _, base = primatives[index & 0xff] 618 result = base['size'] 619 else: 620 leaf_type, name, value = self.types[index - 0x1000] 621 if leaf_type in [ 622 leaf_type.LF_UNION, leaf_type.LF_CLASS, leaf_type.LF_CLASS_ST, leaf_type.LF_STRUCTURE, 623 leaf_type.LF_STRUCTURE_ST, leaf_type.LF_INTERFACE 624 ]: 625 if not value.properties.forward_reference: 626 result = value.size 627 elif leaf_type in [leaf_type.LF_ARRAY, leaf_type.LF_ARRAY_ST, leaf_type.LF_STRIDED_ARRAY]: 628 result = value.size 629 elif leaf_type in [leaf_type.LF_MODIFIER, leaf_type.LF_ENUM, leaf_type.LF_ARGLIST]: 630 result = self.get_size_from_index(value.subtype_index) 631 elif leaf_type in [leaf_type.LF_MEMBER]: 632 result = self.get_size_from_index(value.field_type) 633 elif leaf_type in [leaf_type.LF_BITFIELD]: 634 result = self.get_size_from_index(value.underlying_type) 635 elif leaf_type in [leaf_type.LF_POINTER]: 636 result = value.size 637 if not result: 638 if value.pointer_type == 0x0a: 639 return 4 640 elif value.pointer_type == 0x0c: 641 return 8 642 else: 643 raise ValueError("Pointer size could not be determined") 644 elif leaf_type in [leaf_type.LF_PROCEDURE]: 645 raise ValueError("LF_PROCEDURE size could not be identified") 646 else: 647 raise ValueError("Unable to determine size of leaf_type {}".format(leaf_type.lookup())) 648 if result <= 0: 649 raise ValueError("Invalid size identified: {} ({})".format(index, name)) 650 return result 651 652 ### TYPE HANDLING CODE 653 654 def process_types(self, type_references: Dict[str, int]) -> None: 655 """Reads the TPI and symbol streams to populate the reader's 656 variables.""" 657 658 self.bases = {} 659 self.user_types = {} 660 self.enumerations = {} 661 662 max_len = len(self.types) 663 for index in range(max_len): 664 self._progress_callback(index * 100 / max_len, "Processing types") 665 leaf_type, name, value = self.types[index] 666 if leaf_type in [ 667 leaf_type.LF_CLASS, leaf_type.LF_CLASS_ST, leaf_type.LF_STRUCTURE, leaf_type.LF_STRUCTURE_ST, 668 leaf_type.LF_INTERFACE 669 ]: 670 if not value.properties.forward_reference: 671 self.user_types[name] = { 672 "kind": "struct", 673 "size": value.size, 674 "fields": self.convert_fields(value.fields - 0x1000) 675 } 676 elif leaf_type in [leaf_type.LF_UNION]: 677 if not value.properties.forward_reference: 678 # Deal with UNION types 679 self.user_types[name] = { 680 "kind": "union", 681 "size": value.size, 682 "fields": self.convert_fields(value.fields - 0x1000) 683 } 684 elif leaf_type in [leaf_type.LF_ENUM]: 685 if not value.properties.forward_reference: 686 base = self.get_type_from_index(value.subtype_index) 687 if not isinstance(base, Dict): 688 raise ValueError("Invalid base type returned for Enumeration") 689 self.enumerations[name] = { 690 'base': base['name'], 691 'size': self.get_size_from_index(value.subtype_index), 692 'constants': 693 dict([(name, enum.value) for _, name, enum in self.get_type_from_index(value.fields)]) 694 } 695 696 # Re-run through for ForwardSizeReferences 697 self.user_types = self.replace_forward_references(self.user_types, type_references) 698 699 def consume_type( 700 self, module: interfaces.context.ModuleInterface, offset: int, length: int 701 ) -> Tuple[Tuple[Optional[interfaces.objects.ObjectInterface], Optional[str], Union[ 702 None, List, interfaces.objects.ObjectInterface]], int]: 703 """Returns a (leaf_type, name, object) Tuple for a type, and the number 704 of bytes consumed.""" 705 result = None, None, None # type: Tuple[Optional[interfaces.objects.ObjectInterface], Optional[str], Optional[Union[List, interfaces.objects.ObjectInterface]]] 706 leaf_type = self.context.object(module.get_enumeration("LEAF_TYPE"), 707 layer_name = module._layer_name, 708 offset = offset) 709 consumed = leaf_type.vol.base_type.size 710 remaining = length - consumed 711 712 if leaf_type in [ 713 leaf_type.LF_CLASS, leaf_type.LF_CLASS_ST, leaf_type.LF_STRUCTURE, leaf_type.LF_STRUCTURE_ST, 714 leaf_type.LF_INTERFACE 715 ]: 716 structure = module.object(object_type = "LF_STRUCTURE", offset = offset + consumed) 717 name_offset = structure.name.vol.offset - structure.vol.offset 718 name, value, excess = self.determine_extended_value(leaf_type, structure.size, module, 719 remaining - name_offset) 720 structure.size = value 721 structure.name = name 722 consumed += remaining 723 result = leaf_type, name, structure 724 elif leaf_type in [leaf_type.LF_MEMBER, leaf_type.LF_MEMBER_ST]: 725 member = module.object(object_type = "LF_MEMBER", offset = offset + consumed) 726 name_offset = member.name.vol.offset - member.vol.offset 727 name, value, excess = self.determine_extended_value(leaf_type, member.offset, module, 728 remaining - name_offset) 729 member.offset = value 730 member.name = name 731 result = leaf_type, name, member 732 consumed += member.vol.size + len(name) + 1 + excess 733 elif leaf_type in [leaf_type.LF_ARRAY, leaf_type.LF_ARRAY_ST, leaf_type.LF_STRIDED_ARRAY]: 734 array = module.object(object_type = "LF_ARRAY", offset = offset + consumed) 735 name_offset = array.name.vol.offset - array.vol.offset 736 name, value, excess = self.determine_extended_value(leaf_type, array.size, module, remaining - name_offset) 737 array.size = value 738 array.name = name 739 result = leaf_type, name, array 740 consumed += remaining 741 elif leaf_type in [leaf_type.LF_ENUMERATE]: 742 enum = module.object(object_type = 'LF_ENUMERATE', offset = offset + consumed) 743 name_offset = enum.name.vol.offset - enum.vol.offset 744 name, value, excess = self.determine_extended_value(leaf_type, enum.value, module, remaining - name_offset) 745 enum.value = value 746 enum.name = name 747 result = leaf_type, name, enum 748 consumed += enum.vol.size + len(name) + 1 + excess 749 elif leaf_type in [leaf_type.LF_ARGLIST, leaf_type.LF_ENUM]: 750 enum = module.object(object_type = "LF_ENUM", offset = offset + consumed) 751 name_offset = enum.name.vol.offset - enum.vol.offset 752 name = self.parse_string(enum.name, leaf_type < leaf_type.LF_ST_MAX, size = remaining - name_offset) 753 enum.name = name 754 result = leaf_type, name, enum 755 consumed += remaining 756 elif leaf_type in [leaf_type.LF_UNION]: 757 union = module.object(object_type = "LF_UNION", offset = offset + consumed) 758 name_offset = union.name.vol.offset - union.vol.offset 759 name = self.parse_string(union.name, leaf_type < leaf_type.LF_ST_MAX, size = remaining - name_offset) 760 result = leaf_type, name, union 761 consumed += remaining 762 elif leaf_type in [leaf_type.LF_MODIFIER, leaf_type.LF_POINTER, leaf_type.LF_PROCEDURE]: 763 obj = module.object(object_type = leaf_type.lookup(), offset = offset + consumed) 764 result = leaf_type, None, obj 765 consumed += remaining 766 elif leaf_type in [leaf_type.LF_FIELDLIST]: 767 sub_length = remaining 768 sub_offset = offset + consumed 769 fields = [] 770 while length > consumed: 771 subfield, sub_consumed = self.consume_type(module, sub_offset, sub_length) 772 sub_consumed += self.consume_padding(module.layer_name, sub_offset + sub_consumed) 773 sub_length -= sub_consumed 774 sub_offset += sub_consumed 775 consumed += sub_consumed 776 fields.append(subfield) 777 result = leaf_type, None, fields 778 elif leaf_type in [leaf_type.LF_BITFIELD]: 779 bitfield = module.object(object_type = "LF_BITFIELD", offset = offset + consumed) 780 result = leaf_type, None, bitfield 781 consumed += remaining 782 else: 783 raise TypeError("Unhandled leaf_type: {}".format(leaf_type)) 784 785 return result, consumed 786 787 def consume_padding(self, layer_name: str, offset: int) -> int: 788 """Returns the amount of padding used between fields.""" 789 val = self.context.layers[layer_name].read(offset, 1) 790 if not ((val[0] & 0xf0) == 0xf0): 791 return 0 792 return (int(val[0]) & 0x0f) 793 794 def convert_fields(self, fields: int) -> Dict[Optional[str], Dict[str, Any]]: 795 """Converts a field list into a list of fields.""" 796 result = {} # type: Dict[Optional[str], Dict[str, Any]] 797 _, _, fields_struct = self.types[fields] 798 if not isinstance(fields_struct, list): 799 vollog.warning("Fields structure did not contain a list of fields") 800 return result 801 for field in fields_struct: 802 _, name, member = field 803 result[name] = {"offset": member.offset, "type": self.get_type_from_index(member.field_type)} 804 return result 805 806 def replace_forward_references(self, types, type_references): 807 """Finds all ForwardArrayCounts and calculates them once 808 ForwardReferences have been resolved.""" 809 if isinstance(types, dict): 810 for k, v in types.items(): 811 types[k] = self.replace_forward_references(v, type_references) 812 elif isinstance(types, list): 813 new_types = [] 814 for v in types: 815 new_types.append(self.replace_forward_references(v, type_references)) 816 types = new_types 817 elif isinstance(types, ForwardArrayCount): 818 element_type = types.element_type 819 # If we're a forward array count, we need to do the calculation now after all the types have been processed 820 loop = True 821 while loop: 822 loop = False 823 if element_type > 0x1000: 824 _, name, toplevel_type = self.types[element_type - 0x1000] 825 # If there's no name, the original size is probably fine as long as we're not indirect (LF_MODIFIER) 826 if not name and isinstance( 827 toplevel_type, 828 interfaces.objects.ObjectInterface) and toplevel_type.vol.type_name.endswith('LF_MODIFIER'): 829 # We have check they don't point to a forward reference, so we go round again with the subtype 830 element_type = toplevel_type.subtype_index 831 loop = True 832 elif name: 833 # If there is a name, look it up so we're not using a reference but the real thing 834 element_type = type_references[name] + 0x1000 835 return types.size // self.get_size_from_index(element_type) 836 return types 837 838 # COMMON CODE 839 840 @staticmethod 841 def parse_string(structure: interfaces.objects.ObjectInterface, 842 parse_as_pascal: bool = False, 843 size: int = 0) -> str: 844 """Consumes either a c-string or a pascal string depending on the 845 leaf_type.""" 846 if not parse_as_pascal: 847 name = structure.cast("string", max_length = size, encoding = "latin-1") 848 else: 849 name = structure.cast("pascal_string") 850 name = name.string.cast("string", max_length = name.length, encoding = "latin-1") 851 return str(name) 852 853 def determine_extended_value(self, leaf_type: interfaces.objects.ObjectInterface, 854 value: interfaces.objects.ObjectInterface, module: interfaces.context.ModuleInterface, 855 length: int) -> Tuple[str, interfaces.objects.ObjectInterface, int]: 856 """Reads a value and potentially consumes more data to construct the 857 value.""" 858 excess = 0 859 if value >= leaf_type.LF_CHAR: 860 sub_leaf_type = self.context.object(self.context.symbol_space.get_enumeration(leaf_type.vol.type_name), 861 layer_name = leaf_type.vol.layer_name, 862 offset = value.vol.offset) 863 # Set the offset at just after the previous size type 864 offset = value.vol.offset + value.vol.data_format.length 865 if sub_leaf_type in [leaf_type.LF_CHAR]: 866 value = module.object(object_type = 'char', offset = offset) 867 elif sub_leaf_type in [leaf_type.LF_SHORT]: 868 value = module.object(object_type = 'short', offset = offset) 869 elif sub_leaf_type in [leaf_type.LF_USHORT]: 870 value = module.object(object_type = 'unsigned short', offset = offset) 871 elif sub_leaf_type in [leaf_type.LF_LONG]: 872 value = module.object(object_type = 'long', offset = offset) 873 elif sub_leaf_type in [leaf_type.LF_ULONG]: 874 value = module.object(object_type = 'unsigned long', offset = offset) 875 else: 876 raise TypeError("Unexpected extended value type") 877 excess = value.vol.data_format.length 878 # Updated the consume/offset counters 879 name = module.object(object_type = "string", offset = value.vol.offset + value.vol.data_format.length) 880 name_str = self.parse_string(name, leaf_type < leaf_type.LF_ST_MAX, size = length - excess) 881 return name_str, value, excess 882 883 884class PdbRetreiver: 885 886 def retreive_pdb(self, 887 guid: str, 888 file_name: str, 889 progress_callback: constants.ProgressCallback = None) -> Optional[str]: 890 vollog.info("Download PDB file...") 891 file_name = ".".join(file_name.split(".")[:-1] + ['pdb']) 892 for sym_url in ['http://msdl.microsoft.com/download/symbols']: 893 url = sym_url + "/{}/{}/".format(file_name, guid) 894 895 result = None 896 for suffix in [file_name, file_name[:-1] + '_']: 897 try: 898 vollog.debug("Attempting to retrieve {}".format(url + suffix)) 899 result = resources.ResourceAccessor(progress_callback).open(url + suffix) 900 except error.HTTPError as excp: 901 vollog.debug("Failed with {}".format(excp)) 902 if result: 903 break 904 if progress_callback is not None: 905 progress_callback(100, "Downloading {}".format(url + suffix)) 906 return result.name 907 908 909if __name__ == '__main__': 910 import argparse 911 912 class PrintedProgress(object): 913 """A progress handler that prints the progress value and the 914 description onto the command line.""" 915 916 def __init__(self): 917 self._max_message_len = 0 918 919 def __call__(self, progress: Union[int, float], description: str = None): 920 """A simple function for providing text-based feedback. 921 922 .. warning:: Only for development use. 923 924 Args: 925 progress: Percentage of progress of the current procedure 926 """ 927 message = "\rProgress: {0: 7.2f}\t\t{1:}".format(round(progress, 2), description or '') 928 message_len = len(message) 929 self._max_message_len = max([self._max_message_len, message_len]) 930 print(message, end = (' ' * (self._max_message_len - message_len)) + '\r') 931 932 parser = argparse.ArgumentParser( 933 description = "Read PDB files and convert to Volatility 3 Intermediate Symbol Format") 934 parser.add_argument("-o", "--output", metavar = "OUTPUT", help = "Filename for data output", required = True) 935 file_group = parser.add_argument_group("file", description = "File-based conversion of PDB to ISF") 936 file_group.add_argument("-f", "--file", metavar = "FILE", help = "PDB file to translate to ISF") 937 data_group = parser.add_argument_group("data", description = "Convert based on a GUID and filename pattern") 938 data_group.add_argument("-p", "--pattern", metavar = "PATTERN", help = "Filename pattern to recover PDB file") 939 data_group.add_argument("-g", 940 "--guid", 941 metavar = "GUID", 942 help = "GUID + Age string for the required PDB file", 943 default = None) 944 data_group.add_argument("-k", 945 "--keep", 946 action = "store_true", 947 default = False, 948 help = "Keep the downloaded PDB file") 949 args = parser.parse_args() 950 951 pg_cb = PrintedProgress() 952 953 delfile = False 954 filename = None 955 if args.guid is not None and args.pattern is not None: 956 filename = PdbRetreiver().retreive_pdb(guid = args.guid, file_name = args.pattern, progress_callback = pg_cb) 957 delfile = True 958 elif args.file: 959 filename = args.file 960 else: 961 parser.error("No GUID/pattern or file provided") 962 963 if not filename: 964 parser.error("No suitable filename provided or retrieved") 965 966 ctx = contexts.Context() 967 if not os.path.exists(filename): 968 parser.error("File {} does not exists".format(filename)) 969 location = "file:" + request.pathname2url(filename) 970 971 convertor = PdbReader(ctx, location, progress_callback = pg_cb) 972 973 with open(args.output, "w") as f: 974 json.dump(convertor.get_json(), f, indent = 2, sort_keys = True) 975 976 if args.keep: 977 print("Temporary PDB file: {}".format(filename)) 978 elif delfile: 979 os.remove(filename) 980