1# This file is Copyright 2019 Volatility Foundation and licensed under the Volatility Software License 1.0
2# which is available at https://www.volatilityfoundation.org/license/vsl-v1.0
3#
4import binascii
5import datetime
6import json
7import logging
8import os
9from bisect import bisect
10from typing import Tuple, Dict, Any, Optional, Union, List
11from urllib import request, error
12
13from volatility.framework import contexts, interfaces, constants
14from volatility.framework.layers import physical, msf, resources
15
16vollog = logging.getLogger(__name__)
17
18primatives = {
19    0x03: ("void", {
20        "endian": "little",
21        "kind": "void",
22        "signed": True,
23        "size": 0
24    }),
25    0x08: ("HRESULT", {
26        "endian": "little",
27        "kind": "int",
28        "signed": False,
29        "size": 4
30    }),
31    0x10: ("char", {
32        "endian": "little",
33        "kind": "char",
34        "signed": True,
35        "size": 1
36    }),
37    0x20: ("unsigned char", {
38        "endian": "little",
39        "kind": "char",
40        "signed": False,
41        "size": 1
42    }),
43    0x68: ("int8", {
44        "endian": "little",
45        "kind": "int",
46        "signed": True,
47        "size": 1
48    }),
49    0x69: ("uint8", {
50        "endian": "little",
51        "kind": "int",
52        "signed": False,
53        "size": 1
54    }),
55    0x70: ("char", {
56        "endian": "little",
57        "kind": "char",
58        "signed": True,
59        "size": 1
60    }),
61    0x71: ("wchar", {
62        "endian": "little",
63        "kind": "int",
64        "signed": True,
65        "size": 2
66    }),
67    # 0x7a: ("rchar16", {}),
68    # 0x7b: ("rchar32", {}),
69    0x11: ("short", {
70        "endian": "little",
71        "kind": "int",
72        "signed": True,
73        "size": 2
74    }),
75    0x21: ("unsigned short", {
76        "endian": "little",
77        "kind": "int",
78        "signed": False,
79        "size": 2
80    }),
81    0x72: ("short", {
82        "endian": "little",
83        "kind": "int",
84        "signed": True,
85        "size": 2
86    }),
87    0x73: ("unsigned short", {
88        "endian": "little",
89        "kind": "int",
90        "signed": False,
91        "size": 2
92    }),
93    0x12: ("long", {
94        "endian": "little",
95        "kind": "int",
96        "signed": True,
97        "size": 4
98    }),
99    0x22: ("unsigned long", {
100        "endian": "little",
101        "kind": "int",
102        "signed": False,
103        "size": 4
104    }),
105    0x74: ("int", {
106        "endian": "little",
107        "kind": "int",
108        "signed": True,
109        "size": 4
110    }),
111    0x75: ("unsigned int", {
112        "endian": "little",
113        "kind": "int",
114        "signed": False,
115        "size": 4
116    }),
117    0x13: ("long long", {
118        "endian": "little",
119        "kind": "int",
120        "signed": True,
121        "size": 8
122    }),
123    0x23: ("unsigned long long", {
124        "endian": "little",
125        "kind": "int",
126        "signed": False,
127        "size": 8
128    }),
129    0x76: ("long long", {
130        "endian": "little",
131        "kind": "int",
132        "signed": True,
133        "size": 8
134    }),
135    0x77: ("unsigned long long", {
136        "endian": "little",
137        "kind": "int",
138        "signed": False,
139        "size": 8
140    }),
141    0x14: ("int128", {
142        "endian": "little",
143        "kind": "int",
144        "signed": True,
145        "size": 16
146    }),
147    0x24: ("uint128", {
148        "endian": "little",
149        "kind": "int",
150        "signed": False,
151        "size": 16
152    }),
153    0x78: ("int128", {
154        "endian": "little",
155        "kind": "int",
156        "signed": True,
157        "size": 16
158    }),
159    0x79: ("uint128", {
160        "endian": "little",
161        "kind": "int",
162        "signed": False,
163        "size": 16
164    }),
165    0x46: ("f16", {
166        "endian": "little",
167        "kind": "float",
168        "signed": True,
169        "size": 2
170    }),
171    0x40: ("f32", {
172        "endian": "little",
173        "kind": "float",
174        "signed": True,
175        "size": 4
176    }),
177    0x45: ("f32pp", {
178        "endian": "little",
179        "kind": "float",
180        "signed": True,
181        "size": 4
182    }),
183    0x44: ("f48", {
184        "endian": "little",
185        "kind": "float",
186        "signed": True,
187        "size": 6
188    }),
189    0x41: ("double", {
190        "endian": "little",
191        "kind": "float",
192        "signed": True,
193        "size": 8
194    }),
195    0x42: ("f80", {
196        "endian": "little",
197        "kind": "float",
198        "signed": True,
199        "size": 10
200    }),
201    0x43: ("f128", {
202        "endian": "little",
203        "kind": "float",
204        "signed": True,
205        "size": 16
206    })
207}
208
209indirections = {
210    0x100: ("pointer16", {
211        "endian": "little",
212        "kind": "int",
213        "signed": False,
214        "size": 2
215    }),
216    0x400: ("pointer32", {
217        "endian": "little",
218        "kind": "int",
219        "signed": False,
220        "size": 4
221    }),
222    0x600: ("pointer64", {
223        "endian": "little",
224        "kind": "int",
225        "signed": False,
226        "size": 8
227    })
228}
229
230
231class ForwardArrayCount:
232
233    def __init__(self, size, element_type):
234        self.element_type = element_type
235        self.size = size
236
237
238class PdbReader:
239    """Class to read Microsoft PDB files.
240
241    This reads the various streams according to various sources as to how pdb should be read.
242    These sources include:
243
244    https://docs.rs/crate/pdb/0.5.0/source/src/
245    https://github.com/moyix/pdbparse
246    https://llvm.org/docs/PDB/index.html
247    https://github.com/Microsoft/microsoft-pdb/
248
249    In order to generate ISF files, we need the type stream (2), and the symbols stream (variable).
250    The MultiStream Format wrapper is handled as a volatility layer, which constructs sublayers for each stream.
251    The streams can then be read contiguously allowing the data to be accessed.
252
253    Volatility's type system is strong when everything must be laid out in advance, but PDB data is reasonably dynamic,
254    particularly when it comes to names.  We must therefore parse it after we've collected other information already.
255    This is in comparison to something such as Construct/pdbparse which can use just-parsed data to determine dynamically
256    sized data following.
257    """
258
259    def __init__(self,
260                 context: interfaces.context.ContextInterface,
261                 location: str,
262                 progress_callback: constants.ProgressCallback = None) -> None:
263        self._layer_name, self._context = self.load_pdb_layer(context, location)
264        self._dbiheader = None  # type: Optional[interfaces.objects.ObjectInterface]
265        if not progress_callback:
266            progress_callback = lambda x, y: None
267        self._progress_callback = progress_callback
268        self.types = [
269        ]  # type: List[Tuple[interfaces.objects.ObjectInterface, str, interfaces.objects.ObjectInterface]]
270        self.bases = {}  # type: Dict[str, Any]
271        self.user_types = {}  # type: Dict[str, Any]
272        self.enumerations = {}  # type: Dict[str, Any]
273        self.symbols = {}  # type: Dict[str, Any]
274        self._omap_mapping = []  # type: List[Tuple[int, int]]
275        self._sections = []  # type: List[interfaces.objects.ObjectInterface]
276        self.metadata = {"format": "6.1.0", "windows": {}}
277
278    @property
279    def context(self):
280        return self._context
281
282    @property
283    def pdb_layer_name(self):
284        return self._layer_name
285
286    @classmethod
287    def load_pdb_layer(cls, context: interfaces.context.ContextInterface,
288                       location: str) -> Tuple[str, interfaces.context.ContextInterface]:
289        """Loads a PDB file into a layer within the context and returns the
290        name of the new layer.
291
292        Note: the context may be changed by this method
293        """
294        physical_layer_name = context.layers.free_layer_name("FileLayer")
295        physical_config_path = interfaces.configuration.path_join("pdbreader", physical_layer_name)
296
297        # Create the file layer
298        # This must be specific to get us started, setup the config and run
299        new_context = context.clone()
300        new_context.config[interfaces.configuration.path_join(physical_config_path, "location")] = location
301
302        physical_layer = physical.FileLayer(new_context, physical_config_path, physical_layer_name)
303        new_context.add_layer(physical_layer)
304
305        # Add on the MSF format layer
306        msf_layer_name = context.layers.free_layer_name("MSFLayer")
307        msf_config_path = interfaces.configuration.path_join("pdbreader", msf_layer_name)
308        new_context.config[interfaces.configuration.path_join(msf_config_path, "base_layer")] = physical_layer_name
309        msf_layer = msf.PdbMultiStreamFormat(new_context, msf_config_path, msf_layer_name)
310        new_context.add_layer(msf_layer)
311
312        msf_layer.read_streams()
313
314        return msf_layer_name, new_context
315
316    def reset(self):
317        self.bases = {}
318        self.user_types = {}
319        self.enumerations = {}
320        self.symbols = {}
321        self._sections = []
322        self._omap_mapping = []
323
324    def read_necessary_streams(self):
325        """Read streams to populate the various internal components for a PDB
326        table."""
327        if not self.metadata['windows'].get('pdb', None):
328            self.read_pdb_info_stream()
329        if not self.user_types:
330            self.read_tpi_stream()
331        if not self.symbols:
332            self.read_symbol_stream()
333
334    def read_tpi_stream(self) -> None:
335        """Reads the TPI type steam."""
336        vollog.debug("Reading TPI")
337        tpi_layer = self._context.layers.get(self._layer_name + "_stream2", None)
338        if not tpi_layer:
339            raise ValueError("No TPI stream available")
340        module = self._context.module(module_name = tpi_layer.pdb_symbol_table, layer_name = tpi_layer.name, offset = 0)
341        header = module.object(object_type = "TPI_HEADER", offset = 0)
342
343        # Check the header
344        if not (56 <= header.header_size < 1024):
345            raise ValueError("TPI Stream Header size outside normal bounds")
346        if header.index_min < 4096:
347            raise ValueError("Minimum TPI index is 4096, found: {}".format(header.index_min))
348        if header.index_max < header.index_min:
349            raise ValueError("Maximum TPI index is smaller than minimum TPI index, found: {} < {} ".format(
350                header.index_max, header.index_min))
351
352        # Reset the state
353        self.types = []
354        type_references = {}  # type: Dict[str, int]
355
356        offset = header.header_size
357        # Ensure we use the same type everywhere
358        length_type = "unsigned short"
359        length_len = module.get_type(length_type).size
360        type_index = 1
361        while tpi_layer.maximum_address - offset > 0:
362            self._progress_callback(offset * 100 / tpi_layer.maximum_address, "Reading TPI layer")
363            length = module.object(object_type = length_type, offset = offset)
364            if not isinstance(length, int):
365                raise TypeError("Non-integer length provided")
366            offset += length_len
367            output, consumed = self.consume_type(module, offset, length)
368            leaf_type, name, value = output
369            for tag_type in ['unnamed', 'anonymous']:
370                if name == '<{}-tag>'.format(tag_type) or name == '__{}'.format(tag_type):
371                    name = '__{}_'.format(tag_type) + hex(len(self.types) + 0x1000)[2:]
372            type_references[name] = len(self.types)
373            self.types.append((leaf_type, name, value))
374            offset += length
375            type_index += 1
376            # Since types can only refer to earlier types, assigning the name at this point is fine
377
378        if tpi_layer.maximum_address - offset != 0:
379            raise ValueError("Type values did not fill the TPI stream correctly")
380
381        self.process_types(type_references)
382
383    def read_dbi_stream(self) -> None:
384        """Reads the DBI Stream."""
385        vollog.debug("Reading DBI stream")
386        dbi_layer = self._context.layers.get(self._layer_name + "_stream3", None)
387        if not dbi_layer:
388            raise ValueError("No DBI stream available")
389        module = self._context.module(module_name = dbi_layer.pdb_symbol_table, layer_name = dbi_layer.name, offset = 0)
390        self._dbiheader = module.object(object_type = "DBI_HEADER", offset = 0)
391
392        if not self._dbiheader:
393            raise ValueError("DBI Header could not be read")
394
395        # Skip past sections we don't care about to get to the DBG header
396        dbg_hdr_offset = (self._dbiheader.vol.size + self._dbiheader.module_size + self._dbiheader.secconSize +
397                          self._dbiheader.secmapSize + self._dbiheader.filinfSize + self._dbiheader.tsmapSize +
398                          self._dbiheader.ecinfoSize)
399        self._dbidbgheader = module.object(object_type = "DBI_DBG_HEADER", offset = dbg_hdr_offset)
400
401        self._sections = []
402        self._omap_mapping = []
403
404        if self._dbidbgheader.snSectionHdrOrig != -1:
405            section_orig_layer_name = self._layer_name + "_stream" + str(self._dbidbgheader.snSectionHdrOrig)
406            consumed, length = 0, self.context.layers[section_orig_layer_name].maximum_address
407            while consumed < length:
408                section = self.context.object(dbi_layer.pdb_symbol_table + constants.BANG + "IMAGE_SECTION_HEADER",
409                                              offset = consumed,
410                                              layer_name = section_orig_layer_name)
411                self._sections.append(section)
412                consumed += section.vol.size
413
414            if self._dbidbgheader.snOmapFromSrc != -1:
415                omap_layer_name = self._layer_name + "_stream" + str(self._dbidbgheader.snOmapFromSrc)
416                length = self.context.layers[omap_layer_name].maximum_address
417                data = self.context.layers[omap_layer_name].read(0, length)
418                # For speed we don't use the framework to read this (usually sizeable) data
419                for i in range(0, length, 8):
420                    self._omap_mapping.append(
421                        (int.from_bytes(data[i:i + 4],
422                                        byteorder = 'little'), int.from_bytes(data[i + 4:i + 8], byteorder = 'little')))
423        elif self._dbidbgheader.snSectionHdr != -1:
424            section_layer_name = self._layer_name + "_stream" + str(self._dbidbgheader.snSectionHdr)
425            consumed, length = 0, self.context.layers[section_layer_name].maximum_address
426            while consumed < length:
427                section = self.context.object(dbi_layer.pdb_symbol_table + constants.BANG + "IMAGE_SECTION_HEADER",
428                                              offset = consumed,
429                                              layer_name = section_layer_name)
430                self._sections.append(section)
431                consumed += section.vol.size
432
433    def read_symbol_stream(self):
434        """Reads in the symbol stream."""
435        self.symbols = {}
436
437        if not self._dbiheader:
438            self.read_dbi_stream()
439
440        vollog.debug("Reading Symbols")
441
442        symrec_layer = self._context.layers.get(self._layer_name + "_stream" + str(self._dbiheader.symrecStream), None)
443        if not symrec_layer:
444            raise ValueError("No SymRec stream available")
445        module = self._context.module(module_name = symrec_layer.pdb_symbol_table,
446                                      layer_name = symrec_layer.name,
447                                      offset = 0)
448
449        offset = 0
450        max_address = symrec_layer.maximum_address
451
452        while offset < max_address:
453            self._progress_callback(offset * 100 / max_address, "Reading Symbol layer")
454            sym = module.object(object_type = "GLOBAL_SYMBOL", offset = offset)
455            leaf_type = module.object(object_type = "unsigned short", offset = sym.leaf_type.vol.offset)
456            name = None
457            address = None
458            if sym.segment < len(self._sections):
459                if leaf_type == 0x110e:
460                    # v3 symbol (c-string)
461                    name = self.parse_string(sym.name, False, sym.length - sym.vol.size + 2)
462                    address = self._sections[sym.segment - 1].VirtualAddress + sym.offset
463                elif leaf_type == 0x1009:
464                    # v2 symbol (pascal-string)
465                    name = self.parse_string(sym.name, True, sym.length - sym.vol.size + 2)
466                    address = self._sections[sym.segment - 1].VirtualAddress + sym.offset
467                else:
468                    vollog.debug("Only v2 and v3 symbols are supported")
469            if name:
470                if self._omap_mapping:
471                    address = self.omap_lookup(address)
472                stripped_name = self.name_strip(name)
473                self.symbols[stripped_name] = {"address": address}
474                if name != self.name_strip(name):
475                    self.symbols[stripped_name]["linkage_name"] = name
476            offset += sym.length + 2  # Add on length itself
477
478    def read_pdb_info_stream(self):
479        """Reads in the pdb information stream."""
480        if not self._dbiheader:
481            self.read_dbi_stream()
482
483        vollog.debug("Reading PDB Info")
484        pdb_info_layer = self._context.layers.get(self._layer_name + "_stream1", None)
485        if not pdb_info_layer:
486            raise ValueError("No PDB Info Stream available")
487        module = self._context.module(module_name = pdb_info_layer.pdb_symbol_table,
488                                      layer_name = pdb_info_layer.name,
489                                      offset = 0)
490        pdb_info = module.object(object_type = "PDB_INFORMATION", offset = 0)
491
492        self.metadata['windows']['pdb'] = {
493            "GUID": self.convert_bytes_to_guid(pdb_info.GUID),
494            "age": pdb_info.age,
495            "database": "ntkrnlmp.pdb",
496            "machine_type": self._dbiheader.machine
497        }
498
499    def convert_bytes_to_guid(self, original: bytes) -> str:
500        """Convert the bytes to the correct ordering for a GUID."""
501        orig_guid_list = [x for x in original]
502        guid_list = []
503        for i in [3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15]:
504            guid_list.append(orig_guid_list[i])
505        return str(binascii.hexlify(bytes(guid_list)), "latin-1").upper()
506
507    # SYMBOL HANDLING CODE
508
509    def omap_lookup(self, address):
510        """Looks up an address using the omap mapping."""
511        pos = bisect(self._omap_mapping, (address, -1))
512        if self._omap_mapping[pos][0] > address:
513            pos -= 1
514
515        if not self._omap_mapping[pos][1]:
516            return 0
517        return self._omap_mapping[pos][1] + (address - self._omap_mapping[pos][0])
518
519    def name_strip(self, name):
520        """Strips unnecessary components from the start of a symbol name."""
521        new_name = name
522
523        if new_name[:1] in ["_", "@", "\u007F"]:
524            new_name = new_name[1:]
525
526        name_array = new_name.split("@")
527        if len(name_array) == 2:
528            if name_array[1].isnumeric() and name_array[0][0] != "?":
529                new_name = name_array[0]
530            else:
531                new_name = name
532
533        return new_name
534
535    def get_json(self):
536        """Returns the intermediate format JSON data from this pdb file."""
537        self.read_necessary_streams()
538
539        # Set the time/datestamp for the output
540        self.metadata["producer"] = {
541            "datetime": datetime.datetime.now().isoformat(),
542            "name": "volatility3",
543            "version": constants.PACKAGE_VERSION
544        }
545
546        return {
547            "user_types": self.user_types,
548            "enums": self.enumerations,
549            "base_types": self.bases,
550            "symbols": self.symbols,
551            "metadata": self.metadata,
552        }
553
554    def get_type_from_index(self, index: int) -> Union[List[Any], Dict[str, Any]]:
555        """Takes a type index and returns appropriate dictionary."""
556        if index < 0x1000:
557            base_name, base = primatives[index & 0xff]
558            self.bases[base_name] = base
559            result = {"kind": "base", "name": base_name}  # type: Union[List[Dict[str, Any]], Dict[str, Any]]
560            indirection = (index & 0xf00)
561            if indirection:
562                pointer_name, pointer_base = indirections[indirection]
563                if self.bases.get('pointer', None) and self.bases['pointer'] == pointer_base:
564                    result = {"kind": "pointer", "subtype": result}
565                else:
566                    self.bases[pointer_name] = pointer_base
567                    result = {"kind": "pointer", "base": pointer_name, "subtype": result}
568            return result
569        else:
570            leaf_type, name, value = self.types[index - 0x1000]
571            result = {"kind": "struct", "name": name}
572            if leaf_type in [leaf_type.LF_MODIFIER]:
573                result = self.get_type_from_index(value.subtype_index)
574            elif leaf_type in [leaf_type.LF_ARRAY, leaf_type.LF_ARRAY_ST, leaf_type.LF_STRIDED_ARRAY]:
575                result = {
576                    "count": ForwardArrayCount(value.size, value.element_type),
577                    "kind": "array",
578                    "subtype": self.get_type_from_index(value.element_type)
579                }
580            elif leaf_type in [leaf_type.LF_BITFIELD]:
581                result = {
582                    "kind": "bitfield",
583                    "type": self.get_type_from_index(value.underlying_type),
584                    "bit_length": value.length,
585                    "bit_position": value.position
586                }
587            elif leaf_type in [leaf_type.LF_POINTER]:
588                # Since we use the base['pointer'] to set the size for pointers, update it and check we don't get conflicts
589                size = self.get_size_from_index(index)
590                if self.bases.get("pointer", None) is None:
591                    self.bases['pointer'] = {"endian": "little", "kind": "int", "signed": False, "size": size}
592                else:
593                    if size != self.bases['pointer']['size']:
594                        raise ValueError("Native pointers with different sizes!")
595                result = {"kind": "pointer", "subtype": self.get_type_from_index(value.subtype_index)}
596            elif leaf_type in [leaf_type.LF_PROCEDURE]:
597                return {"kind": "function"}
598            elif leaf_type in [leaf_type.LF_UNION]:
599                result = {"kind": "union", "name": name}
600            elif leaf_type in [leaf_type.LF_ENUM]:
601                result = {"kind": "enum", "name": name}
602            elif leaf_type in [leaf_type.LF_FIELDLIST]:
603                result = value
604            elif not name:
605                raise ValueError("No name for structure that should be named")
606            return result
607
608    def get_size_from_index(self, index: int) -> int:
609        """Returns the size of the structure based on the type index
610        provided."""
611        result = -1
612        name = ''
613        if index < 0x1000:
614            if (index & 0xf00):
615                _, base = indirections[index & 0xf00]
616            else:
617                _, base = primatives[index & 0xff]
618            result = base['size']
619        else:
620            leaf_type, name, value = self.types[index - 0x1000]
621            if leaf_type in [
622                    leaf_type.LF_UNION, leaf_type.LF_CLASS, leaf_type.LF_CLASS_ST, leaf_type.LF_STRUCTURE,
623                    leaf_type.LF_STRUCTURE_ST, leaf_type.LF_INTERFACE
624            ]:
625                if not value.properties.forward_reference:
626                    result = value.size
627            elif leaf_type in [leaf_type.LF_ARRAY, leaf_type.LF_ARRAY_ST, leaf_type.LF_STRIDED_ARRAY]:
628                result = value.size
629            elif leaf_type in [leaf_type.LF_MODIFIER, leaf_type.LF_ENUM, leaf_type.LF_ARGLIST]:
630                result = self.get_size_from_index(value.subtype_index)
631            elif leaf_type in [leaf_type.LF_MEMBER]:
632                result = self.get_size_from_index(value.field_type)
633            elif leaf_type in [leaf_type.LF_BITFIELD]:
634                result = self.get_size_from_index(value.underlying_type)
635            elif leaf_type in [leaf_type.LF_POINTER]:
636                result = value.size
637                if not result:
638                    if value.pointer_type == 0x0a:
639                        return 4
640                    elif value.pointer_type == 0x0c:
641                        return 8
642                    else:
643                        raise ValueError("Pointer size could not be determined")
644            elif leaf_type in [leaf_type.LF_PROCEDURE]:
645                raise ValueError("LF_PROCEDURE size could not be identified")
646            else:
647                raise ValueError("Unable to determine size of leaf_type {}".format(leaf_type.lookup()))
648        if result <= 0:
649            raise ValueError("Invalid size identified: {} ({})".format(index, name))
650        return result
651
652    ### TYPE HANDLING CODE
653
654    def process_types(self, type_references: Dict[str, int]) -> None:
655        """Reads the TPI and symbol streams to populate the reader's
656        variables."""
657
658        self.bases = {}
659        self.user_types = {}
660        self.enumerations = {}
661
662        max_len = len(self.types)
663        for index in range(max_len):
664            self._progress_callback(index * 100 / max_len, "Processing types")
665            leaf_type, name, value = self.types[index]
666            if leaf_type in [
667                    leaf_type.LF_CLASS, leaf_type.LF_CLASS_ST, leaf_type.LF_STRUCTURE, leaf_type.LF_STRUCTURE_ST,
668                    leaf_type.LF_INTERFACE
669            ]:
670                if not value.properties.forward_reference:
671                    self.user_types[name] = {
672                        "kind": "struct",
673                        "size": value.size,
674                        "fields": self.convert_fields(value.fields - 0x1000)
675                    }
676            elif leaf_type in [leaf_type.LF_UNION]:
677                if not value.properties.forward_reference:
678                    # Deal with UNION types
679                    self.user_types[name] = {
680                        "kind": "union",
681                        "size": value.size,
682                        "fields": self.convert_fields(value.fields - 0x1000)
683                    }
684            elif leaf_type in [leaf_type.LF_ENUM]:
685                if not value.properties.forward_reference:
686                    base = self.get_type_from_index(value.subtype_index)
687                    if not isinstance(base, Dict):
688                        raise ValueError("Invalid base type returned for Enumeration")
689                    self.enumerations[name] = {
690                        'base': base['name'],
691                        'size': self.get_size_from_index(value.subtype_index),
692                        'constants':
693                        dict([(name, enum.value) for _, name, enum in self.get_type_from_index(value.fields)])
694                    }
695
696        # Re-run through for ForwardSizeReferences
697        self.user_types = self.replace_forward_references(self.user_types, type_references)
698
699    def consume_type(
700        self, module: interfaces.context.ModuleInterface, offset: int, length: int
701    ) -> Tuple[Tuple[Optional[interfaces.objects.ObjectInterface], Optional[str], Union[
702            None, List, interfaces.objects.ObjectInterface]], int]:
703        """Returns a (leaf_type, name, object) Tuple for a type, and the number
704        of bytes consumed."""
705        result = None, None, None  # type: Tuple[Optional[interfaces.objects.ObjectInterface], Optional[str], Optional[Union[List, interfaces.objects.ObjectInterface]]]
706        leaf_type = self.context.object(module.get_enumeration("LEAF_TYPE"),
707                                        layer_name = module._layer_name,
708                                        offset = offset)
709        consumed = leaf_type.vol.base_type.size
710        remaining = length - consumed
711
712        if leaf_type in [
713                leaf_type.LF_CLASS, leaf_type.LF_CLASS_ST, leaf_type.LF_STRUCTURE, leaf_type.LF_STRUCTURE_ST,
714                leaf_type.LF_INTERFACE
715        ]:
716            structure = module.object(object_type = "LF_STRUCTURE", offset = offset + consumed)
717            name_offset = structure.name.vol.offset - structure.vol.offset
718            name, value, excess = self.determine_extended_value(leaf_type, structure.size, module,
719                                                                remaining - name_offset)
720            structure.size = value
721            structure.name = name
722            consumed += remaining
723            result = leaf_type, name, structure
724        elif leaf_type in [leaf_type.LF_MEMBER, leaf_type.LF_MEMBER_ST]:
725            member = module.object(object_type = "LF_MEMBER", offset = offset + consumed)
726            name_offset = member.name.vol.offset - member.vol.offset
727            name, value, excess = self.determine_extended_value(leaf_type, member.offset, module,
728                                                                remaining - name_offset)
729            member.offset = value
730            member.name = name
731            result = leaf_type, name, member
732            consumed += member.vol.size + len(name) + 1 + excess
733        elif leaf_type in [leaf_type.LF_ARRAY, leaf_type.LF_ARRAY_ST, leaf_type.LF_STRIDED_ARRAY]:
734            array = module.object(object_type = "LF_ARRAY", offset = offset + consumed)
735            name_offset = array.name.vol.offset - array.vol.offset
736            name, value, excess = self.determine_extended_value(leaf_type, array.size, module, remaining - name_offset)
737            array.size = value
738            array.name = name
739            result = leaf_type, name, array
740            consumed += remaining
741        elif leaf_type in [leaf_type.LF_ENUMERATE]:
742            enum = module.object(object_type = 'LF_ENUMERATE', offset = offset + consumed)
743            name_offset = enum.name.vol.offset - enum.vol.offset
744            name, value, excess = self.determine_extended_value(leaf_type, enum.value, module, remaining - name_offset)
745            enum.value = value
746            enum.name = name
747            result = leaf_type, name, enum
748            consumed += enum.vol.size + len(name) + 1 + excess
749        elif leaf_type in [leaf_type.LF_ARGLIST, leaf_type.LF_ENUM]:
750            enum = module.object(object_type = "LF_ENUM", offset = offset + consumed)
751            name_offset = enum.name.vol.offset - enum.vol.offset
752            name = self.parse_string(enum.name, leaf_type < leaf_type.LF_ST_MAX, size = remaining - name_offset)
753            enum.name = name
754            result = leaf_type, name, enum
755            consumed += remaining
756        elif leaf_type in [leaf_type.LF_UNION]:
757            union = module.object(object_type = "LF_UNION", offset = offset + consumed)
758            name_offset = union.name.vol.offset - union.vol.offset
759            name = self.parse_string(union.name, leaf_type < leaf_type.LF_ST_MAX, size = remaining - name_offset)
760            result = leaf_type, name, union
761            consumed += remaining
762        elif leaf_type in [leaf_type.LF_MODIFIER, leaf_type.LF_POINTER, leaf_type.LF_PROCEDURE]:
763            obj = module.object(object_type = leaf_type.lookup(), offset = offset + consumed)
764            result = leaf_type, None, obj
765            consumed += remaining
766        elif leaf_type in [leaf_type.LF_FIELDLIST]:
767            sub_length = remaining
768            sub_offset = offset + consumed
769            fields = []
770            while length > consumed:
771                subfield, sub_consumed = self.consume_type(module, sub_offset, sub_length)
772                sub_consumed += self.consume_padding(module.layer_name, sub_offset + sub_consumed)
773                sub_length -= sub_consumed
774                sub_offset += sub_consumed
775                consumed += sub_consumed
776                fields.append(subfield)
777            result = leaf_type, None, fields
778        elif leaf_type in [leaf_type.LF_BITFIELD]:
779            bitfield = module.object(object_type = "LF_BITFIELD", offset = offset + consumed)
780            result = leaf_type, None, bitfield
781            consumed += remaining
782        else:
783            raise TypeError("Unhandled leaf_type: {}".format(leaf_type))
784
785        return result, consumed
786
787    def consume_padding(self, layer_name: str, offset: int) -> int:
788        """Returns the amount of padding used between fields."""
789        val = self.context.layers[layer_name].read(offset, 1)
790        if not ((val[0] & 0xf0) == 0xf0):
791            return 0
792        return (int(val[0]) & 0x0f)
793
794    def convert_fields(self, fields: int) -> Dict[Optional[str], Dict[str, Any]]:
795        """Converts a field list into a list of fields."""
796        result = {}  # type: Dict[Optional[str], Dict[str, Any]]
797        _, _, fields_struct = self.types[fields]
798        if not isinstance(fields_struct, list):
799            vollog.warning("Fields structure did not contain a list of fields")
800            return result
801        for field in fields_struct:
802            _, name, member = field
803            result[name] = {"offset": member.offset, "type": self.get_type_from_index(member.field_type)}
804        return result
805
806    def replace_forward_references(self, types, type_references):
807        """Finds all ForwardArrayCounts and calculates them once
808        ForwardReferences have been resolved."""
809        if isinstance(types, dict):
810            for k, v in types.items():
811                types[k] = self.replace_forward_references(v, type_references)
812        elif isinstance(types, list):
813            new_types = []
814            for v in types:
815                new_types.append(self.replace_forward_references(v, type_references))
816            types = new_types
817        elif isinstance(types, ForwardArrayCount):
818            element_type = types.element_type
819            # If we're a forward array count, we need to do the calculation now after all the types have been processed
820            loop = True
821            while loop:
822                loop = False
823                if element_type > 0x1000:
824                    _, name, toplevel_type = self.types[element_type - 0x1000]
825                    # If there's no name, the original size is probably fine as long as we're not indirect (LF_MODIFIER)
826                    if not name and isinstance(
827                            toplevel_type,
828                            interfaces.objects.ObjectInterface) and toplevel_type.vol.type_name.endswith('LF_MODIFIER'):
829                        # We have check they don't point to a forward reference, so we go round again with the subtype
830                        element_type = toplevel_type.subtype_index
831                        loop = True
832                    elif name:
833                        # If there is a name, look it up so we're not using a reference but the real thing
834                        element_type = type_references[name] + 0x1000
835            return types.size // self.get_size_from_index(element_type)
836        return types
837
838    # COMMON CODE
839
840    @staticmethod
841    def parse_string(structure: interfaces.objects.ObjectInterface,
842                     parse_as_pascal: bool = False,
843                     size: int = 0) -> str:
844        """Consumes either a c-string or a pascal string depending on the
845        leaf_type."""
846        if not parse_as_pascal:
847            name = structure.cast("string", max_length = size, encoding = "latin-1")
848        else:
849            name = structure.cast("pascal_string")
850            name = name.string.cast("string", max_length = name.length, encoding = "latin-1")
851        return str(name)
852
853    def determine_extended_value(self, leaf_type: interfaces.objects.ObjectInterface,
854                                 value: interfaces.objects.ObjectInterface, module: interfaces.context.ModuleInterface,
855                                 length: int) -> Tuple[str, interfaces.objects.ObjectInterface, int]:
856        """Reads a value and potentially consumes more data to construct the
857        value."""
858        excess = 0
859        if value >= leaf_type.LF_CHAR:
860            sub_leaf_type = self.context.object(self.context.symbol_space.get_enumeration(leaf_type.vol.type_name),
861                                                layer_name = leaf_type.vol.layer_name,
862                                                offset = value.vol.offset)
863            # Set the offset at just after the previous size type
864            offset = value.vol.offset + value.vol.data_format.length
865            if sub_leaf_type in [leaf_type.LF_CHAR]:
866                value = module.object(object_type = 'char', offset = offset)
867            elif sub_leaf_type in [leaf_type.LF_SHORT]:
868                value = module.object(object_type = 'short', offset = offset)
869            elif sub_leaf_type in [leaf_type.LF_USHORT]:
870                value = module.object(object_type = 'unsigned short', offset = offset)
871            elif sub_leaf_type in [leaf_type.LF_LONG]:
872                value = module.object(object_type = 'long', offset = offset)
873            elif sub_leaf_type in [leaf_type.LF_ULONG]:
874                value = module.object(object_type = 'unsigned long', offset = offset)
875            else:
876                raise TypeError("Unexpected extended value type")
877            excess = value.vol.data_format.length
878            # Updated the consume/offset counters
879        name = module.object(object_type = "string", offset = value.vol.offset + value.vol.data_format.length)
880        name_str = self.parse_string(name, leaf_type < leaf_type.LF_ST_MAX, size = length - excess)
881        return name_str, value, excess
882
883
884class PdbRetreiver:
885
886    def retreive_pdb(self,
887                     guid: str,
888                     file_name: str,
889                     progress_callback: constants.ProgressCallback = None) -> Optional[str]:
890        vollog.info("Download PDB file...")
891        file_name = ".".join(file_name.split(".")[:-1] + ['pdb'])
892        for sym_url in ['http://msdl.microsoft.com/download/symbols']:
893            url = sym_url + "/{}/{}/".format(file_name, guid)
894
895            result = None
896            for suffix in [file_name, file_name[:-1] + '_']:
897                try:
898                    vollog.debug("Attempting to retrieve {}".format(url + suffix))
899                    result = resources.ResourceAccessor(progress_callback).open(url + suffix)
900                except error.HTTPError as excp:
901                    vollog.debug("Failed with {}".format(excp))
902            if result:
903                break
904        if progress_callback is not None:
905            progress_callback(100, "Downloading {}".format(url + suffix))
906        return result.name
907
908
909if __name__ == '__main__':
910    import argparse
911
912    class PrintedProgress(object):
913        """A progress handler that prints the progress value and the
914        description onto the command line."""
915
916        def __init__(self):
917            self._max_message_len = 0
918
919        def __call__(self, progress: Union[int, float], description: str = None):
920            """A simple function for providing text-based feedback.
921
922            .. warning:: Only for development use.
923
924            Args:
925                progress: Percentage of progress of the current procedure
926            """
927            message = "\rProgress: {0: 7.2f}\t\t{1:}".format(round(progress, 2), description or '')
928            message_len = len(message)
929            self._max_message_len = max([self._max_message_len, message_len])
930            print(message, end = (' ' * (self._max_message_len - message_len)) + '\r')
931
932    parser = argparse.ArgumentParser(
933        description = "Read PDB files and convert to Volatility 3 Intermediate Symbol Format")
934    parser.add_argument("-o", "--output", metavar = "OUTPUT", help = "Filename for data output", required = True)
935    file_group = parser.add_argument_group("file", description = "File-based conversion of PDB to ISF")
936    file_group.add_argument("-f", "--file", metavar = "FILE", help = "PDB file to translate to ISF")
937    data_group = parser.add_argument_group("data", description = "Convert based on a GUID and filename pattern")
938    data_group.add_argument("-p", "--pattern", metavar = "PATTERN", help = "Filename pattern to recover PDB file")
939    data_group.add_argument("-g",
940                            "--guid",
941                            metavar = "GUID",
942                            help = "GUID + Age string for the required PDB file",
943                            default = None)
944    data_group.add_argument("-k",
945                            "--keep",
946                            action = "store_true",
947                            default = False,
948                            help = "Keep the downloaded PDB file")
949    args = parser.parse_args()
950
951    pg_cb = PrintedProgress()
952
953    delfile = False
954    filename = None
955    if args.guid is not None and args.pattern is not None:
956        filename = PdbRetreiver().retreive_pdb(guid = args.guid, file_name = args.pattern, progress_callback = pg_cb)
957        delfile = True
958    elif args.file:
959        filename = args.file
960    else:
961        parser.error("No GUID/pattern or file provided")
962
963    if not filename:
964        parser.error("No suitable filename provided or retrieved")
965
966    ctx = contexts.Context()
967    if not os.path.exists(filename):
968        parser.error("File {} does not exists".format(filename))
969    location = "file:" + request.pathname2url(filename)
970
971    convertor = PdbReader(ctx, location, progress_callback = pg_cb)
972
973    with open(args.output, "w") as f:
974        json.dump(convertor.get_json(), f, indent = 2, sort_keys = True)
975
976    if args.keep:
977        print("Temporary PDB file: {}".format(filename))
978    elif delfile:
979        os.remove(filename)
980