1import os
2import logging
3import hashlib
4from io import BufferedReader
5from typing import List, Optional  # pylint:disable=unused-import
6
7import sortedcontainers
8
9import archinfo
10from .region import Region, Segment, Section
11from .regions import Regions
12from .symbol import Symbol, SymbolType
13from ..address_translator import AT
14from ..memory import Clemory
15from ..errors import CLEOperationError, CLEError
16
17l = logging.getLogger(name=__name__)
18
19
20class FunctionHintSource:
21    """
22    Enums that describe the source of function hints.
23    """
24    EH_FRAME = 0
25    EXTERNAL_EH_FRAME = 1
26
27
28class FunctionHint:
29    """
30    Describes a function hint.
31
32    :ivar int addr:     Address of the function.
33    :ivar int size:     Size of the function.
34    :ivar source:       Source of this hint.
35    :vartype source:    int
36    """
37
38    __slots__ = ('addr', 'size', 'source')
39
40    def __init__(self, addr, size, source):
41        self.addr = addr
42        self.size = size
43        self.source = source
44
45    def __repr__(self):
46        return "<FuncHint@%#x, %d bytes>" % (self.addr, self.size)
47
48
49class ExceptionHandling:
50    """
51    Describes an exception handling.
52
53    Exception handlers are usually language-specific. In C++, it is usually implemented as try {} catch {} blocks.
54
55    :ivar int start_addr:               The beginning of the try block.
56    :ivar int size:                     Size of the try block.
57    :ivar Optional[int] handler_addr:   Address of the exception handler code.
58    :ivar type:                         Type of the exception handler. Optional.
59    :ivar Optional[int] func_addr:      Address of the function. Optional.
60    """
61
62    __slots__ = ('start_addr', 'size', 'handler_addr', 'type', 'func_addr',)
63
64    def __init__(self, start_addr, size, handler_addr=None, type_=None, func_addr=None):
65
66        self.start_addr = start_addr
67        self.size = size
68        self.handler_addr = handler_addr
69        self.type = type_
70        self.func_addr = func_addr
71
72    def __repr__(self):
73        if self.handler_addr is not None:
74            return "<ExceptionHandling@%#x-%#x: handler@%#x>" % (self.start_addr,
75                                                                 self.start_addr + self.size,
76                                                                 self.handler_addr)
77        else:
78            return "<ExceptionHandling@%#x-%#x: no handler>" % (self.start_addr,
79                                                                 self.start_addr + self.size)
80
81
82class Backend:
83    """
84    Main base class for CLE binary objects.
85
86    An alternate interface to this constructor exists as the static method :meth:`cle.loader.Loader.load_object`
87
88    :ivar binary:           The path to the file this object is loaded from
89    :ivar binary_basename:  The basename of the filepath, or a short representation of the stream it was loaded from
90    :ivar is_main_bin:      Whether this binary is loaded as the main executable
91    :ivar segments:         A listing of all the loaded segments in this file
92    :ivar sections:         A listing of all the demarked sections in the file
93    :ivar sections_map:     A dict mapping from section name to section
94    :ivar imports:          A mapping from symbol name to import relocation
95    :ivar resolved_imports: A list of all the import symbols that are successfully resolved
96    :ivar relocs:           A list of all the relocations in this binary
97    :ivar irelatives:       A list of tuples representing all the irelative relocations that need to be performed. The
98                            first item in the tuple is the address of the resolver function, and the second item is the
99                            address of where to write the result. The destination address is an RVA.
100    :ivar jmprel:           A mapping from symbol name to the address of its jump slot relocation, i.e. its GOT entry.
101    :ivar arch:             The architecture of this binary
102    :vartype arch:          archinfo.arch.Arch
103    :ivar str os:           The operating system this binary is meant to run under
104    :ivar int mapped_base:  The base address of this object in virtual memory
105    :ivar deps:             A list of names of shared libraries this binary depends on
106    :ivar linking:          'dynamic' or 'static'
107    :ivar linked_base:      The base address this object requests to be loaded at
108    :ivar bool pic:         Whether this object is position-independent
109    :ivar bool execstack:   Whether this executable has an executable stack
110    :ivar str provides:     The name of the shared library dependancy that this object resolves
111    :ivar list symbols:     A list of symbols provided by this object, sorted by address
112    :ivar has_memory:       Whether this backend is backed by a Clemory or not. As it stands now, a backend should still
113                            define `min_addr` and `max_addr` even if `has_memory` is False.
114    """
115    is_default = False
116
117    def __init__(self,
118            binary,
119            binary_stream,
120            loader=None,
121            is_main_bin=False,
122            entry_point=None,
123            arch=None,
124            base_addr=None,
125            force_rebase=False,
126            has_memory=True,
127            **kwargs):
128        """
129        :param binary:          The path to the binary to load
130        :param binary_stream:   The open stream to this binary. The reference to this will be held until you call close.
131        :param is_main_bin:     Whether this binary should be loaded as the main executable
132        """
133        self.binary = binary
134        self._binary_stream: BufferedReader = binary_stream
135        if self.binary is not None:
136            self.binary_basename = os.path.basename(self.binary)
137        elif hasattr(self._binary_stream, "name"):
138            self.binary_basename = os.path.basename(self._binary_stream.name)
139        else:
140            self.binary_basename = str(self._binary_stream)
141
142        for k in list(kwargs.keys()):
143            if k == 'custom_entry_point':
144                entry_point = kwargs.pop(k)
145            elif k == 'custom_arch':
146                arch = kwargs.pop(k)
147            elif k == 'custom_base_addr':
148                base_addr = kwargs.pop(k)
149            else:
150                continue
151            l.critical("Deprecation warning: the %s parameter has been renamed to %s", k, k[7:])
152
153        if kwargs != {}:
154            l.warning("Unused kwargs for loading binary %s: %s", self.binary, ', '.join(kwargs.keys()))
155
156        self.is_main_bin = is_main_bin
157        self.has_memory = has_memory
158        self.loader = loader
159        self._entry = 0
160        self._segments = Regions() # List of segments
161        self._sections = Regions() # List of sections
162        self.sections_map = {}  # Mapping from section name to section
163        self.symbols: 'sortedcontainers.SortedKeyList[Symbol]' = sortedcontainers.SortedKeyList(key=self._get_symbol_relative_addr)
164        self.imports = {}
165        self.resolved_imports = []
166        self.relocs = []
167        self.irelatives = []    # list of tuples (resolver, destination), dest w/o rebase
168        self.jmprel = {}
169        self.arch = None
170        self.os = None  # Let other stuff override this
171        self.compiler = None, None  # compiler name, version
172        self._symbol_cache = {}
173        # a list of directories to search for libraries specified by the object
174        self.extra_load_path = []
175        # attributes to enable SimProcedure guessing
176        self.guess_simprocs = False
177        self.guess_simprocs_hint = None
178
179        # checksums
180        self.md5 = None
181        self.sha256 = None
182
183        self.mapped_base_symbolic = 0
184        # These are set by cle, and should not be overriden manually
185        self.mapped_base = self.linked_base = 0 # not to be set manually - used by CLE
186
187        self.deps = []           # Needed shared objects (libraries dependencies)
188        self.child_objects = []  # any objects loaded directly out of this
189        self.parent_object = None
190        self.linking = None # Dynamic or static linking
191        self.pic = force_rebase
192        self.execstack = False
193
194        # tls info set by backend to communicate with thread manager
195        self.tls_used = False
196        self.tls_block_size = None
197        self.tls_data_size = None
198        self.tls_data_start = None
199        # tls info set by thread manager
200        self.tls_module_id = None
201        #self.tls_block_offset = None  # this is an ELF-only attribute
202
203        # exception handling
204        # they should be rebased when .rebase() is called
205        self.exception_handlings = []  # type: List[ExceptionHandling]
206
207        # Hints
208        # they should be rebased when .rebase() is called
209        self.function_hints = []  # type: List[FunctionHint]
210
211        # Custom options
212        self._custom_entry_point = entry_point
213        self._custom_base_addr = base_addr
214        self.provides = os.path.basename(self.binary) if self.binary is not None else None
215
216        self.memory = None  # type: Clemory
217
218        # should be set inside `cle.Loader.add_object`
219        self._is_mapped = False
220        # cached max_addr
221        self._max_addr = None
222        # cached last section
223        self._last_section = None
224        # cached last segment
225        self._last_segment = None
226
227        if arch is None:
228            self.arch = None
229        elif isinstance(arch, str):
230            self.set_arch(archinfo.arch_from_id(arch))
231        elif isinstance(arch, archinfo.Arch):
232            self.set_arch(arch)
233        elif isinstance(arch, type) and issubclass(arch, archinfo.Arch):
234            self.set_arch(arch())
235        else:
236            raise CLEError("Bad parameter: arch=%s" % arch)
237
238        self._checksum()
239
240    def close(self):
241        del self._binary_stream
242
243    def __repr__(self):
244        return '<%s Object %s, maps [%#x:%#x]>' % \
245               (self.__class__.__name__, self.binary_basename, self.min_addr, self.max_addr)
246
247    def set_arch(self, arch):
248        self.arch = arch
249        self.memory = Clemory(arch) # Private virtual address space, without relocations
250
251    @property
252    def image_base_delta(self):
253        return self.mapped_base - self.linked_base
254
255    @property
256    def entry(self):
257        if self._custom_entry_point is not None:
258            return AT.from_lva(self._custom_entry_point, self).to_mva()
259        return AT.from_lva(self._entry, self).to_mva()
260
261    @property
262    def segments(self):
263        return self._segments
264
265    @segments.setter
266    def segments(self, v):
267        if isinstance(v, list):
268            self._segments = Regions(lst=v)
269        elif isinstance(v, Regions):
270            self._segments = v
271        else:
272            raise ValueError('Unsupported type %s set as sections.' % type(v))
273
274    @property
275    def sections(self):
276        return self._sections
277
278    @sections.setter
279    def sections(self, v):
280        if isinstance(v, list):
281            self._sections = Regions(lst=v)
282        elif isinstance(v, Regions):
283            self._sections = v
284        else:
285            raise ValueError('Unsupported type %s set as sections.' % type(v))
286
287    @property
288    def symbols_by_addr(self):
289        l.critical("Deprecation warning: symbols_by_addr is deprecated - use loader.find_symbol() for lookup and .symbols for enumeration")
290        return {s.rebased_addr: s for s in self.symbols}
291
292    def rebase(self, new_base):
293        """
294        Rebase backend's regions to the new base where they were mapped by the loader
295        """
296        if self._is_mapped:
297            # we could rebase an object twice if we really wanted... no need though, right?
298            raise CLEOperationError("Image already rebased from %#x to %#x" % (self.linked_base, self.mapped_base))
299
300        self.mapped_base = new_base
301
302        if self.sections:
303            self.sections._rebase(self.image_base_delta)
304        if self.segments and self.sections is not self.segments:
305            self.segments._rebase(self.image_base_delta)
306
307        for handling in self.exception_handlings:
308            if handling.func_addr is not None:
309                handling.func_addr += self.image_base_delta
310            if handling.handler_addr is not None:
311                handling.handler_addr += self.image_base_delta
312            handling.start_addr += self.image_base_delta
313
314        for hint in self.function_hints:
315            hint.addr = hint.addr + self.image_base_delta
316
317    def relocate(self):
318        """
319        Apply all resolved relocations to memory.
320
321        The meaning of "resolved relocations" is somewhat subtle - there is a linking step which attempts to resolve
322        each relocation, currently only present in the main internal loading function since the calculation of which
323        objects should be available
324        """
325        for reloc in self.relocs:
326            if reloc.resolved:
327                reloc.relocate()
328
329    def contains_addr(self, addr):
330        """
331        Is `addr` in one of the binary's segments/sections we have loaded? (i.e. is it mapped into memory ?)
332        """
333        return self.find_loadable_containing(addr) is not None
334
335    def find_loadable_containing(self, addr):
336        lookup = self.find_segment_containing if self.segments else self.find_section_containing
337        return lookup(addr)
338
339    def find_segment_containing(self, addr):
340        """
341        Returns the segment that contains `addr`, or ``None``.
342        """
343        if self._last_segment is not None and self._last_segment.contains_addr(addr):
344            return self._last_segment
345
346        r = self.segments.find_region_containing(addr)
347        if r is not None:
348            self._last_segment = r
349        return r
350
351    def find_section_containing(self, addr):
352        """
353        Returns the section that contains `addr` or ``None``.
354        """
355        if self._last_section is not None and self._last_section.contains_addr(addr):
356            return self._last_section
357
358        r = self.sections.find_region_containing(addr)
359        if r is not None:
360            self._last_section = r
361        return r
362
363    def addr_to_offset(self, addr):
364        loadable = self.find_loadable_containing(addr)
365        if loadable is not None:
366            return loadable.addr_to_offset(addr)
367        else:
368            return None
369
370    def offset_to_addr(self, offset):
371        if self.segments:
372            for s in self.segments:
373                if s.contains_offset(offset):
374                    return s.offset_to_addr(offset)
375        else:
376            for s in self.sections:
377                if s.contains_offset(offset):
378                    return s.offset_to_addr(offset)
379        return None
380
381    @property
382    def min_addr(self):
383        """
384        This returns the lowest virtual address contained in any loaded segment of the binary.
385        """
386        # Loader maps the object at chosen mapped base anyway and independently of the internal structure
387        return self.mapped_base
388
389    @property
390    def max_addr(self):
391        """
392        This returns the highest virtual address contained in any loaded segment of the binary.
393        """
394
395        if self._max_addr is None:
396            out = self.mapped_base
397            if self.segments or self.sections:
398                out = max(map(lambda x: x.max_addr, self.segments or self.sections))
399            self._max_addr = out - self.mapped_base
400        return self._max_addr + self.mapped_base
401
402    @property
403    def initializers(self): # pylint: disable=no-self-use
404        """
405        Stub function. Should be overridden by backends that can provide initializer functions that ought to be run
406        before execution reaches the entry point. Addresses should be rebased.
407        """
408        return []
409
410    @property
411    def finalizers(self): # pylint: disable=no-self-use
412        """
413        Stub function. Like initializers, but with finalizers.
414        """
415        return []
416
417    @property
418    def threads(self):  # pylint: disable=no-self-use
419        """
420        If this backend represents a dump of a running program, it may contain one or more thread contexts, i.e.
421        register files. This property should contain a list of names for these threads, which should be unique.
422        """
423        return []
424
425    def thread_registers(self, thread=None):  # pylint: disable=no-self-use,unused-argument
426        """
427        If this backend represents a dump of a running program, it may contain one or more thread contexts, i.e.
428        register files. This method should return the register file for a given thread (as named in ``Backend.threads``)
429        as a dict mapping register names (as seen in archinfo) to numbers. If the thread is not specified, it should
430        return the context for a "default" thread. If there are no threads, it should return an empty dict.
431        """
432        return {}
433
434    def initial_register_values(self):
435        """
436        Deprecated
437        """
438        l.critical("Deprecation warning: initial_register_values is deprecated - use backend.thread_registers() instead")
439        return self.thread_registers().items()
440
441    def get_symbol(self, name): # pylint: disable=no-self-use,unused-argument
442        """
443        Stub function. Implement to find the symbol with name `name`.
444        """
445        if name in self._symbol_cache:
446            return self._symbol_cache[name]
447        return None
448
449    @staticmethod
450    def extract_soname(path): # pylint: disable=unused-argument
451        """
452        Extracts the shared object identifier from the path, or returns None if it cannot.
453        """
454        return None
455
456    @classmethod
457    def is_compatible(cls, stream):  # pylint:disable=unused-argument
458        """
459        Determine quickly whether this backend can load an object from this stream
460        """
461        return False
462
463    @classmethod
464    def check_compatibility(cls, spec, obj): # pylint: disable=unused-argument
465        """
466        Performs a minimal static load of ``spec`` and returns whether it's compatible with other_obj
467        """
468        return False
469
470    @classmethod
471    def check_magic_compatibility(cls, stream): # pylint: disable=unused-argument
472        """
473        Check if a stream of bytes contains the same magic number as the main object
474        """
475        return False
476
477    @staticmethod
478    def _get_symbol_relative_addr(symbol):
479        return symbol.relative_addr
480
481    def _checksum(self):
482        """
483        Calculate MD5 and SHA256 checksum for the binary.
484        """
485
486        if self._binary_stream is not None:
487            data = self._binary_stream.read()
488            self._binary_stream.seek(0)
489            self.md5 = hashlib.md5(data).digest()
490            self.sha256 = hashlib.sha256(data).digest()
491
492    def __getstate__(self):
493        return self.__dict__
494
495    def __setstate__(self, state):
496        self.__dict__.update(state)
497        for sym in self.symbols:
498            sym.owner = self
499
500ALL_BACKENDS = dict()
501
502
503def register_backend(name, cls):
504    ALL_BACKENDS.update({name: cls})
505
506
507from .elf import ELF, ELFCore, MetaELF
508from .pe import PE
509#from .idabin import IDABin
510from .blob import Blob
511from .cgc import CGC, BackedCGC
512from .ihex import Hex
513from .minidump import Minidump
514from .macho import MachO
515from .named_region import NamedRegion
516from .java.jar import Jar
517from .java.apk import Apk
518from .java.soot import Soot
519from .xbe import XBE
520from .static_archive import StaticArchive
521
522try:
523    from .binja import BinjaBin
524except Exception:  # pylint:disable=broad-except
525    l.warning("Binary Ninja is installed in the environment but the BinjaBin backend fails to initialize. Your Binary "
526              "Ninja might be too old.",
527              exc_info=True)
528