lib/fileformat/binary.py

#!/usr/local/bin/python3.8
#
# PLASMA : Generate an indented asm code (pseudo-C) with colored syntax.
# Copyright (C) 2015    Joel
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.    If not, see <http://www.gnu.org/licenses/>.
#

import bisect
from time import time
import subprocess

from plasma.lib.utils import debug__, print_no_end, get_char, BYTES_PRINTABLE_SET
from plasma.lib.colors import color_section

T_BIN_ELF = 0
T_BIN_PE  = 1
T_BIN_RAW = 2
T_BIN_UNK = 3


class SectionAbs():
    # virt_size: size of the mapped section in memory
    def __init__(self, name, start, virt_size, real_size, is_exec, is_data, is_bss, data):
        self.name = name
        self.start = start
        self.virt_size = virt_size
        self.real_size = real_size
        self.end = start + virt_size - 1
        self.real_end = start + real_size - 1
        self.is_exec = is_exec
        self.is_data = is_data
        self.is_bss = is_bss
        self.data = data
        self.big_endian = False # set in lib.disassembler

    def print_header(self):
        print_no_end(color_section(self.name.ljust(20)))
        print_no_end(" [ ")
        print_no_end(hex(self.start))
        print_no_end(" - ")
        print_no_end(hex(self.end))
        print_no_end(" - %d - %d" % (self.virt_size, self.real_size))
        print(" ]")

    def read(self, ad, size):
        if ad > self.real_end:
            return b""
        off = ad - self.start
        return self.data[off:off + size]

    def read_int(self, ad, size):
        if size == 1:
            return self.read_byte(ad)
        if size == 2:
            return self.read_word(ad)
        if size == 4:
            return self.read_dword(ad)
        if size == 8:
            return self.read_qword(ad)
        return None

    def read_byte(self, ad):
        if ad > self.real_end:
            return None
        off = ad - self.start
        return self.data[off]

    def read_word(self, ad):
        if ad > self.real_end:
            return None
        off = ad - self.start
        w = self.data[off:off+2]
        if len(w) != 2:
            return None
        if self.big_endian:
            return (w[0] << 8) + w[1]
        return (w[1] << 8) + w[0]

    def read_dword(self, ad):
        if ad > self.real_end:
            return None
        off = ad - self.start
        w = self.data[off:off+4]
        if len(w) != 4:
            return None
        if self.big_endian:
            return (w[0] << 24) + (w[1] << 16) + (w[2] << 8) + w[3]
        return (w[3] << 24) + (w[2] << 16) + (w[1] << 8) + w[0]

    def read_qword(self, ad):
        if ad > self.real_end:
            return None
        off = ad - self.start
        w = self.data[off:off+8]
        if len(w) != 8:
            return None
        if self.big_endian:
            return (w[0] << 56) + (w[1] << 48) + (w[2] << 40) + (w[3] << 32) + \
                   (w[4] << 24) + (w[5] << 16) + (w[6] << 8) + w[7]
        return (w[7] << 56) + (w[6] << 48) + (w[5] << 40) + (w[4] << 32) + \
               (w[3] << 24) + (w[2] << 16) + (w[1] << 8) + w[0]


class SegmentAbs(SectionAbs):
    def __init__(self, name, start, virt_size, real_size, is_exec, is_data,
                 data, file_offset, big_endian):
        self.name = name
        self.start = start
        self.virt_size = virt_size
        self.real_size = real_size
        self.end = start + virt_size - 1
        self.real_end = start + real_size - 1
        self.is_exec = is_exec
        self.is_data = is_data
        self.file_offset = file_offset
        self.data = data
        self.big_endian = big_endian
        self.is_bss = False


class Binary(object):
    def __init__(self):
        self.reverse_symbols = {} # ad -> name
        self.symbols = {} # name -> ad
        self.section_names = {}
        self.demangled = {} # name -> ad
        self.reverse_demangled = {} # ad -> name
        self.imports = {} # ad -> True (the bool is just for msgpack to save the database)
        self._abs_sections = {} # start section -> SectionAbs
        self._sorted_sections = [] # bisect list, contains section start address

        # for elf
        self._abs_segments = {}
        self._sorted_segments = []

        # To be compatible with CLE, used only in ELF
        self.rebase_addr = 0

        # It will be set in Disassembler !
        self.wordsize = 0
        self.type = -1

        # It will be set in Console !
        self.api = None


    def get_section(self, ad):
        i = bisect.bisect_right(self._sorted_sections, ad)
        if not i:
            return None
        start = self._sorted_sections[i - 1]
        s = self._abs_sections[start]
        if ad <= s.end:
            return s
        return None


    def add_section(self, start_address, name, virt_size, real_size,
                    is_exec, is_data, is_bss, data):
        if is_exec or is_data:
            bisect.insort_left(self._sorted_sections, start_address)
        self._abs_sections[start_address] = SectionAbs(
                name,
                start_address,
                virt_size,
                real_size,
                is_exec,
                is_data,
                is_bss,
                data)


    # for elf
    def get_segment(self, ad):
        i = bisect.bisect_right(self._sorted_segments, ad)
        if not i:
            return None
        start = self._sorted_segments[i - 1]
        s = self._abs_segments[start]
        if ad <= s.end:
            return s
        return None


    def is_address(self, ad):
        s = self.get_section(ad)
        return s is not None and s.start != 0


    def get_next_section(self, ad):
        i = bisect.bisect_right(self._sorted_sections, ad)
        if i >= len(self._sorted_sections):
            return None
        start = self._sorted_sections[i]
        s = self._abs_sections[start]
        if ad <= s.end:
            return s
        return None


    def get_first_addr(self):
        return self._sorted_sections[0]


    def get_last_addr(self):
        ad = self._sorted_sections[-1]
        return self._abs_sections[ad].end


    def read(self, ad, size):
        s = self.get_section(ad)
        if s is None:
            return b""
        return s.read(ad, size)


    def read_byte(self, ad):
        s = self.get_section(ad)
        if ad > s.real_end:
            return None
        return s.read_byte(ad)


    def rename_sym(self, name):
        count = 0
        n = "%s_%d" % (name, count)
        while n in self.symbols:
            n = "%s_%d" % (name, count)
            count += 1
        return n


    # not optimized
    def get_section_by_name(self, name):
        for s in self._abs_sections.values():
            if s.name == name:
                return s
        return None


    def get_prev_section(self, ad):
        s = self.get_section(ad)
        i = bisect.bisect_right(self._sorted_sections, s.start - 1)
        if i == 0:
            return None
        start = self._sorted_sections[i - 1]
        return self._abs_sections[start]


    def iter_sections(self):
        for ad in self._sorted_sections:
            yield self._abs_sections[ad]


    # TODO : move in SectionAbs
    def get_string(self, addr, max_data_size=-1, s=None):
        if s is None:
            s = self.get_section(addr)
            if s is None:
                return None

        data = s.data
        off = addr - s.start
        txt = []

        c = 0
        i = 0
        while (i < max_data_size or max_data_size == -1) and off < len(data):
            c = data[off]
            if c == 0:
                break
            if c not in BYTES_PRINTABLE_SET:
                break
            txt.append(get_char(c))
            off += 1
            i += 1

        if i == max_data_size:
            if c != 0:
                txt.append("...")
        elif c != 0 or i == 0:
            return None

        return ''.join(txt)


    # Returns the size of the string or 0 if it's not an ascii string
    def is_string(self, addr, min_bytes=3, s=None):
        if s is None:
            s = self.get_section(addr)
            if s is None:
                return 0

        data = s.data
        off = addr - s.start
        n = 0
        c = 0
        while off < len(data):
            c = data[off]
            if c == 0:
                n += 1
                break
            if c in BYTES_PRINTABLE_SET:
                n += 1
            else:
                break
            off += 1

        # consider this is a string when there are more than 2 chars
        # with a null byte
        if c == 0 and n >= min_bytes:
            return n
        return 0


    def load_section_names(self):
        # Used for the auto-completion
        for ad, sec in self._abs_sections.items():
            self.section_names[sec.name] = ad


    def demangle_symbols(self):
        addr = []
        lookup_names = []
        for n, ad in self.symbols.items():
            if n.startswith("_Z") or n.startswith("__Z"):
                addr.append(ad)
                lookup_names.append(n.split("@@")[0])

        if not addr:
            return

        # http://stackoverflow.com/questions/6526500/c-name-mangling-library-for-python
        args = ["c++filt", "-p"]
        pipe = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        stdout, _ = pipe.communicate('\n'.join(lookup_names).encode('utf-8'))
        demangled = stdout.split(b"\n")[:-1]

        self.reverse_demangled = dict(zip(addr, demangled))

        for ad, n in self.reverse_demangled.items():
            n = n.decode()
            self.reverse_demangled[ad] = n
            self.demangled[n] = ad


    def load_static_sym(self):
        return


    def load_dyn_sym(self):
        return


    def is_big_endian(self):
        raise NotImplementedError


    def get_entry_point(self):
        raise NotImplementedError