ebooks/mobi/utils.py

#!/usr/local/bin/python3.8
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai


__license__   = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

import struct, string, zlib, os
from collections import OrderedDict
from io import BytesIO

from calibre.utils.img import save_cover_data_to, scale_image, image_to_data, image_from_data, resize_image, png_data_to_gif_data
from calibre.utils.imghdr import what
from calibre.ebooks import normalize
from polyglot.builtins import as_bytes
from tinycss.color3 import parse_color_string

IMAGE_MAX_SIZE = 10 * 1024 * 1024
RECORD_SIZE = 0x1000  # 4096 (Text record size (uncompressed))


class PolyglotDict(dict):

    def __setitem__(self, key, val):
        if isinstance(key, str):
            key = key.encode('utf-8')
        dict.__setitem__(self, key, val)

    def __getitem__(self, key):
        if isinstance(key, str):
            key = key.encode('utf-8')
        return dict.__getitem__(self, key)

    def __contains__(self, key):
        if isinstance(key, str):
            key = key.encode('utf-8')
        return dict.__contains__(self, key)


def decode_string(raw, codec='utf-8', ordt_map=None):
    length, = struct.unpack(b'>B', raw[0:1])
    raw = raw[1:1+length]
    consumed = length+1
    if ordt_map:
        return ''.join(ordt_map[x] for x in bytearray(raw)), consumed
    return raw.decode(codec), consumed


def decode_hex_number(raw, codec='utf-8'):
    '''
    Return a variable length number encoded using hexadecimal encoding. These
    numbers have the first byte which tells the number of bytes that follow.
    The bytes that follow are simply the hexadecimal representation of the
    number.

    :param raw: Raw binary data as a bytestring

    :return: The number and the number of bytes from raw that the number
    occupies.
    '''
    raw, consumed = decode_string(raw, codec=codec)
    return int(raw, 16), consumed


def encode_string(raw):
    ans = bytearray(as_bytes(raw))
    ans.insert(0, len(ans))
    return bytes(ans)


def encode_number_as_hex(num):
    '''
    Encode num as a variable length encoded hexadecimal number. Returns the
    bytestring containing the encoded number. These
    numbers have the first byte which tells the number of bytes that follow.
    The bytes that follow are simply the hexadecimal representation of the
    number.
    '''
    num = hex(num)[2:].upper().encode('ascii')
    nlen = len(num)
    if nlen % 2 != 0:
        num = b'0'+num
    return encode_string(num)


def encint(value, forward=True):
    '''
    Some parts of the Mobipocket format encode data as variable-width integers.
    These integers are represented big-endian with 7 bits per byte in bits 1-7.
    They may be either forward-encoded, in which case only the first byte has bit 8 set,
    or backward-encoded, in which case only the last byte has bit 8 set.
    For example, the number 0x11111 = 0b10001000100010001 would be represented
    forward-encoded as:

        0x04 0x22 0x91 = 0b100 0b100010 0b10010001

    And backward-encoded as:

        0x84 0x22 0x11 = 0b10000100 0b100010 0b10001

    This function encodes the integer ``value`` as a variable width integer and
    returns the bytestring corresponding to it.

    If forward is True the bytes returned are suitable for prepending to the
    output buffer, otherwise they must be append to the output buffer.
    '''
    if value < 0:
        raise ValueError('Cannot encode negative numbers as vwi')
    # Encode vwi
    byts = bytearray()
    while True:
        b = value & 0b01111111
        value >>= 7  # shift value to the right by 7 bits

        byts.append(b)
        if value == 0:
            break
    byts[0 if forward else -1] |= 0b10000000
    byts.reverse()
    return bytes(byts)


def decint(raw, forward=True):
    '''
    Read a variable width integer from the bytestring or bytearray raw and return the
    integer and the number of bytes read. If forward is True bytes are read
    from the start of raw, otherwise from the end of raw.

    This function is the inverse of encint above, see its docs for more
    details.
    '''
    val = 0
    byts = bytearray()
    src = bytearray(raw)
    if not forward:
        src.reverse()
    for bnum in src:
        byts.append(bnum & 0b01111111)
        if bnum & 0b10000000:
            break
    if not forward:
        byts.reverse()
    for byte in byts:
        val <<= 7  # Shift value to the left by 7 bits
        val |= byte

    return val, len(byts)


def test_decint(num):
    for d in (True, False):
        raw = encint(num, forward=d)
        sz = len(raw)
        if (num, sz) != decint(raw, forward=d):
            raise ValueError('Failed for num %d, forward=%r: %r != %r' % (
                num, d, (num, sz), decint(raw, forward=d)))


def rescale_image(data, maxsizeb=IMAGE_MAX_SIZE, dimen=None):
    '''
    Convert image setting all transparent pixels to white and changing format
    to JPEG. Ensure the resultant image has a byte size less than
    maxsizeb.

    If dimen is not None, generate a thumbnail of
    width=dimen, height=dimen or width, height = dimen (depending on the type
    of dimen)

    Returns the image as a bytestring
    '''
    if dimen is not None:
        if hasattr(dimen, '__len__'):
            width, height = dimen
        else:
            width = height = dimen
        data = scale_image(data, width=width, height=height, compression_quality=90)[-1]
    else:
        # Replace transparent pixels with white pixels and convert to JPEG
        data = save_cover_data_to(data)
    if len(data) <= maxsizeb:
        return data
    orig_data = data  # save it in case compression fails
    quality = 90
    while len(data) > maxsizeb and quality >= 5:
        data = image_to_data(image_from_data(orig_data), compression_quality=quality)
        quality -= 5
    if len(data) <= maxsizeb:
        return data
    orig_data = data

    scale = 0.9
    while len(data) > maxsizeb and scale >= 0.05:
        img = image_from_data(data)
        w, h = img.width(), img.height()
        img = resize_image(img, int(scale*w), int(scale*h))
        data = image_to_data(img, compression_quality=quality)
        scale -= 0.05
    return data


def get_trailing_data(record, extra_data_flags):
    '''
    Given a text record as a bytestring and the extra data flags from the MOBI
    header, return the trailing data as a dictionary, mapping bit number to
    data as bytestring. Also returns the record - all trailing data.

    :return: Trailing data, record - trailing data
    '''
    data = OrderedDict()
    flags = extra_data_flags >> 1

    num = 0
    while flags:
        num += 1
        if flags & 0b1:
            sz, consumed = decint(record, forward=False)
            if sz > consumed:
                data[num] = record[-sz:-consumed]
            record = record[:-sz]
        flags >>= 1
    # Read multibyte chars if any
    if extra_data_flags & 0b1:
        # Only the first two bits are used for the size since there can
        # never be more than 3 trailing multibyte chars
        sz = (ord(record[-1:]) & 0b11) + 1
        consumed = 1
        if sz > consumed:
            data[0] = record[-sz:-consumed]
        record = record[:-sz]
    return data, record


def encode_trailing_data(raw):
    '''
    Given some data in the bytestring raw, return a bytestring of the form

        <data><size>

    where size is a backwards encoded vwi whose value is the length of the
    entire returned bytestring. data is the bytestring passed in as raw.

    This is the encoding used for trailing data entries at the end of text
    records. See get_trailing_data() for details.
    '''
    lsize = 1
    while True:
        encoded = encint(len(raw) + lsize, forward=False)
        if len(encoded) == lsize:
            break
        lsize += 1
    return raw + encoded


def encode_fvwi(val, flags, flag_size=4):
    '''
    Encode the value val and the flag_size bits from flags as a fvwi. This encoding is
    used in the trailing byte sequences for indexing. Returns encoded
    bytestring.
    '''
    ans = val << flag_size
    for i in range(flag_size):
        ans |= (flags & (1 << i))
    return encint(ans)


def decode_fvwi(byts, flag_size=4):
    '''
    Decode encoded fvwi. Returns number, flags, consumed
    '''
    arg, consumed = decint(bytes(byts))
    val = arg >> flag_size
    flags = 0
    for i in range(flag_size):
        flags |= (arg & (1 << i))
    return val, flags, consumed


def decode_tbs(byts, flag_size=4):
    '''
    Trailing byte sequences for indexing consists of series of fvwi numbers.
    This function reads the fvwi number and its associated flags. It then uses
    the flags to read any more numbers that belong to the series. The flags are
    the lowest 4 bits of the vwi (see the encode_fvwi function above).

    Returns the fvwi number, a dictionary mapping flags bits to the associated
    data and the number of bytes consumed.
    '''
    byts = bytes(byts)
    val, flags, consumed = decode_fvwi(byts, flag_size=flag_size)
    extra = {}
    byts = byts[consumed:]
    if flags & 0b1000 and flag_size > 3:
        extra[0b1000] = True
    if flags & 0b0010:
        x, consumed2 = decint(byts)
        byts = byts[consumed2:]
        extra[0b0010] = x
        consumed += consumed2
    if flags & 0b0100:
        extra[0b0100] = ord(byts[0:1])
        byts = byts[1:]
        consumed += 1
    if flags & 0b0001:
        x, consumed2 = decint(byts)
        byts = byts[consumed2:]
        extra[0b0001] = x
        consumed += consumed2
    return val, extra, consumed


def encode_tbs(val, extra, flag_size=4):
    '''
    Encode the number val and the extra data in the extra dict as an fvwi. See
    decode_tbs above.
    '''
    flags = 0
    for flag in extra:
        flags |= flag
    ans = encode_fvwi(val, flags, flag_size=flag_size)

    if 0b0010 in extra:
        ans += encint(extra[0b0010])
    if 0b0100 in extra:
        ans += bytes(bytearray([extra[0b0100]]))
    if 0b0001 in extra:
        ans += encint(extra[0b0001])
    return ans


def utf8_text(text):
    '''
    Convert a possibly null string to utf-8 bytes, guaranteeing to return a non
    empty, normalized bytestring.
    '''
    if text and text.strip():
        text = text.strip()
        if not isinstance(text, str):
            text = text.decode('utf-8', 'replace')
        text = normalize(text).encode('utf-8')
    else:
        text = _('Unknown').encode('utf-8')
    return text


def align_block(raw, multiple=4, pad=b'\0'):
    '''
    Return raw with enough pad bytes append to ensure its length is a multiple
    of 4.
    '''
    extra = len(raw) % multiple
    if extra == 0:
        return raw
    return raw + pad*(multiple - extra)


def detect_periodical(toc, log=None):
    '''
    Detect if the TOC object toc contains a periodical that conforms to the
    structure required by kindlegen to generate a periodical.
    '''
    if toc.count() < 1 or not toc[0].klass == 'periodical':
        return False
    for node in toc.iterdescendants():
        if node.depth() == 1 and node.klass != 'article':
            if log is not None:
                log.debug(
                'Not a periodical: Deepest node does not have '
                'class="article"')
            return False
        if node.depth() == 2 and node.klass != 'section':
            if log is not None:
                log.debug(
                'Not a periodical: Second deepest node does not have'
                ' class="section"')
            return False
        if node.depth() == 3 and node.klass != 'periodical':
            if log is not None:
                log.debug('Not a periodical: Third deepest node'
                    ' does not have class="periodical"')
            return False
        if node.depth() > 3:
            if log is not None:
                log.debug('Not a periodical: Has nodes of depth > 3')
            return False
    return True


def count_set_bits(num):
    if num < 0:
        num = -num
    ans = 0
    while num > 0:
        ans += (num & 0b1)
        num >>= 1
    return ans


def to_base(num, base=32, min_num_digits=None):
    digits = string.digits + string.ascii_uppercase
    sign = 1 if num >= 0 else -1
    if num == 0:
        return ('0' if min_num_digits is None else '0'*min_num_digits)
    num *= sign
    ans = []
    while num:
        ans.append(digits[(num % base)])
        num //= base
    if min_num_digits is not None and len(ans) < min_num_digits:
        ans.extend('0'*(min_num_digits - len(ans)))
    if sign < 0:
        ans.append('-')
    ans.reverse()
    return ''.join(ans)


def mobify_image(data):
    'Convert PNG images to GIF as the idiotic Kindle cannot display some PNG'
    fmt = what(None, data)
    if fmt == 'png':
        data = png_data_to_gif_data(data)
    return data

# Font records {{{


def read_font_record(data, extent=1040):
    '''
    Return the font encoded in the MOBI FONT record represented by data.
    The return value in a dict with fields raw_data, font_data, err, ext,
    headers.

    :param extent: The number of obfuscated bytes. So far I have only
    encountered files with 1040 obfuscated bytes. If you encounter an
    obfuscated record for which this function fails, try different extent
    values (easily automated).

    raw_data is the raw data in the font record
    font_data is the decoded font_data or None if an error occurred
    err is not None if some error occurred
    ext is the font type (ttf for TrueType, dat for unknown and failed if an
    error occurred)
    headers is the list of decoded headers from the font record or None if
    decoding failed
    '''
    # Format:
    # bytes  0 -  3:  'FONT'
    # bytes  4 -  7:  Uncompressed size
    # bytes  8 - 11:  flags
    #                   bit 1 - zlib compression
    #                   bit 2 - XOR obfuscated
    # bytes 12 - 15:  offset to start of compressed data
    # bytes 16 - 19:  length of XOR string
    # bytes 19 - 23:  offset to start of XOR data
    # The zlib compressed data begins with 2 bytes of header and
    # has 4 bytes of checksum at the end
    ans = {'raw_data':data, 'font_data':None, 'err':None, 'ext':'failed',
            'headers':None, 'encrypted':False}

    try:
        usize, flags, dstart, xor_len, xor_start = struct.unpack_from(
                b'>LLLLL', data, 4)
    except:
        ans['err'] = 'Failed to read font record header fields'
        return ans
    font_data = data[dstart:]
    ans['headers'] = {'usize':usize, 'flags':bin(flags), 'xor_len':xor_len,
            'xor_start':xor_start, 'dstart':dstart}

    if flags & 0b10:
        # De-obfuscate the data
        key = bytearray(data[xor_start:xor_start+xor_len])
        buf = bytearray(font_data)
        extent = len(font_data) if extent is None else extent
        extent = min(extent, len(font_data))

        for n in range(extent):
            buf[n] ^= key[n%xor_len]  # XOR of buf and key

        font_data = bytes(buf)
        ans['encrypted'] = True

    if flags & 0b1:
        # ZLIB compressed data
        try:
            font_data = zlib.decompress(font_data)
        except Exception as e:
            ans['err'] = 'Failed to zlib decompress font data (%s)'%e
            return ans

        if len(font_data) != usize:
            ans['err'] = 'Uncompressed font size mismatch'
            return ans

    ans['font_data'] = font_data
    sig = font_data[:4]
    ans['ext'] = ('ttf' if sig in {b'\0\1\0\0', b'true', b'ttcf'}
                    else 'otf' if sig == b'OTTO' else 'dat')

    return ans


def write_font_record(data, obfuscate=True, compress=True):
    '''
    Write the ttf/otf font represented by data into a font record. See
    read_font_record() for details on the format of the record.
    '''

    flags = 0
    key_len = 20
    usize = len(data)
    xor_key = b''
    if compress:
        flags |= 0b1
        data = zlib.compress(data, 9)
    if obfuscate and len(data) >= 1040:
        flags |= 0b10
        xor_key = os.urandom(key_len)
        key = bytearray(xor_key)
        data = bytearray(data)
        for i in range(1040):
            data[i] ^= key[i%key_len]
        data = bytes(data)

    key_start = struct.calcsize(b'>5L') + 4
    data_start = key_start + len(xor_key)

    header = b'FONT' + struct.pack(b'>5L', usize, flags, data_start,
            len(xor_key), key_start)

    return header + xor_key + data

# }}}


def create_text_record(text):
    '''
    Return a Palmdoc record of size RECORD_SIZE from the text file object.
    In case the record ends in the middle of a multibyte character return
    the overlap as well.

    Returns data, overlap: where both are byte strings. overlap is the
    extra bytes needed to complete the truncated multibyte character.
    '''
    opos = text.tell()
    text.seek(0, 2)
    # npos is the position of the next record
    npos = min((opos + RECORD_SIZE, text.tell()))
    # Number of bytes from the next record needed to complete the last
    # character in this record
    extra = 0

    last = b''
    while not last.decode('utf-8', 'ignore'):
        # last contains no valid utf-8 characters
        size = len(last) + 1
        text.seek(npos - size)
        last = text.read(size)

    # last now has one valid utf-8 char and possibly some bytes that belong
    # to a truncated char

    try:
        last.decode('utf-8', 'strict')
    except UnicodeDecodeError:
        # There are some truncated bytes in last
        prev = len(last)
        while True:
            text.seek(npos - prev)
            last = text.read(len(last) + 1)
            try:
                last.decode('utf-8')
            except UnicodeDecodeError:
                pass
            else:
                break
        extra = len(last) - prev

    text.seek(opos)
    data = text.read(RECORD_SIZE)
    overlap = text.read(extra)
    text.seek(npos)

    return data, overlap


class CNCX:  # {{{

    '''
    Create the CNCX records. These are records containing all the strings from
    an index. Each record is of the form: <vwi string size><utf-8 encoded
    string>
    '''

    MAX_STRING_LENGTH = 500

    def __init__(self, strings=()):
        self.strings = OrderedDict((s, 0) for s in strings)

        self.records = []
        offset = 0
        buf = BytesIO()
        RECORD_LIMIT = 0x10000 - 1024  # kindlegen appears to use 1024, PDB limit is 0x10000
        for key in self.strings:
            utf8 = utf8_text(key[:self.MAX_STRING_LENGTH])
            l = len(utf8)
            sz_bytes = encint(l)
            raw = sz_bytes + utf8
            if buf.tell() + len(raw) > RECORD_LIMIT:
                self.records.append(align_block(buf.getvalue()))
                buf.seek(0), buf.truncate(0)
                offset = len(self.records) * 0x10000
            buf.write(raw)
            self.strings[key] = offset
            offset += len(raw)

        val = buf.getvalue()
        if val:
            self.records.append(align_block(val))

    def __getitem__(self, string):
        return self.strings[string]

    def __bool__(self):
        return bool(self.records)
    __nonzero__ = __bool__

    def __len__(self):
        return len(self.records)

# }}}


def is_guide_ref_start(ref):
    return (ref.title.lower() == 'start' or
            (ref.type and ref.type.lower() in {'start',
                    'other.start', 'text'}))


def convert_color_for_font_tag(val):
    rgba = parse_color_string(str(val or ''))
    if rgba is None or rgba == 'currentColor':
        return str(val)
    clamp = lambda x: min(x, max(0, x), 1)
    rgb = map(clamp, rgba[:3])
    return '#' + ''.join(map(lambda x:'%02x' % int(x * 255), rgb))