conversion/plugins/epub_input.py

__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

import os, re, posixpath
from itertools import cycle

from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation

ADOBE_OBFUSCATION =  'http://ns.adobe.com/pdf/enc#RC'
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'


def decrypt_font_data(key, data, algorithm):
    is_adobe = algorithm == ADOBE_OBFUSCATION
    crypt_len = 1024 if is_adobe else 1040
    crypt = bytearray(data[:crypt_len])
    key = cycle(iter(bytearray(key)))
    decrypt = bytes(bytearray(x^next(key) for x in crypt))
    return decrypt + data[crypt_len:]


def decrypt_font(key, path, algorithm):
    with lopen(path, 'r+b') as f:
        data = decrypt_font_data(key, f.read(), algorithm)
        f.seek(0), f.truncate(), f.write(data)


class EPUBInput(InputFormatPlugin):

    name        = 'EPUB Input'
    author      = 'Kovid Goyal'
    description = _('Convert EPUB files (.epub) to HTML')
    file_types  = {'epub'}
    output_encoding = None
    commit_name = 'epub_input'

    recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}

    def process_encryption(self, encfile, opf, log):
        from lxml import etree
        import uuid, hashlib
        idpf_key = opf.raw_unique_identifier
        if idpf_key:
            idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
            idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
        key = None
        for item in opf.identifier_iter():
            scheme = None
            for xkey in item.attrib.keys():
                if xkey.endswith('scheme'):
                    scheme = item.get(xkey)
            if (scheme and scheme.lower() == 'uuid') or \
                    (item.text and item.text.startswith('urn:uuid:')):
                try:
                    key = item.text.rpartition(':')[-1]
                    key = uuid.UUID(key).bytes
                except:
                    import traceback
                    traceback.print_exc()
                    key = None

        try:
            root = etree.parse(encfile)
            for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
                algorithm = em.get('Algorithm', '')
                if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
                    return False
                cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
                uri = cr.get('URI')
                path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
                tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
                if (tkey and os.path.exists(path)):
                    self._encrypted_font_uris.append(uri)
                    decrypt_font(tkey, path, algorithm)
            return True
        except:
            import traceback
            traceback.print_exc()
        return False

    def set_guide_type(self, opf, gtype, href=None, title=''):
        # Set the specified guide entry
        for elem in list(opf.iterguide()):
            if elem.get('type', '').lower() == gtype:
                elem.getparent().remove(elem)

        if href is not None:
            t = opf.create_guide_item(gtype, title, href)
            for guide in opf.root.xpath('./*[local-name()="guide"]'):
                guide.append(t)
                return
            guide = opf.create_guide_element()
            opf.root.append(guide)
            guide.append(t)
            return t

    def rationalize_cover3(self, opf, log):
        ''' If there is a reference to the cover/titlepage via manifest properties, convert to
        entries in the <guide> so that the rest of the pipeline picks it up. '''
        from calibre.ebooks.metadata.opf3 import items_with_property
        removed = guide_titlepage_href = guide_titlepage_id = None

        # Look for titlepages incorrectly marked in the <guide> as covers
        guide_cover, guide_elem = None, None
        for guide_elem in opf.iterguide():
            if guide_elem.get('type', '').lower() == 'cover':
                guide_cover = guide_elem.get('href', '').partition('#')[0]
                break
        if guide_cover:
            spine = list(opf.iterspine())
            if spine:
                idref = spine[0].get('idref', '')
                for x in opf.itermanifest():
                    if x.get('id') == idref and x.get('href') == guide_cover:
                        guide_titlepage_href = guide_cover
                        guide_titlepage_id = idref
                        break

        raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
        if raster_cover_href:
            self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
        titlepage_id = titlepage_href = None
        for item in items_with_property(opf.root, 'calibre:title-page'):
            tid, href = item.get('id'), item.get('href')
            if href and tid:
                titlepage_id, titlepage_href = tid, href.partition('#')[0]
                break
        if titlepage_href is None:
            titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
        if titlepage_href is not None:
            self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title page')
            spine = list(opf.iterspine())
            if len(spine) > 1:
                for item in spine:
                    if item.get('idref') == titlepage_id:
                        log('Found HTML cover', titlepage_href)
                        if self.for_viewer:
                            item.attrib.pop('linear', None)
                        else:
                            item.getparent().remove(item)
                            removed = titlepage_href
                        return removed

    def rationalize_cover2(self, opf, log):
        ''' Ensure that the cover information in the guide is correct. That
        means, at most one entry with type="cover" that points to a raster
        cover and at most one entry with type="titlepage" that points to an
        HTML titlepage. '''
        from calibre.ebooks.oeb.base import OPF
        removed = None
        from lxml import etree
        guide_cover, guide_elem = None, None
        for guide_elem in opf.iterguide():
            if guide_elem.get('type', '').lower() == 'cover':
                guide_cover = guide_elem.get('href', '').partition('#')[0]
                break
        if not guide_cover:
            raster_cover = opf.raster_cover
            if raster_cover:
                if guide_elem is None:
                    g = opf.root.makeelement(OPF('guide'))
                    opf.root.append(g)
                else:
                    g = guide_elem.getparent()
                guide_cover = raster_cover
                guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
                g.append(guide_elem)
            return
        spine = list(opf.iterspine())
        if not spine:
            return
        # Check if the cover specified in the guide is also
        # the first element in spine
        idref = spine[0].get('idref', '')
        manifest = list(opf.itermanifest())
        if not manifest:
            return
        elem = [x for x in manifest if x.get('id', '') == idref]
        if not elem or elem[0].get('href', None) != guide_cover:
            return
        log('Found HTML cover', guide_cover)

        # Remove from spine as covers must be treated
        # specially
        if not self.for_viewer:
            if len(spine) == 1:
                log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
                for guide_elem in tuple(opf.iterguide()):
                    if guide_elem.get('type', '').lower() == 'cover':
                        guide_elem.getparent().remove(guide_elem)
                return
            else:
                spine[0].getparent().remove(spine[0])
                removed = guide_cover
        else:
            # Ensure the cover is displayed as the first item in the book, some
            # epub files have it set with linear='no' which causes the cover to
            # display in the end
            spine[0].attrib.pop('linear', None)
            opf.spine[0].is_linear = True
        # Ensure that the guide has a cover entry pointing to a raster cover
        # and a titlepage entry pointing to the html titlepage. The titlepage
        # entry will be used by the epub output plugin, the raster cover entry
        # by other output plugins.

        # Search for a raster cover identified in the OPF
        raster_cover = opf.raster_cover

        # Set the cover guide entry
        if raster_cover is not None:
            guide_elem.set('href', raster_cover)
        else:
            # Render the titlepage to create a raster cover
            from calibre.ebooks import render_html_svg_workaround
            guide_elem.set('href', 'calibre_raster_cover.jpg')
            t = etree.SubElement(
                elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
            t.set('media-type', 'image/jpeg')
            if os.path.exists(guide_cover):
                renderer = render_html_svg_workaround(guide_cover, log)
                if renderer is not None:
                    with lopen('calibre_raster_cover.jpg', 'wb') as f:
                        f.write(renderer)

        # Set the titlepage guide entry
        self.set_guide_type(opf, 'titlepage', guide_cover, 'Title page')
        return removed

    def find_opf(self):
        from calibre.utils.xml_parse import safe_xml_fromstring

        def attr(n, attr):
            for k, v in n.attrib.items():
                if k.endswith(attr):
                    return v
        try:
            with lopen('META-INF/container.xml', 'rb') as f:
                root = safe_xml_fromstring(f.read())
                for r in root.xpath('//*[local-name()="rootfile"]'):
                    if attr(r, 'media-type') != "application/oebps-package+xml":
                        continue
                    path = attr(r, 'full-path')
                    if not path:
                        continue
                    path = os.path.join(os.getcwd(), *path.split('/'))
                    if os.path.exists(path):
                        return path
        except Exception:
            import traceback
            traceback.print_exc()

    def convert(self, stream, options, file_ext, log, accelerators):
        from calibre.utils.zipfile import ZipFile
        from calibre import walk
        from calibre.ebooks import DRMError
        from calibre.ebooks.metadata.opf2 import OPF
        try:
            zf = ZipFile(stream)
            zf.extractall(os.getcwd())
        except:
            log.exception('EPUB appears to be invalid ZIP file, trying a'
                    ' more forgiving ZIP parser')
            from calibre.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream)
        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
        opf = self.find_opf()
        if opf is None:
            for f in walk('.'):
                if f.lower().endswith('.opf') and '__MACOSX' not in f and \
                        not os.path.basename(f).startswith('.'):
                    opf = os.path.abspath(f)
                    break
        path = getattr(stream, 'name', 'stream')

        if opf is None:
            raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)

        opf = os.path.relpath(opf, os.getcwd())
        parts = os.path.split(opf)
        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))

        self._encrypted_font_uris = []
        if os.path.exists(encfile):
            if not self.process_encryption(encfile, opf, log):
                raise DRMError(os.path.basename(path))
        self.encrypted_fonts = self._encrypted_font_uris

        if len(parts) > 1 and parts[0]:
            delta = '/'.join(parts[:-1])+'/'

            def normpath(x):
                return posixpath.normpath(delta + elem.get('href'))

            for elem in opf.itermanifest():
                elem.set('href', normpath(elem.get('href')))
            for elem in opf.iterguide():
                elem.set('href', normpath(elem.get('href')))

        f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
        self.removed_cover = f(opf, log)
        if self.removed_cover:
            self.removed_items_to_ignore = (self.removed_cover,)
        epub3_nav = opf.epub3_nav
        if epub3_nav is not None:
            self.convert_epub3_nav(epub3_nav, opf, log, options)

        for x in opf.itermanifest():
            if x.get('media-type', '') == 'application/x-dtbook+xml':
                raise ValueError(
                    'EPUB files with DTBook markup are not supported')

        not_for_spine = set()
        for y in opf.itermanifest():
            id_ = y.get('id', None)
            if id_:
                mt = y.get('media-type', None)
                if mt in {
                        'application/vnd.adobe-page-template+xml',
                        'application/vnd.adobe.page-template+xml',
                        'application/adobe-page-template+xml',
                        'application/adobe.page-template+xml',
                        'application/text'
                }:
                    not_for_spine.add(id_)
                ext = y.get('href', '').rpartition('.')[-1].lower()
                if mt == 'text/plain' and ext in {'otf', 'ttf'}:
                    # some epub authoring software sets font mime types to
                    # text/plain
                    not_for_spine.add(id_)
                    y.set('media-type', 'application/font')

        seen = set()
        for x in list(opf.iterspine()):
            ref = x.get('idref', None)
            if not ref or ref in not_for_spine or ref in seen:
                x.getparent().remove(x)
                continue
            seen.add(ref)

        if len(list(opf.iterspine())) == 0:
            raise ValueError('No valid entries in the spine of this EPUB')

        with lopen('content.opf', 'wb') as nopf:
            nopf.write(opf.render())

        return os.path.abspath('content.opf')

    def convert_epub3_nav(self, nav_path, opf, log, opts):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
        from calibre.ebooks.oeb.polish.toc import first_child
        from calibre.utils.xml_parse import safe_xml_fromstring
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(
                    x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
                            x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/')
        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
        opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
        opts.epub3_nav_parsed = root
        if getattr(self, 'removed_cover', None):
            changed = False
            base_path = os.path.dirname(nav_path)
            for elem in root.xpath('//*[@href]'):
                href, frag = elem.get('href').partition('#')[::2]
                link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
                abs_href = urlnormalize(link_path)
                if abs_href == self.removed_cover:
                    changed = True
                    elem.set('data-calibre-removed-titlepage', '1')
            if changed:
                with lopen(nav_path, 'wb') as f:
                    f.write(serialize(root, 'application/xhtml+xml'))

    def postprocess_book(self, oeb, opts, log):
        rc = getattr(self, 'removed_cover', None)
        if rc:
            cover_toc_item = None
            for item in oeb.toc.iterdescendants():
                if item.href and item.href.partition('#')[0] == rc:
                    cover_toc_item = item
                    break
            spine = {x.href for x in oeb.spine}
            if (cover_toc_item is not None and cover_toc_item not in spine):
                oeb.toc.item_that_refers_to_cover = cover_toc_item