1__license__ = 'GPL 3'
2__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
3__docformat__ = 'restructuredtext en'
4
5import os, re, posixpath
6from itertools import cycle
7
8from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
9
10ADOBE_OBFUSCATION =  'http://ns.adobe.com/pdf/enc#RC'
11IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
12
13
14def decrypt_font_data(key, data, algorithm):
15    is_adobe = algorithm == ADOBE_OBFUSCATION
16    crypt_len = 1024 if is_adobe else 1040
17    crypt = bytearray(data[:crypt_len])
18    key = cycle(iter(bytearray(key)))
19    decrypt = bytes(bytearray(x^next(key) for x in crypt))
20    return decrypt + data[crypt_len:]
21
22
23def decrypt_font(key, path, algorithm):
24    with lopen(path, 'r+b') as f:
25        data = decrypt_font_data(key, f.read(), algorithm)
26        f.seek(0), f.truncate(), f.write(data)
27
28
29class EPUBInput(InputFormatPlugin):
30
31    name        = 'EPUB Input'
32    author      = 'Kovid Goyal'
33    description = _('Convert EPUB files (.epub) to HTML')
34    file_types  = {'epub'}
35    output_encoding = None
36    commit_name = 'epub_input'
37
38    recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
39
40    def process_encryption(self, encfile, opf, log):
41        from lxml import etree
42        import uuid, hashlib
43        idpf_key = opf.raw_unique_identifier
44        if idpf_key:
45            idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
46            idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
47        key = None
48        for item in opf.identifier_iter():
49            scheme = None
50            for xkey in item.attrib.keys():
51                if xkey.endswith('scheme'):
52                    scheme = item.get(xkey)
53            if (scheme and scheme.lower() == 'uuid') or \
54                    (item.text and item.text.startswith('urn:uuid:')):
55                try:
56                    key = item.text.rpartition(':')[-1]
57                    key = uuid.UUID(key).bytes
58                except:
59                    import traceback
60                    traceback.print_exc()
61                    key = None
62
63        try:
64            root = etree.parse(encfile)
65            for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
66                algorithm = em.get('Algorithm', '')
67                if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
68                    return False
69                cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
70                uri = cr.get('URI')
71                path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
72                tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
73                if (tkey and os.path.exists(path)):
74                    self._encrypted_font_uris.append(uri)
75                    decrypt_font(tkey, path, algorithm)
76            return True
77        except:
78            import traceback
79            traceback.print_exc()
80        return False
81
82    def set_guide_type(self, opf, gtype, href=None, title=''):
83        # Set the specified guide entry
84        for elem in list(opf.iterguide()):
85            if elem.get('type', '').lower() == gtype:
86                elem.getparent().remove(elem)
87
88        if href is not None:
89            t = opf.create_guide_item(gtype, title, href)
90            for guide in opf.root.xpath('./*[local-name()="guide"]'):
91                guide.append(t)
92                return
93            guide = opf.create_guide_element()
94            opf.root.append(guide)
95            guide.append(t)
96            return t
97
98    def rationalize_cover3(self, opf, log):
99        ''' If there is a reference to the cover/titlepage via manifest properties, convert to
100        entries in the <guide> so that the rest of the pipeline picks it up. '''
101        from calibre.ebooks.metadata.opf3 import items_with_property
102        removed = guide_titlepage_href = guide_titlepage_id = None
103
104        # Look for titlepages incorrectly marked in the <guide> as covers
105        guide_cover, guide_elem = None, None
106        for guide_elem in opf.iterguide():
107            if guide_elem.get('type', '').lower() == 'cover':
108                guide_cover = guide_elem.get('href', '').partition('#')[0]
109                break
110        if guide_cover:
111            spine = list(opf.iterspine())
112            if spine:
113                idref = spine[0].get('idref', '')
114                for x in opf.itermanifest():
115                    if x.get('id') == idref and x.get('href') == guide_cover:
116                        guide_titlepage_href = guide_cover
117                        guide_titlepage_id = idref
118                        break
119
120        raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
121        if raster_cover_href:
122            self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
123        titlepage_id = titlepage_href = None
124        for item in items_with_property(opf.root, 'calibre:title-page'):
125            tid, href = item.get('id'), item.get('href')
126            if href and tid:
127                titlepage_id, titlepage_href = tid, href.partition('#')[0]
128                break
129        if titlepage_href is None:
130            titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
131        if titlepage_href is not None:
132            self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title page')
133            spine = list(opf.iterspine())
134            if len(spine) > 1:
135                for item in spine:
136                    if item.get('idref') == titlepage_id:
137                        log('Found HTML cover', titlepage_href)
138                        if self.for_viewer:
139                            item.attrib.pop('linear', None)
140                        else:
141                            item.getparent().remove(item)
142                            removed = titlepage_href
143                        return removed
144
145    def rationalize_cover2(self, opf, log):
146        ''' Ensure that the cover information in the guide is correct. That
147        means, at most one entry with type="cover" that points to a raster
148        cover and at most one entry with type="titlepage" that points to an
149        HTML titlepage. '''
150        from calibre.ebooks.oeb.base import OPF
151        removed = None
152        from lxml import etree
153        guide_cover, guide_elem = None, None
154        for guide_elem in opf.iterguide():
155            if guide_elem.get('type', '').lower() == 'cover':
156                guide_cover = guide_elem.get('href', '').partition('#')[0]
157                break
158        if not guide_cover:
159            raster_cover = opf.raster_cover
160            if raster_cover:
161                if guide_elem is None:
162                    g = opf.root.makeelement(OPF('guide'))
163                    opf.root.append(g)
164                else:
165                    g = guide_elem.getparent()
166                guide_cover = raster_cover
167                guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
168                g.append(guide_elem)
169            return
170        spine = list(opf.iterspine())
171        if not spine:
172            return
173        # Check if the cover specified in the guide is also
174        # the first element in spine
175        idref = spine[0].get('idref', '')
176        manifest = list(opf.itermanifest())
177        if not manifest:
178            return
179        elem = [x for x in manifest if x.get('id', '') == idref]
180        if not elem or elem[0].get('href', None) != guide_cover:
181            return
182        log('Found HTML cover', guide_cover)
183
184        # Remove from spine as covers must be treated
185        # specially
186        if not self.for_viewer:
187            if len(spine) == 1:
188                log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
189                for guide_elem in tuple(opf.iterguide()):
190                    if guide_elem.get('type', '').lower() == 'cover':
191                        guide_elem.getparent().remove(guide_elem)
192                return
193            else:
194                spine[0].getparent().remove(spine[0])
195                removed = guide_cover
196        else:
197            # Ensure the cover is displayed as the first item in the book, some
198            # epub files have it set with linear='no' which causes the cover to
199            # display in the end
200            spine[0].attrib.pop('linear', None)
201            opf.spine[0].is_linear = True
202        # Ensure that the guide has a cover entry pointing to a raster cover
203        # and a titlepage entry pointing to the html titlepage. The titlepage
204        # entry will be used by the epub output plugin, the raster cover entry
205        # by other output plugins.
206
207        # Search for a raster cover identified in the OPF
208        raster_cover = opf.raster_cover
209
210        # Set the cover guide entry
211        if raster_cover is not None:
212            guide_elem.set('href', raster_cover)
213        else:
214            # Render the titlepage to create a raster cover
215            from calibre.ebooks import render_html_svg_workaround
216            guide_elem.set('href', 'calibre_raster_cover.jpg')
217            t = etree.SubElement(
218                elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
219            t.set('media-type', 'image/jpeg')
220            if os.path.exists(guide_cover):
221                renderer = render_html_svg_workaround(guide_cover, log)
222                if renderer is not None:
223                    with lopen('calibre_raster_cover.jpg', 'wb') as f:
224                        f.write(renderer)
225
226        # Set the titlepage guide entry
227        self.set_guide_type(opf, 'titlepage', guide_cover, 'Title page')
228        return removed
229
230    def find_opf(self):
231        from calibre.utils.xml_parse import safe_xml_fromstring
232
233        def attr(n, attr):
234            for k, v in n.attrib.items():
235                if k.endswith(attr):
236                    return v
237        try:
238            with lopen('META-INF/container.xml', 'rb') as f:
239                root = safe_xml_fromstring(f.read())
240                for r in root.xpath('//*[local-name()="rootfile"]'):
241                    if attr(r, 'media-type') != "application/oebps-package+xml":
242                        continue
243                    path = attr(r, 'full-path')
244                    if not path:
245                        continue
246                    path = os.path.join(os.getcwd(), *path.split('/'))
247                    if os.path.exists(path):
248                        return path
249        except Exception:
250            import traceback
251            traceback.print_exc()
252
253    def convert(self, stream, options, file_ext, log, accelerators):
254        from calibre.utils.zipfile import ZipFile
255        from calibre import walk
256        from calibre.ebooks import DRMError
257        from calibre.ebooks.metadata.opf2 import OPF
258        try:
259            zf = ZipFile(stream)
260            zf.extractall(os.getcwd())
261        except:
262            log.exception('EPUB appears to be invalid ZIP file, trying a'
263                    ' more forgiving ZIP parser')
264            from calibre.utils.localunzip import extractall
265            stream.seek(0)
266            extractall(stream)
267        encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
268        opf = self.find_opf()
269        if opf is None:
270            for f in walk('.'):
271                if f.lower().endswith('.opf') and '__MACOSX' not in f and \
272                        not os.path.basename(f).startswith('.'):
273                    opf = os.path.abspath(f)
274                    break
275        path = getattr(stream, 'name', 'stream')
276
277        if opf is None:
278            raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
279
280        opf = os.path.relpath(opf, os.getcwd())
281        parts = os.path.split(opf)
282        opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
283
284        self._encrypted_font_uris = []
285        if os.path.exists(encfile):
286            if not self.process_encryption(encfile, opf, log):
287                raise DRMError(os.path.basename(path))
288        self.encrypted_fonts = self._encrypted_font_uris
289
290        if len(parts) > 1 and parts[0]:
291            delta = '/'.join(parts[:-1])+'/'
292
293            def normpath(x):
294                return posixpath.normpath(delta + elem.get('href'))
295
296            for elem in opf.itermanifest():
297                elem.set('href', normpath(elem.get('href')))
298            for elem in opf.iterguide():
299                elem.set('href', normpath(elem.get('href')))
300
301        f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
302        self.removed_cover = f(opf, log)
303        if self.removed_cover:
304            self.removed_items_to_ignore = (self.removed_cover,)
305        epub3_nav = opf.epub3_nav
306        if epub3_nav is not None:
307            self.convert_epub3_nav(epub3_nav, opf, log, options)
308
309        for x in opf.itermanifest():
310            if x.get('media-type', '') == 'application/x-dtbook+xml':
311                raise ValueError(
312                    'EPUB files with DTBook markup are not supported')
313
314        not_for_spine = set()
315        for y in opf.itermanifest():
316            id_ = y.get('id', None)
317            if id_:
318                mt = y.get('media-type', None)
319                if mt in {
320                        'application/vnd.adobe-page-template+xml',
321                        'application/vnd.adobe.page-template+xml',
322                        'application/adobe-page-template+xml',
323                        'application/adobe.page-template+xml',
324                        'application/text'
325                }:
326                    not_for_spine.add(id_)
327                ext = y.get('href', '').rpartition('.')[-1].lower()
328                if mt == 'text/plain' and ext in {'otf', 'ttf'}:
329                    # some epub authoring software sets font mime types to
330                    # text/plain
331                    not_for_spine.add(id_)
332                    y.set('media-type', 'application/font')
333
334        seen = set()
335        for x in list(opf.iterspine()):
336            ref = x.get('idref', None)
337            if not ref or ref in not_for_spine or ref in seen:
338                x.getparent().remove(x)
339                continue
340            seen.add(ref)
341
342        if len(list(opf.iterspine())) == 0:
343            raise ValueError('No valid entries in the spine of this EPUB')
344
345        with lopen('content.opf', 'wb') as nopf:
346            nopf.write(opf.render())
347
348        return os.path.abspath('content.opf')
349
350    def convert_epub3_nav(self, nav_path, opf, log, opts):
351        from lxml import etree
352        from calibre.ebooks.chardet import xml_to_unicode
353        from calibre.ebooks.oeb.polish.parsing import parse
354        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
355        from calibre.ebooks.oeb.polish.toc import first_child
356        from calibre.utils.xml_parse import safe_xml_fromstring
357        from tempfile import NamedTemporaryFile
358        with lopen(nav_path, 'rb') as f:
359            raw = f.read()
360        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
361        root = parse(raw, log=log)
362        ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
363        navmap = ncx[0]
364        et = '{%s}type' % EPUB_NS
365        bn = os.path.basename(nav_path)
366
367        def add_from_li(li, parent):
368            href = text = None
369            for x in li.iterchildren(XHTML('a'), XHTML('span')):
370                text = etree.tostring(
371                    x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
372                            x.xpath('descendant-or-self::*/@title')).strip()
373                href = x.get('href')
374                if href:
375                    if href.startswith('#'):
376                        href = bn + href
377                break
378            np = parent.makeelement(NCX('navPoint'))
379            parent.append(np)
380            np.append(np.makeelement(NCX('navLabel')))
381            np[0].append(np.makeelement(NCX('text')))
382            np[0][0].text = text
383            if href:
384                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
385            return np
386
387        def process_nav_node(node, toc_parent):
388            for li in node.iterchildren(XHTML('li')):
389                child = add_from_li(li, toc_parent)
390                ol = first_child(li, XHTML('ol'))
391                if child is not None and ol is not None:
392                    process_nav_node(ol, child)
393
394        for nav in root.iterdescendants(XHTML('nav')):
395            if nav.get(et) == 'toc':
396                ol = first_child(nav, XHTML('ol'))
397                if ol is not None:
398                    process_nav_node(ol, navmap)
399                    break
400        else:
401            return
402
403        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
404            f.write(etree.tostring(ncx, encoding='utf-8'))
405        ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/')
406        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
407        for spine in opf.root.xpath('//*[local-name()="spine"]'):
408            spine.set('toc', ncx_id)
409        opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
410        opts.epub3_nav_parsed = root
411        if getattr(self, 'removed_cover', None):
412            changed = False
413            base_path = os.path.dirname(nav_path)
414            for elem in root.xpath('//*[@href]'):
415                href, frag = elem.get('href').partition('#')[::2]
416                link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
417                abs_href = urlnormalize(link_path)
418                if abs_href == self.removed_cover:
419                    changed = True
420                    elem.set('data-calibre-removed-titlepage', '1')
421            if changed:
422                with lopen(nav_path, 'wb') as f:
423                    f.write(serialize(root, 'application/xhtml+xml'))
424
425    def postprocess_book(self, oeb, opts, log):
426        rc = getattr(self, 'removed_cover', None)
427        if rc:
428            cover_toc_item = None
429            for item in oeb.toc.iterdescendants():
430                if item.href and item.href.partition('#')[0] == rc:
431                    cover_toc_item = item
432                    break
433            spine = {x.href for x in oeb.spine}
434            if (cover_toc_item is not None and cover_toc_item not in spine):
435                oeb.toc.item_that_refers_to_cover = cover_toc_item
436