1__license__ = 'GPL 3' 2__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' 3__docformat__ = 'restructuredtext en' 4 5import os, re, posixpath 6from itertools import cycle 7 8from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation 9 10ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC' 11IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding' 12 13 14def decrypt_font_data(key, data, algorithm): 15 is_adobe = algorithm == ADOBE_OBFUSCATION 16 crypt_len = 1024 if is_adobe else 1040 17 crypt = bytearray(data[:crypt_len]) 18 key = cycle(iter(bytearray(key))) 19 decrypt = bytes(bytearray(x^next(key) for x in crypt)) 20 return decrypt + data[crypt_len:] 21 22 23def decrypt_font(key, path, algorithm): 24 with lopen(path, 'r+b') as f: 25 data = decrypt_font_data(key, f.read(), algorithm) 26 f.seek(0), f.truncate(), f.write(data) 27 28 29class EPUBInput(InputFormatPlugin): 30 31 name = 'EPUB Input' 32 author = 'Kovid Goyal' 33 description = _('Convert EPUB files (.epub) to HTML') 34 file_types = {'epub'} 35 output_encoding = None 36 commit_name = 'epub_input' 37 38 recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)} 39 40 def process_encryption(self, encfile, opf, log): 41 from lxml import etree 42 import uuid, hashlib 43 idpf_key = opf.raw_unique_identifier 44 if idpf_key: 45 idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) 46 idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest() 47 key = None 48 for item in opf.identifier_iter(): 49 scheme = None 50 for xkey in item.attrib.keys(): 51 if xkey.endswith('scheme'): 52 scheme = item.get(xkey) 53 if (scheme and scheme.lower() == 'uuid') or \ 54 (item.text and item.text.startswith('urn:uuid:')): 55 try: 56 key = item.text.rpartition(':')[-1] 57 key = uuid.UUID(key).bytes 58 except: 59 import traceback 60 traceback.print_exc() 61 key = None 62 63 try: 64 root = etree.parse(encfile) 65 for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): 66 algorithm = em.get('Algorithm', '') 67 if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}: 68 return False 69 cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] 70 uri = cr.get('URI') 71 path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) 72 tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key) 73 if (tkey and os.path.exists(path)): 74 self._encrypted_font_uris.append(uri) 75 decrypt_font(tkey, path, algorithm) 76 return True 77 except: 78 import traceback 79 traceback.print_exc() 80 return False 81 82 def set_guide_type(self, opf, gtype, href=None, title=''): 83 # Set the specified guide entry 84 for elem in list(opf.iterguide()): 85 if elem.get('type', '').lower() == gtype: 86 elem.getparent().remove(elem) 87 88 if href is not None: 89 t = opf.create_guide_item(gtype, title, href) 90 for guide in opf.root.xpath('./*[local-name()="guide"]'): 91 guide.append(t) 92 return 93 guide = opf.create_guide_element() 94 opf.root.append(guide) 95 guide.append(t) 96 return t 97 98 def rationalize_cover3(self, opf, log): 99 ''' If there is a reference to the cover/titlepage via manifest properties, convert to 100 entries in the <guide> so that the rest of the pipeline picks it up. ''' 101 from calibre.ebooks.metadata.opf3 import items_with_property 102 removed = guide_titlepage_href = guide_titlepage_id = None 103 104 # Look for titlepages incorrectly marked in the <guide> as covers 105 guide_cover, guide_elem = None, None 106 for guide_elem in opf.iterguide(): 107 if guide_elem.get('type', '').lower() == 'cover': 108 guide_cover = guide_elem.get('href', '').partition('#')[0] 109 break 110 if guide_cover: 111 spine = list(opf.iterspine()) 112 if spine: 113 idref = spine[0].get('idref', '') 114 for x in opf.itermanifest(): 115 if x.get('id') == idref and x.get('href') == guide_cover: 116 guide_titlepage_href = guide_cover 117 guide_titlepage_id = idref 118 break 119 120 raster_cover_href = opf.epub3_raster_cover or opf.raster_cover 121 if raster_cover_href: 122 self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image') 123 titlepage_id = titlepage_href = None 124 for item in items_with_property(opf.root, 'calibre:title-page'): 125 tid, href = item.get('id'), item.get('href') 126 if href and tid: 127 titlepage_id, titlepage_href = tid, href.partition('#')[0] 128 break 129 if titlepage_href is None: 130 titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id 131 if titlepage_href is not None: 132 self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title page') 133 spine = list(opf.iterspine()) 134 if len(spine) > 1: 135 for item in spine: 136 if item.get('idref') == titlepage_id: 137 log('Found HTML cover', titlepage_href) 138 if self.for_viewer: 139 item.attrib.pop('linear', None) 140 else: 141 item.getparent().remove(item) 142 removed = titlepage_href 143 return removed 144 145 def rationalize_cover2(self, opf, log): 146 ''' Ensure that the cover information in the guide is correct. That 147 means, at most one entry with type="cover" that points to a raster 148 cover and at most one entry with type="titlepage" that points to an 149 HTML titlepage. ''' 150 from calibre.ebooks.oeb.base import OPF 151 removed = None 152 from lxml import etree 153 guide_cover, guide_elem = None, None 154 for guide_elem in opf.iterguide(): 155 if guide_elem.get('type', '').lower() == 'cover': 156 guide_cover = guide_elem.get('href', '').partition('#')[0] 157 break 158 if not guide_cover: 159 raster_cover = opf.raster_cover 160 if raster_cover: 161 if guide_elem is None: 162 g = opf.root.makeelement(OPF('guide')) 163 opf.root.append(g) 164 else: 165 g = guide_elem.getparent() 166 guide_cover = raster_cover 167 guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'}) 168 g.append(guide_elem) 169 return 170 spine = list(opf.iterspine()) 171 if not spine: 172 return 173 # Check if the cover specified in the guide is also 174 # the first element in spine 175 idref = spine[0].get('idref', '') 176 manifest = list(opf.itermanifest()) 177 if not manifest: 178 return 179 elem = [x for x in manifest if x.get('id', '') == idref] 180 if not elem or elem[0].get('href', None) != guide_cover: 181 return 182 log('Found HTML cover', guide_cover) 183 184 # Remove from spine as covers must be treated 185 # specially 186 if not self.for_viewer: 187 if len(spine) == 1: 188 log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.') 189 for guide_elem in tuple(opf.iterguide()): 190 if guide_elem.get('type', '').lower() == 'cover': 191 guide_elem.getparent().remove(guide_elem) 192 return 193 else: 194 spine[0].getparent().remove(spine[0]) 195 removed = guide_cover 196 else: 197 # Ensure the cover is displayed as the first item in the book, some 198 # epub files have it set with linear='no' which causes the cover to 199 # display in the end 200 spine[0].attrib.pop('linear', None) 201 opf.spine[0].is_linear = True 202 # Ensure that the guide has a cover entry pointing to a raster cover 203 # and a titlepage entry pointing to the html titlepage. The titlepage 204 # entry will be used by the epub output plugin, the raster cover entry 205 # by other output plugins. 206 207 # Search for a raster cover identified in the OPF 208 raster_cover = opf.raster_cover 209 210 # Set the cover guide entry 211 if raster_cover is not None: 212 guide_elem.set('href', raster_cover) 213 else: 214 # Render the titlepage to create a raster cover 215 from calibre.ebooks import render_html_svg_workaround 216 guide_elem.set('href', 'calibre_raster_cover.jpg') 217 t = etree.SubElement( 218 elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover') 219 t.set('media-type', 'image/jpeg') 220 if os.path.exists(guide_cover): 221 renderer = render_html_svg_workaround(guide_cover, log) 222 if renderer is not None: 223 with lopen('calibre_raster_cover.jpg', 'wb') as f: 224 f.write(renderer) 225 226 # Set the titlepage guide entry 227 self.set_guide_type(opf, 'titlepage', guide_cover, 'Title page') 228 return removed 229 230 def find_opf(self): 231 from calibre.utils.xml_parse import safe_xml_fromstring 232 233 def attr(n, attr): 234 for k, v in n.attrib.items(): 235 if k.endswith(attr): 236 return v 237 try: 238 with lopen('META-INF/container.xml', 'rb') as f: 239 root = safe_xml_fromstring(f.read()) 240 for r in root.xpath('//*[local-name()="rootfile"]'): 241 if attr(r, 'media-type') != "application/oebps-package+xml": 242 continue 243 path = attr(r, 'full-path') 244 if not path: 245 continue 246 path = os.path.join(os.getcwd(), *path.split('/')) 247 if os.path.exists(path): 248 return path 249 except Exception: 250 import traceback 251 traceback.print_exc() 252 253 def convert(self, stream, options, file_ext, log, accelerators): 254 from calibre.utils.zipfile import ZipFile 255 from calibre import walk 256 from calibre.ebooks import DRMError 257 from calibre.ebooks.metadata.opf2 import OPF 258 try: 259 zf = ZipFile(stream) 260 zf.extractall(os.getcwd()) 261 except: 262 log.exception('EPUB appears to be invalid ZIP file, trying a' 263 ' more forgiving ZIP parser') 264 from calibre.utils.localunzip import extractall 265 stream.seek(0) 266 extractall(stream) 267 encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) 268 opf = self.find_opf() 269 if opf is None: 270 for f in walk('.'): 271 if f.lower().endswith('.opf') and '__MACOSX' not in f and \ 272 not os.path.basename(f).startswith('.'): 273 opf = os.path.abspath(f) 274 break 275 path = getattr(stream, 'name', 'stream') 276 277 if opf is None: 278 raise ValueError('%s is not a valid EPUB file (could not find opf)'%path) 279 280 opf = os.path.relpath(opf, os.getcwd()) 281 parts = os.path.split(opf) 282 opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) 283 284 self._encrypted_font_uris = [] 285 if os.path.exists(encfile): 286 if not self.process_encryption(encfile, opf, log): 287 raise DRMError(os.path.basename(path)) 288 self.encrypted_fonts = self._encrypted_font_uris 289 290 if len(parts) > 1 and parts[0]: 291 delta = '/'.join(parts[:-1])+'/' 292 293 def normpath(x): 294 return posixpath.normpath(delta + elem.get('href')) 295 296 for elem in opf.itermanifest(): 297 elem.set('href', normpath(elem.get('href'))) 298 for elem in opf.iterguide(): 299 elem.set('href', normpath(elem.get('href'))) 300 301 f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 302 self.removed_cover = f(opf, log) 303 if self.removed_cover: 304 self.removed_items_to_ignore = (self.removed_cover,) 305 epub3_nav = opf.epub3_nav 306 if epub3_nav is not None: 307 self.convert_epub3_nav(epub3_nav, opf, log, options) 308 309 for x in opf.itermanifest(): 310 if x.get('media-type', '') == 'application/x-dtbook+xml': 311 raise ValueError( 312 'EPUB files with DTBook markup are not supported') 313 314 not_for_spine = set() 315 for y in opf.itermanifest(): 316 id_ = y.get('id', None) 317 if id_: 318 mt = y.get('media-type', None) 319 if mt in { 320 'application/vnd.adobe-page-template+xml', 321 'application/vnd.adobe.page-template+xml', 322 'application/adobe-page-template+xml', 323 'application/adobe.page-template+xml', 324 'application/text' 325 }: 326 not_for_spine.add(id_) 327 ext = y.get('href', '').rpartition('.')[-1].lower() 328 if mt == 'text/plain' and ext in {'otf', 'ttf'}: 329 # some epub authoring software sets font mime types to 330 # text/plain 331 not_for_spine.add(id_) 332 y.set('media-type', 'application/font') 333 334 seen = set() 335 for x in list(opf.iterspine()): 336 ref = x.get('idref', None) 337 if not ref or ref in not_for_spine or ref in seen: 338 x.getparent().remove(x) 339 continue 340 seen.add(ref) 341 342 if len(list(opf.iterspine())) == 0: 343 raise ValueError('No valid entries in the spine of this EPUB') 344 345 with lopen('content.opf', 'wb') as nopf: 346 nopf.write(opf.render()) 347 348 return os.path.abspath('content.opf') 349 350 def convert_epub3_nav(self, nav_path, opf, log, opts): 351 from lxml import etree 352 from calibre.ebooks.chardet import xml_to_unicode 353 from calibre.ebooks.oeb.polish.parsing import parse 354 from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize 355 from calibre.ebooks.oeb.polish.toc import first_child 356 from calibre.utils.xml_parse import safe_xml_fromstring 357 from tempfile import NamedTemporaryFile 358 with lopen(nav_path, 'rb') as f: 359 raw = f.read() 360 raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] 361 root = parse(raw, log=log) 362 ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>') 363 navmap = ncx[0] 364 et = '{%s}type' % EPUB_NS 365 bn = os.path.basename(nav_path) 366 367 def add_from_li(li, parent): 368 href = text = None 369 for x in li.iterchildren(XHTML('a'), XHTML('span')): 370 text = etree.tostring( 371 x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join( 372 x.xpath('descendant-or-self::*/@title')).strip() 373 href = x.get('href') 374 if href: 375 if href.startswith('#'): 376 href = bn + href 377 break 378 np = parent.makeelement(NCX('navPoint')) 379 parent.append(np) 380 np.append(np.makeelement(NCX('navLabel'))) 381 np[0].append(np.makeelement(NCX('text'))) 382 np[0][0].text = text 383 if href: 384 np.append(np.makeelement(NCX('content'), attrib={'src':href})) 385 return np 386 387 def process_nav_node(node, toc_parent): 388 for li in node.iterchildren(XHTML('li')): 389 child = add_from_li(li, toc_parent) 390 ol = first_child(li, XHTML('ol')) 391 if child is not None and ol is not None: 392 process_nav_node(ol, child) 393 394 for nav in root.iterdescendants(XHTML('nav')): 395 if nav.get(et) == 'toc': 396 ol = first_child(nav, XHTML('ol')) 397 if ol is not None: 398 process_nav_node(ol, navmap) 399 break 400 else: 401 return 402 403 with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: 404 f.write(etree.tostring(ncx, encoding='utf-8')) 405 ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/') 406 ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id') 407 for spine in opf.root.xpath('//*[local-name()="spine"]'): 408 spine.set('toc', ncx_id) 409 opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/')) 410 opts.epub3_nav_parsed = root 411 if getattr(self, 'removed_cover', None): 412 changed = False 413 base_path = os.path.dirname(nav_path) 414 for elem in root.xpath('//*[@href]'): 415 href, frag = elem.get('href').partition('#')[::2] 416 link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path) 417 abs_href = urlnormalize(link_path) 418 if abs_href == self.removed_cover: 419 changed = True 420 elem.set('data-calibre-removed-titlepage', '1') 421 if changed: 422 with lopen(nav_path, 'wb') as f: 423 f.write(serialize(root, 'application/xhtml+xml')) 424 425 def postprocess_book(self, oeb, opts, log): 426 rc = getattr(self, 'removed_cover', None) 427 if rc: 428 cover_toc_item = None 429 for item in oeb.toc.iterdescendants(): 430 if item.href and item.href.partition('#')[0] == rc: 431 cover_toc_item = item 432 break 433 spine = {x.href for x in oeb.spine} 434 if (cover_toc_item is not None and cover_toc_item not in spine): 435 oeb.toc.item_that_refers_to_cover = cover_toc_item 436