1''' 2Basic support for manipulating OEB 1.x/2.0 content and metadata. 3''' 4 5__license__ = 'GPL v3' 6__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' 7__docformat__ = 'restructuredtext en' 8 9import os, re, logging, sys, numbers 10from collections import defaultdict 11from itertools import count 12from operator import attrgetter 13 14from lxml import etree, html 15from calibre import force_unicode 16from calibre.constants import filesystem_encoding, __version__ 17from calibre.translations.dynamic import translate 18from calibre.utils.xml_parse import safe_xml_fromstring 19from calibre.ebooks.chardet import xml_to_unicode 20from calibre.ebooks.conversion.preprocess import CSSPreProcessor 21from calibre import (isbytestring, as_unicode, get_types_map) 22from calibre.ebooks.oeb.parse_utils import barename, XHTML_NS, namespace, XHTML, parse_html, NotHTML 23from calibre.utils.cleantext import clean_xml_chars 24from calibre.utils.short_uuid import uuid4 25from polyglot.builtins import iteritems, string_or_bytes, itervalues, codepoint_to_chr 26from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse 27from calibre.utils.icu import numeric_sort_key 28 29XML_NS = 'http://www.w3.org/XML/1998/namespace' 30OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/' 31OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' 32OPF2_NS = 'http://www.idpf.org/2007/opf' 33OPF_NSES = {OPF1_NS, OPF2_NS} 34DC09_NS = 'http://purl.org/metadata/dublin_core' 35DC10_NS = 'http://purl.org/dc/elements/1.0/' 36DC11_NS = 'http://purl.org/dc/elements/1.1/' 37DC_NSES = {DC09_NS, DC10_NS, DC11_NS} 38XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance' 39DCTERMS_NS = 'http://purl.org/dc/terms/' 40NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' 41SVG_NS = 'http://www.w3.org/2000/svg' 42XLINK_NS = 'http://www.w3.org/1999/xlink' 43CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata' 44RE_NS = 'http://exslt.org/regular-expressions' 45MBP_NS = 'http://www.mobipocket.com' 46EPUB_NS = 'http://www.idpf.org/2007/ops' 47MATHML_NS = 'http://www.w3.org/1998/Math/MathML' 48 49XPNSMAP = { 50 'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, 'd09': DC09_NS, 51 'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt': DCTERMS_NS, 52 'ncx': NCX_NS, 'svg': SVG_NS, 'xl': XLINK_NS, 're': RE_NS, 53 'mathml': MATHML_NS, 'mbp': MBP_NS, 'calibre': CALIBRE_NS, 54 'epub':EPUB_NS 55} 56 57OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} 58OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, 59 'xsi': XSI_NS, 'calibre': CALIBRE_NS} 60 61 62def XML(name): 63 return '{%s}%s' % (XML_NS, name) 64 65 66def OPF(name): 67 return '{%s}%s' % (OPF2_NS, name) 68 69 70def DC(name): 71 return '{%s}%s' % (DC11_NS, name) 72 73 74def XSI(name): 75 return '{%s}%s' % (XSI_NS, name) 76 77 78def DCTERMS(name): 79 return '{%s}%s' % (DCTERMS_NS, name) 80 81 82def NCX(name): 83 return '{%s}%s' % (NCX_NS, name) 84 85 86def SVG(name): 87 return '{%s}%s' % (SVG_NS, name) 88 89 90def XLINK(name): 91 return '{%s}%s' % (XLINK_NS, name) 92 93 94def CALIBRE(name): 95 return '{%s}%s' % (CALIBRE_NS, name) 96 97 98_css_url_re = re.compile(r'url\s*\([\'"]{0,1}(.*?)[\'"]{0,1}\)', re.I) 99_css_import_re = re.compile(r'@import "(.*?)"') 100_archive_re = re.compile(r'[^ ]+') 101 102# Tags that should not be self closed in epub output 103self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', 104'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details', 105'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer', 106'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'iframe', 'ins', 'kbd', 107'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p', 108'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small', 109'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var', 110'video', 'title', 'script', 'style'} 111 112 113def css_text(x): 114 ans = x.cssText 115 if isinstance(ans, bytes): 116 ans = ans.decode('utf-8', 'replace') 117 return ans 118 119 120def as_string_type(pat, for_unicode): 121 if for_unicode: 122 if isinstance(pat, bytes): 123 pat = pat.decode('utf-8') 124 else: 125 if isinstance(pat, str): 126 pat = pat.encode('utf-8') 127 return pat 128 129 130def self_closing_pat(for_unicode): 131 attr = 'unicode_ans' if for_unicode else 'bytes_ans' 132 ans = getattr(self_closing_pat, attr, None) 133 if ans is None: 134 sub = '|'.join(self_closing_bad_tags) 135 template = r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>' 136 pat = template % sub 137 pat = as_string_type(pat, for_unicode) 138 ans = re.compile(pat, flags=re.IGNORECASE) 139 setattr(self_closing_pat, attr, ans) 140 return ans 141 142 143def close_self_closing_tags(raw): 144 for_unicode = isinstance(raw, str) 145 repl = as_string_type(r'<\g<tag>\g<arg>></\g<tag>>', for_unicode) 146 pat = self_closing_pat(for_unicode) 147 return pat.sub(repl, raw) 148 149 150def uuid_id(): 151 return 'u' + uuid4() 152 153 154def itercsslinks(raw): 155 for match in _css_url_re.finditer(raw): 156 yield match.group(1), match.start(1) 157 for match in _css_import_re.finditer(raw): 158 yield match.group(1), match.start(1) 159 160 161_link_attrs = set(html.defs.link_attrs) | {XLINK('href'), 'poster'} 162 163 164def iterlinks(root, find_links_in_css=True): 165 ''' 166 Iterate over all links in a OEB Document. 167 168 :param root: A valid lxml.etree element. 169 ''' 170 assert etree.iselement(root) 171 172 for el in root.iter('*'): 173 try: 174 tag = barename(el.tag).lower() 175 except Exception: 176 continue 177 attribs = el.attrib 178 179 if tag == 'object': 180 codebase = None 181 # <object> tags have attributes that are relative to 182 # codebase 183 if 'codebase' in attribs: 184 codebase = el.get('codebase') 185 yield (el, 'codebase', codebase, 0) 186 for attrib in 'classid', 'data': 187 if attrib in attribs: 188 value = el.get(attrib) 189 if codebase is not None: 190 value = urljoin(codebase, value) 191 yield (el, attrib, value, 0) 192 if 'archive' in attribs: 193 for match in _archive_re.finditer(el.get('archive')): 194 value = match.group(0) 195 if codebase is not None: 196 value = urljoin(codebase, value) 197 yield (el, 'archive', value, match.start()) 198 else: 199 for attr in attribs: 200 if attr in _link_attrs: 201 yield (el, attr, attribs[attr], 0) 202 203 if not find_links_in_css: 204 continue 205 if tag == 'style' and el.text: 206 for match in _css_url_re.finditer(el.text): 207 yield (el, None, match.group(1), match.start(1)) 208 for match in _css_import_re.finditer(el.text): 209 yield (el, None, match.group(1), match.start(1)) 210 if 'style' in attribs: 211 for match in _css_url_re.finditer(attribs['style']): 212 yield (el, 'style', match.group(1), match.start(1)) 213 214 215def make_links_absolute(root, base_url): 216 ''' 217 Make all links in the document absolute, given the 218 ``base_url`` for the document (the full URL where the document 219 came from) 220 ''' 221 def link_repl(href): 222 return urljoin(base_url, href) 223 rewrite_links(root, link_repl) 224 225 226def resolve_base_href(root): 227 base_href = None 228 basetags = root.xpath('//base[@href]|//h:base[@href]', 229 namespaces=XPNSMAP) 230 for b in basetags: 231 base_href = b.get('href') 232 b.drop_tree() 233 if not base_href: 234 return 235 make_links_absolute(root, base_href, resolve_base_href=False) 236 237 238def rewrite_links(root, link_repl_func, resolve_base_href=False): 239 ''' 240 Rewrite all the links in the document. For each link 241 ``link_repl_func(link)`` will be called, and the return value 242 will replace the old link. 243 244 Note that links may not be absolute (unless you first called 245 ``make_links_absolute()``), and may be internal (e.g., 246 ``'#anchor'``). They can also be values like 247 ``'mailto:email'`` or ``'javascript:expr'``. 248 249 If the ``link_repl_func`` returns None, the attribute or 250 tag text will be removed completely. 251 ''' 252 from css_parser import replaceUrls, log, CSSParser 253 log.setLevel(logging.WARN) 254 log.raiseExceptions = False 255 256 if resolve_base_href: 257 resolve_base_href(root) 258 for el, attrib, link, pos in iterlinks(root, find_links_in_css=False): 259 new_link = link_repl_func(link.strip()) 260 if new_link == link: 261 continue 262 if new_link is None: 263 # Remove the attribute or element content 264 if attrib is None: 265 el.text = '' 266 else: 267 del el.attrib[attrib] 268 continue 269 if attrib is None: 270 new = el.text[:pos] + new_link + el.text[pos+len(link):] 271 el.text = new 272 else: 273 cur = el.attrib[attrib] 274 if not pos and len(cur) == len(link): 275 # Most common case 276 el.attrib[attrib] = new_link 277 else: 278 new = cur[:pos] + new_link + cur[pos+len(link):] 279 el.attrib[attrib] = new 280 281 parser = CSSParser(raiseExceptions=False, log=_css_logger, 282 fetcher=lambda x:(None, '')) 283 for el in root.iter(etree.Element): 284 try: 285 tag = el.tag 286 except UnicodeDecodeError: 287 continue 288 289 if tag == XHTML('style') and el.text and \ 290 (_css_url_re.search(el.text) is not None or '@import' in 291 el.text): 292 stylesheet = parser.parseString(el.text, validate=False) 293 replaceUrls(stylesheet, link_repl_func) 294 repl = css_text(stylesheet) 295 el.text = '\n'+ clean_xml_chars(repl) + '\n' 296 297 text = el.get('style') 298 if text and _css_url_re.search(text) is not None: 299 try: 300 stext = parser.parseStyle(text, validate=False) 301 except Exception: 302 # Parsing errors are raised by css_parser 303 continue 304 replaceUrls(stext, link_repl_func) 305 repl = css_text(stext).replace('\n', ' ').replace('\r', 306 ' ') 307 el.set('style', repl) 308 309 310types_map = get_types_map() 311EPUB_MIME = types_map['.epub'] 312XHTML_MIME = types_map['.xhtml'] 313CSS_MIME = types_map['.css'] 314NCX_MIME = types_map['.ncx'] 315OPF_MIME = types_map['.opf'] 316PAGE_MAP_MIME = 'application/oebps-page-map+xml' 317OEB_DOC_MIME = 'text/x-oeb1-document' 318OEB_CSS_MIME = 'text/x-oeb1-css' 319OPENTYPE_MIME = types_map['.otf'] 320GIF_MIME = types_map['.gif'] 321JPEG_MIME = types_map['.jpeg'] 322PNG_MIME = types_map['.png'] 323SVG_MIME = types_map['.svg'] 324WEBP_MIME = types_map['.webp'] 325BINARY_MIME = 'application/octet-stream' 326 327XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS 328 329OEB_STYLES = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'} 330OEB_DOCS = {XHTML_MIME, 'text/html', OEB_DOC_MIME, 331 'text/x-oeb-document'} 332OEB_RASTER_IMAGES = {GIF_MIME, JPEG_MIME, PNG_MIME, WEBP_MIME} 333OEB_IMAGES = {GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME} 334 335MS_COVER_TYPE = 'other.ms-coverimage-standard' 336 337ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') 338COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') 339QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$') 340PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+') 341XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') 342CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''') 343 344 345def element(parent, *args, **kwargs): 346 if parent is not None: 347 return etree.SubElement(parent, *args, **kwargs) 348 return etree.Element(*args, **kwargs) 349 350 351def prefixname(name, nsrmap): 352 if not isqname(name): 353 return name 354 ns = namespace(name) 355 if ns not in nsrmap: 356 return name 357 prefix = nsrmap[ns] 358 if not prefix: 359 return barename(name) 360 return ':'.join((prefix, barename(name))) 361 362 363def isprefixname(name): 364 return name and PREFIXNAME_RE.match(name) is not None 365 366 367def qname(name, nsmap): 368 if not isprefixname(name): 369 return name 370 prefix, local = name.split(':', 1) 371 if prefix not in nsmap: 372 return name 373 return '{%s}%s' % (nsmap[prefix], local) 374 375 376def isqname(name): 377 return name and QNAME_RE.match(name) is not None 378 379 380def XPath(expr): 381 return etree.XPath(expr, namespaces=XPNSMAP) 382 383 384def xpath(elem, expr): 385 return elem.xpath(expr, namespaces=XPNSMAP) 386 387 388def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True): 389 if not strip_comments: 390 # -- in comments trips up adobe digital editions 391 for x in root.iterdescendants(etree.Comment): 392 if x.text and '--' in x.text: 393 x.text = x.text.replace('--', '__') 394 ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, 395 pretty_print=pretty_print, with_tail=with_tail) 396 397 if strip_comments: 398 ans = re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', ans) 399 400 return ans 401 402 403def xml2text(elem, pretty_print=False, method='text'): 404 return etree.tostring(elem, method=method, encoding='unicode', with_tail=False, pretty_print=pretty_print) 405 406 407def escape_cdata(root): 408 pat = re.compile(r'[<>&]') 409 for elem in root.iterdescendants('{%s}style' % XHTML_NS, '{%s}script' % XHTML_NS): 410 if elem.text and pat.search(elem.text) is not None: 411 elem.text = etree.CDATA(elem.text.replace(']]>', r'\]\]\>')) 412 413 414def serialize(data, media_type, pretty_print=False): 415 if isinstance(data, etree._Element): 416 is_oeb_doc = media_type in OEB_DOCS 417 if is_oeb_doc: 418 escape_cdata(data) 419 ans = xml2str(data, pretty_print=pretty_print) 420 if is_oeb_doc: 421 # Convert self closing div|span|a|video|audio|iframe|etc tags 422 # to normally closed ones, as they are interpreted 423 # incorrectly by some browser based renderers 424 ans = close_self_closing_tags(ans) 425 return ans 426 if isinstance(data, str): 427 return data.encode('utf-8') 428 if hasattr(data, 'cssText'): 429 data = data.cssText 430 if isinstance(data, str): 431 data = data.encode('utf-8') 432 return data + b'\n' 433 return bytes(data) 434 435 436ASCII_CHARS = frozenset(codepoint_to_chr(x) for x in range(128)) 437UNIBYTE_CHARS = frozenset(x.encode('ascii') for x in ASCII_CHARS) 438USAFE = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 439 'abcdefghijklmnopqrstuvwxyz' 440 '0123456789' '_.-/~') 441URL_SAFE = frozenset(USAFE) 442URL_SAFE_BYTES = frozenset(USAFE.encode('ascii')) 443URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE_BYTES] 444del USAFE 445 446 447def urlquote(href): 448 """ Quote URL-unsafe characters, allowing IRI-safe characters. 449 That is, this function returns valid IRIs not valid URIs. In particular, 450 IRIs can contain non-ascii characters. """ 451 result = [] 452 isbytes = isinstance(href, bytes) 453 unsafe = URL_UNSAFE[int(isbytes)] 454 esc, join = "%%%02x", '' 455 if isbytes: 456 esc, join = esc.encode('ascii'), b'' 457 for char in href: 458 if char in unsafe: 459 char = esc % ord(char) 460 result.append(char) 461 return join.join(result) 462 463 464def urlnormalize(href): 465 """Convert a URL into normalized form, with all and only URL-unsafe 466 characters URL quoted. 467 """ 468 try: 469 parts = urlparse(href) 470 except ValueError as e: 471 raise ValueError('Failed to parse the URL: %r with underlying error: %s' % (href, as_unicode(e))) 472 if not parts.scheme or parts.scheme == 'file': 473 path, frag = urldefrag(href) 474 parts = ('', '', path, '', '', frag) 475 parts = (part.replace('\\', '/') for part in parts) 476 parts = (urlunquote(part) for part in parts) 477 parts = (urlquote(part) for part in parts) 478 return urlunparse(parts) 479 480 481def extract(elem): 482 """ 483 Removes this element from the tree, including its children and 484 text. The tail text is joined to the previous element or 485 parent. 486 """ 487 parent = elem.getparent() 488 if parent is not None: 489 if elem.tail: 490 previous = elem.getprevious() 491 if previous is None: 492 parent.text = (parent.text or '') + elem.tail 493 else: 494 previous.tail = (previous.tail or '') + elem.tail 495 parent.remove(elem) 496 497 498class DummyHandler(logging.Handler): 499 500 def __init__(self): 501 logging.Handler.__init__(self, logging.WARNING) 502 self.setFormatter(logging.Formatter('%(message)s')) 503 self.log = None 504 505 def emit(self, record): 506 if self.log is not None: 507 msg = self.format(record) 508 f = self.log.error if record.levelno >= logging.ERROR \ 509 else self.log.warn 510 f(msg) 511 512 513_css_logger = logging.getLogger('calibre.css') 514_css_logger.setLevel(logging.WARNING) 515_css_log_handler = DummyHandler() 516_css_logger.addHandler(_css_log_handler) 517 518 519class OEBError(Exception): 520 """Generic OEB-processing error.""" 521 pass 522 523 524class NullContainer: 525 """An empty container. 526 527 For use with book formats which do not support container-like access. 528 """ 529 530 def __init__(self, log): 531 self.log = log 532 533 def read(self, path): 534 raise OEBError('Attempt to read from NullContainer') 535 536 def write(self, path): 537 raise OEBError('Attempt to write to NullContainer') 538 539 def exists(self, path): 540 return False 541 542 def namelist(self): 543 return [] 544 545 546class DirContainer: 547 """Filesystem directory container.""" 548 549 def __init__(self, path, log, ignore_opf=False): 550 self.log = log 551 if isbytestring(path): 552 path = path.decode(filesystem_encoding) 553 self.opfname = None 554 ext = os.path.splitext(path)[1].lower() 555 if ext == '.opf': 556 self.opfname = os.path.basename(path) 557 self.rootdir = os.path.dirname(path) 558 return 559 self.rootdir = path 560 if not ignore_opf: 561 for path in self.namelist(): 562 ext = os.path.splitext(path)[1].lower() 563 if ext == '.opf': 564 self.opfname = path 565 return 566 567 def _unquote(self, path): 568 # unquote must run on a bytestring and will return a bytestring 569 # If it runs on a unicode object, it returns a double encoded unicode 570 # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8') 571 # and the latter is correct 572 if isinstance(path, str): 573 path = path.encode('utf-8') 574 return urlunquote(path).decode('utf-8') 575 576 def read(self, path): 577 if path is None: 578 path = self.opfname 579 path = os.path.join(self.rootdir, self._unquote(path)) 580 with lopen(path, 'rb') as f: 581 return f.read() 582 583 def write(self, path, data): 584 path = os.path.join(self.rootdir, self._unquote(path)) 585 dir = os.path.dirname(path) 586 if not os.path.isdir(dir): 587 os.makedirs(dir) 588 with lopen(path, 'wb') as f: 589 return f.write(data) 590 591 def exists(self, path): 592 if not path: 593 return False 594 try: 595 path = os.path.join(self.rootdir, self._unquote(path)) 596 except ValueError: # Happens if path contains quoted special chars 597 return False 598 try: 599 return os.path.isfile(path) 600 except UnicodeEncodeError: 601 # On linux, if LANG is unset, the os.stat call tries to encode the 602 # unicode path using ASCII 603 # To replicate try: 604 # LANG=en_US.ASCII python -c "import os; os.stat(u'Espa\xf1a')" 605 return os.path.isfile(path.encode(filesystem_encoding)) 606 607 def namelist(self): 608 names = [] 609 base = self.rootdir 610 for root, dirs, files in os.walk(base): 611 for fname in files: 612 fname = os.path.join(root, fname) 613 if isinstance(fname, bytes): 614 try: 615 fname = fname.decode(filesystem_encoding) 616 except Exception: 617 try: 618 fname = fname.decode('utf-8') 619 except Exception: 620 continue 621 fname = fname.replace('\\', '/') 622 names.append(fname) 623 return names 624 625 626class Metadata: 627 """A collection of OEB data model metadata. 628 629 Provides access to the list of items associated with a particular metadata 630 term via the term's local name using either Python container or attribute 631 syntax. Return an empty list for any terms with no currently associated 632 metadata items. 633 """ 634 635 DC_TERMS = {'contributor', 'coverage', 'creator', 'date', 636 'description', 'format', 'identifier', 'language', 637 'publisher', 'relation', 'rights', 'source', 638 'subject', 'title', 'type'} 639 CALIBRE_TERMS = {'series', 'series_index', 'rating', 'timestamp', 640 'publication_type', 'title_sort'} 641 OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), 642 'scheme': OPF('scheme'), 'event': OPF('event'), 643 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'} 644 OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} 645 OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, 646 'xsi': XSI_NS, 'calibre': CALIBRE_NS} 647 648 class Item: 649 """An item of OEB data model metadata. 650 651 The metadata term or name may be accessed via the :attr:`term` or 652 :attr:`name` attributes. The metadata value or content may be accessed 653 via the :attr:`value` or :attr:`content` attributes, or via Unicode or 654 string representations of the object. 655 656 OEB data model metadata attributes may be accessed either via their 657 fully-qualified names using the Python container access syntax, or via 658 their local names using Python attribute syntax. Only attributes 659 allowed by the OPF 2.0 specification are supported. 660 """ 661 class Attribute: 662 """Smart accessor for allowed OEB metadata item attributes.""" 663 664 def __init__(self, attr, allowed=None): 665 if not callable(attr): 666 attr_, attr = attr, lambda term: attr_ 667 self.attr = attr 668 self.allowed = allowed 669 670 def term_attr(self, obj): 671 term = obj.term 672 if namespace(term) != DC11_NS: 673 term = OPF('meta') 674 allowed = self.allowed 675 if allowed is not None and term not in allowed: 676 raise AttributeError( 677 'attribute %r not valid for metadata term %r' % ( 678 self.attr(term), barename(obj.term))) 679 return self.attr(term) 680 681 def __get__(self, obj, cls): 682 if obj is None: 683 return None 684 return obj.attrib.get(self.term_attr(obj), '') 685 686 def __set__(self, obj, value): 687 obj.attrib[self.term_attr(obj)] = value 688 689 def __init__(self, term, value, attrib={}, nsmap={}, **kwargs): 690 self.attrib = attrib = dict(attrib) 691 self.nsmap = nsmap = dict(nsmap) 692 attrib.update(kwargs) 693 if namespace(term) == OPF2_NS: 694 term = barename(term) 695 ns = namespace(term) 696 local = barename(term).lower() 697 if local in Metadata.DC_TERMS and (not ns or ns in DC_NSES): 698 # Anything looking like Dublin Core is coerced 699 term = DC(local) 700 elif local in Metadata.CALIBRE_TERMS and ns in (CALIBRE_NS, ''): 701 # Ditto for Calibre-specific metadata 702 term = CALIBRE(local) 703 self.term = term 704 self.value = value 705 for attr, value in tuple(iteritems(attrib)): 706 if isprefixname(value): 707 attrib[attr] = qname(value, nsmap) 708 nsattr = Metadata.OPF_ATTRS.get(attr, attr) 709 if nsattr == OPF('scheme') and namespace(term) != DC11_NS: 710 # The opf:meta element takes @scheme, not @opf:scheme 711 nsattr = 'scheme' 712 if attr != nsattr: 713 attrib[nsattr] = attrib.pop(attr) 714 715 @property 716 def name(self): 717 return self.term 718 719 @property 720 def content(self): 721 return self.value 722 723 @content.setter 724 def content(self, value): 725 self.value = value 726 727 scheme = Attribute(lambda term: 'scheme' if 728 term == OPF('meta') else OPF('scheme'), 729 [DC('identifier'), OPF('meta')]) 730 file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'), 731 DC('title')]) 732 role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) 733 event = Attribute(OPF('event'), [DC('date')]) 734 id = Attribute('id') 735 type = Attribute(XSI('type'), [DC('date'), DC('format'), 736 DC('type')]) 737 lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'), 738 DC('creator'), DC('publisher'), 739 DC('relation'), DC('rights'), 740 DC('source'), DC('subject'), 741 OPF('meta')]) 742 743 def __getitem__(self, key): 744 return self.attrib[key] 745 746 def __setitem__(self, key, value): 747 self.attrib[key] = value 748 749 def __contains__(self, key): 750 return key in self.attrib 751 752 def get(self, key, default=None): 753 return self.attrib.get(key, default) 754 755 def __repr__(self): 756 return 'Item(term=%r, value=%r, attrib=%r)' \ 757 % (barename(self.term), self.value, self.attrib) 758 759 def __str__(self): 760 return as_unicode(self.value) 761 762 def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}): 763 attrib = {} 764 for key, value in self.attrib.items(): 765 if namespace(key) == OPF2_NS: 766 key = barename(key) 767 attrib[key] = prefixname(value, nsrmap) 768 if namespace(self.term) == DC11_NS: 769 name = DC(icu_title(barename(self.term))) 770 elem = element(dcmeta, name, attrib=attrib) 771 elem.text = self.value 772 else: 773 elem = element(xmeta, 'meta', attrib=attrib) 774 elem.attrib['name'] = prefixname(self.term, nsrmap) 775 elem.attrib['content'] = prefixname(self.value, nsrmap) 776 return elem 777 778 def to_opf2(self, parent=None, nsrmap={}): 779 attrib = {} 780 for key, value in self.attrib.items(): 781 attrib[key] = prefixname(value, nsrmap) 782 if namespace(self.term) == DC11_NS: 783 elem = element(parent, self.term, attrib=attrib) 784 try: 785 elem.text = self.value 786 except: 787 elem.text = repr(self.value) 788 else: 789 elem = element(parent, OPF('meta'), attrib=attrib) 790 elem.attrib['name'] = prefixname(self.term, nsrmap) 791 elem.attrib['content'] = prefixname(self.value, nsrmap) 792 return elem 793 794 def __init__(self, oeb): 795 self.oeb = oeb 796 self.items = defaultdict(list) 797 self.primary_writing_mode = None 798 799 def add(self, term, value, attrib={}, nsmap={}, **kwargs): 800 """Add a new metadata item.""" 801 item = self.Item(term, value, attrib, nsmap, **kwargs) 802 items = self.items[barename(item.term)] 803 items.append(item) 804 return item 805 806 def iterkeys(self): 807 yield from self.items 808 __iter__ = iterkeys 809 810 def clear(self, key): 811 l = self.items[key] 812 for x in list(l): 813 l.remove(x) 814 815 def filter(self, key, predicate): 816 l = self.items[key] 817 for x in list(l): 818 if predicate(x): 819 l.remove(x) 820 821 def __getitem__(self, key): 822 return self.items[key] 823 824 def __contains__(self, key): 825 return key in self.items 826 827 def __getattr__(self, term): 828 return self.items[term] 829 830 @property 831 def _nsmap(self): 832 nsmap = {} 833 for term in self.items: 834 for item in self.items[term]: 835 nsmap.update(item.nsmap) 836 return nsmap 837 838 @property 839 def _opf1_nsmap(self): 840 nsmap = self._nsmap 841 for key, value in nsmap.items(): 842 if value in OPF_NSES or value in DC_NSES: 843 del nsmap[key] 844 return nsmap 845 846 @property 847 def _opf2_nsmap(self): 848 nsmap = self._nsmap 849 nsmap.update(OPF2_NSMAP) 850 return nsmap 851 852 def to_opf1(self, parent=None): 853 nsmap = self._opf1_nsmap 854 nsrmap = {value: key for key, value in iteritems(nsmap)} 855 elem = element(parent, 'metadata', nsmap=nsmap) 856 dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP) 857 xmeta = element(elem, 'x-metadata') 858 for term in self.items: 859 for item in self.items[term]: 860 item.to_opf1(dcmeta, xmeta, nsrmap=nsrmap) 861 if 'ms-chaptertour' not in self.items: 862 chaptertour = self.Item('ms-chaptertour', 'chaptertour') 863 chaptertour.to_opf1(dcmeta, xmeta, nsrmap=nsrmap) 864 return elem 865 866 def to_opf2(self, parent=None): 867 nsmap = self._opf2_nsmap 868 nsrmap = {value: key for key, value in iteritems(nsmap)} 869 elem = element(parent, OPF('metadata'), nsmap=nsmap) 870 for term in self.items: 871 for item in self.items[term]: 872 item.to_opf2(elem, nsrmap=nsrmap) 873 if self.primary_writing_mode: 874 elem.append(elem.makeelement(OPF('meta'), attrib={'name':'primary-writing-mode', 'content':self.primary_writing_mode})) 875 return elem 876 877 878class Manifest: 879 """Collection of files composing an OEB data model book. 880 881 Provides access to the content of the files composing the book and 882 attributes associated with those files, including their internal paths, 883 unique identifiers, and MIME types. 884 885 Itself acts as a :class:`set` of manifest items, and provides the following 886 instance data member for dictionary-like access: 887 888 :attr:`ids`: A dictionary in which the keys are the unique identifiers of 889 the manifest items and the values are the items themselves. 890 :attr:`hrefs`: A dictionary in which the keys are the internal paths of the 891 manifest items and the values are the items themselves. 892 """ 893 894 class Item: 895 """An OEB data model book content file. 896 897 Provides the following data members for accessing the file content and 898 metadata associated with this particular file. 899 900 :attr:`id`: Unique identifier. 901 :attr:`href`: Book-internal path. 902 :attr:`media_type`: MIME type of the file content. 903 :attr:`fallback`: Unique id of any fallback manifest item associated 904 with this manifest item. 905 :attr:`spine_position`: Display/reading order index for book textual 906 content. `None` for manifest items which are not part of the 907 book's textual content. 908 :attr:`linear`: `True` for textual content items which are part of the 909 primary linear reading order and `False` for textual content items 910 which are not (such as footnotes). Meaningless for items which 911 have a :attr:`spine_position` of `None`. 912 """ 913 914 def __init__(self, oeb, id, href, media_type, 915 fallback=None, loader=str, data=None): 916 if href: 917 href = str(href) 918 self.oeb = oeb 919 self.id = id 920 self.href = self.path = urlnormalize(href) 921 self.media_type = media_type 922 self.fallback = fallback 923 self.override_css_fetch = None 924 self.resolve_css_imports = True 925 self.spine_position = None 926 self.linear = True 927 if loader is None and data is None: 928 loader = oeb.container.read 929 self._loader = loader 930 self._data = data 931 932 def __repr__(self): 933 return 'Item(id=%r, href=%r, media_type=%r)' \ 934 % (self.id, self.href, self.media_type) 935 936 # Parsing {{{ 937 def _parse_xml(self, data): 938 if not data: 939 return 940 data = xml_to_unicode(data, strip_encoding_pats=True, 941 assume_utf8=True, resolve_entities=True)[0] 942 return safe_xml_fromstring(data) 943 944 def _parse_xhtml(self, data): 945 orig_data = data 946 fname = urlunquote(self.href) 947 self.oeb.log.debug('Parsing', fname, '...') 948 self.oeb.html_preprocessor.current_href = self.href 949 try: 950 data = parse_html(data, log=self.oeb.log, 951 decoder=self.oeb.decode, 952 preprocessor=self.oeb.html_preprocessor, 953 filename=fname, non_html_file_tags={'ncx'}) 954 except NotHTML: 955 return self._parse_xml(orig_data) 956 return data 957 958 def _parse_txt(self, data): 959 has_html = '<html>' 960 if isinstance(data, bytes): 961 has_html = has_html.encode('ascii') 962 if has_html in data: 963 return self._parse_xhtml(data) 964 965 self.oeb.log.debug('Converting', self.href, '...') 966 967 from calibre.ebooks.txt.processor import convert_markdown 968 969 title = self.oeb.metadata.title 970 if title: 971 title = str(title[0]) 972 else: 973 title = _('Unknown') 974 975 return self._parse_xhtml(convert_markdown(data, title=title)) 976 977 def _parse_css(self, data): 978 from css_parser import CSSParser, log, resolveImports 979 from css_parser.css import CSSRule 980 log.setLevel(logging.WARN) 981 log.raiseExceptions = False 982 self.oeb.log.debug('Parsing', self.href, '...') 983 data = self.oeb.decode(data) 984 data = self.oeb.css_preprocessor(data, add_namespace=False) 985 parser = CSSParser(loglevel=logging.WARNING, 986 fetcher=self.override_css_fetch or self._fetch_css, 987 log=_css_logger) 988 data = parser.parseString(data, href=self.href, validate=False) 989 if self.resolve_css_imports: 990 data = resolveImports(data) 991 for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)): 992 data.cssRules.remove(rule) 993 return data 994 995 def _fetch_css(self, path): 996 hrefs = self.oeb.manifest.hrefs 997 if path not in hrefs: 998 self.oeb.logger.warn('CSS import of missing file %r' % path) 999 return (None, None) 1000 item = hrefs[path] 1001 if item.media_type not in OEB_STYLES: 1002 self.oeb.logger.warn('CSS import of non-CSS file %r' % path) 1003 return (None, None) 1004 data = item.data.cssText 1005 enc = None if isinstance(data, str) else 'utf-8' 1006 return (enc, data) 1007 1008 # }}} 1009 1010 @property 1011 def data(self): 1012 """Provides MIME type sensitive access to the manifest 1013 entry's associated content. 1014 1015 - XHTML, HTML, and variant content is parsed as necessary to 1016 convert and return as an lxml.etree element in the XHTML 1017 namespace. 1018 - XML content is parsed and returned as an lxml.etree element. 1019 - CSS and CSS-variant content is parsed and returned as a css_parser 1020 CSS DOM stylesheet. 1021 - All other content is returned as a :class:`str` or :class:`bytes` 1022 object with no special parsing. 1023 """ 1024 data = self._data 1025 if data is None: 1026 if self._loader is None: 1027 return None 1028 data = self._loader(getattr(self, 'html_input_href', 1029 self.href)) 1030 try: 1031 mt = self.media_type.lower() 1032 except Exception: 1033 mt = 'application/octet-stream' 1034 if not isinstance(data, string_or_bytes): 1035 pass # already parsed 1036 elif mt in OEB_DOCS: 1037 data = self._parse_xhtml(data) 1038 elif mt[-4:] in ('+xml', '/xml'): 1039 data = self._parse_xml(data) 1040 elif mt in OEB_STYLES: 1041 data = self._parse_css(data) 1042 elif mt == 'text/plain': 1043 self.oeb.log.warn('%s contains data in TXT format'%self.href, 1044 'converting to HTML') 1045 data = self._parse_txt(data) 1046 self.media_type = XHTML_MIME 1047 self._data = data 1048 return data 1049 1050 @data.setter 1051 def data(self, value): 1052 self._data = value 1053 1054 @data.deleter 1055 def data(self): 1056 self._data = None 1057 1058 def reparse_css(self): 1059 self._data = self._parse_css(str(self)) 1060 1061 def unload_data_from_memory(self, memory=None): 1062 if isinstance(self._data, bytes): 1063 if memory is None: 1064 from calibre.ptempfile import PersistentTemporaryFile 1065 pt = PersistentTemporaryFile(suffix='_oeb_base_mem_unloader.img') 1066 with pt: 1067 pt.write(self._data) 1068 self.oeb._temp_files.append(pt.name) 1069 1070 def loader(*args): 1071 with open(pt.name, 'rb') as f: 1072 ans = f.read() 1073 os.remove(pt.name) 1074 return ans 1075 self._loader = loader 1076 else: 1077 def loader2(*args): 1078 with open(memory, 'rb') as f: 1079 ans = f.read() 1080 return ans 1081 self._loader = loader2 1082 self._data = None 1083 1084 @property 1085 def unicode_representation(self): 1086 data = self.data 1087 if isinstance(data, etree._Element): 1088 return xml2text(data, pretty_print=self.oeb.pretty_print) 1089 if isinstance(data, str): 1090 return data 1091 if hasattr(data, 'cssText'): 1092 return css_text(data) 1093 return str(data) 1094 1095 @property 1096 def bytes_representation(self): 1097 return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print) 1098 1099 def __str__(self): 1100 return self.unicode_representation 1101 1102 def __eq__(self, other): 1103 return self is other 1104 1105 def __ne__(self, other): 1106 return self is not other 1107 1108 def __hash__(self): 1109 return id(self) 1110 1111 @property 1112 def sort_key(self): 1113 href = self.href 1114 if isinstance(href, bytes): 1115 href = force_unicode(href) 1116 sp = self.spine_position if isinstance(self.spine_position, numbers.Number) else sys.maxsize 1117 return sp, (self.media_type or '').lower(), numeric_sort_key(href), self.id 1118 1119 def relhref(self, href): 1120 """Convert the URL provided in :param:`href` from a book-absolute 1121 reference to a reference relative to this manifest item. 1122 """ 1123 return rel_href(self.href, href) 1124 1125 def abshref(self, href): 1126 """Convert the URL provided in :param:`href` from a reference 1127 relative to this manifest item to a book-absolute reference. 1128 """ 1129 try: 1130 purl = urlparse(href) 1131 except ValueError: 1132 return href 1133 scheme = purl.scheme 1134 if scheme and scheme != 'file': 1135 return href 1136 purl = list(purl) 1137 purl[0] = '' 1138 href = urlunparse(purl) 1139 path, frag = urldefrag(href) 1140 if not path: 1141 if frag: 1142 return '#'.join((self.href, frag)) 1143 else: 1144 return self.href 1145 if '/' not in self.href: 1146 return href 1147 dirname = os.path.dirname(self.href) 1148 href = os.path.join(dirname, href) 1149 href = os.path.normpath(href).replace('\\', '/') 1150 return href 1151 1152 def __init__(self, oeb): 1153 self.oeb = oeb 1154 self.items = set() 1155 self.ids = {} 1156 self.hrefs = {} 1157 1158 def add(self, id, href, media_type, fallback=None, loader=None, data=None): 1159 """Add a new item to the book manifest. 1160 1161 The item's :param:`id`, :param:`href`, and :param:`media_type` are all 1162 required. A :param:`fallback` item-id is required for any items with a 1163 MIME type which is not one of the OPS core media types. Either the 1164 item's data itself may be provided with :param:`data`, or a loader 1165 function for the data may be provided with :param:`loader`, or the 1166 item's data may later be set manually via the :attr:`data` attribute. 1167 """ 1168 item = self.Item( 1169 self.oeb, id, href, media_type, fallback, loader, data) 1170 self.items.add(item) 1171 self.ids[item.id] = item 1172 self.hrefs[item.href] = item 1173 return item 1174 1175 def remove(self, item): 1176 """Removes :param:`item` from the manifest.""" 1177 if item in self.ids: 1178 item = self.ids[item] 1179 del self.ids[item.id] 1180 if item.href in self.hrefs: 1181 del self.hrefs[item.href] 1182 self.items.remove(item) 1183 if item in self.oeb.spine: 1184 self.oeb.spine.remove(item) 1185 1186 def remove_duplicate_item(self, item): 1187 if item in self.ids: 1188 item = self.ids[item] 1189 del self.ids[item.id] 1190 self.items.remove(item) 1191 1192 def generate(self, id=None, href=None): 1193 """Generate a new unique identifier and/or internal path for use in 1194 creating a new manifest item, using the provided :param:`id` and/or 1195 :param:`href` as bases. 1196 1197 Returns an two-tuple of the new id and path. If either :param:`id` or 1198 :param:`href` are `None` then the corresponding item in the return 1199 tuple will also be `None`. 1200 """ 1201 if id is not None: 1202 base = id 1203 index = 1 1204 while id in self.ids: 1205 id = base + str(index) 1206 index += 1 1207 if href is not None: 1208 href = urlnormalize(href) 1209 base, ext = os.path.splitext(href) 1210 index = 1 1211 lhrefs = {x.lower() for x in self.hrefs} 1212 while href.lower() in lhrefs: 1213 href = base + str(index) + ext 1214 index += 1 1215 return id, str(href) 1216 1217 def __iter__(self): 1218 yield from self.items 1219 1220 def __len__(self): 1221 return len(self.items) 1222 1223 def values(self): 1224 return list(self.items) 1225 1226 def __contains__(self, item): 1227 return item in self.items 1228 1229 def to_opf1(self, parent=None): 1230 elem = element(parent, 'manifest') 1231 for item in self.items: 1232 media_type = item.media_type 1233 if media_type in OEB_DOCS: 1234 media_type = OEB_DOC_MIME 1235 elif media_type in OEB_STYLES: 1236 media_type = OEB_CSS_MIME 1237 attrib = {'id': item.id, 'href': urlunquote(item.href), 1238 'media-type': media_type} 1239 if item.fallback: 1240 attrib['fallback'] = item.fallback 1241 element(elem, 'item', attrib=attrib) 1242 return elem 1243 1244 def to_opf2(self, parent=None): 1245 elem = element(parent, OPF('manifest')) 1246 for item in sorted(self.items, key=attrgetter('sort_key')): 1247 media_type = item.media_type 1248 if media_type in OEB_DOCS: 1249 media_type = XHTML_MIME 1250 elif media_type in OEB_STYLES: 1251 media_type = CSS_MIME 1252 attrib = {'id': item.id, 'href': urlunquote(item.href), 1253 'media-type': media_type} 1254 if item.fallback: 1255 attrib['fallback'] = item.fallback 1256 element(elem, OPF('item'), attrib=attrib) 1257 return elem 1258 1259 @property 1260 def main_stylesheet(self): 1261 ans = getattr(self, '_main_stylesheet', None) 1262 if ans is None: 1263 for item in self: 1264 if item.media_type.lower() in OEB_STYLES: 1265 ans = item 1266 break 1267 return ans 1268 1269 @main_stylesheet.setter 1270 def main_stylesheet(self, item): 1271 self._main_stylesheet = item 1272 1273 1274class Spine: 1275 """Collection of manifest items composing an OEB data model book's main 1276 textual content. 1277 1278 The spine manages which manifest items compose the book's main textual 1279 content and the sequence in which they appear. Provides Python container 1280 access as a list-like object. 1281 """ 1282 1283 def __init__(self, oeb): 1284 self.oeb = oeb 1285 self.items = [] 1286 self.page_progression_direction = None 1287 1288 def _linear(self, linear): 1289 if isinstance(linear, string_or_bytes): 1290 linear = linear.lower() 1291 if linear is None or linear in ('yes', 'true'): 1292 linear = True 1293 elif linear in ('no', 'false'): 1294 linear = False 1295 return linear 1296 1297 def add(self, item, linear=None): 1298 """Append :param:`item` to the end of the `Spine`.""" 1299 item.linear = self._linear(linear) 1300 item.spine_position = len(self.items) 1301 self.items.append(item) 1302 return item 1303 1304 def insert(self, index, item, linear): 1305 """Insert :param:`item` at position :param:`index` in the `Spine`.""" 1306 item.linear = self._linear(linear) 1307 item.spine_position = index 1308 self.items.insert(index, item) 1309 for i in range(index, len(self.items)): 1310 self.items[i].spine_position = i 1311 return item 1312 1313 def remove(self, item): 1314 """Remove :param:`item` from the `Spine`.""" 1315 index = item.spine_position 1316 self.items.pop(index) 1317 for i in range(index, len(self.items)): 1318 self.items[i].spine_position = i 1319 item.spine_position = None 1320 1321 def index(self, item): 1322 for i, x in enumerate(self): 1323 if item == x: 1324 return i 1325 return -1 1326 1327 def __iter__(self): 1328 yield from self.items 1329 1330 def __getitem__(self, index): 1331 return self.items[index] 1332 1333 def __len__(self): 1334 return len(self.items) 1335 1336 def __contains__(self, item): 1337 return (item in self.items) 1338 1339 def to_opf1(self, parent=None): 1340 elem = element(parent, 'spine') 1341 for item in self.items: 1342 if item.linear: 1343 element(elem, 'itemref', attrib={'idref': item.id}) 1344 return elem 1345 1346 def to_opf2(self, parent=None): 1347 elem = element(parent, OPF('spine')) 1348 for item in self.items: 1349 attrib = {'idref': item.id} 1350 if not item.linear: 1351 attrib['linear'] = 'no' 1352 element(elem, OPF('itemref'), attrib=attrib) 1353 return elem 1354 1355 1356class Guide: 1357 """Collection of references to standard frequently-occurring sections 1358 within an OEB data model book. 1359 1360 Provides dictionary-like access, in which the keys are the OEB reference 1361 type identifiers and the values are `Reference` objects. 1362 """ 1363 1364 class Reference: 1365 """Reference to a standard book section. 1366 1367 Provides the following instance data members: 1368 1369 :attr:`type`: Reference type identifier, as chosen from the list 1370 allowed in the OPF 2.0 specification. 1371 :attr:`title`: Human-readable section title. 1372 :attr:`href`: Book-internal URL of the referenced section. May include 1373 a fragment identifier. 1374 """ 1375 _TYPES_TITLES = [('cover', __('Cover')), 1376 ('title-page', __('Title page')), 1377 ('toc', __('Table of Contents')), 1378 ('index', __('Index')), 1379 ('glossary', __('Glossary')), 1380 ('acknowledgements', __('Acknowledgements')), 1381 ('bibliography', __('Bibliography')), 1382 ('colophon', __('Colophon')), 1383 ('copyright-page', __('Copyright')), 1384 ('dedication', __('Dedication')), 1385 ('epigraph', __('Epigraph')), 1386 ('foreword', __('Foreword')), 1387 ('loi', __('List of illustrations')), 1388 ('lot', __('List of tables')), 1389 ('notes', __('Notes')), 1390 ('preface', __('Preface')), 1391 ('text', __('Main text'))] 1392 TITLES = dict(_TYPES_TITLES) 1393 TYPES = frozenset(TITLES) 1394 ORDER = {t: i for i, (t, _) in enumerate(_TYPES_TITLES)} 1395 1396 def __init__(self, oeb, type, title, href): 1397 self.oeb = oeb 1398 if type.lower() in self.TYPES: 1399 type = type.lower() 1400 elif type not in self.TYPES and \ 1401 not type.startswith('other.'): 1402 type = 'other.' + type 1403 if not title and type in self.TITLES: 1404 title = oeb.translate(self.TITLES[type]) 1405 self.type = type 1406 self.title = title 1407 self.href = urlnormalize(href) 1408 1409 def __repr__(self): 1410 return 'Reference(type=%r, title=%r, href=%r)' \ 1411 % (self.type, self.title, self.href) 1412 1413 @property 1414 def item(self): 1415 """The manifest item associated with this reference.""" 1416 path = urldefrag(self.href)[0] 1417 hrefs = self.oeb.manifest.hrefs 1418 return hrefs.get(path, None) 1419 1420 def __init__(self, oeb): 1421 self.oeb = oeb 1422 self.refs = {} 1423 1424 def add(self, type, title, href): 1425 """Add a new reference to the `Guide`.""" 1426 if href: 1427 href = str(href) 1428 ref = self.Reference(self.oeb, type, title, href) 1429 self.refs[type] = ref 1430 return ref 1431 1432 def remove(self, type): 1433 return self.refs.pop(type, None) 1434 1435 def remove_by_href(self, href): 1436 remove = [r for r, i in iteritems(self.refs) if i.href == href] 1437 for r in remove: 1438 self.remove(r) 1439 1440 def iterkeys(self): 1441 yield from self.refs 1442 __iter__ = iterkeys 1443 1444 def values(self): 1445 return sorted(itervalues(self.refs), key=lambda ref: ref.ORDER.get(ref.type, 10000)) 1446 1447 def items(self): 1448 yield from self.refs.items() 1449 1450 def __getitem__(self, key): 1451 return self.refs[key] 1452 1453 def get(self, key): 1454 return self.refs.get(key) 1455 1456 def __delitem__(self, key): 1457 del self.refs[key] 1458 1459 def __contains__(self, key): 1460 return key in self.refs 1461 1462 def __len__(self): 1463 return len(self.refs) 1464 1465 def to_opf1(self, parent=None): 1466 elem = element(parent, 'guide') 1467 for ref in self.refs.values(): 1468 attrib = {'type': ref.type, 'href': urlunquote(ref.href)} 1469 if ref.title: 1470 attrib['title'] = ref.title 1471 element(elem, 'reference', attrib=attrib) 1472 return elem 1473 1474 def to_opf2(self, parent=None): 1475 if not len(self): 1476 return 1477 elem = element(parent, OPF('guide')) 1478 for ref in self.refs.values(): 1479 attrib = {'type': ref.type, 'href': urlunquote(ref.href)} 1480 if ref.title: 1481 attrib['title'] = ref.title 1482 element(elem, OPF('reference'), attrib=attrib) 1483 return elem 1484 1485 1486class TOC: 1487 """Represents a hierarchical table of contents or navigation tree for 1488 accessing arbitrary semantic sections within an OEB data model book. 1489 1490 Acts as a node within the navigation tree. Provides list-like access to 1491 sub-nodes. Provides the follow node instance data attributes: 1492 1493 :attr:`title`: The title of this navigation node. 1494 :attr:`href`: Book-internal URL referenced by this node. 1495 :attr:`klass`: Optional semantic class referenced by this node. 1496 :attr:`id`: Option unique identifier for this node. 1497 :attr:`author`: Optional author attribution for periodicals <mbp:> 1498 :attr:`description`: Optional description attribute for periodicals <mbp:> 1499 :attr:`toc_thumbnail`: Optional toc thumbnail image 1500 """ 1501 1502 def __init__(self, title=None, href=None, klass=None, id=None, 1503 play_order=None, author=None, description=None, toc_thumbnail=None): 1504 self.title = title 1505 self.href = urlnormalize(href) if href else href 1506 self.klass = klass 1507 self.id = id 1508 self.nodes = [] 1509 self.play_order = 0 1510 if play_order is None: 1511 play_order = self.next_play_order() 1512 self.play_order = play_order 1513 self.author = author 1514 self.description = description 1515 self.toc_thumbnail = toc_thumbnail 1516 1517 def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None): 1518 """Create and return a new sub-node of this node.""" 1519 node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail) 1520 self.nodes.append(node) 1521 return node 1522 1523 def remove(self, node): 1524 for child in self.nodes: 1525 if child is node: 1526 self.nodes.remove(child) 1527 return True 1528 else: 1529 if child.remove(node): 1530 return True 1531 return False 1532 1533 def iter(self): 1534 """Iterate over this node and all descendants in depth-first order.""" 1535 yield self 1536 for child in self.nodes: 1537 yield from child.iter() 1538 1539 def count(self): 1540 return len(list(self.iter())) - 1 1541 1542 def next_play_order(self): 1543 entries = [x.play_order for x in self.iter()] 1544 base = max(entries) if entries else 0 1545 return base+1 1546 1547 def has_href(self, href): 1548 for x in self.iter(): 1549 if x.href == href: 1550 return True 1551 return False 1552 1553 def has_text(self, text): 1554 for x in self.iter(): 1555 if x.title and x.title.lower() == text.lower(): 1556 return True 1557 return False 1558 1559 def iterdescendants(self, breadth_first=False): 1560 """Iterate over all descendant nodes in depth-first order.""" 1561 if breadth_first: 1562 for child in self.nodes: 1563 yield child 1564 for child in self.nodes: 1565 yield from child.iterdescendants(breadth_first=True) 1566 else: 1567 for child in self.nodes: 1568 yield from child.iter() 1569 1570 def __iter__(self): 1571 """Iterate over all immediate child nodes.""" 1572 yield from self.nodes 1573 1574 def __getitem__(self, index): 1575 return self.nodes[index] 1576 1577 def autolayer(self): 1578 """Make sequences of children pointing to the same content file into 1579 children of the first node referencing that file. 1580 """ 1581 prev = None 1582 for node in list(self.nodes): 1583 if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]: 1584 self.nodes.remove(node) 1585 prev.nodes.append(node) 1586 else: 1587 prev = node 1588 1589 def depth(self): 1590 """The maximum depth of the navigation tree rooted at this node.""" 1591 try: 1592 return max(node.depth() for node in self.nodes) + 1 1593 except ValueError: 1594 return 1 1595 1596 def get_lines(self, lvl=0): 1597 ans = [('\t'*lvl) + 'TOC: %s --> %s'%(self.title, self.href)] 1598 for child in self: 1599 ans.extend(child.get_lines(lvl+1)) 1600 return ans 1601 1602 def __str__(self): 1603 return '\n'.join(self.get_lines()) 1604 1605 def to_opf1(self, tour): 1606 for node in self.nodes: 1607 element(tour, 'site', attrib={ 1608 'title': node.title, 'href': urlunquote(node.href)}) 1609 node.to_opf1(tour) 1610 return tour 1611 1612 def to_ncx(self, parent=None): 1613 if parent is None: 1614 parent = etree.Element(NCX('navMap')) 1615 for node in self.nodes: 1616 id = node.id or uuid_id() 1617 po = node.play_order 1618 if po == 0: 1619 po = 1 1620 attrib = {'id': id, 'playOrder': str(po)} 1621 if node.klass: 1622 attrib['class'] = node.klass 1623 point = element(parent, NCX('navPoint'), attrib=attrib) 1624 label = etree.SubElement(point, NCX('navLabel')) 1625 title = node.title 1626 if title: 1627 title = re.sub(r'\s+', ' ', title) 1628 element(label, NCX('text')).text = title 1629 # Do not unescape this URL as ADE requires it to be escaped to 1630 # handle semi colons and other special characters in the file names 1631 element(point, NCX('content'), src=node.href) 1632 node.to_ncx(point) 1633 return parent 1634 1635 def rationalize_play_orders(self): 1636 ''' 1637 Ensure that all nodes with the same play_order have the same href and 1638 with different play_orders have different hrefs. 1639 ''' 1640 def po_node(n): 1641 for x in self.iter(): 1642 if x is n: 1643 return 1644 if x.play_order == n.play_order: 1645 return x 1646 1647 def href_node(n): 1648 for x in self.iter(): 1649 if x is n: 1650 return 1651 if x.href == n.href: 1652 return x 1653 1654 for x in self.iter(): 1655 y = po_node(x) 1656 if y is not None: 1657 if x.href != y.href: 1658 x.play_order = getattr(href_node(x), 'play_order', 1659 self.next_play_order()) 1660 y = href_node(x) 1661 if y is not None: 1662 x.play_order = y.play_order 1663 1664 1665class PageList: 1666 """Collection of named "pages" to mapped positions within an OEB data model 1667 book's textual content. 1668 1669 Provides list-like access to the pages. 1670 """ 1671 1672 class Page: 1673 """Represents a mapping between a page name and a position within 1674 the book content. 1675 1676 Provides the following instance data attributes: 1677 1678 :attr:`name`: The name of this page. Generally a number. 1679 :attr:`href`: Book-internal URL at which point this page begins. 1680 :attr:`type`: Must be one of 'front' (for prefatory pages, as commonly 1681 labeled in print with small-case Roman numerals), 'normal' (for 1682 standard pages, as commonly labeled in print with Arabic numerals), 1683 or 'special' (for other pages, as commonly not labeled in any 1684 fashion in print, such as the cover and title pages). 1685 :attr:`klass`: Optional semantic class of this page. 1686 :attr:`id`: Optional unique identifier for this page. 1687 """ 1688 TYPES = {'front', 'normal', 'special'} 1689 1690 def __init__(self, name, href, type='normal', klass=None, id=None): 1691 self.name = str(name) 1692 self.href = urlnormalize(href) 1693 self.type = type if type in self.TYPES else 'normal' 1694 self.id = id 1695 self.klass = klass 1696 1697 def __init__(self): 1698 self.pages = [] 1699 1700 def add(self, name, href, type='normal', klass=None, id=None): 1701 """Create a new page and add it to the `PageList`.""" 1702 page = self.Page(name, href, type, klass, id) 1703 self.pages.append(page) 1704 return page 1705 1706 def __len__(self): 1707 return len(self.pages) 1708 1709 def __iter__(self): 1710 yield from self.pages 1711 1712 def __getitem__(self, index): 1713 return self.pages[index] 1714 1715 def pop(self, index=-1): 1716 return self.pages.pop(index) 1717 1718 def remove(self, page): 1719 return self.pages.remove(page) 1720 1721 def to_ncx(self, parent=None): 1722 plist = element(parent, NCX('pageList'), id=uuid_id()) 1723 values = {t: count(1) for t in ('front', 'normal', 'special')} 1724 for page in self.pages: 1725 id = page.id or uuid_id() 1726 type = page.type 1727 value = str(next(values[type])) 1728 attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'} 1729 if page.klass: 1730 attrib['class'] = page.klass 1731 ptarget = element(plist, NCX('pageTarget'), attrib=attrib) 1732 label = element(ptarget, NCX('navLabel')) 1733 element(label, NCX('text')).text = page.name 1734 element(ptarget, NCX('content'), src=page.href) 1735 return plist 1736 1737 def to_page_map(self): 1738 pmap = etree.Element(OPF('page-map'), nsmap={None: OPF2_NS}) 1739 for page in self.pages: 1740 element(pmap, OPF('page'), name=page.name, href=page.href) 1741 return pmap 1742 1743 1744class OEBBook: 1745 """Representation of a book in the IDPF OEB data model.""" 1746 1747 COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') 1748 COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') 1749 1750 def __init__(self, logger, 1751 html_preprocessor, 1752 css_preprocessor=CSSPreProcessor(), 1753 encoding='utf-8', pretty_print=False, 1754 input_encoding='utf-8'): 1755 """Create empty book. Arguments: 1756 1757 :param:`encoding`: Default encoding for textual content read 1758 from an external container. 1759 :param:`pretty_print`: Whether or not the canonical string form 1760 of XML markup is pretty-printed. 1761 :param html_preprocessor: A callable that takes a unicode object 1762 and returns a unicode object. Will be called on all html files 1763 before they are parsed. 1764 :param css_preprocessor: A callable that takes a unicode object 1765 and returns a unicode object. Will be called on all CSS files 1766 before they are parsed. 1767 :param:`logger`: A Log object to use for logging all messages 1768 related to the processing of this book. It is accessible 1769 via the instance data members :attr:`logger,log`. 1770 1771 It provides the following public instance data members for 1772 accessing various parts of the OEB data model: 1773 1774 :attr:`metadata`: Metadata such as title, author name(s), etc. 1775 :attr:`manifest`: Manifest of all files included in the book, 1776 including MIME types and fallback information. 1777 :attr:`spine`: In-order list of manifest items which compose 1778 the textual content of the book. 1779 :attr:`guide`: Collection of references to standard positions 1780 within the text, such as the cover, preface, etc. 1781 :attr:`toc`: Hierarchical table of contents. 1782 :attr:`pages`: List of "pages," such as indexed to a print edition of 1783 the same text. 1784 """ 1785 _css_log_handler.log = logger 1786 self.encoding = encoding 1787 self.input_encoding = input_encoding 1788 self.html_preprocessor = html_preprocessor 1789 self.css_preprocessor = css_preprocessor 1790 self.pretty_print = pretty_print 1791 self.logger = self.log = logger 1792 self.version = '2.0' 1793 self.container = NullContainer(self.log) 1794 self.metadata = Metadata(self) 1795 self.uid = None 1796 self.manifest = Manifest(self) 1797 self.spine = Spine(self) 1798 self.guide = Guide(self) 1799 self.toc = TOC() 1800 self.pages = PageList() 1801 self.auto_generated_toc = True 1802 self._temp_files = [] 1803 1804 def clean_temp_files(self): 1805 for path in self._temp_files: 1806 try: 1807 os.remove(path) 1808 except: 1809 pass 1810 1811 @classmethod 1812 def generate(cls, opts): 1813 """Generate an OEBBook instance from command-line options.""" 1814 encoding = opts.encoding 1815 pretty_print = opts.pretty_print 1816 return cls(encoding=encoding, pretty_print=pretty_print) 1817 1818 def translate(self, text): 1819 """Translate :param:`text` into the book's primary language.""" 1820 lang = str(self.metadata.language[0]) 1821 lang = lang.split('-', 1)[0].lower() 1822 return translate(lang, text) 1823 1824 def decode(self, data): 1825 """Automatically decode :param:`data` into a `unicode` object.""" 1826 def fix_data(d): 1827 return d.replace('\r\n', '\n').replace('\r', '\n') 1828 if isinstance(data, str): 1829 return fix_data(data) 1830 bom_enc = None 1831 if data[:4] in (b'\0\0\xfe\xff', b'\xff\xfe\0\0'): 1832 bom_enc = {b'\0\0\xfe\xff':'utf-32-be', 1833 b'\xff\xfe\0\0':'utf-32-le'}[data[:4]] 1834 data = data[4:] 1835 elif data[:2] in (b'\xff\xfe', b'\xfe\xff'): 1836 bom_enc = {b'\xff\xfe':'utf-16-le', 'b\xfe\xff':'utf-16-be'}[data[:2]] 1837 data = data[2:] 1838 elif data[:3] == b'\xef\xbb\xbf': 1839 bom_enc = 'utf-8' 1840 data = data[3:] 1841 if bom_enc is not None: 1842 try: 1843 return fix_data(data.decode(bom_enc)) 1844 except UnicodeDecodeError: 1845 pass 1846 if self.input_encoding: 1847 try: 1848 return fix_data(data.decode(self.input_encoding, 'replace')) 1849 except UnicodeDecodeError: 1850 pass 1851 try: 1852 return fix_data(data.decode('utf-8')) 1853 except UnicodeDecodeError: 1854 pass 1855 data, _ = xml_to_unicode(data) 1856 return fix_data(data) 1857 1858 def to_opf1(self): 1859 """Produce OPF 1.2 representing the book's metadata and structure. 1860 1861 Returns a dictionary in which the keys are MIME types and the values 1862 are tuples of (default) filenames and lxml.etree element structures. 1863 """ 1864 package = etree.Element('package', 1865 attrib={'unique-identifier': self.uid.id}) 1866 self.metadata.to_opf1(package) 1867 self.manifest.to_opf1(package) 1868 self.spine.to_opf1(package) 1869 tours = element(package, 'tours') 1870 tour = element(tours, 'tour', 1871 attrib={'id': 'chaptertour', 'title': 'Chapter Tour'}) 1872 self.toc.to_opf1(tour) 1873 self.guide.to_opf1(package) 1874 return {OPF_MIME: ('content.opf', package)} 1875 1876 def _update_playorder(self, ncx): 1877 hrefs = set(map(urlnormalize, xpath(ncx, '//ncx:content/@src'))) 1878 playorder = {} 1879 next = 1 1880 selector = XPath('h:body//*[@id or @name]') 1881 for item in self.spine: 1882 base = item.href 1883 if base in hrefs: 1884 playorder[base] = next 1885 next += 1 1886 for elem in selector(item.data): 1887 added = False 1888 for attr in ('id', 'name'): 1889 id = elem.get(attr) 1890 if not id: 1891 continue 1892 href = '#'.join([base, id]) 1893 if href in hrefs: 1894 playorder[href] = next 1895 added = True 1896 if added: 1897 next += 1 1898 selector = XPath('ncx:content/@src') 1899 for i, elem in enumerate(xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]')): 1900 href = urlnormalize(selector(elem)[0]) 1901 order = playorder.get(href, i) 1902 elem.attrib['playOrder'] = str(order) 1903 return 1904 1905 def _to_ncx(self): 1906 lang = str(self.metadata.language[0]) 1907 lang = lang.replace('_', '-') 1908 ncx = etree.Element(NCX('ncx'), 1909 attrib={'version': '2005-1', XML('lang'): lang}, 1910 nsmap={None: NCX_NS}) 1911 head = etree.SubElement(ncx, NCX('head')) 1912 etree.SubElement(head, NCX('meta'), 1913 name='dtb:uid', content=str(self.uid)) 1914 etree.SubElement(head, NCX('meta'), 1915 name='dtb:depth', content=str(self.toc.depth())) 1916 generator = ''.join(['calibre (', __version__, ')']) 1917 etree.SubElement(head, NCX('meta'), 1918 name='dtb:generator', content=generator) 1919 etree.SubElement(head, NCX('meta'), 1920 name='dtb:totalPageCount', content=str(len(self.pages))) 1921 maxpnum = etree.SubElement(head, NCX('meta'), 1922 name='dtb:maxPageNumber', content='0') 1923 title = etree.SubElement(ncx, NCX('docTitle')) 1924 text = etree.SubElement(title, NCX('text')) 1925 text.text = str(self.metadata.title[0]) 1926 navmap = etree.SubElement(ncx, NCX('navMap')) 1927 self.toc.to_ncx(navmap) 1928 if len(self.pages) > 0: 1929 plist = self.pages.to_ncx(ncx) 1930 value = max(int(x) for x in xpath(plist, '//@value')) 1931 maxpnum.attrib['content'] = str(value) 1932 self._update_playorder(ncx) 1933 return ncx 1934 1935 def to_opf2(self, page_map=False): 1936 """Produce OPF 2.0 representing the book's metadata and structure. 1937 1938 Returns a dictionary in which the keys are MIME types and the values 1939 are tuples of (default) filenames and lxml.etree element structures. 1940 """ 1941 results = {} 1942 package = etree.Element(OPF('package'), 1943 attrib={'version': '2.0', 'unique-identifier': self.uid.id}, 1944 nsmap={None: OPF2_NS}) 1945 self.metadata.to_opf2(package) 1946 manifest = self.manifest.to_opf2(package) 1947 spine = self.spine.to_opf2(package) 1948 self.guide.to_opf2(package) 1949 results[OPF_MIME] = ('content.opf', package) 1950 id, href = self.manifest.generate('ncx', 'toc.ncx') 1951 etree.SubElement(manifest, OPF('item'), id=id, href=href, 1952 attrib={'media-type': NCX_MIME}) 1953 spine.attrib['toc'] = id 1954 results[NCX_MIME] = (href, self._to_ncx()) 1955 if page_map and len(self.pages) > 0: 1956 id, href = self.manifest.generate('page-map', 'page-map.xml') 1957 etree.SubElement(manifest, OPF('item'), id=id, href=href, 1958 attrib={'media-type': PAGE_MAP_MIME}) 1959 spine.attrib['page-map'] = id 1960 results[PAGE_MAP_MIME] = (href, self.pages.to_page_map()) 1961 if self.spine.page_progression_direction in {'ltr', 'rtl'}: 1962 spine.attrib['page-progression-direction'] = self.spine.page_progression_direction 1963 return results 1964 1965 1966def rel_href(base_href, href): 1967 """Convert the URL provided in :param:`href` to a URL relative to the URL 1968 in :param:`base_href` """ 1969 if urlparse(href).scheme: 1970 return href 1971 if '/' not in base_href: 1972 return href 1973 base = list(filter(lambda x: x and x != '.', os.path.dirname(os.path.normpath(base_href)).replace(os.sep, '/').split('/'))) 1974 while True: 1975 try: 1976 idx = base.index('..') 1977 except ValueError: 1978 break 1979 if idx > 0: 1980 del base[idx-1:idx+1] 1981 else: 1982 break 1983 if not base: 1984 return href 1985 target, frag = urldefrag(href) 1986 target = target.split('/') 1987 index = 0 1988 for index in range(min(len(base), len(target))): 1989 if base[index] != target[index]: 1990 break 1991 else: 1992 index += 1 1993 relhref = (['..'] * (len(base) - index)) + target[index:] 1994 relhref = '/'.join(relhref) 1995 if frag: 1996 relhref = '#'.join((relhref, frag)) 1997 return relhref 1998