1#!/usr/local/bin/python3.8 2 3__license__ = 'GPL v3' 4__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' 5__docformat__ = 'restructuredtext en' 6 7''' 8lxml based OPF parser. 9''' 10 11import re, sys, functools, os, uuid, glob, io, json, copy 12 13from lxml import etree 14 15from calibre.ebooks import escape_xpath_attr 16from calibre.constants import __appname__, __version__, filesystem_encoding 17from calibre.ebooks.metadata.toc import TOC 18from calibre.ebooks.metadata.utils import parse_opf, pretty_print_opf as _pretty_print 19from calibre.ebooks.metadata import string_to_authors, MetaInformation, check_isbn 20from calibre.ebooks.metadata.book.base import Metadata 21from calibre.utils.date import parse_date, isoformat 22from calibre.utils.localization import get_lang, canonicalize_lang 23from calibre import prints, guess_type 24from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars 25from calibre.utils.config import tweaks 26from calibre.utils.xml_parse import safe_xml_fromstring 27from polyglot.builtins import iteritems 28from polyglot.urllib import unquote, urlparse 29 30pretty_print_opf = False 31 32 33class PrettyPrint: 34 35 def __enter__(self): 36 global pretty_print_opf 37 pretty_print_opf = True 38 39 def __exit__(self, *args): 40 global pretty_print_opf 41 pretty_print_opf = False 42 43 44pretty_print = PrettyPrint() 45 46 47class Resource: # {{{ 48 49 ''' 50 Represents a resource (usually a file on the filesystem or a URL pointing 51 to the web. Such resources are commonly referred to in OPF files. 52 53 They have the interface: 54 55 :member:`path` 56 :member:`mime_type` 57 :method:`href` 58 ''' 59 60 def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True): 61 self.orig = href_or_path 62 self._href = None 63 self._basedir = basedir 64 self.path = None 65 self.fragment = '' 66 try: 67 self.mime_type = guess_type(href_or_path)[0] 68 except: 69 self.mime_type = None 70 if self.mime_type is None: 71 self.mime_type = 'application/octet-stream' 72 if is_path: 73 path = href_or_path 74 if not os.path.isabs(path): 75 path = os.path.abspath(os.path.join(basedir, path)) 76 if isinstance(path, bytes): 77 path = path.decode(filesystem_encoding) 78 self.path = path 79 else: 80 href_or_path = href_or_path 81 url = urlparse(href_or_path) 82 if url[0] not in ('', 'file'): 83 self._href = href_or_path 84 else: 85 pc = url[2] 86 if isinstance(pc, str): 87 pc = pc.encode('utf-8') 88 pc = pc.decode('utf-8') 89 self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) 90 self.fragment = url[-1] 91 92 def href(self, basedir=None): 93 ''' 94 Return a URL pointing to this resource. If it is a file on the filesystem 95 the URL is relative to `basedir`. 96 97 `basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`). 98 If this resource has no basedir, then the current working directory is used as the basedir. 99 ''' 100 if basedir is None: 101 if self._basedir: 102 basedir = self._basedir 103 else: 104 basedir = os.getcwd() 105 if self.path is None: 106 return self._href 107 frag = ('#' + self.fragment) if self.fragment else '' 108 if self.path == basedir: 109 return frag 110 try: 111 rpath = os.path.relpath(self.path, basedir) 112 except ValueError: # On windows path and basedir could be on different drives 113 rpath = self.path 114 if isinstance(rpath, bytes): 115 rpath = rpath.decode(filesystem_encoding) 116 return rpath.replace(os.sep, '/')+frag 117 118 def set_basedir(self, path): 119 self._basedir = path 120 121 def basedir(self): 122 return self._basedir 123 124 def __repr__(self): 125 return 'Resource(%s, %s)'%(repr(self.path), repr(self.href())) 126 127# }}} 128 129 130class ResourceCollection: # {{{ 131 132 def __init__(self): 133 self._resources = [] 134 135 def __iter__(self): 136 yield from self._resources 137 138 def __len__(self): 139 return len(self._resources) 140 141 def __getitem__(self, index): 142 return self._resources[index] 143 144 def __bool__(self): 145 return len(self._resources) > 0 146 147 def __str__(self): 148 resources = map(repr, self) 149 return '[%s]'%', '.join(resources) 150 __unicode__ = __str__ 151 152 def __repr__(self): 153 return str(self) 154 155 def append(self, resource): 156 if not isinstance(resource, Resource): 157 raise ValueError('Can only append objects of type Resource') 158 self._resources.append(resource) 159 160 def remove(self, resource): 161 self._resources.remove(resource) 162 163 def replace(self, start, end, items): 164 'Same as list[start:end] = items' 165 self._resources[start:end] = items 166 167 @staticmethod 168 def from_directory_contents(top, topdown=True): 169 collection = ResourceCollection() 170 for spec in os.walk(top, topdown=topdown): 171 path = os.path.abspath(os.path.join(spec[0], spec[1])) 172 res = Resource.from_path(path) 173 res.set_basedir(top) 174 collection.append(res) 175 return collection 176 177 def set_basedir(self, path): 178 for res in self: 179 res.set_basedir(path) 180 181# }}} 182 183 184class ManifestItem(Resource): # {{{ 185 186 @staticmethod 187 def from_opf_manifest_item(item, basedir): 188 href = item.get('href', None) 189 if href: 190 res = ManifestItem(href, basedir=basedir, is_path=True) 191 mt = item.get('media-type', '').strip() 192 if mt: 193 res.mime_type = mt 194 return res 195 196 @property 197 def media_type(self): 198 return self.mime_type 199 200 @media_type.setter 201 def media_type(self, val): 202 self.mime_type = val 203 204 def __unicode__representation__(self): 205 return '<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href(), self.media_type) 206 207 __str__ = __unicode__representation__ 208 209 def __repr__(self): 210 return str(self) 211 212 def __getitem__(self, index): 213 if index == 0: 214 return self.href() 215 if index == 1: 216 return self.media_type 217 raise IndexError('%d out of bounds.'%index) 218 219# }}} 220 221 222class Manifest(ResourceCollection): # {{{ 223 224 def append_from_opf_manifest_item(self, item, dir): 225 self.append(ManifestItem.from_opf_manifest_item(item, dir)) 226 id = item.get('id', '') 227 if not id: 228 id = 'id%d'%self.next_id 229 self[-1].id = id 230 self.next_id += 1 231 232 @staticmethod 233 def from_opf_manifest_element(items, dir): 234 m = Manifest() 235 for item in items: 236 try: 237 m.append_from_opf_manifest_item(item, dir) 238 except ValueError: 239 continue 240 return m 241 242 @staticmethod 243 def from_paths(entries): 244 ''' 245 `entries`: List of (path, mime-type) If mime-type is None it is autodetected 246 ''' 247 m = Manifest() 248 for path, mt in entries: 249 mi = ManifestItem(path, is_path=True) 250 if mt: 251 mi.mime_type = mt 252 mi.id = 'id%d'%m.next_id 253 m.next_id += 1 254 m.append(mi) 255 return m 256 257 def add_item(self, path, mime_type=None): 258 mi = ManifestItem(path, is_path=True) 259 if mime_type: 260 mi.mime_type = mime_type 261 mi.id = 'id%d'%self.next_id 262 self.next_id += 1 263 self.append(mi) 264 return mi.id 265 266 def __init__(self): 267 ResourceCollection.__init__(self) 268 self.next_id = 1 269 270 def item(self, id): 271 for i in self: 272 if i.id == id: 273 return i 274 275 def id_for_path(self, path): 276 path = os.path.normpath(os.path.abspath(path)) 277 for i in self: 278 if i.path and os.path.normpath(i.path) == path: 279 return i.id 280 281 def path_for_id(self, id): 282 for i in self: 283 if i.id == id: 284 return i.path 285 286 def type_for_id(self, id): 287 for i in self: 288 if i.id == id: 289 return i.mime_type 290 291# }}} 292 293 294class Spine(ResourceCollection): # {{{ 295 296 class Item(Resource): 297 298 def __init__(self, idfunc, *args, **kwargs): 299 Resource.__init__(self, *args, **kwargs) 300 self.is_linear = True 301 self.id = idfunc(self.path) 302 self.idref = None 303 304 def __repr__(self): 305 return 'Spine.Item(path=%r, id=%s, is_linear=%s)' % \ 306 (self.path, self.id, self.is_linear) 307 308 @staticmethod 309 def from_opf_spine_element(itemrefs, manifest): 310 s = Spine(manifest) 311 seen = set() 312 path_map = {i.id:i.path for i in s.manifest} 313 for itemref in itemrefs: 314 idref = itemref.get('idref', None) 315 if idref is not None: 316 path = path_map.get(idref) 317 if path and path not in seen: 318 r = Spine.Item(lambda x:idref, path, is_path=True) 319 r.is_linear = itemref.get('linear', 'yes') == 'yes' 320 r.idref = idref 321 s.append(r) 322 seen.add(path) 323 return s 324 325 @staticmethod 326 def from_paths(paths, manifest): 327 s = Spine(manifest) 328 for path in paths: 329 try: 330 s.append(Spine.Item(s.manifest.id_for_path, path, is_path=True)) 331 except: 332 continue 333 return s 334 335 def __init__(self, manifest): 336 ResourceCollection.__init__(self) 337 self.manifest = manifest 338 339 def replace(self, start, end, ids): 340 ''' 341 Replace the items between start (inclusive) and end (not inclusive) with 342 with the items identified by ids. ids can be a list of any length. 343 ''' 344 items = [] 345 for id in ids: 346 path = self.manifest.path_for_id(id) 347 if path is None: 348 raise ValueError('id %s not in manifest') 349 items.append(Spine.Item(lambda x: id, path, is_path=True)) 350 ResourceCollection.replace(start, end, items) 351 352 def linear_items(self): 353 for r in self: 354 if r.is_linear: 355 yield r.path 356 357 def nonlinear_items(self): 358 for r in self: 359 if not r.is_linear: 360 yield r.path 361 362 def items(self): 363 for i in self: 364 yield i.path 365 366# }}} 367 368 369class Guide(ResourceCollection): # {{{ 370 371 class Reference(Resource): 372 373 @staticmethod 374 def from_opf_resource_item(ref, basedir): 375 title, href, type = ref.get('title', ''), ref.get('href'), ref.get('type') 376 res = Guide.Reference(href, basedir, is_path=True) 377 res.title = title 378 res.type = type 379 return res 380 381 def __repr__(self): 382 ans = '<reference type="%s" href="%s" '%(self.type, self.href()) 383 if self.title: 384 ans += 'title="%s" '%self.title 385 return ans + '/>' 386 387 @staticmethod 388 def from_opf_guide(references, base_dir=os.getcwd()): 389 coll = Guide() 390 for ref in references: 391 try: 392 ref = Guide.Reference.from_opf_resource_item(ref, base_dir) 393 coll.append(ref) 394 except: 395 continue 396 return coll 397 398 def set_cover(self, path): 399 for i in tuple(self): 400 if 'cover' in i.type.lower(): 401 self.remove(i) 402 for typ in ('cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'): 403 self.append(Guide.Reference(path, is_path=True)) 404 self[-1].type = typ 405 self[-1].title = '' 406 407# }}} 408 409 410class MetadataField: 411 412 def __init__(self, name, is_dc=True, formatter=None, none_is=None, 413 renderer=lambda x: str(x)): 414 self.name = name 415 self.is_dc = is_dc 416 self.formatter = formatter 417 self.none_is = none_is 418 self.renderer = renderer 419 420 def __real_get__(self, obj, type=None): 421 ans = obj.get_metadata_element(self.name) 422 if ans is None: 423 return None 424 ans = obj.get_text(ans) 425 if ans is None: 426 return ans 427 if self.formatter is not None: 428 try: 429 ans = self.formatter(ans) 430 except: 431 return None 432 if hasattr(ans, 'strip'): 433 ans = ans.strip() 434 return ans 435 436 def __get__(self, obj, type=None): 437 ans = self.__real_get__(obj, type) 438 if ans is None: 439 ans = self.none_is 440 return ans 441 442 def __set__(self, obj, val): 443 elem = obj.get_metadata_element(self.name) 444 if val is None: 445 if elem is not None: 446 elem.getparent().remove(elem) 447 return 448 if elem is None: 449 elem = obj.create_metadata_element(self.name, is_dc=self.is_dc) 450 obj.set_text(elem, self.renderer(val)) 451 452 453class TitleSortField(MetadataField): 454 455 def __get__(self, obj, type=None): 456 c = self.__real_get__(obj, type) 457 if c is None: 458 matches = obj.title_path(obj.metadata) 459 if matches: 460 for match in matches: 461 ans = match.get('{%s}file-as'%obj.NAMESPACES['opf'], None) 462 if not ans: 463 ans = match.get('file-as', None) 464 if ans: 465 c = ans 466 if not c: 467 c = self.none_is 468 else: 469 c = c.strip() 470 return c 471 472 def __set__(self, obj, val): 473 MetadataField.__set__(self, obj, val) 474 matches = obj.title_path(obj.metadata) 475 if matches: 476 for match in matches: 477 for attr in list(match.attrib): 478 if attr.endswith('file-as'): 479 del match.attrib[attr] 480 481 482def serialize_user_metadata(metadata_elem, all_user_metadata, tail='\n'+(' '*8)): 483 from calibre.utils.config import to_json 484 from calibre.ebooks.metadata.book.json_codec import (object_to_unicode, 485 encode_is_multiple) 486 487 for name, fm in all_user_metadata.items(): 488 try: 489 fm = copy.copy(fm) 490 encode_is_multiple(fm) 491 fm = object_to_unicode(fm) 492 fm = json.dumps(fm, default=to_json, ensure_ascii=False) 493 except: 494 prints('Failed to write user metadata:', name) 495 import traceback 496 traceback.print_exc() 497 continue 498 meta = metadata_elem.makeelement('meta') 499 meta.set('name', 'calibre:user_metadata:'+name) 500 meta.set('content', fm) 501 meta.tail = tail 502 metadata_elem.append(meta) 503 504 505def serialize_annotations(metadata_elem, annotations, tail='\n'+(' '*8)): 506 for item in annotations: 507 data = json.dumps(item, ensure_ascii=False) 508 if isinstance(data, bytes): 509 data = data.decode('utf-8') 510 meta = metadata_elem.makeelement('meta') 511 meta.set('name', 'calibre:annotation') 512 meta.set('content', data) 513 meta.tail = tail 514 metadata_elem.append(meta) 515 516 517def dump_dict(cats): 518 if not cats: 519 cats = {} 520 from calibre.ebooks.metadata.book.json_codec import object_to_unicode 521 return json.dumps(object_to_unicode(cats), ensure_ascii=False, 522 skipkeys=True) 523 524 525class OPF: # {{{ 526 527 MIMETYPE = 'application/oebps-package+xml' 528 NAMESPACES = { 529 None: "http://www.idpf.org/2007/opf", 530 'dc': "http://purl.org/dc/elements/1.1/", 531 'opf': "http://www.idpf.org/2007/opf", 532 } 533 META = '{%s}meta' % NAMESPACES['opf'] 534 xpn = NAMESPACES.copy() 535 xpn.pop(None) 536 xpn['re'] = 'http://exslt.org/regular-expressions' 537 XPath = functools.partial(etree.XPath, namespaces=xpn) 538 CONTENT = XPath('self::*[re:match(name(), "meta$", "i")]/@content') 539 TEXT = XPath('string()') 540 541 metadata_path = XPath('descendant::*[re:match(name(), "metadata", "i")]') 542 metadata_elem_path = XPath( 543 'descendant::*[re:match(name(), concat($name, "$"), "i") or (re:match(name(), "meta$", "i") ' 544 'and re:match(@name, concat("^calibre:", $name, "$"), "i"))]') 545 title_path = XPath('descendant::*[re:match(name(), "title", "i")]') 546 authors_path = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut" or (not(@role) and not(@opf:role)))]') 547 editors_path = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="edt" or @opf:role="edt")]') 548 bkp_path = XPath('descendant::*[re:match(name(), "contributor", "i") and (@role="bkp" or @opf:role="bkp")]') 549 tags_path = XPath('descendant::*[re:match(name(), "subject", "i")]') 550 isbn_path = XPath('descendant::*[re:match(name(), "identifier", "i") and ' 551 '(re:match(@scheme, "isbn", "i") or re:match(@opf:scheme, "isbn", "i"))]') 552 pubdate_path = XPath('descendant::*[re:match(name(), "date", "i")]') 553 raster_cover_path = XPath('descendant::*[re:match(name(), "meta", "i") and ' 554 're:match(@name, "cover", "i") and @content]') 555 guide_cover_path = XPath('descendant::*[local-name()="guide"]/*[local-name()="reference" and re:match(@type, "cover", "i")]/@href') 556 identifier_path = XPath('descendant::*[re:match(name(), "identifier", "i")]') 557 application_id_path = XPath('descendant::*[re:match(name(), "identifier", "i") and ' 558 '(re:match(@opf:scheme, "calibre|libprs500", "i") or re:match(@scheme, "calibre|libprs500", "i"))]') 559 uuid_id_path = XPath('descendant::*[re:match(name(), "identifier", "i") and ' 560 '(re:match(@opf:scheme, "uuid", "i") or re:match(@scheme, "uuid", "i"))]') 561 languages_path = XPath('descendant::*[local-name()="language"]') 562 563 manifest_path = XPath('descendant::*[re:match(name(), "manifest", "i")]/*[re:match(name(), "item", "i")]') 564 manifest_ppath = XPath('descendant::*[re:match(name(), "manifest", "i")]') 565 spine_path = XPath('descendant::*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]') 566 guide_path = XPath('descendant::*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]') 567 568 publisher = MetadataField('publisher') 569 comments = MetadataField('description') 570 category = MetadataField('type') 571 rights = MetadataField('rights') 572 series = MetadataField('series', is_dc=False) 573 if tweaks['use_series_auto_increment_tweak_when_importing']: 574 series_index = MetadataField('series_index', is_dc=False, 575 formatter=float, none_is=None) 576 else: 577 series_index = MetadataField('series_index', is_dc=False, 578 formatter=float, none_is=1) 579 title_sort = TitleSortField('title_sort', is_dc=False) 580 rating = MetadataField('rating', is_dc=False, formatter=float) 581 publication_type = MetadataField('publication_type', is_dc=False) 582 timestamp = MetadataField('timestamp', is_dc=False, 583 formatter=parse_date, renderer=isoformat) 584 user_categories = MetadataField('user_categories', is_dc=False, 585 formatter=json.loads, 586 renderer=dump_dict) 587 author_link_map = MetadataField('author_link_map', is_dc=False, 588 formatter=json.loads, renderer=dump_dict) 589 590 def __init__(self, stream, basedir=os.getcwd(), unquote_urls=True, 591 populate_spine=True, try_to_guess_cover=True, preparsed_opf=None, read_toc=True): 592 self.try_to_guess_cover = try_to_guess_cover 593 self.basedir = self.base_dir = basedir 594 self.path_to_html_toc = self.html_toc_fragment = None 595 self.root = parse_opf(stream) if preparsed_opf is None else preparsed_opf 596 try: 597 self.package_version = float(self.root.get('version', None)) 598 except (AttributeError, TypeError, ValueError): 599 self.package_version = 0 600 self.metadata = self.metadata_path(self.root) 601 if not self.metadata: 602 self.metadata = [self.root.makeelement('{http://www.idpf.org/2007/opf}metadata')] 603 self.root.insert(0, self.metadata[0]) 604 self.metadata[0].tail = '\n' 605 self.metadata = self.metadata[0] 606 if unquote_urls: 607 self.unquote_urls() 608 self.manifest = Manifest() 609 m = self.manifest_path(self.root) 610 if m: 611 self.manifest = Manifest.from_opf_manifest_element(m, basedir) 612 self.spine = None 613 s = self.spine_path(self.root) 614 if populate_spine and s: 615 self.spine = Spine.from_opf_spine_element(s, self.manifest) 616 self.guide = None 617 guide = self.guide_path(self.root) 618 self.guide = Guide.from_opf_guide(guide, basedir) if guide else None 619 self.cover_data = (None, None) 620 if read_toc: 621 self.find_toc() 622 else: 623 self.toc = None 624 self.read_user_metadata() 625 626 def read_user_metadata(self): 627 self._user_metadata_ = {} 628 temp = Metadata('x', ['x']) 629 from calibre.utils.config import from_json 630 from calibre.ebooks.metadata.book.json_codec import decode_is_multiple 631 elems = self.root.xpath('//*[name() = "meta" and starts-with(@name,' 632 '"calibre:user_metadata:") and @content]') 633 for elem in elems: 634 name = elem.get('name') 635 name = ':'.join(name.split(':')[2:]) 636 if not name or not name.startswith('#'): 637 continue 638 fm = elem.get('content') 639 try: 640 fm = json.loads(fm, object_hook=from_json) 641 decode_is_multiple(fm) 642 temp.set_user_metadata(name, fm) 643 except: 644 prints('Failed to read user metadata:', name) 645 import traceback 646 traceback.print_exc() 647 continue 648 self._user_metadata_ = temp.get_all_user_metadata(True) 649 650 def to_book_metadata(self): 651 if self.package_version >= 3.0: 652 from calibre.ebooks.metadata.opf3 import read_metadata 653 return read_metadata(self.root) 654 ans = MetaInformation(self) 655 for n, v in self._user_metadata_.items(): 656 ans.set_user_metadata(n, v) 657 658 ans.set_identifiers(self.get_identifiers()) 659 660 return ans 661 662 def read_annotations(self): 663 for elem in self.root.xpath('//*[name() = "meta" and @name = "calibre:annotation" and @content]'): 664 try: 665 yield json.loads(elem.get('content')) 666 except Exception: 667 pass 668 669 def write_user_metadata(self): 670 elems = self.root.xpath('//*[name() = "meta" and starts-with(@name,' 671 '"calibre:user_metadata:") and @content]') 672 for elem in elems: 673 elem.getparent().remove(elem) 674 serialize_user_metadata(self.metadata, 675 self._user_metadata_) 676 677 def find_toc(self): 678 self.toc = None 679 try: 680 spine = self.XPath('descendant::*[re:match(name(), "spine", "i")]')(self.root) 681 toc = None 682 if spine: 683 spine = spine[0] 684 toc = spine.get('toc', None) 685 if toc is None and self.guide: 686 for item in self.guide: 687 if item.type and item.type.lower() == 'toc': 688 toc = item.path 689 if toc is None: 690 for item in self.manifest: 691 if 'toc' in item.href().lower(): 692 toc = item.path 693 if toc is None: 694 return 695 self.toc = TOC(base_path=self.base_dir) 696 is_ncx = getattr(self, 'manifest', None) is not None and \ 697 self.manifest.type_for_id(toc) is not None and \ 698 'dtbncx' in self.manifest.type_for_id(toc) 699 if is_ncx or toc.lower() in ('ncx', 'ncxtoc'): 700 path = self.manifest.path_for_id(toc) 701 if path: 702 self.toc.read_ncx_toc(path) 703 else: 704 f = glob.glob(os.path.join(self.base_dir, '*.ncx')) 705 if f: 706 self.toc.read_ncx_toc(f[0]) 707 else: 708 self.path_to_html_toc, self.html_toc_fragment = \ 709 toc.partition('#')[0], toc.partition('#')[-1] 710 if not os.access(self.path_to_html_toc, os.R_OK) or \ 711 not os.path.isfile(self.path_to_html_toc): 712 self.path_to_html_toc = None 713 self.toc.read_html_toc(toc) 714 except: 715 pass 716 717 def get_text(self, elem): 718 return ''.join(self.CONTENT(elem) or self.TEXT(elem)) 719 720 def set_text(self, elem, content): 721 if elem.tag == self.META: 722 elem.attrib['content'] = content 723 else: 724 elem.text = content 725 726 def itermanifest(self): 727 return self.manifest_path(self.root) 728 729 def create_manifest_item(self, href, media_type, append=False): 730 ids = {i.get('id', None) for i in self.itermanifest()} 731 manifest_id = 'id1' 732 c = 1 733 while manifest_id in ids: 734 c += 1 735 manifest_id = 'id%d'%c 736 if not media_type: 737 media_type = 'application/xhtml+xml' 738 ans = etree.Element('{%s}item'%self.NAMESPACES['opf'], 739 attrib={'id':manifest_id, 'href':href, 'media-type':media_type}) 740 ans.tail = '\n\t\t' 741 if append: 742 manifest = self.manifest_ppath(self.root)[0] 743 manifest.append(ans) 744 return ans 745 746 def replace_manifest_item(self, item, items): 747 items = [self.create_manifest_item(*i) for i in items] 748 for i, item2 in enumerate(items): 749 item2.set('id', item.get('id')+'.%d'%(i+1)) 750 manifest = item.getparent() 751 index = manifest.index(item) 752 manifest[index:index+1] = items 753 return [i.get('id') for i in items] 754 755 def iterspine(self): 756 return self.spine_path(self.root) 757 758 def spine_items(self): 759 for item in self.iterspine(): 760 idref = item.get('idref', '') 761 for x in self.itermanifest(): 762 if x.get('id', None) == idref: 763 yield x.get('href', '') 764 765 def first_spine_item(self): 766 items = self.iterspine() 767 if not items: 768 return None 769 idref = items[0].get('idref', '') 770 for x in self.itermanifest(): 771 if x.get('id', None) == idref: 772 return x.get('href', None) 773 774 def create_spine_item(self, idref): 775 ans = etree.Element('{%s}itemref'%self.NAMESPACES['opf'], idref=idref) 776 ans.tail = '\n\t\t' 777 return ans 778 779 def replace_spine_items_by_idref(self, idref, new_idrefs): 780 items = list(map(self.create_spine_item, new_idrefs)) 781 spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.root)[0] 782 old = [i for i in self.iterspine() if i.get('idref', None) == idref] 783 for x in old: 784 i = spine.index(x) 785 spine[i:i+1] = items 786 787 def create_guide_element(self): 788 e = etree.SubElement(self.root, '{%s}guide'%self.NAMESPACES['opf']) 789 e.text = '\n ' 790 e.tail = '\n' 791 return e 792 793 def remove_guide(self): 794 self.guide = None 795 for g in self.root.xpath('./*[re:match(name(), "guide", "i")]', namespaces={'re':'http://exslt.org/regular-expressions'}): 796 self.root.remove(g) 797 798 def create_guide_item(self, type, title, href): 799 e = etree.Element('{%s}reference'%self.NAMESPACES['opf'], 800 type=type, title=title, href=href) 801 e.tail='\n' 802 return e 803 804 def add_guide_item(self, type, title, href): 805 g = self.root.xpath('./*[re:match(name(), "guide", "i")]', namespaces={'re':'http://exslt.org/regular-expressions'})[0] 806 g.append(self.create_guide_item(type, title, href)) 807 808 def iterguide(self): 809 return self.guide_path(self.root) 810 811 def unquote_urls(self): 812 def get_href(item): 813 raw = unquote(item.get('href', '')) 814 if not isinstance(raw, str): 815 raw = raw.decode('utf-8') 816 return raw 817 for item in self.itermanifest(): 818 item.set('href', get_href(item)) 819 for item in self.iterguide(): 820 item.set('href', get_href(item)) 821 822 @property 823 def title(self): 824 # TODO: Add support for EPUB 3 refinements 825 826 for elem in self.title_path(self.metadata): 827 title = self.get_text(elem) 828 if title and title.strip(): 829 return re.sub(r'\s+', ' ', title.strip()) 830 831 @title.setter 832 def title(self, val): 833 val = (val or '').strip() 834 titles = self.title_path(self.metadata) 835 if self.package_version < 3: 836 # EPUB 3 allows multiple title elements containing sub-titles, 837 # series and other things. We all loooove EPUB 3. 838 for title in titles: 839 title.getparent().remove(title) 840 titles = () 841 if val: 842 title = titles[0] if titles else self.create_metadata_element('title') 843 title.text = re.sub(r'\s+', ' ', str(val)) 844 845 @property 846 def authors(self): 847 ans = [] 848 for elem in self.authors_path(self.metadata): 849 ans.extend(string_to_authors(self.get_text(elem))) 850 if not ans: 851 for elem in self.editors_path(self.metadata): 852 ans.extend(string_to_authors(self.get_text(elem))) 853 return ans 854 855 @authors.setter 856 def authors(self, val): 857 remove = list(self.authors_path(self.metadata)) or list(self.editors_path(self.metadata)) 858 for elem in remove: 859 elem.getparent().remove(elem) 860 # Ensure new author element is at the top of the list 861 # for broken implementations that always use the first 862 # <dc:creator> element with no attention to the role 863 for author in reversed(val): 864 elem = self.metadata.makeelement('{%s}creator'% 865 self.NAMESPACES['dc'], nsmap=self.NAMESPACES) 866 elem.tail = '\n' 867 self.metadata.insert(0, elem) 868 elem.set('{%s}role'%self.NAMESPACES['opf'], 'aut') 869 self.set_text(elem, author.strip()) 870 871 @property 872 def author_sort(self): 873 matches = self.authors_path(self.metadata) or self.editors_path(self.metadata) 874 if matches: 875 for match in matches: 876 ans = match.get('{%s}file-as'%self.NAMESPACES['opf']) or match.get('file-as') 877 if ans: 878 return ans 879 880 @author_sort.setter 881 def author_sort(self, val): 882 matches = self.authors_path(self.metadata) or self.editors_path(self.metadata) 883 if matches: 884 for key in matches[0].attrib: 885 if key.endswith('file-as'): 886 matches[0].attrib.pop(key) 887 matches[0].set('{%s}file-as'%self.NAMESPACES['opf'], str(val)) 888 889 @property 890 def tags(self): 891 ans = [] 892 for tag in self.tags_path(self.metadata): 893 text = self.get_text(tag) 894 if text and text.strip(): 895 ans.extend([x.strip() for x in text.split(',')]) 896 return ans 897 898 @tags.setter 899 def tags(self, val): 900 for tag in list(self.tags_path(self.metadata)): 901 tag.getparent().remove(tag) 902 for tag in val: 903 elem = self.create_metadata_element('subject') 904 self.set_text(elem, str(tag)) 905 906 @property 907 def pubdate(self): 908 ans = None 909 for match in self.pubdate_path(self.metadata): 910 try: 911 val = parse_date(etree.tostring(match, encoding='unicode', 912 method='text', with_tail=False).strip()) 913 except: 914 continue 915 if ans is None or val < ans: 916 ans = val 917 return ans 918 919 @pubdate.setter 920 def pubdate(self, val): 921 least_val = least_elem = None 922 for match in self.pubdate_path(self.metadata): 923 try: 924 cval = parse_date(etree.tostring(match, encoding='unicode', 925 method='text', with_tail=False).strip()) 926 except: 927 match.getparent().remove(match) 928 else: 929 if not val: 930 match.getparent().remove(match) 931 if least_val is None or cval < least_val: 932 least_val, least_elem = cval, match 933 934 if val: 935 if least_val is None: 936 least_elem = self.create_metadata_element('date') 937 938 least_elem.attrib.clear() 939 least_elem.text = isoformat(val) 940 941 @property 942 def isbn(self): 943 for match in self.isbn_path(self.metadata): 944 return self.get_text(match) or None 945 946 @isbn.setter 947 def isbn(self, val): 948 uuid_id = None 949 for attr in self.root.attrib: 950 if attr.endswith('unique-identifier'): 951 uuid_id = self.root.attrib[attr] 952 break 953 954 matches = self.isbn_path(self.metadata) 955 if not val: 956 for x in matches: 957 xid = x.get('id', None) 958 is_package_identifier = uuid_id is not None and uuid_id == xid 959 if is_package_identifier: 960 self.set_text(x, str(uuid.uuid4())) 961 for attr in x.attrib: 962 if attr.endswith('scheme'): 963 x.attrib[attr] = 'uuid' 964 else: 965 x.getparent().remove(x) 966 return 967 if not matches: 968 attrib = {'{%s}scheme'%self.NAMESPACES['opf']: 'ISBN'} 969 matches = [self.create_metadata_element('identifier', 970 attrib=attrib)] 971 self.set_text(matches[0], str(val)) 972 973 def get_identifiers(self): 974 identifiers = {} 975 schemeless = [] 976 for x in self.XPath( 977 'descendant::*[local-name() = "identifier" and text()]')( 978 self.metadata): 979 found_scheme = False 980 for attr, val in iteritems(x.attrib): 981 if attr.endswith('scheme'): 982 typ = icu_lower(val) 983 val = etree.tostring(x, with_tail=False, encoding='unicode', 984 method='text').strip() 985 if val and typ not in ('calibre', 'uuid'): 986 if typ == 'isbn' and val.lower().startswith('urn:isbn:'): 987 val = val[len('urn:isbn:'):] 988 identifiers[typ] = val 989 found_scheme = True 990 break 991 if not found_scheme: 992 val = etree.tostring(x, with_tail=False, encoding='unicode', 993 method='text').strip() 994 if val.lower().startswith('urn:isbn:'): 995 val = check_isbn(val.split(':')[-1]) 996 if val is not None: 997 identifiers['isbn'] = val 998 else: 999 schemeless.append(val) 1000 1001 if schemeless and 'isbn' not in identifiers: 1002 for val in schemeless: 1003 if check_isbn(val, simple_sanitize=True) is not None: 1004 identifiers['isbn'] = check_isbn(val) 1005 break 1006 1007 return identifiers 1008 1009 def set_identifiers(self, identifiers): 1010 identifiers = identifiers.copy() 1011 uuid_id = None 1012 for attr in self.root.attrib: 1013 if attr.endswith('unique-identifier'): 1014 uuid_id = self.root.attrib[attr] 1015 break 1016 1017 for x in self.XPath( 1018 'descendant::*[local-name() = "identifier"]')( 1019 self.metadata): 1020 xid = x.get('id', None) 1021 is_package_identifier = uuid_id is not None and uuid_id == xid 1022 typ = {val.lower() for attr, val in iteritems(x.attrib) if attr.endswith('scheme')} 1023 if is_package_identifier: 1024 typ = tuple(typ) 1025 if typ and typ[0] in identifiers: 1026 self.set_text(x, identifiers.pop(typ[0])) 1027 continue 1028 if typ and not (typ & {'calibre', 'uuid'}): 1029 x.getparent().remove(x) 1030 1031 for typ, val in iteritems(identifiers): 1032 attrib = {'{%s}scheme'%self.NAMESPACES['opf']: typ.upper()} 1033 self.set_text(self.create_metadata_element( 1034 'identifier', attrib=attrib), str(val)) 1035 1036 @property 1037 def application_id(self): 1038 for match in self.application_id_path(self.metadata): 1039 return self.get_text(match) or None 1040 1041 @application_id.setter 1042 def application_id(self, val): 1043 removed_ids = set() 1044 for x in tuple(self.application_id_path(self.metadata)): 1045 removed_ids.add(x.get('id', None)) 1046 x.getparent().remove(x) 1047 1048 uuid_id = None 1049 for attr in self.root.attrib: 1050 if attr.endswith('unique-identifier'): 1051 uuid_id = self.root.attrib[attr] 1052 break 1053 attrib = {'{%s}scheme'%self.NAMESPACES['opf']: 'calibre'} 1054 if uuid_id and uuid_id in removed_ids: 1055 attrib['id'] = uuid_id 1056 self.set_text(self.create_metadata_element( 1057 'identifier', attrib=attrib), str(val)) 1058 1059 @property 1060 def uuid(self): 1061 for match in self.uuid_id_path(self.metadata): 1062 return self.get_text(match) or None 1063 1064 @uuid.setter 1065 def uuid(self, val): 1066 matches = self.uuid_id_path(self.metadata) 1067 if not matches: 1068 attrib = {'{%s}scheme'%self.NAMESPACES['opf']: 'uuid'} 1069 matches = [self.create_metadata_element('identifier', 1070 attrib=attrib)] 1071 self.set_text(matches[0], str(val)) 1072 1073 @property 1074 def language(self): 1075 ans = self.languages 1076 if ans: 1077 return ans[0] 1078 1079 @language.setter 1080 def language(self, val): 1081 self.languages = [val] 1082 1083 @property 1084 def languages(self): 1085 ans = [] 1086 for match in self.languages_path(self.metadata): 1087 t = self.get_text(match) 1088 if t and t.strip(): 1089 l = canonicalize_lang(t.strip()) 1090 if l: 1091 ans.append(l) 1092 return ans 1093 1094 @languages.setter 1095 def languages(self, val): 1096 matches = self.languages_path(self.metadata) 1097 for x in matches: 1098 x.getparent().remove(x) 1099 1100 for lang in val: 1101 l = self.create_metadata_element('language') 1102 self.set_text(l, str(lang)) 1103 1104 @property 1105 def raw_languages(self): 1106 for match in self.languages_path(self.metadata): 1107 t = self.get_text(match) 1108 if t and t.strip(): 1109 yield t.strip() 1110 1111 @property 1112 def book_producer(self): 1113 for match in self.bkp_path(self.metadata): 1114 return self.get_text(match) or None 1115 1116 @book_producer.setter 1117 def book_producer(self, val): 1118 matches = self.bkp_path(self.metadata) 1119 if not matches: 1120 matches = [self.create_metadata_element('contributor')] 1121 matches[0].set('{%s}role'%self.NAMESPACES['opf'], 'bkp') 1122 self.set_text(matches[0], str(val)) 1123 1124 def identifier_iter(self): 1125 yield from self.identifier_path(self.metadata) 1126 1127 @property 1128 def raw_unique_identifier(self): 1129 uuid_elem = None 1130 for attr in self.root.attrib: 1131 if attr.endswith('unique-identifier'): 1132 uuid_elem = self.root.attrib[attr] 1133 break 1134 if uuid_elem: 1135 matches = self.root.xpath('//*[@id=%s]'%escape_xpath_attr(uuid_elem)) 1136 if matches: 1137 for m in matches: 1138 raw = m.text 1139 if raw: 1140 return raw 1141 1142 @property 1143 def unique_identifier(self): 1144 raw = self.raw_unique_identifier 1145 if raw: 1146 return raw.rpartition(':')[-1] 1147 1148 @property 1149 def page_progression_direction(self): 1150 spine = self.XPath('descendant::*[re:match(name(), "spine", "i")][1]')(self.root) 1151 if spine: 1152 for k, v in iteritems(spine[0].attrib): 1153 if k == 'page-progression-direction' or k.endswith('}page-progression-direction'): 1154 return v 1155 1156 @property 1157 def primary_writing_mode(self): 1158 for m in self.XPath('//*[local-name()="meta" and @name="primary-writing-mode" and @content]')(self.root): 1159 return m.get('content') 1160 1161 def guess_cover(self): 1162 ''' 1163 Try to guess a cover. Needed for some old/badly formed OPF files. 1164 ''' 1165 if self.base_dir and os.path.exists(self.base_dir): 1166 for item in self.identifier_path(self.metadata): 1167 scheme = None 1168 for key in item.attrib.keys(): 1169 if key.endswith('scheme'): 1170 scheme = item.get(key) 1171 break 1172 if scheme is None: 1173 continue 1174 if item.text: 1175 prefix = item.text.replace('-', '') 1176 for suffix in ['.jpg', '.jpeg', '.gif', '.png', '.bmp']: 1177 cpath = os.access(os.path.join(self.base_dir, prefix+suffix), os.R_OK) 1178 if os.access(os.path.join(self.base_dir, prefix+suffix), os.R_OK): 1179 return cpath 1180 1181 @property 1182 def epub3_raster_cover(self): 1183 for item in self.itermanifest(): 1184 props = set((item.get('properties') or '').lower().split()) 1185 if 'cover-image' in props: 1186 mt = item.get('media-type', '') 1187 if mt and 'xml' not in mt and 'html' not in mt: 1188 return item.get('href', None) 1189 1190 @property 1191 def raster_cover(self): 1192 covers = self.raster_cover_path(self.metadata) 1193 if covers: 1194 cover_id = covers[0].get('content') 1195 for item in self.itermanifest(): 1196 if item.get('id', None) == cover_id: 1197 mt = item.get('media-type', '') 1198 if mt and 'xml' not in mt and 'html' not in mt: 1199 return item.get('href', None) 1200 for item in self.itermanifest(): 1201 if item.get('href', None) == cover_id: 1202 mt = item.get('media-type', '') 1203 if mt and 'xml' not in mt and 'html' not in mt: 1204 return item.get('href', None) 1205 elif self.package_version >= 3.0: 1206 return self.epub3_raster_cover 1207 1208 @property 1209 def guide_raster_cover(self): 1210 covers = self.guide_cover_path(self.root) 1211 if covers: 1212 mt_map = {i.get('href'):i for i in self.itermanifest()} 1213 for href in covers: 1214 if href: 1215 i = mt_map.get(href) 1216 if i is not None: 1217 iid, mt = i.get('id'), i.get('media-type') 1218 if iid and mt and mt.lower() in {'image/png', 'image/jpeg', 'image/jpg', 'image/gif'}: 1219 return i 1220 1221 @property 1222 def epub3_nav(self): 1223 if self.package_version >= 3.0: 1224 for item in self.itermanifest(): 1225 props = (item.get('properties') or '').lower().split() 1226 if 'nav' in props: 1227 mt = item.get('media-type') or '' 1228 if 'html' in mt.lower(): 1229 mid = item.get('id') 1230 if mid: 1231 path = self.manifest.path_for_id(mid) 1232 if path and os.path.exists(path): 1233 return path 1234 1235 @property 1236 def cover(self): 1237 if self.guide is not None: 1238 for t in ('cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'): 1239 for item in self.guide: 1240 if item.type and item.type.lower() == t: 1241 return item.path 1242 try: 1243 if self.try_to_guess_cover: 1244 return self.guess_cover() 1245 except: 1246 pass 1247 1248 @cover.setter 1249 def cover(self, path): 1250 if self.guide is not None: 1251 self.guide.set_cover(path) 1252 for item in list(self.iterguide()): 1253 if 'cover' in item.get('type', ''): 1254 item.getparent().remove(item) 1255 1256 else: 1257 g = self.create_guide_element() 1258 self.guide = Guide() 1259 self.guide.set_cover(path) 1260 etree.SubElement(g, 'opf:reference', nsmap=self.NAMESPACES, 1261 attrib={'type':'cover', 'href':self.guide[-1].href()}) 1262 id = self.manifest.id_for_path(self.cover) 1263 if id is None: 1264 for t in ('cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'): 1265 for item in self.guide: 1266 if item.type.lower() == t: 1267 self.create_manifest_item(item.href(), guess_type(path)[0]) 1268 1269 def get_metadata_element(self, name): 1270 matches = self.metadata_elem_path(self.metadata, name=name) 1271 if matches: 1272 return matches[-1] 1273 1274 def create_metadata_element(self, name, attrib=None, is_dc=True): 1275 if is_dc: 1276 name = '{%s}%s' % (self.NAMESPACES['dc'], name) 1277 else: 1278 attrib = attrib or {} 1279 attrib['name'] = 'calibre:' + name 1280 name = '{%s}%s' % (self.NAMESPACES['opf'], 'meta') 1281 nsmap = dict(self.NAMESPACES) 1282 del nsmap['opf'] 1283 elem = etree.SubElement(self.metadata, name, attrib=attrib, 1284 nsmap=nsmap) 1285 elem.tail = '\n' 1286 return elem 1287 1288 def render(self, encoding='utf-8'): 1289 for meta in self.raster_cover_path(self.metadata): 1290 # Ensure that the name attribute occurs before the content 1291 # attribute. Needed for Nooks. 1292 a = meta.attrib 1293 c = a.get('content', None) 1294 if c is not None: 1295 del a['content'] 1296 a['content'] = c 1297 # The PocketBook requires calibre:series_index to come after 1298 # calibre:series or it fails to read series info 1299 # We swap attributes instead of elements, as that avoids namespace 1300 # re-declarations 1301 smap = {} 1302 for child in self.metadata.xpath('./*[@name="calibre:series" or @name="calibre:series_index"]'): 1303 smap[child.get('name')] = (child, self.metadata.index(child)) 1304 if len(smap) == 2 and smap['calibre:series'][1] > smap['calibre:series_index'][1]: 1305 s, si = smap['calibre:series'][0], smap['calibre:series_index'][0] 1306 1307 def swap(attr): 1308 t = s.get(attr, '') 1309 s.set(attr, si.get(attr, '')), si.set(attr, t) 1310 swap('name'), swap('content') 1311 1312 self.write_user_metadata() 1313 if pretty_print_opf: 1314 _pretty_print(self.root) 1315 raw = etree.tostring(self.root, encoding=encoding, pretty_print=True) 1316 if not raw.lstrip().startswith(b'<?xml '): 1317 raw = ('<?xml version="1.0" encoding="%s"?>\n'%encoding.upper()).encode('ascii') + raw 1318 return raw 1319 1320 def smart_update(self, mi, replace_metadata=False, apply_null=False): 1321 for attr in ('title', 'authors', 'author_sort', 'title_sort', 1322 'publisher', 'series', 'series_index', 'rating', 1323 'isbn', 'tags', 'category', 'comments', 'book_producer', 1324 'pubdate', 'user_categories', 'author_link_map'): 1325 val = getattr(mi, attr, None) 1326 if attr == 'rating' and val: 1327 val = float(val) 1328 is_null = val is None or val in ((), [], (None, None), {}) or (attr == 'rating' and (not val or val < 0.1)) 1329 if is_null: 1330 if apply_null and attr in {'series', 'tags', 'isbn', 'comments', 'publisher', 'rating'}: 1331 setattr(self, attr, ([] if attr == 'tags' else None)) 1332 else: 1333 setattr(self, attr, val) 1334 langs = getattr(mi, 'languages', []) 1335 if langs == ['und']: 1336 langs = [] 1337 if apply_null or langs: 1338 self.languages = langs or [] 1339 temp = self.to_book_metadata() 1340 temp.remove_stale_user_metadata(mi) 1341 temp.smart_update(mi, replace_metadata=replace_metadata) 1342 if not replace_metadata and callable(getattr(temp, 'custom_field_keys', None)): 1343 # We have to replace non-null fields regardless of the value of 1344 # replace_metadata to match the behavior of the builtin fields 1345 # above. 1346 for x in temp.custom_field_keys(): 1347 meta = temp.get_user_metadata(x, make_copy=True) 1348 if meta is None: 1349 continue 1350 if meta['datatype'] == 'text' and meta['is_multiple']: 1351 val = mi.get(x, []) 1352 if val or apply_null: 1353 temp.set(x, val) 1354 elif meta['datatype'] in {'int', 'float', 'bool'}: 1355 missing = object() 1356 val = mi.get(x, missing) 1357 if val is missing: 1358 if apply_null: 1359 temp.set(x, None) 1360 elif apply_null or val is not None: 1361 temp.set(x, val) 1362 elif apply_null and mi.is_null(x) and not temp.is_null(x): 1363 temp.set(x, None) 1364 1365 self._user_metadata_ = temp.get_all_user_metadata(True) 1366 1367# }}} 1368 1369 1370class OPFCreator(Metadata): 1371 1372 def __init__(self, base_path, other): 1373 ''' 1374 Initialize. 1375 @param base_path: An absolute path to the folder in which this OPF file 1376 will eventually be. This is used by the L{create_manifest} method 1377 to convert paths to files into relative paths. 1378 ''' 1379 Metadata.__init__(self, title='', other=other) 1380 self.base_path = os.path.abspath(base_path) 1381 self.page_progression_direction = None 1382 self.primary_writing_mode = None 1383 if self.application_id is None: 1384 self.application_id = str(uuid.uuid4()) 1385 if not isinstance(self.toc, TOC): 1386 self.toc = None 1387 if not self.authors: 1388 self.authors = [_('Unknown')] 1389 if self.guide is None: 1390 self.guide = Guide() 1391 if self.cover: 1392 self.guide.set_cover(self.cover) 1393 1394 def create_manifest(self, entries): 1395 ''' 1396 Create <manifest> 1397 1398 `entries`: List of (path, mime-type) If mime-type is None it is autodetected 1399 ''' 1400 entries = list(map(lambda x: x if os.path.isabs(x[0]) else 1401 (os.path.abspath(os.path.join(self.base_path, x[0])), x[1]), 1402 entries)) 1403 self.manifest = Manifest.from_paths(entries) 1404 self.manifest.set_basedir(self.base_path) 1405 1406 def create_manifest_from_files_in(self, files_and_dirs, 1407 exclude=lambda x:False): 1408 entries = [] 1409 1410 def dodir(dir): 1411 for spec in os.walk(dir): 1412 root, files = spec[0], spec[-1] 1413 for name in files: 1414 path = os.path.join(root, name) 1415 if os.path.isfile(path) and not exclude(path): 1416 entries.append((path, None)) 1417 1418 for i in files_and_dirs: 1419 if os.path.isdir(i): 1420 dodir(i) 1421 else: 1422 entries.append((i, None)) 1423 1424 self.create_manifest(entries) 1425 1426 def create_spine(self, entries): 1427 ''' 1428 Create the <spine> element. Must first call :method:`create_manifest`. 1429 1430 `entries`: List of paths 1431 ''' 1432 entries = list(map(lambda x: x if os.path.isabs(x) else 1433 os.path.abspath(os.path.join(self.base_path, x)), entries)) 1434 self.spine = Spine.from_paths(entries, self.manifest) 1435 1436 def set_toc(self, toc): 1437 ''' 1438 Set the toc. You must call :method:`create_spine` before calling this 1439 method. 1440 1441 :param toc: A :class:`TOC` object 1442 ''' 1443 self.toc = toc 1444 1445 def create_guide(self, guide_element): 1446 self.guide = Guide.from_opf_guide(guide_element, self.base_path) 1447 self.guide.set_basedir(self.base_path) 1448 1449 def render(self, opf_stream=sys.stdout, ncx_stream=None, 1450 ncx_manifest_entry=None, encoding=None, process_guide=None): 1451 if encoding is None: 1452 encoding = 'utf-8' 1453 toc = getattr(self, 'toc', None) 1454 if self.manifest: 1455 self.manifest.set_basedir(self.base_path) 1456 if ncx_manifest_entry is not None and toc is not None: 1457 if not os.path.isabs(ncx_manifest_entry): 1458 ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry) 1459 remove = [i for i in self.manifest if i.id == 'ncx'] 1460 for item in remove: 1461 self.manifest.remove(item) 1462 self.manifest.append(ManifestItem(ncx_manifest_entry, self.base_path)) 1463 self.manifest[-1].id = 'ncx' 1464 self.manifest[-1].mime_type = 'application/x-dtbncx+xml' 1465 if self.guide is None: 1466 self.guide = Guide() 1467 if self.cover: 1468 cover = self.cover 1469 if not os.path.isabs(cover): 1470 cover = os.path.abspath(os.path.join(self.base_path, cover)) 1471 self.guide.set_cover(cover) 1472 self.guide.set_basedir(self.base_path) 1473 1474 # Actual rendering 1475 from lxml.builder import ElementMaker 1476 from calibre.ebooks.oeb.base import OPF2_NS, DC11_NS, CALIBRE_NS 1477 DNS = OPF2_NS+'___xx___' 1478 E = ElementMaker(namespace=DNS, nsmap={None:DNS}) 1479 M = ElementMaker(namespace=DNS, 1480 nsmap={'dc':DC11_NS, 'calibre':CALIBRE_NS, 'opf':OPF2_NS}) 1481 DC = ElementMaker(namespace=DC11_NS) 1482 1483 def DC_ELEM(tag, text, dc_attrs={}, opf_attrs={}): 1484 if text: 1485 elem = getattr(DC, tag)(clean_ascii_chars(text), **dc_attrs) 1486 else: 1487 elem = getattr(DC, tag)(**dc_attrs) 1488 for k, v in opf_attrs.items(): 1489 elem.set('{%s}%s'%(OPF2_NS, k), v) 1490 return elem 1491 1492 def CAL_ELEM(name, content): 1493 return M.meta(name=name, content=content) 1494 1495 metadata = M.metadata() 1496 a = metadata.append 1497 role = {} 1498 a(DC_ELEM('title', self.title if self.title else _('Unknown'), 1499 opf_attrs=role)) 1500 for i, author in enumerate(self.authors): 1501 fa = {'role':'aut'} 1502 if i == 0 and self.author_sort: 1503 fa['file-as'] = self.author_sort 1504 a(DC_ELEM('creator', author, opf_attrs=fa)) 1505 a(DC_ELEM('contributor', '%s (%s) [%s]'%(__appname__, __version__, 1506 'https://calibre-ebook.com'), opf_attrs={'role':'bkp', 1507 'file-as':__appname__})) 1508 a(DC_ELEM('identifier', str(self.application_id), 1509 opf_attrs={'scheme':__appname__}, 1510 dc_attrs={'id':__appname__+'_id'})) 1511 if getattr(self, 'pubdate', None) is not None: 1512 a(DC_ELEM('date', self.pubdate.isoformat())) 1513 langs = self.languages 1514 if not langs or langs == ['und']: 1515 langs = [get_lang().replace('_', '-').partition('-')[0]] 1516 for lang in langs: 1517 a(DC_ELEM('language', lang)) 1518 if self.comments: 1519 a(DC_ELEM('description', self.comments)) 1520 if self.publisher: 1521 a(DC_ELEM('publisher', self.publisher)) 1522 for key, val in iteritems(self.get_identifiers()): 1523 a(DC_ELEM('identifier', val, opf_attrs={'scheme':icu_upper(key)})) 1524 if self.rights: 1525 a(DC_ELEM('rights', self.rights)) 1526 if self.tags: 1527 for tag in self.tags: 1528 a(DC_ELEM('subject', tag)) 1529 if self.series: 1530 a(CAL_ELEM('calibre:series', self.series)) 1531 if self.series_index is not None: 1532 a(CAL_ELEM('calibre:series_index', self.format_series_index())) 1533 if self.title_sort: 1534 a(CAL_ELEM('calibre:title_sort', self.title_sort)) 1535 if self.rating is not None: 1536 a(CAL_ELEM('calibre:rating', str(self.rating))) 1537 if self.timestamp is not None: 1538 a(CAL_ELEM('calibre:timestamp', self.timestamp.isoformat())) 1539 if self.publication_type is not None: 1540 a(CAL_ELEM('calibre:publication_type', self.publication_type)) 1541 if self.user_categories: 1542 from calibre.ebooks.metadata.book.json_codec import object_to_unicode 1543 a(CAL_ELEM('calibre:user_categories', 1544 json.dumps(object_to_unicode(self.user_categories)))) 1545 if self.primary_writing_mode: 1546 a(M.meta(name='primary-writing-mode', content=self.primary_writing_mode)) 1547 manifest = E.manifest() 1548 if self.manifest is not None: 1549 for ref in self.manifest: 1550 href = ref.href() 1551 if isinstance(href, bytes): 1552 href = href.decode('utf-8') 1553 item = E.item(id=str(ref.id), href=href) 1554 item.set('media-type', ref.mime_type) 1555 manifest.append(item) 1556 spine = E.spine() 1557 if self.toc is not None: 1558 spine.set('toc', 'ncx') 1559 if self.page_progression_direction is not None: 1560 spine.set('page-progression-direction', self.page_progression_direction) 1561 if self.spine is not None: 1562 for ref in self.spine: 1563 if ref.id is not None: 1564 spine.append(E.itemref(idref=ref.id)) 1565 guide = E.guide() 1566 if self.guide is not None: 1567 for ref in self.guide: 1568 href = ref.href() 1569 if isinstance(href, bytes): 1570 href = href.decode('utf-8') 1571 item = E.reference(type=ref.type, href=href) 1572 if ref.title: 1573 item.set('title', ref.title) 1574 guide.append(item) 1575 if process_guide is not None: 1576 process_guide(E, guide) 1577 1578 serialize_user_metadata(metadata, self.get_all_user_metadata(False)) 1579 1580 root = E.package( 1581 metadata, 1582 manifest, 1583 spine, 1584 guide 1585 ) 1586 root.set('unique-identifier', __appname__+'_id') 1587 root.set('version', '2.0') 1588 raw = etree.tostring(root, pretty_print=True, xml_declaration=True, 1589 encoding=encoding) 1590 raw = raw.replace(DNS.encode('utf-8'), OPF2_NS.encode('utf-8')) 1591 opf_stream.write(raw) 1592 opf_stream.flush() 1593 if toc is not None and ncx_stream is not None: 1594 toc.render(ncx_stream, self.application_id) 1595 ncx_stream.flush() 1596 1597 1598def metadata_to_opf(mi, as_string=True, default_lang=None): 1599 from lxml import etree 1600 import textwrap 1601 from calibre.ebooks.oeb.base import OPF, DC 1602 1603 if not mi.application_id: 1604 mi.application_id = str(uuid.uuid4()) 1605 1606 if not mi.uuid: 1607 mi.uuid = str(uuid.uuid4()) 1608 1609 if not mi.book_producer: 1610 mi.book_producer = __appname__ + ' (%s) '%__version__ + \ 1611 '[https://calibre-ebook.com]' 1612 1613 if not mi.languages: 1614 lang = (get_lang().replace('_', '-').partition('-')[0] if default_lang 1615 is None else default_lang) 1616 mi.languages = [lang] 1617 1618 root = safe_xml_fromstring(textwrap.dedent( 1619 ''' 1620 <package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0"> 1621 <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> 1622 <dc:identifier opf:scheme="%(a)s" id="%(a)s_id">%(id)s</dc:identifier> 1623 <dc:identifier opf:scheme="uuid" id="uuid_id">%(uuid)s</dc:identifier> 1624 </metadata> 1625 <guide/> 1626 </package> 1627 '''%dict(a=__appname__, id=mi.application_id, uuid=mi.uuid))) 1628 metadata = root[0] 1629 guide = root[1] 1630 metadata[0].tail = '\n'+(' '*8) 1631 1632 def factory(tag, text=None, sort=None, role=None, scheme=None, name=None, 1633 content=None): 1634 attrib = {} 1635 if sort: 1636 attrib[OPF('file-as')] = sort 1637 if role: 1638 attrib[OPF('role')] = role 1639 if scheme: 1640 attrib[OPF('scheme')] = scheme 1641 if name: 1642 attrib['name'] = name 1643 if content: 1644 attrib['content'] = content 1645 try: 1646 elem = metadata.makeelement(tag, attrib=attrib) 1647 except ValueError: 1648 elem = metadata.makeelement(tag, attrib={k:clean_xml_chars(v) for k, v in iteritems(attrib)}) 1649 elem.tail = '\n'+(' '*8) 1650 if text: 1651 try: 1652 elem.text = text.strip() 1653 except ValueError: 1654 elem.text = clean_ascii_chars(text.strip()) 1655 metadata.append(elem) 1656 1657 factory(DC('title'), mi.title) 1658 for au in mi.authors: 1659 factory(DC('creator'), au, mi.author_sort, 'aut') 1660 factory(DC('contributor'), mi.book_producer, __appname__, 'bkp') 1661 if hasattr(mi.pubdate, 'isoformat'): 1662 factory(DC('date'), isoformat(mi.pubdate)) 1663 if hasattr(mi, 'category') and mi.category: 1664 factory(DC('type'), mi.category) 1665 if mi.comments: 1666 factory(DC('description'), clean_ascii_chars(mi.comments)) 1667 if mi.publisher: 1668 factory(DC('publisher'), mi.publisher) 1669 for key, val in iteritems(mi.get_identifiers()): 1670 factory(DC('identifier'), val, scheme=icu_upper(key)) 1671 if mi.rights: 1672 factory(DC('rights'), mi.rights) 1673 for lang in mi.languages: 1674 if not lang or lang.lower() == 'und': 1675 continue 1676 factory(DC('language'), lang) 1677 if mi.tags: 1678 for tag in mi.tags: 1679 factory(DC('subject'), tag) 1680 meta = lambda n, c: factory('meta', name='calibre:'+n, content=c) 1681 if getattr(mi, 'author_link_map', None) is not None: 1682 meta('author_link_map', dump_dict(mi.author_link_map)) 1683 if mi.series: 1684 meta('series', mi.series) 1685 if mi.series_index is not None: 1686 meta('series_index', mi.format_series_index()) 1687 if mi.rating is not None: 1688 meta('rating', str(mi.rating)) 1689 if hasattr(mi.timestamp, 'isoformat'): 1690 meta('timestamp', isoformat(mi.timestamp)) 1691 if mi.publication_type: 1692 meta('publication_type', mi.publication_type) 1693 if mi.title_sort: 1694 meta('title_sort', mi.title_sort) 1695 if mi.user_categories: 1696 meta('user_categories', dump_dict(mi.user_categories)) 1697 1698 serialize_user_metadata(metadata, mi.get_all_user_metadata(False)) 1699 all_annotations = getattr(mi, 'all_annotations', None) 1700 if all_annotations: 1701 serialize_annotations(metadata, all_annotations) 1702 1703 metadata[-1].tail = '\n' +(' '*4) 1704 1705 if mi.cover: 1706 if not isinstance(mi.cover, str): 1707 mi.cover = mi.cover.decode(filesystem_encoding) 1708 guide.text = '\n'+(' '*8) 1709 r = guide.makeelement(OPF('reference'), 1710 attrib={'type':'cover', 'title':_('Cover'), 'href':mi.cover}) 1711 r.tail = '\n' +(' '*4) 1712 guide.append(r) 1713 if pretty_print_opf: 1714 _pretty_print(root) 1715 1716 return etree.tostring(root, pretty_print=True, encoding='utf-8', 1717 xml_declaration=True) if as_string else root 1718 1719 1720def test_m2o(): 1721 from calibre.utils.date import now as nowf 1722 mi = MetaInformation('test & title', ['a"1', "a'2"]) 1723 mi.title_sort = 'a\'"b' 1724 mi.author_sort = 'author sort' 1725 mi.pubdate = nowf() 1726 mi.language = 'en' 1727 mi.comments = 'what a fun book\n\n' 1728 mi.publisher = 'publisher' 1729 mi.set_identifiers({'isbn':'booo', 'dummy':'dummy'}) 1730 mi.tags = ['a', 'b'] 1731 mi.series = 's"c\'l&<>' 1732 mi.series_index = 3.34 1733 mi.rating = 3 1734 mi.timestamp = nowf() 1735 mi.publication_type = 'ooooo' 1736 mi.rights = 'yes' 1737 mi.cover = os.path.abspath('asd.jpg') 1738 opf = metadata_to_opf(mi) 1739 print(opf) 1740 newmi = MetaInformation(OPF(io.BytesIO(opf))) 1741 for attr in ('author_sort', 'title_sort', 'comments', 1742 'publisher', 'series', 'series_index', 'rating', 1743 'isbn', 'tags', 'cover_data', 'application_id', 1744 'language', 'cover', 1745 'book_producer', 'timestamp', 1746 'pubdate', 'rights', 'publication_type'): 1747 o, n = getattr(mi, attr), getattr(newmi, attr) 1748 if o != n and o.strip() != n.strip(): 1749 print('FAILED:', attr, getattr(mi, attr), '!=', getattr(newmi, attr)) 1750 if mi.get_identifiers() != newmi.get_identifiers(): 1751 print('FAILED:', 'identifiers', mi.get_identifiers(), end=' ') 1752 print('!=', newmi.get_identifiers()) 1753 1754 1755def suite(): 1756 import unittest 1757 1758 class OPFTest(unittest.TestCase): 1759 1760 def setUp(self): 1761 self.stream = io.BytesIO( 1762 b'''\ 1763 <?xml version="1.0" encoding="UTF-8"?> 1764 <package version="2.0" xmlns="http://www.idpf.org/2007/opf" > 1765 <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> 1766 <dc:title opf:file-as="Wow">A Cool & © ß Title</dc:title> 1767 <creator opf:role="aut" file-as="Monkey">Monkey Kitchen</creator> 1768 <creator opf:role="aut">Next</creator> 1769 <dc:subject>One</dc:subject><dc:subject>Two</dc:subject> 1770 <dc:identifier scheme="ISBN">123456789</dc:identifier> 1771 <dc:identifier scheme="dummy">dummy</dc:identifier> 1772 <meta name="calibre:series" content="A one book series" /> 1773 <meta name="calibre:rating" content="4"/> 1774 <meta name="calibre:publication_type" content="test"/> 1775 <meta name="calibre:series_index" content="2.5" /> 1776 </metadata> 1777 <manifest> 1778 <item id="1" href="a%20%7E%20b" media-type="text/txt" /> 1779 </manifest> 1780 </package> 1781 ''' 1782 ) 1783 self.opf = OPF(self.stream, os.getcwd()) 1784 1785 def testReading(self, opf=None): 1786 if opf is None: 1787 opf = self.opf 1788 self.assertEqual(opf.title, 'A Cool & \xa9 \xdf Title') 1789 self.assertEqual(opf.authors, 'Monkey Kitchen,Next'.split(',')) 1790 self.assertEqual(opf.author_sort, 'Monkey') 1791 self.assertEqual(opf.title_sort, 'Wow') 1792 self.assertEqual(opf.tags, ['One', 'Two']) 1793 self.assertEqual(opf.isbn, '123456789') 1794 self.assertEqual(opf.series, 'A one book series') 1795 self.assertEqual(opf.series_index, 2.5) 1796 self.assertEqual(opf.rating, 4) 1797 self.assertEqual(opf.publication_type, 'test') 1798 self.assertEqual(list(opf.itermanifest())[0].get('href'), 'a ~ b') 1799 self.assertEqual(opf.get_identifiers(), {'isbn':'123456789', 1800 'dummy':'dummy'}) 1801 1802 def testWriting(self): 1803 for test in [('title', 'New & Title'), ('authors', ['One', 'Two']), 1804 ('author_sort', "Kitchen"), ('tags', ['Three']), 1805 ('isbn', 'a'), ('rating', 3), ('series_index', 1), 1806 ('title_sort', 'ts')]: 1807 setattr(self.opf, *test) 1808 attr, val = test 1809 self.assertEqual(getattr(self.opf, attr), val) 1810 1811 self.opf.render() 1812 1813 def testCreator(self): 1814 opf = OPFCreator(os.getcwd(), self.opf) 1815 buf = io.BytesIO() 1816 opf.render(buf) 1817 raw = buf.getvalue() 1818 self.testReading(opf=OPF(io.BytesIO(raw), os.getcwd())) 1819 1820 def testSmartUpdate(self): 1821 self.opf.smart_update(MetaInformation(self.opf)) 1822 self.testReading() 1823 1824 return unittest.TestLoader().loadTestsFromTestCase(OPFTest) 1825 1826 1827def test(): 1828 import unittest 1829 unittest.TextTestRunner(verbosity=2).run(suite()) 1830 1831 1832def test_user_metadata(): 1833 mi = Metadata('Test title', ['test author1', 'test author2']) 1834 um = { 1835 '#myseries': {'#value#': 'test series\xe4', 'datatype':'text', 1836 'is_multiple': None, 'name': 'My Series'}, 1837 '#myseries_index': {'#value#': 2.45, 'datatype': 'float', 1838 'is_multiple': None}, 1839 '#mytags': {'#value#':['t1','t2','t3'], 'datatype':'text', 1840 'is_multiple': '|', 'name': 'My Tags'} 1841 } 1842 mi.set_all_user_metadata(um) 1843 raw = metadata_to_opf(mi) 1844 opfc = OPFCreator(os.getcwd(), other=mi) 1845 out = io.BytesIO() 1846 opfc.render(out) 1847 raw2 = out.getvalue() 1848 f = io.BytesIO(raw) 1849 opf = OPF(f) 1850 f2 = io.BytesIO(raw2) 1851 opf2 = OPF(f2) 1852 assert um == opf._user_metadata_ 1853 assert um == opf2._user_metadata_ 1854 print(opf.render()) 1855 1856 1857if __name__ == '__main__': 1858 # test_user_metadata() 1859 test_m2o() 1860 test() 1861