1#!/usr/local/bin/python3.8
2
3__license__   = 'GPL v3'
4__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
5__docformat__ = 'restructuredtext en'
6
7'''
8lxml based OPF parser.
9'''
10
11import re, sys, functools, os, uuid, glob, io, json, copy
12
13from lxml import etree
14
15from calibre.ebooks import escape_xpath_attr
16from calibre.constants import __appname__, __version__, filesystem_encoding
17from calibre.ebooks.metadata.toc import TOC
18from calibre.ebooks.metadata.utils import parse_opf, pretty_print_opf as _pretty_print
19from calibre.ebooks.metadata import string_to_authors, MetaInformation, check_isbn
20from calibre.ebooks.metadata.book.base import Metadata
21from calibre.utils.date import parse_date, isoformat
22from calibre.utils.localization import get_lang, canonicalize_lang
23from calibre import prints, guess_type
24from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
25from calibre.utils.config import tweaks
26from calibre.utils.xml_parse import safe_xml_fromstring
27from polyglot.builtins import iteritems
28from polyglot.urllib import unquote, urlparse
29
30pretty_print_opf = False
31
32
33class PrettyPrint:
34
35    def __enter__(self):
36        global pretty_print_opf
37        pretty_print_opf = True
38
39    def __exit__(self, *args):
40        global pretty_print_opf
41        pretty_print_opf = False
42
43
44pretty_print = PrettyPrint()
45
46
47class Resource:  # {{{
48
49    '''
50    Represents a resource (usually a file on the filesystem or a URL pointing
51    to the web. Such resources are commonly referred to in OPF files.
52
53    They have the interface:
54
55    :member:`path`
56    :member:`mime_type`
57    :method:`href`
58    '''
59
60    def __init__(self, href_or_path, basedir=os.getcwd(), is_path=True):
61        self.orig = href_or_path
62        self._href = None
63        self._basedir = basedir
64        self.path = None
65        self.fragment = ''
66        try:
67            self.mime_type = guess_type(href_or_path)[0]
68        except:
69            self.mime_type = None
70        if self.mime_type is None:
71            self.mime_type = 'application/octet-stream'
72        if is_path:
73            path = href_or_path
74            if not os.path.isabs(path):
75                path = os.path.abspath(os.path.join(basedir, path))
76            if isinstance(path, bytes):
77                path = path.decode(filesystem_encoding)
78            self.path = path
79        else:
80            href_or_path = href_or_path
81            url = urlparse(href_or_path)
82            if url[0] not in ('', 'file'):
83                self._href = href_or_path
84            else:
85                pc = url[2]
86                if isinstance(pc, str):
87                    pc = pc.encode('utf-8')
88                pc = pc.decode('utf-8')
89                self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
90                self.fragment = url[-1]
91
92    def href(self, basedir=None):
93        '''
94        Return a URL pointing to this resource. If it is a file on the filesystem
95        the URL is relative to `basedir`.
96
97        `basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`).
98        If this resource has no basedir, then the current working directory is used as the basedir.
99        '''
100        if basedir is None:
101            if self._basedir:
102                basedir = self._basedir
103            else:
104                basedir = os.getcwd()
105        if self.path is None:
106            return self._href
107        frag = ('#' + self.fragment) if self.fragment else ''
108        if self.path == basedir:
109            return frag
110        try:
111            rpath = os.path.relpath(self.path, basedir)
112        except ValueError:  # On windows path and basedir could be on different drives
113            rpath = self.path
114        if isinstance(rpath, bytes):
115            rpath = rpath.decode(filesystem_encoding)
116        return rpath.replace(os.sep, '/')+frag
117
118    def set_basedir(self, path):
119        self._basedir = path
120
121    def basedir(self):
122        return self._basedir
123
124    def __repr__(self):
125        return 'Resource(%s, %s)'%(repr(self.path), repr(self.href()))
126
127# }}}
128
129
130class ResourceCollection:  # {{{
131
132    def __init__(self):
133        self._resources = []
134
135    def __iter__(self):
136        yield from self._resources
137
138    def __len__(self):
139        return len(self._resources)
140
141    def __getitem__(self, index):
142        return self._resources[index]
143
144    def __bool__(self):
145        return len(self._resources) > 0
146
147    def __str__(self):
148        resources = map(repr, self)
149        return '[%s]'%', '.join(resources)
150    __unicode__ = __str__
151
152    def __repr__(self):
153        return str(self)
154
155    def append(self, resource):
156        if not isinstance(resource, Resource):
157            raise ValueError('Can only append objects of type Resource')
158        self._resources.append(resource)
159
160    def remove(self, resource):
161        self._resources.remove(resource)
162
163    def replace(self, start, end, items):
164        'Same as list[start:end] = items'
165        self._resources[start:end] = items
166
167    @staticmethod
168    def from_directory_contents(top, topdown=True):
169        collection = ResourceCollection()
170        for spec in os.walk(top, topdown=topdown):
171            path = os.path.abspath(os.path.join(spec[0], spec[1]))
172            res = Resource.from_path(path)
173            res.set_basedir(top)
174            collection.append(res)
175        return collection
176
177    def set_basedir(self, path):
178        for res in self:
179            res.set_basedir(path)
180
181# }}}
182
183
184class ManifestItem(Resource):  # {{{
185
186    @staticmethod
187    def from_opf_manifest_item(item, basedir):
188        href = item.get('href', None)
189        if href:
190            res = ManifestItem(href, basedir=basedir, is_path=True)
191            mt = item.get('media-type', '').strip()
192            if mt:
193                res.mime_type = mt
194            return res
195
196    @property
197    def media_type(self):
198        return self.mime_type
199
200    @media_type.setter
201    def media_type(self, val):
202        self.mime_type = val
203
204    def __unicode__representation__(self):
205        return '<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href(), self.media_type)
206
207    __str__ = __unicode__representation__
208
209    def __repr__(self):
210        return str(self)
211
212    def __getitem__(self, index):
213        if index == 0:
214            return self.href()
215        if index == 1:
216            return self.media_type
217        raise IndexError('%d out of bounds.'%index)
218
219# }}}
220
221
222class Manifest(ResourceCollection):  # {{{
223
224    def append_from_opf_manifest_item(self, item, dir):
225        self.append(ManifestItem.from_opf_manifest_item(item, dir))
226        id = item.get('id', '')
227        if not id:
228            id = 'id%d'%self.next_id
229        self[-1].id = id
230        self.next_id += 1
231
232    @staticmethod
233    def from_opf_manifest_element(items, dir):
234        m = Manifest()
235        for item in items:
236            try:
237                m.append_from_opf_manifest_item(item, dir)
238            except ValueError:
239                continue
240        return m
241
242    @staticmethod
243    def from_paths(entries):
244        '''
245        `entries`: List of (path, mime-type) If mime-type is None it is autodetected
246        '''
247        m = Manifest()
248        for path, mt in entries:
249            mi = ManifestItem(path, is_path=True)
250            if mt:
251                mi.mime_type = mt
252            mi.id = 'id%d'%m.next_id
253            m.next_id += 1
254            m.append(mi)
255        return m
256
257    def add_item(self, path, mime_type=None):
258        mi = ManifestItem(path, is_path=True)
259        if mime_type:
260            mi.mime_type = mime_type
261        mi.id = 'id%d'%self.next_id
262        self.next_id += 1
263        self.append(mi)
264        return mi.id
265
266    def __init__(self):
267        ResourceCollection.__init__(self)
268        self.next_id = 1
269
270    def item(self, id):
271        for i in self:
272            if i.id == id:
273                return i
274
275    def id_for_path(self, path):
276        path = os.path.normpath(os.path.abspath(path))
277        for i in self:
278            if i.path and os.path.normpath(i.path) == path:
279                return i.id
280
281    def path_for_id(self, id):
282        for i in self:
283            if i.id == id:
284                return i.path
285
286    def type_for_id(self, id):
287        for i in self:
288            if i.id == id:
289                return i.mime_type
290
291# }}}
292
293
294class Spine(ResourceCollection):  # {{{
295
296    class Item(Resource):
297
298        def __init__(self, idfunc, *args, **kwargs):
299            Resource.__init__(self, *args, **kwargs)
300            self.is_linear = True
301            self.id = idfunc(self.path)
302            self.idref = None
303
304        def __repr__(self):
305            return 'Spine.Item(path=%r, id=%s, is_linear=%s)' % \
306                    (self.path, self.id, self.is_linear)
307
308    @staticmethod
309    def from_opf_spine_element(itemrefs, manifest):
310        s = Spine(manifest)
311        seen = set()
312        path_map = {i.id:i.path for i in s.manifest}
313        for itemref in itemrefs:
314            idref = itemref.get('idref', None)
315            if idref is not None:
316                path = path_map.get(idref)
317                if path and path not in seen:
318                    r = Spine.Item(lambda x:idref, path, is_path=True)
319                    r.is_linear = itemref.get('linear', 'yes') == 'yes'
320                    r.idref = idref
321                    s.append(r)
322                    seen.add(path)
323        return s
324
325    @staticmethod
326    def from_paths(paths, manifest):
327        s = Spine(manifest)
328        for path in paths:
329            try:
330                s.append(Spine.Item(s.manifest.id_for_path, path, is_path=True))
331            except:
332                continue
333        return s
334
335    def __init__(self, manifest):
336        ResourceCollection.__init__(self)
337        self.manifest = manifest
338
339    def replace(self, start, end, ids):
340        '''
341        Replace the items between start (inclusive) and end (not inclusive) with
342        with the items identified by ids. ids can be a list of any length.
343        '''
344        items = []
345        for id in ids:
346            path = self.manifest.path_for_id(id)
347            if path is None:
348                raise ValueError('id %s not in manifest')
349            items.append(Spine.Item(lambda x: id, path, is_path=True))
350        ResourceCollection.replace(start, end, items)
351
352    def linear_items(self):
353        for r in self:
354            if r.is_linear:
355                yield r.path
356
357    def nonlinear_items(self):
358        for r in self:
359            if not r.is_linear:
360                yield r.path
361
362    def items(self):
363        for i in self:
364            yield i.path
365
366# }}}
367
368
369class Guide(ResourceCollection):  # {{{
370
371    class Reference(Resource):
372
373        @staticmethod
374        def from_opf_resource_item(ref, basedir):
375            title, href, type = ref.get('title', ''), ref.get('href'), ref.get('type')
376            res = Guide.Reference(href, basedir, is_path=True)
377            res.title = title
378            res.type = type
379            return res
380
381        def __repr__(self):
382            ans = '<reference type="%s" href="%s" '%(self.type, self.href())
383            if self.title:
384                ans += 'title="%s" '%self.title
385            return ans + '/>'
386
387    @staticmethod
388    def from_opf_guide(references, base_dir=os.getcwd()):
389        coll = Guide()
390        for ref in references:
391            try:
392                ref = Guide.Reference.from_opf_resource_item(ref, base_dir)
393                coll.append(ref)
394            except:
395                continue
396        return coll
397
398    def set_cover(self, path):
399        for i in tuple(self):
400            if 'cover' in i.type.lower():
401                self.remove(i)
402        for typ in ('cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'):
403            self.append(Guide.Reference(path, is_path=True))
404            self[-1].type = typ
405            self[-1].title = ''
406
407# }}}
408
409
410class MetadataField:
411
412    def __init__(self, name, is_dc=True, formatter=None, none_is=None,
413            renderer=lambda x: str(x)):
414        self.name      = name
415        self.is_dc     = is_dc
416        self.formatter = formatter
417        self.none_is   = none_is
418        self.renderer  = renderer
419
420    def __real_get__(self, obj, type=None):
421        ans = obj.get_metadata_element(self.name)
422        if ans is None:
423            return None
424        ans = obj.get_text(ans)
425        if ans is None:
426            return ans
427        if self.formatter is not None:
428            try:
429                ans = self.formatter(ans)
430            except:
431                return None
432        if hasattr(ans, 'strip'):
433            ans = ans.strip()
434        return ans
435
436    def __get__(self, obj, type=None):
437        ans = self.__real_get__(obj, type)
438        if ans is None:
439            ans = self.none_is
440        return ans
441
442    def __set__(self, obj, val):
443        elem = obj.get_metadata_element(self.name)
444        if val is None:
445            if elem is not None:
446                elem.getparent().remove(elem)
447            return
448        if elem is None:
449            elem = obj.create_metadata_element(self.name, is_dc=self.is_dc)
450        obj.set_text(elem, self.renderer(val))
451
452
453class TitleSortField(MetadataField):
454
455    def __get__(self, obj, type=None):
456        c = self.__real_get__(obj, type)
457        if c is None:
458            matches = obj.title_path(obj.metadata)
459            if matches:
460                for match in matches:
461                    ans = match.get('{%s}file-as'%obj.NAMESPACES['opf'], None)
462                    if not ans:
463                        ans = match.get('file-as', None)
464                    if ans:
465                        c = ans
466        if not c:
467            c = self.none_is
468        else:
469            c = c.strip()
470        return c
471
472    def __set__(self, obj, val):
473        MetadataField.__set__(self, obj, val)
474        matches = obj.title_path(obj.metadata)
475        if matches:
476            for match in matches:
477                for attr in list(match.attrib):
478                    if attr.endswith('file-as'):
479                        del match.attrib[attr]
480
481
482def serialize_user_metadata(metadata_elem, all_user_metadata, tail='\n'+(' '*8)):
483    from calibre.utils.config import to_json
484    from calibre.ebooks.metadata.book.json_codec import (object_to_unicode,
485                                                         encode_is_multiple)
486
487    for name, fm in all_user_metadata.items():
488        try:
489            fm = copy.copy(fm)
490            encode_is_multiple(fm)
491            fm = object_to_unicode(fm)
492            fm = json.dumps(fm, default=to_json, ensure_ascii=False)
493        except:
494            prints('Failed to write user metadata:', name)
495            import traceback
496            traceback.print_exc()
497            continue
498        meta = metadata_elem.makeelement('meta')
499        meta.set('name', 'calibre:user_metadata:'+name)
500        meta.set('content', fm)
501        meta.tail = tail
502        metadata_elem.append(meta)
503
504
505def serialize_annotations(metadata_elem, annotations, tail='\n'+(' '*8)):
506    for item in annotations:
507        data = json.dumps(item, ensure_ascii=False)
508        if isinstance(data, bytes):
509            data = data.decode('utf-8')
510        meta = metadata_elem.makeelement('meta')
511        meta.set('name', 'calibre:annotation')
512        meta.set('content', data)
513        meta.tail = tail
514        metadata_elem.append(meta)
515
516
517def dump_dict(cats):
518    if not cats:
519        cats = {}
520    from calibre.ebooks.metadata.book.json_codec import object_to_unicode
521    return json.dumps(object_to_unicode(cats), ensure_ascii=False,
522            skipkeys=True)
523
524
525class OPF:  # {{{
526
527    MIMETYPE         = 'application/oebps-package+xml'
528    NAMESPACES       = {
529                        None: "http://www.idpf.org/2007/opf",
530                        'dc': "http://purl.org/dc/elements/1.1/",
531                        'opf': "http://www.idpf.org/2007/opf",
532                       }
533    META             = '{%s}meta' % NAMESPACES['opf']
534    xpn = NAMESPACES.copy()
535    xpn.pop(None)
536    xpn['re'] = 'http://exslt.org/regular-expressions'
537    XPath = functools.partial(etree.XPath, namespaces=xpn)
538    CONTENT          = XPath('self::*[re:match(name(), "meta$", "i")]/@content')
539    TEXT             = XPath('string()')
540
541    metadata_path   = XPath('descendant::*[re:match(name(), "metadata", "i")]')
542    metadata_elem_path = XPath(
543        'descendant::*[re:match(name(), concat($name, "$"), "i") or (re:match(name(), "meta$", "i") '
544        'and re:match(@name, concat("^calibre:", $name, "$"), "i"))]')
545    title_path      = XPath('descendant::*[re:match(name(), "title", "i")]')
546    authors_path    = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut" or (not(@role) and not(@opf:role)))]')
547    editors_path    = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="edt" or @opf:role="edt")]')
548    bkp_path        = XPath('descendant::*[re:match(name(), "contributor", "i") and (@role="bkp" or @opf:role="bkp")]')
549    tags_path       = XPath('descendant::*[re:match(name(), "subject", "i")]')
550    isbn_path       = XPath('descendant::*[re:match(name(), "identifier", "i") and '
551                            '(re:match(@scheme, "isbn", "i") or re:match(@opf:scheme, "isbn", "i"))]')
552    pubdate_path    = XPath('descendant::*[re:match(name(), "date", "i")]')
553    raster_cover_path = XPath('descendant::*[re:match(name(), "meta", "i") and '
554            're:match(@name, "cover", "i") and @content]')
555    guide_cover_path = XPath('descendant::*[local-name()="guide"]/*[local-name()="reference" and re:match(@type, "cover", "i")]/@href')
556    identifier_path = XPath('descendant::*[re:match(name(), "identifier", "i")]')
557    application_id_path = XPath('descendant::*[re:match(name(), "identifier", "i") and '
558                            '(re:match(@opf:scheme, "calibre|libprs500", "i") or re:match(@scheme, "calibre|libprs500", "i"))]')
559    uuid_id_path    = XPath('descendant::*[re:match(name(), "identifier", "i") and '
560                            '(re:match(@opf:scheme, "uuid", "i") or re:match(@scheme, "uuid", "i"))]')
561    languages_path  = XPath('descendant::*[local-name()="language"]')
562
563    manifest_path   = XPath('descendant::*[re:match(name(), "manifest", "i")]/*[re:match(name(), "item", "i")]')
564    manifest_ppath  = XPath('descendant::*[re:match(name(), "manifest", "i")]')
565    spine_path      = XPath('descendant::*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]')
566    guide_path      = XPath('descendant::*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]')
567
568    publisher       = MetadataField('publisher')
569    comments        = MetadataField('description')
570    category        = MetadataField('type')
571    rights          = MetadataField('rights')
572    series          = MetadataField('series', is_dc=False)
573    if tweaks['use_series_auto_increment_tweak_when_importing']:
574        series_index    = MetadataField('series_index', is_dc=False,
575                                        formatter=float, none_is=None)
576    else:
577        series_index    = MetadataField('series_index', is_dc=False,
578                                        formatter=float, none_is=1)
579    title_sort      = TitleSortField('title_sort', is_dc=False)
580    rating          = MetadataField('rating', is_dc=False, formatter=float)
581    publication_type = MetadataField('publication_type', is_dc=False)
582    timestamp       = MetadataField('timestamp', is_dc=False,
583                                    formatter=parse_date, renderer=isoformat)
584    user_categories = MetadataField('user_categories', is_dc=False,
585                                    formatter=json.loads,
586                                    renderer=dump_dict)
587    author_link_map = MetadataField('author_link_map', is_dc=False,
588                                formatter=json.loads, renderer=dump_dict)
589
590    def __init__(self, stream, basedir=os.getcwd(), unquote_urls=True,
591            populate_spine=True, try_to_guess_cover=True, preparsed_opf=None, read_toc=True):
592        self.try_to_guess_cover = try_to_guess_cover
593        self.basedir  = self.base_dir = basedir
594        self.path_to_html_toc = self.html_toc_fragment = None
595        self.root = parse_opf(stream) if preparsed_opf is None else preparsed_opf
596        try:
597            self.package_version = float(self.root.get('version', None))
598        except (AttributeError, TypeError, ValueError):
599            self.package_version = 0
600        self.metadata = self.metadata_path(self.root)
601        if not self.metadata:
602            self.metadata = [self.root.makeelement('{http://www.idpf.org/2007/opf}metadata')]
603            self.root.insert(0, self.metadata[0])
604            self.metadata[0].tail = '\n'
605        self.metadata      = self.metadata[0]
606        if unquote_urls:
607            self.unquote_urls()
608        self.manifest = Manifest()
609        m = self.manifest_path(self.root)
610        if m:
611            self.manifest = Manifest.from_opf_manifest_element(m, basedir)
612        self.spine = None
613        s = self.spine_path(self.root)
614        if populate_spine and s:
615            self.spine = Spine.from_opf_spine_element(s, self.manifest)
616        self.guide = None
617        guide = self.guide_path(self.root)
618        self.guide = Guide.from_opf_guide(guide, basedir) if guide else None
619        self.cover_data = (None, None)
620        if read_toc:
621            self.find_toc()
622        else:
623            self.toc = None
624        self.read_user_metadata()
625
626    def read_user_metadata(self):
627        self._user_metadata_ = {}
628        temp = Metadata('x', ['x'])
629        from calibre.utils.config import from_json
630        from calibre.ebooks.metadata.book.json_codec import decode_is_multiple
631        elems = self.root.xpath('//*[name() = "meta" and starts-with(@name,'
632                '"calibre:user_metadata:") and @content]')
633        for elem in elems:
634            name = elem.get('name')
635            name = ':'.join(name.split(':')[2:])
636            if not name or not name.startswith('#'):
637                continue
638            fm = elem.get('content')
639            try:
640                fm = json.loads(fm, object_hook=from_json)
641                decode_is_multiple(fm)
642                temp.set_user_metadata(name, fm)
643            except:
644                prints('Failed to read user metadata:', name)
645                import traceback
646                traceback.print_exc()
647                continue
648        self._user_metadata_ = temp.get_all_user_metadata(True)
649
650    def to_book_metadata(self):
651        if self.package_version >= 3.0:
652            from calibre.ebooks.metadata.opf3 import read_metadata
653            return read_metadata(self.root)
654        ans = MetaInformation(self)
655        for n, v in self._user_metadata_.items():
656            ans.set_user_metadata(n, v)
657
658        ans.set_identifiers(self.get_identifiers())
659
660        return ans
661
662    def read_annotations(self):
663        for elem in self.root.xpath('//*[name() = "meta" and @name = "calibre:annotation" and @content]'):
664            try:
665                yield json.loads(elem.get('content'))
666            except Exception:
667                pass
668
669    def write_user_metadata(self):
670        elems = self.root.xpath('//*[name() = "meta" and starts-with(@name,'
671                '"calibre:user_metadata:") and @content]')
672        for elem in elems:
673            elem.getparent().remove(elem)
674        serialize_user_metadata(self.metadata,
675                self._user_metadata_)
676
677    def find_toc(self):
678        self.toc = None
679        try:
680            spine = self.XPath('descendant::*[re:match(name(), "spine", "i")]')(self.root)
681            toc = None
682            if spine:
683                spine = spine[0]
684                toc = spine.get('toc', None)
685            if toc is None and self.guide:
686                for item in self.guide:
687                    if item.type and item.type.lower() == 'toc':
688                        toc = item.path
689            if toc is None:
690                for item in self.manifest:
691                    if 'toc' in item.href().lower():
692                        toc = item.path
693            if toc is None:
694                return
695            self.toc = TOC(base_path=self.base_dir)
696            is_ncx = getattr(self, 'manifest', None) is not None and \
697                     self.manifest.type_for_id(toc) is not None and \
698                     'dtbncx' in self.manifest.type_for_id(toc)
699            if is_ncx or toc.lower() in ('ncx', 'ncxtoc'):
700                path = self.manifest.path_for_id(toc)
701                if path:
702                    self.toc.read_ncx_toc(path)
703                else:
704                    f = glob.glob(os.path.join(self.base_dir, '*.ncx'))
705                    if f:
706                        self.toc.read_ncx_toc(f[0])
707            else:
708                self.path_to_html_toc, self.html_toc_fragment = \
709                    toc.partition('#')[0], toc.partition('#')[-1]
710                if not os.access(self.path_to_html_toc, os.R_OK) or \
711                        not os.path.isfile(self.path_to_html_toc):
712                    self.path_to_html_toc = None
713                self.toc.read_html_toc(toc)
714        except:
715            pass
716
717    def get_text(self, elem):
718        return ''.join(self.CONTENT(elem) or self.TEXT(elem))
719
720    def set_text(self, elem, content):
721        if elem.tag == self.META:
722            elem.attrib['content'] = content
723        else:
724            elem.text = content
725
726    def itermanifest(self):
727        return self.manifest_path(self.root)
728
729    def create_manifest_item(self, href, media_type, append=False):
730        ids = {i.get('id', None) for i in self.itermanifest()}
731        manifest_id = 'id1'
732        c = 1
733        while manifest_id in ids:
734            c += 1
735            manifest_id = 'id%d'%c
736        if not media_type:
737            media_type = 'application/xhtml+xml'
738        ans = etree.Element('{%s}item'%self.NAMESPACES['opf'],
739                             attrib={'id':manifest_id, 'href':href, 'media-type':media_type})
740        ans.tail = '\n\t\t'
741        if append:
742            manifest = self.manifest_ppath(self.root)[0]
743            manifest.append(ans)
744        return ans
745
746    def replace_manifest_item(self, item, items):
747        items = [self.create_manifest_item(*i) for i in items]
748        for i, item2 in enumerate(items):
749            item2.set('id', item.get('id')+'.%d'%(i+1))
750        manifest = item.getparent()
751        index = manifest.index(item)
752        manifest[index:index+1] = items
753        return [i.get('id') for i in items]
754
755    def iterspine(self):
756        return self.spine_path(self.root)
757
758    def spine_items(self):
759        for item in self.iterspine():
760            idref = item.get('idref', '')
761            for x in self.itermanifest():
762                if x.get('id', None) == idref:
763                    yield x.get('href', '')
764
765    def first_spine_item(self):
766        items = self.iterspine()
767        if not items:
768            return None
769        idref = items[0].get('idref', '')
770        for x in self.itermanifest():
771            if x.get('id', None) == idref:
772                return x.get('href', None)
773
774    def create_spine_item(self, idref):
775        ans = etree.Element('{%s}itemref'%self.NAMESPACES['opf'], idref=idref)
776        ans.tail = '\n\t\t'
777        return ans
778
779    def replace_spine_items_by_idref(self, idref, new_idrefs):
780        items = list(map(self.create_spine_item, new_idrefs))
781        spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.root)[0]
782        old = [i for i in self.iterspine() if i.get('idref', None) == idref]
783        for x in old:
784            i = spine.index(x)
785            spine[i:i+1] = items
786
787    def create_guide_element(self):
788        e = etree.SubElement(self.root, '{%s}guide'%self.NAMESPACES['opf'])
789        e.text = '\n        '
790        e.tail =  '\n'
791        return e
792
793    def remove_guide(self):
794        self.guide = None
795        for g in self.root.xpath('./*[re:match(name(), "guide", "i")]', namespaces={'re':'http://exslt.org/regular-expressions'}):
796            self.root.remove(g)
797
798    def create_guide_item(self, type, title, href):
799        e = etree.Element('{%s}reference'%self.NAMESPACES['opf'],
800                             type=type, title=title, href=href)
801        e.tail='\n'
802        return e
803
804    def add_guide_item(self, type, title, href):
805        g = self.root.xpath('./*[re:match(name(), "guide", "i")]', namespaces={'re':'http://exslt.org/regular-expressions'})[0]
806        g.append(self.create_guide_item(type, title, href))
807
808    def iterguide(self):
809        return self.guide_path(self.root)
810
811    def unquote_urls(self):
812        def get_href(item):
813            raw = unquote(item.get('href', ''))
814            if not isinstance(raw, str):
815                raw = raw.decode('utf-8')
816            return raw
817        for item in self.itermanifest():
818            item.set('href', get_href(item))
819        for item in self.iterguide():
820            item.set('href', get_href(item))
821
822    @property
823    def title(self):
824        # TODO: Add support for EPUB 3 refinements
825
826        for elem in self.title_path(self.metadata):
827            title = self.get_text(elem)
828            if title and title.strip():
829                return re.sub(r'\s+', ' ', title.strip())
830
831    @title.setter
832    def title(self, val):
833        val = (val or '').strip()
834        titles = self.title_path(self.metadata)
835        if self.package_version < 3:
836            # EPUB 3 allows multiple title elements containing sub-titles,
837            # series and other things. We all loooove EPUB 3.
838            for title in titles:
839                title.getparent().remove(title)
840            titles = ()
841        if val:
842            title = titles[0] if titles else self.create_metadata_element('title')
843            title.text = re.sub(r'\s+', ' ', str(val))
844
845    @property
846    def authors(self):
847        ans = []
848        for elem in self.authors_path(self.metadata):
849            ans.extend(string_to_authors(self.get_text(elem)))
850        if not ans:
851            for elem in self.editors_path(self.metadata):
852                ans.extend(string_to_authors(self.get_text(elem)))
853        return ans
854
855    @authors.setter
856    def authors(self, val):
857        remove = list(self.authors_path(self.metadata)) or list(self.editors_path(self.metadata))
858        for elem in remove:
859            elem.getparent().remove(elem)
860        # Ensure new author element is at the top of the list
861        # for broken implementations that always use the first
862        # <dc:creator> element with no attention to the role
863        for author in reversed(val):
864            elem = self.metadata.makeelement('{%s}creator'%
865                    self.NAMESPACES['dc'], nsmap=self.NAMESPACES)
866            elem.tail = '\n'
867            self.metadata.insert(0, elem)
868            elem.set('{%s}role'%self.NAMESPACES['opf'], 'aut')
869            self.set_text(elem, author.strip())
870
871    @property
872    def author_sort(self):
873        matches = self.authors_path(self.metadata) or self.editors_path(self.metadata)
874        if matches:
875            for match in matches:
876                ans = match.get('{%s}file-as'%self.NAMESPACES['opf']) or match.get('file-as')
877                if ans:
878                    return ans
879
880    @author_sort.setter
881    def author_sort(self, val):
882        matches = self.authors_path(self.metadata) or self.editors_path(self.metadata)
883        if matches:
884            for key in matches[0].attrib:
885                if key.endswith('file-as'):
886                    matches[0].attrib.pop(key)
887            matches[0].set('{%s}file-as'%self.NAMESPACES['opf'], str(val))
888
889    @property
890    def tags(self):
891        ans = []
892        for tag in self.tags_path(self.metadata):
893            text = self.get_text(tag)
894            if text and text.strip():
895                ans.extend([x.strip() for x in text.split(',')])
896        return ans
897
898    @tags.setter
899    def tags(self, val):
900        for tag in list(self.tags_path(self.metadata)):
901            tag.getparent().remove(tag)
902        for tag in val:
903            elem = self.create_metadata_element('subject')
904            self.set_text(elem, str(tag))
905
906    @property
907    def pubdate(self):
908        ans = None
909        for match in self.pubdate_path(self.metadata):
910            try:
911                val = parse_date(etree.tostring(match, encoding='unicode',
912                    method='text', with_tail=False).strip())
913            except:
914                continue
915            if ans is None or val < ans:
916                ans = val
917        return ans
918
919    @pubdate.setter
920    def pubdate(self, val):
921        least_val = least_elem = None
922        for match in self.pubdate_path(self.metadata):
923            try:
924                cval = parse_date(etree.tostring(match, encoding='unicode',
925                    method='text', with_tail=False).strip())
926            except:
927                match.getparent().remove(match)
928            else:
929                if not val:
930                    match.getparent().remove(match)
931                if least_val is None or cval < least_val:
932                    least_val, least_elem = cval, match
933
934        if val:
935            if least_val is None:
936                least_elem = self.create_metadata_element('date')
937
938            least_elem.attrib.clear()
939            least_elem.text = isoformat(val)
940
941    @property
942    def isbn(self):
943        for match in self.isbn_path(self.metadata):
944            return self.get_text(match) or None
945
946    @isbn.setter
947    def isbn(self, val):
948        uuid_id = None
949        for attr in self.root.attrib:
950            if attr.endswith('unique-identifier'):
951                uuid_id = self.root.attrib[attr]
952                break
953
954        matches = self.isbn_path(self.metadata)
955        if not val:
956            for x in matches:
957                xid = x.get('id', None)
958                is_package_identifier = uuid_id is not None and uuid_id == xid
959                if is_package_identifier:
960                    self.set_text(x, str(uuid.uuid4()))
961                    for attr in x.attrib:
962                        if attr.endswith('scheme'):
963                            x.attrib[attr] = 'uuid'
964                else:
965                    x.getparent().remove(x)
966            return
967        if not matches:
968            attrib = {'{%s}scheme'%self.NAMESPACES['opf']: 'ISBN'}
969            matches = [self.create_metadata_element('identifier',
970                                                    attrib=attrib)]
971        self.set_text(matches[0], str(val))
972
973    def get_identifiers(self):
974        identifiers = {}
975        schemeless = []
976        for x in self.XPath(
977            'descendant::*[local-name() = "identifier" and text()]')(
978                    self.metadata):
979            found_scheme = False
980            for attr, val in iteritems(x.attrib):
981                if attr.endswith('scheme'):
982                    typ = icu_lower(val)
983                    val = etree.tostring(x, with_tail=False, encoding='unicode',
984                            method='text').strip()
985                    if val and typ not in ('calibre', 'uuid'):
986                        if typ == 'isbn' and val.lower().startswith('urn:isbn:'):
987                            val = val[len('urn:isbn:'):]
988                        identifiers[typ] = val
989                    found_scheme = True
990                    break
991            if not found_scheme:
992                val = etree.tostring(x, with_tail=False, encoding='unicode',
993                            method='text').strip()
994                if val.lower().startswith('urn:isbn:'):
995                    val = check_isbn(val.split(':')[-1])
996                    if val is not None:
997                        identifiers['isbn'] = val
998                else:
999                    schemeless.append(val)
1000
1001        if schemeless and 'isbn' not in identifiers:
1002            for val in schemeless:
1003                if check_isbn(val, simple_sanitize=True) is not None:
1004                    identifiers['isbn'] = check_isbn(val)
1005                    break
1006
1007        return identifiers
1008
1009    def set_identifiers(self, identifiers):
1010        identifiers = identifiers.copy()
1011        uuid_id = None
1012        for attr in self.root.attrib:
1013            if attr.endswith('unique-identifier'):
1014                uuid_id = self.root.attrib[attr]
1015                break
1016
1017        for x in self.XPath(
1018            'descendant::*[local-name() = "identifier"]')(
1019                    self.metadata):
1020            xid = x.get('id', None)
1021            is_package_identifier = uuid_id is not None and uuid_id == xid
1022            typ = {val.lower() for attr, val in iteritems(x.attrib) if attr.endswith('scheme')}
1023            if is_package_identifier:
1024                typ = tuple(typ)
1025                if typ and typ[0] in identifiers:
1026                    self.set_text(x, identifiers.pop(typ[0]))
1027                continue
1028            if typ and not (typ & {'calibre', 'uuid'}):
1029                x.getparent().remove(x)
1030
1031        for typ, val in iteritems(identifiers):
1032            attrib = {'{%s}scheme'%self.NAMESPACES['opf']: typ.upper()}
1033            self.set_text(self.create_metadata_element(
1034                'identifier', attrib=attrib), str(val))
1035
1036    @property
1037    def application_id(self):
1038        for match in self.application_id_path(self.metadata):
1039            return self.get_text(match) or None
1040
1041    @application_id.setter
1042    def application_id(self, val):
1043        removed_ids = set()
1044        for x in tuple(self.application_id_path(self.metadata)):
1045            removed_ids.add(x.get('id', None))
1046            x.getparent().remove(x)
1047
1048        uuid_id = None
1049        for attr in self.root.attrib:
1050            if attr.endswith('unique-identifier'):
1051                uuid_id = self.root.attrib[attr]
1052                break
1053        attrib = {'{%s}scheme'%self.NAMESPACES['opf']: 'calibre'}
1054        if uuid_id and uuid_id in removed_ids:
1055            attrib['id'] = uuid_id
1056        self.set_text(self.create_metadata_element(
1057            'identifier', attrib=attrib), str(val))
1058
1059    @property
1060    def uuid(self):
1061        for match in self.uuid_id_path(self.metadata):
1062            return self.get_text(match) or None
1063
1064    @uuid.setter
1065    def uuid(self, val):
1066        matches = self.uuid_id_path(self.metadata)
1067        if not matches:
1068            attrib = {'{%s}scheme'%self.NAMESPACES['opf']: 'uuid'}
1069            matches = [self.create_metadata_element('identifier',
1070                                                    attrib=attrib)]
1071        self.set_text(matches[0], str(val))
1072
1073    @property
1074    def language(self):
1075        ans = self.languages
1076        if ans:
1077            return ans[0]
1078
1079    @language.setter
1080    def language(self, val):
1081        self.languages = [val]
1082
1083    @property
1084    def languages(self):
1085        ans = []
1086        for match in self.languages_path(self.metadata):
1087            t = self.get_text(match)
1088            if t and t.strip():
1089                l = canonicalize_lang(t.strip())
1090                if l:
1091                    ans.append(l)
1092        return ans
1093
1094    @languages.setter
1095    def languages(self, val):
1096        matches = self.languages_path(self.metadata)
1097        for x in matches:
1098            x.getparent().remove(x)
1099
1100        for lang in val:
1101            l = self.create_metadata_element('language')
1102            self.set_text(l, str(lang))
1103
1104    @property
1105    def raw_languages(self):
1106        for match in self.languages_path(self.metadata):
1107            t = self.get_text(match)
1108            if t and t.strip():
1109                yield t.strip()
1110
1111    @property
1112    def book_producer(self):
1113        for match in self.bkp_path(self.metadata):
1114            return self.get_text(match) or None
1115
1116    @book_producer.setter
1117    def book_producer(self, val):
1118        matches = self.bkp_path(self.metadata)
1119        if not matches:
1120            matches = [self.create_metadata_element('contributor')]
1121            matches[0].set('{%s}role'%self.NAMESPACES['opf'], 'bkp')
1122        self.set_text(matches[0], str(val))
1123
1124    def identifier_iter(self):
1125        yield from self.identifier_path(self.metadata)
1126
1127    @property
1128    def raw_unique_identifier(self):
1129        uuid_elem = None
1130        for attr in self.root.attrib:
1131            if attr.endswith('unique-identifier'):
1132                uuid_elem = self.root.attrib[attr]
1133                break
1134        if uuid_elem:
1135            matches = self.root.xpath('//*[@id=%s]'%escape_xpath_attr(uuid_elem))
1136            if matches:
1137                for m in matches:
1138                    raw = m.text
1139                    if raw:
1140                        return raw
1141
1142    @property
1143    def unique_identifier(self):
1144        raw = self.raw_unique_identifier
1145        if raw:
1146            return raw.rpartition(':')[-1]
1147
1148    @property
1149    def page_progression_direction(self):
1150        spine = self.XPath('descendant::*[re:match(name(), "spine", "i")][1]')(self.root)
1151        if spine:
1152            for k, v in iteritems(spine[0].attrib):
1153                if k == 'page-progression-direction' or k.endswith('}page-progression-direction'):
1154                    return v
1155
1156    @property
1157    def primary_writing_mode(self):
1158        for m in self.XPath('//*[local-name()="meta" and @name="primary-writing-mode" and @content]')(self.root):
1159            return m.get('content')
1160
1161    def guess_cover(self):
1162        '''
1163        Try to guess a cover. Needed for some old/badly formed OPF files.
1164        '''
1165        if self.base_dir and os.path.exists(self.base_dir):
1166            for item in self.identifier_path(self.metadata):
1167                scheme = None
1168                for key in item.attrib.keys():
1169                    if key.endswith('scheme'):
1170                        scheme = item.get(key)
1171                        break
1172                if scheme is None:
1173                    continue
1174                if item.text:
1175                    prefix = item.text.replace('-', '')
1176                    for suffix in ['.jpg', '.jpeg', '.gif', '.png', '.bmp']:
1177                        cpath = os.access(os.path.join(self.base_dir, prefix+suffix), os.R_OK)
1178                        if os.access(os.path.join(self.base_dir, prefix+suffix), os.R_OK):
1179                            return cpath
1180
1181    @property
1182    def epub3_raster_cover(self):
1183        for item in self.itermanifest():
1184            props = set((item.get('properties') or '').lower().split())
1185            if 'cover-image' in props:
1186                mt = item.get('media-type', '')
1187                if mt and 'xml' not in mt and 'html' not in mt:
1188                    return item.get('href', None)
1189
1190    @property
1191    def raster_cover(self):
1192        covers = self.raster_cover_path(self.metadata)
1193        if covers:
1194            cover_id = covers[0].get('content')
1195            for item in self.itermanifest():
1196                if item.get('id', None) == cover_id:
1197                    mt = item.get('media-type', '')
1198                    if mt and 'xml' not in mt and 'html' not in mt:
1199                        return item.get('href', None)
1200            for item in self.itermanifest():
1201                if item.get('href', None) == cover_id:
1202                    mt = item.get('media-type', '')
1203                    if mt and 'xml' not in mt and 'html' not in mt:
1204                        return item.get('href', None)
1205        elif self.package_version >= 3.0:
1206            return self.epub3_raster_cover
1207
1208    @property
1209    def guide_raster_cover(self):
1210        covers = self.guide_cover_path(self.root)
1211        if covers:
1212            mt_map = {i.get('href'):i for i in self.itermanifest()}
1213            for href in covers:
1214                if href:
1215                    i = mt_map.get(href)
1216                    if i is not None:
1217                        iid, mt = i.get('id'), i.get('media-type')
1218                        if iid and mt and mt.lower() in {'image/png', 'image/jpeg', 'image/jpg', 'image/gif'}:
1219                            return i
1220
1221    @property
1222    def epub3_nav(self):
1223        if self.package_version >= 3.0:
1224            for item in self.itermanifest():
1225                props = (item.get('properties') or '').lower().split()
1226                if 'nav' in props:
1227                    mt = item.get('media-type') or ''
1228                    if 'html' in mt.lower():
1229                        mid = item.get('id')
1230                        if mid:
1231                            path = self.manifest.path_for_id(mid)
1232                            if path and os.path.exists(path):
1233                                return path
1234
1235    @property
1236    def cover(self):
1237        if self.guide is not None:
1238            for t in ('cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'):
1239                for item in self.guide:
1240                    if item.type and item.type.lower() == t:
1241                        return item.path
1242        try:
1243            if self.try_to_guess_cover:
1244                return self.guess_cover()
1245        except:
1246            pass
1247
1248    @cover.setter
1249    def cover(self, path):
1250        if self.guide is not None:
1251            self.guide.set_cover(path)
1252            for item in list(self.iterguide()):
1253                if 'cover' in item.get('type', ''):
1254                    item.getparent().remove(item)
1255
1256        else:
1257            g = self.create_guide_element()
1258            self.guide = Guide()
1259            self.guide.set_cover(path)
1260            etree.SubElement(g, 'opf:reference', nsmap=self.NAMESPACES,
1261                                attrib={'type':'cover', 'href':self.guide[-1].href()})
1262        id = self.manifest.id_for_path(self.cover)
1263        if id is None:
1264            for t in ('cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'):
1265                for item in self.guide:
1266                    if item.type.lower() == t:
1267                        self.create_manifest_item(item.href(), guess_type(path)[0])
1268
1269    def get_metadata_element(self, name):
1270        matches = self.metadata_elem_path(self.metadata, name=name)
1271        if matches:
1272            return matches[-1]
1273
1274    def create_metadata_element(self, name, attrib=None, is_dc=True):
1275        if is_dc:
1276            name = '{%s}%s' % (self.NAMESPACES['dc'], name)
1277        else:
1278            attrib = attrib or {}
1279            attrib['name'] = 'calibre:' + name
1280            name = '{%s}%s' % (self.NAMESPACES['opf'], 'meta')
1281        nsmap = dict(self.NAMESPACES)
1282        del nsmap['opf']
1283        elem = etree.SubElement(self.metadata, name, attrib=attrib,
1284                                nsmap=nsmap)
1285        elem.tail = '\n'
1286        return elem
1287
1288    def render(self, encoding='utf-8'):
1289        for meta in self.raster_cover_path(self.metadata):
1290            # Ensure that the name attribute occurs before the content
1291            # attribute. Needed for Nooks.
1292            a = meta.attrib
1293            c = a.get('content', None)
1294            if c is not None:
1295                del a['content']
1296                a['content'] = c
1297        # The PocketBook requires calibre:series_index to come after
1298        # calibre:series or it fails to read series info
1299        # We swap attributes instead of elements, as that avoids namespace
1300        # re-declarations
1301        smap = {}
1302        for child in self.metadata.xpath('./*[@name="calibre:series" or @name="calibre:series_index"]'):
1303            smap[child.get('name')] = (child, self.metadata.index(child))
1304        if len(smap) == 2 and smap['calibre:series'][1] > smap['calibre:series_index'][1]:
1305            s, si = smap['calibre:series'][0], smap['calibre:series_index'][0]
1306
1307            def swap(attr):
1308                t = s.get(attr, '')
1309                s.set(attr, si.get(attr, '')), si.set(attr, t)
1310            swap('name'), swap('content')
1311
1312        self.write_user_metadata()
1313        if pretty_print_opf:
1314            _pretty_print(self.root)
1315        raw = etree.tostring(self.root, encoding=encoding, pretty_print=True)
1316        if not raw.lstrip().startswith(b'<?xml '):
1317            raw = ('<?xml version="1.0"  encoding="%s"?>\n'%encoding.upper()).encode('ascii') + raw
1318        return raw
1319
1320    def smart_update(self, mi, replace_metadata=False, apply_null=False):
1321        for attr in ('title', 'authors', 'author_sort', 'title_sort',
1322                     'publisher', 'series', 'series_index', 'rating',
1323                     'isbn', 'tags', 'category', 'comments', 'book_producer',
1324                     'pubdate', 'user_categories', 'author_link_map'):
1325            val = getattr(mi, attr, None)
1326            if attr == 'rating' and val:
1327                val = float(val)
1328            is_null = val is None or val in ((), [], (None, None), {}) or (attr == 'rating' and (not val or val < 0.1))
1329            if is_null:
1330                if apply_null and attr in {'series', 'tags', 'isbn', 'comments', 'publisher', 'rating'}:
1331                    setattr(self, attr, ([] if attr == 'tags' else None))
1332            else:
1333                setattr(self, attr, val)
1334        langs = getattr(mi, 'languages', [])
1335        if langs == ['und']:
1336            langs = []
1337        if apply_null or langs:
1338            self.languages = langs or []
1339        temp = self.to_book_metadata()
1340        temp.remove_stale_user_metadata(mi)
1341        temp.smart_update(mi, replace_metadata=replace_metadata)
1342        if not replace_metadata and callable(getattr(temp, 'custom_field_keys', None)):
1343            # We have to replace non-null fields regardless of the value of
1344            # replace_metadata to match the behavior of the builtin fields
1345            # above.
1346            for x in temp.custom_field_keys():
1347                meta = temp.get_user_metadata(x, make_copy=True)
1348                if meta is None:
1349                    continue
1350                if meta['datatype'] == 'text' and meta['is_multiple']:
1351                    val = mi.get(x, [])
1352                    if val or apply_null:
1353                        temp.set(x, val)
1354                elif meta['datatype'] in {'int', 'float', 'bool'}:
1355                    missing = object()
1356                    val = mi.get(x, missing)
1357                    if val is missing:
1358                        if apply_null:
1359                            temp.set(x, None)
1360                    elif apply_null or val is not None:
1361                        temp.set(x, val)
1362                elif apply_null and mi.is_null(x) and not temp.is_null(x):
1363                    temp.set(x, None)
1364
1365        self._user_metadata_ = temp.get_all_user_metadata(True)
1366
1367# }}}
1368
1369
1370class OPFCreator(Metadata):
1371
1372    def __init__(self, base_path, other):
1373        '''
1374        Initialize.
1375        @param base_path: An absolute path to the folder in which this OPF file
1376        will eventually be. This is used by the L{create_manifest} method
1377        to convert paths to files into relative paths.
1378        '''
1379        Metadata.__init__(self, title='', other=other)
1380        self.base_path = os.path.abspath(base_path)
1381        self.page_progression_direction = None
1382        self.primary_writing_mode = None
1383        if self.application_id is None:
1384            self.application_id = str(uuid.uuid4())
1385        if not isinstance(self.toc, TOC):
1386            self.toc = None
1387        if not self.authors:
1388            self.authors = [_('Unknown')]
1389        if self.guide is None:
1390            self.guide = Guide()
1391        if self.cover:
1392            self.guide.set_cover(self.cover)
1393
1394    def create_manifest(self, entries):
1395        '''
1396        Create <manifest>
1397
1398        `entries`: List of (path, mime-type) If mime-type is None it is autodetected
1399        '''
1400        entries = list(map(lambda x: x if os.path.isabs(x[0]) else
1401                      (os.path.abspath(os.path.join(self.base_path, x[0])), x[1]),
1402                      entries))
1403        self.manifest = Manifest.from_paths(entries)
1404        self.manifest.set_basedir(self.base_path)
1405
1406    def create_manifest_from_files_in(self, files_and_dirs,
1407            exclude=lambda x:False):
1408        entries = []
1409
1410        def dodir(dir):
1411            for spec in os.walk(dir):
1412                root, files = spec[0], spec[-1]
1413                for name in files:
1414                    path = os.path.join(root, name)
1415                    if os.path.isfile(path) and not exclude(path):
1416                        entries.append((path, None))
1417
1418        for i in files_and_dirs:
1419            if os.path.isdir(i):
1420                dodir(i)
1421            else:
1422                entries.append((i, None))
1423
1424        self.create_manifest(entries)
1425
1426    def create_spine(self, entries):
1427        '''
1428        Create the <spine> element. Must first call :method:`create_manifest`.
1429
1430        `entries`: List of paths
1431        '''
1432        entries = list(map(lambda x: x if os.path.isabs(x) else
1433                      os.path.abspath(os.path.join(self.base_path, x)), entries))
1434        self.spine = Spine.from_paths(entries, self.manifest)
1435
1436    def set_toc(self, toc):
1437        '''
1438        Set the toc. You must call :method:`create_spine` before calling this
1439        method.
1440
1441        :param toc: A :class:`TOC` object
1442        '''
1443        self.toc = toc
1444
1445    def create_guide(self, guide_element):
1446        self.guide = Guide.from_opf_guide(guide_element, self.base_path)
1447        self.guide.set_basedir(self.base_path)
1448
1449    def render(self, opf_stream=sys.stdout, ncx_stream=None,
1450               ncx_manifest_entry=None, encoding=None, process_guide=None):
1451        if encoding is None:
1452            encoding = 'utf-8'
1453        toc = getattr(self, 'toc', None)
1454        if self.manifest:
1455            self.manifest.set_basedir(self.base_path)
1456            if ncx_manifest_entry is not None and toc is not None:
1457                if not os.path.isabs(ncx_manifest_entry):
1458                    ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry)
1459                remove = [i for i in self.manifest if i.id == 'ncx']
1460                for item in remove:
1461                    self.manifest.remove(item)
1462                self.manifest.append(ManifestItem(ncx_manifest_entry, self.base_path))
1463                self.manifest[-1].id = 'ncx'
1464                self.manifest[-1].mime_type = 'application/x-dtbncx+xml'
1465        if self.guide is None:
1466            self.guide = Guide()
1467        if self.cover:
1468            cover = self.cover
1469            if not os.path.isabs(cover):
1470                cover = os.path.abspath(os.path.join(self.base_path, cover))
1471            self.guide.set_cover(cover)
1472        self.guide.set_basedir(self.base_path)
1473
1474        # Actual rendering
1475        from lxml.builder import ElementMaker
1476        from calibre.ebooks.oeb.base import OPF2_NS, DC11_NS, CALIBRE_NS
1477        DNS = OPF2_NS+'___xx___'
1478        E = ElementMaker(namespace=DNS, nsmap={None:DNS})
1479        M = ElementMaker(namespace=DNS,
1480                nsmap={'dc':DC11_NS, 'calibre':CALIBRE_NS, 'opf':OPF2_NS})
1481        DC = ElementMaker(namespace=DC11_NS)
1482
1483        def DC_ELEM(tag, text, dc_attrs={}, opf_attrs={}):
1484            if text:
1485                elem = getattr(DC, tag)(clean_ascii_chars(text), **dc_attrs)
1486            else:
1487                elem = getattr(DC, tag)(**dc_attrs)
1488            for k, v in opf_attrs.items():
1489                elem.set('{%s}%s'%(OPF2_NS, k), v)
1490            return elem
1491
1492        def CAL_ELEM(name, content):
1493            return M.meta(name=name, content=content)
1494
1495        metadata = M.metadata()
1496        a = metadata.append
1497        role = {}
1498        a(DC_ELEM('title', self.title if self.title else _('Unknown'),
1499            opf_attrs=role))
1500        for i, author in enumerate(self.authors):
1501            fa = {'role':'aut'}
1502            if i == 0 and self.author_sort:
1503                fa['file-as'] = self.author_sort
1504            a(DC_ELEM('creator', author, opf_attrs=fa))
1505        a(DC_ELEM('contributor', '%s (%s) [%s]'%(__appname__, __version__,
1506            'https://calibre-ebook.com'), opf_attrs={'role':'bkp',
1507                'file-as':__appname__}))
1508        a(DC_ELEM('identifier', str(self.application_id),
1509            opf_attrs={'scheme':__appname__},
1510            dc_attrs={'id':__appname__+'_id'}))
1511        if getattr(self, 'pubdate', None) is not None:
1512            a(DC_ELEM('date', self.pubdate.isoformat()))
1513        langs = self.languages
1514        if not langs or langs == ['und']:
1515            langs = [get_lang().replace('_', '-').partition('-')[0]]
1516        for lang in langs:
1517            a(DC_ELEM('language', lang))
1518        if self.comments:
1519            a(DC_ELEM('description', self.comments))
1520        if self.publisher:
1521            a(DC_ELEM('publisher', self.publisher))
1522        for key, val in iteritems(self.get_identifiers()):
1523            a(DC_ELEM('identifier', val, opf_attrs={'scheme':icu_upper(key)}))
1524        if self.rights:
1525            a(DC_ELEM('rights', self.rights))
1526        if self.tags:
1527            for tag in self.tags:
1528                a(DC_ELEM('subject', tag))
1529        if self.series:
1530            a(CAL_ELEM('calibre:series', self.series))
1531            if self.series_index is not None:
1532                a(CAL_ELEM('calibre:series_index', self.format_series_index()))
1533        if self.title_sort:
1534            a(CAL_ELEM('calibre:title_sort', self.title_sort))
1535        if self.rating is not None:
1536            a(CAL_ELEM('calibre:rating', str(self.rating)))
1537        if self.timestamp is not None:
1538            a(CAL_ELEM('calibre:timestamp', self.timestamp.isoformat()))
1539        if self.publication_type is not None:
1540            a(CAL_ELEM('calibre:publication_type', self.publication_type))
1541        if self.user_categories:
1542            from calibre.ebooks.metadata.book.json_codec import object_to_unicode
1543            a(CAL_ELEM('calibre:user_categories',
1544                       json.dumps(object_to_unicode(self.user_categories))))
1545        if self.primary_writing_mode:
1546            a(M.meta(name='primary-writing-mode', content=self.primary_writing_mode))
1547        manifest = E.manifest()
1548        if self.manifest is not None:
1549            for ref in self.manifest:
1550                href = ref.href()
1551                if isinstance(href, bytes):
1552                    href = href.decode('utf-8')
1553                item = E.item(id=str(ref.id), href=href)
1554                item.set('media-type', ref.mime_type)
1555                manifest.append(item)
1556        spine = E.spine()
1557        if self.toc is not None:
1558            spine.set('toc', 'ncx')
1559        if self.page_progression_direction is not None:
1560            spine.set('page-progression-direction', self.page_progression_direction)
1561        if self.spine is not None:
1562            for ref in self.spine:
1563                if ref.id is not None:
1564                    spine.append(E.itemref(idref=ref.id))
1565        guide = E.guide()
1566        if self.guide is not None:
1567            for ref in self.guide:
1568                href = ref.href()
1569                if isinstance(href, bytes):
1570                    href = href.decode('utf-8')
1571                item = E.reference(type=ref.type, href=href)
1572                if ref.title:
1573                    item.set('title', ref.title)
1574                guide.append(item)
1575        if process_guide is not None:
1576            process_guide(E, guide)
1577
1578        serialize_user_metadata(metadata, self.get_all_user_metadata(False))
1579
1580        root = E.package(
1581                metadata,
1582                manifest,
1583                spine,
1584                guide
1585        )
1586        root.set('unique-identifier', __appname__+'_id')
1587        root.set('version', '2.0')
1588        raw = etree.tostring(root, pretty_print=True, xml_declaration=True,
1589                encoding=encoding)
1590        raw = raw.replace(DNS.encode('utf-8'), OPF2_NS.encode('utf-8'))
1591        opf_stream.write(raw)
1592        opf_stream.flush()
1593        if toc is not None and ncx_stream is not None:
1594            toc.render(ncx_stream, self.application_id)
1595            ncx_stream.flush()
1596
1597
1598def metadata_to_opf(mi, as_string=True, default_lang=None):
1599    from lxml import etree
1600    import textwrap
1601    from calibre.ebooks.oeb.base import OPF, DC
1602
1603    if not mi.application_id:
1604        mi.application_id = str(uuid.uuid4())
1605
1606    if not mi.uuid:
1607        mi.uuid = str(uuid.uuid4())
1608
1609    if not mi.book_producer:
1610        mi.book_producer = __appname__ + ' (%s) '%__version__ + \
1611            '[https://calibre-ebook.com]'
1612
1613    if not mi.languages:
1614        lang = (get_lang().replace('_', '-').partition('-')[0] if default_lang
1615                is None else default_lang)
1616        mi.languages = [lang]
1617
1618    root = safe_xml_fromstring(textwrap.dedent(
1619    '''
1620    <package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0">
1621        <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
1622            <dc:identifier opf:scheme="%(a)s" id="%(a)s_id">%(id)s</dc:identifier>
1623            <dc:identifier opf:scheme="uuid" id="uuid_id">%(uuid)s</dc:identifier>
1624            </metadata>
1625        <guide/>
1626    </package>
1627    '''%dict(a=__appname__, id=mi.application_id, uuid=mi.uuid)))
1628    metadata = root[0]
1629    guide = root[1]
1630    metadata[0].tail = '\n'+(' '*8)
1631
1632    def factory(tag, text=None, sort=None, role=None, scheme=None, name=None,
1633            content=None):
1634        attrib = {}
1635        if sort:
1636            attrib[OPF('file-as')] = sort
1637        if role:
1638            attrib[OPF('role')] = role
1639        if scheme:
1640            attrib[OPF('scheme')] = scheme
1641        if name:
1642            attrib['name'] = name
1643        if content:
1644            attrib['content'] = content
1645        try:
1646            elem = metadata.makeelement(tag, attrib=attrib)
1647        except ValueError:
1648            elem = metadata.makeelement(tag, attrib={k:clean_xml_chars(v) for k, v in iteritems(attrib)})
1649        elem.tail = '\n'+(' '*8)
1650        if text:
1651            try:
1652                elem.text = text.strip()
1653            except ValueError:
1654                elem.text = clean_ascii_chars(text.strip())
1655        metadata.append(elem)
1656
1657    factory(DC('title'), mi.title)
1658    for au in mi.authors:
1659        factory(DC('creator'), au, mi.author_sort, 'aut')
1660    factory(DC('contributor'), mi.book_producer, __appname__, 'bkp')
1661    if hasattr(mi.pubdate, 'isoformat'):
1662        factory(DC('date'), isoformat(mi.pubdate))
1663    if hasattr(mi, 'category') and mi.category:
1664        factory(DC('type'), mi.category)
1665    if mi.comments:
1666        factory(DC('description'), clean_ascii_chars(mi.comments))
1667    if mi.publisher:
1668        factory(DC('publisher'), mi.publisher)
1669    for key, val in iteritems(mi.get_identifiers()):
1670        factory(DC('identifier'), val, scheme=icu_upper(key))
1671    if mi.rights:
1672        factory(DC('rights'), mi.rights)
1673    for lang in mi.languages:
1674        if not lang or lang.lower() == 'und':
1675            continue
1676        factory(DC('language'), lang)
1677    if mi.tags:
1678        for tag in mi.tags:
1679            factory(DC('subject'), tag)
1680    meta = lambda n, c: factory('meta', name='calibre:'+n, content=c)
1681    if getattr(mi, 'author_link_map', None) is not None:
1682        meta('author_link_map', dump_dict(mi.author_link_map))
1683    if mi.series:
1684        meta('series', mi.series)
1685    if mi.series_index is not None:
1686        meta('series_index', mi.format_series_index())
1687    if mi.rating is not None:
1688        meta('rating', str(mi.rating))
1689    if hasattr(mi.timestamp, 'isoformat'):
1690        meta('timestamp', isoformat(mi.timestamp))
1691    if mi.publication_type:
1692        meta('publication_type', mi.publication_type)
1693    if mi.title_sort:
1694        meta('title_sort', mi.title_sort)
1695    if mi.user_categories:
1696        meta('user_categories', dump_dict(mi.user_categories))
1697
1698    serialize_user_metadata(metadata, mi.get_all_user_metadata(False))
1699    all_annotations = getattr(mi, 'all_annotations', None)
1700    if all_annotations:
1701        serialize_annotations(metadata, all_annotations)
1702
1703    metadata[-1].tail = '\n' +(' '*4)
1704
1705    if mi.cover:
1706        if not isinstance(mi.cover, str):
1707            mi.cover = mi.cover.decode(filesystem_encoding)
1708        guide.text = '\n'+(' '*8)
1709        r = guide.makeelement(OPF('reference'),
1710                attrib={'type':'cover', 'title':_('Cover'), 'href':mi.cover})
1711        r.tail = '\n' +(' '*4)
1712        guide.append(r)
1713    if pretty_print_opf:
1714        _pretty_print(root)
1715
1716    return etree.tostring(root, pretty_print=True, encoding='utf-8',
1717            xml_declaration=True) if as_string else root
1718
1719
1720def test_m2o():
1721    from calibre.utils.date import now as nowf
1722    mi = MetaInformation('test & title', ['a"1', "a'2"])
1723    mi.title_sort = 'a\'"b'
1724    mi.author_sort = 'author sort'
1725    mi.pubdate = nowf()
1726    mi.language = 'en'
1727    mi.comments = 'what a fun book\n\n'
1728    mi.publisher = 'publisher'
1729    mi.set_identifiers({'isbn':'booo', 'dummy':'dummy'})
1730    mi.tags = ['a', 'b']
1731    mi.series = 's"c\'l&<>'
1732    mi.series_index = 3.34
1733    mi.rating = 3
1734    mi.timestamp = nowf()
1735    mi.publication_type = 'ooooo'
1736    mi.rights = 'yes'
1737    mi.cover = os.path.abspath('asd.jpg')
1738    opf = metadata_to_opf(mi)
1739    print(opf)
1740    newmi = MetaInformation(OPF(io.BytesIO(opf)))
1741    for attr in ('author_sort', 'title_sort', 'comments',
1742                    'publisher', 'series', 'series_index', 'rating',
1743                    'isbn', 'tags', 'cover_data', 'application_id',
1744                    'language', 'cover',
1745                    'book_producer', 'timestamp',
1746                    'pubdate', 'rights', 'publication_type'):
1747        o, n = getattr(mi, attr), getattr(newmi, attr)
1748        if o != n and o.strip() != n.strip():
1749            print('FAILED:', attr, getattr(mi, attr), '!=', getattr(newmi, attr))
1750    if mi.get_identifiers() != newmi.get_identifiers():
1751        print('FAILED:', 'identifiers', mi.get_identifiers(), end=' ')
1752        print('!=', newmi.get_identifiers())
1753
1754
1755def suite():
1756    import unittest
1757
1758    class OPFTest(unittest.TestCase):
1759
1760        def setUp(self):
1761            self.stream = io.BytesIO(
1762    b'''\
1763    <?xml version="1.0"  encoding="UTF-8"?>
1764    <package version="2.0" xmlns="http://www.idpf.org/2007/opf" >
1765    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
1766        <dc:title opf:file-as="Wow">A Cool &amp; &copy; &#223; Title</dc:title>
1767        <creator opf:role="aut" file-as="Monkey">Monkey Kitchen</creator>
1768        <creator opf:role="aut">Next</creator>
1769        <dc:subject>One</dc:subject><dc:subject>Two</dc:subject>
1770        <dc:identifier scheme="ISBN">123456789</dc:identifier>
1771        <dc:identifier scheme="dummy">dummy</dc:identifier>
1772        <meta name="calibre:series" content="A one book series" />
1773        <meta name="calibre:rating" content="4"/>
1774        <meta name="calibre:publication_type" content="test"/>
1775        <meta name="calibre:series_index" content="2.5" />
1776    </metadata>
1777    <manifest>
1778        <item id="1" href="a%20%7E%20b" media-type="text/txt" />
1779    </manifest>
1780    </package>
1781    '''
1782            )
1783            self.opf = OPF(self.stream, os.getcwd())
1784
1785        def testReading(self, opf=None):
1786            if opf is None:
1787                opf = self.opf
1788            self.assertEqual(opf.title, 'A Cool & \xa9 \xdf Title')
1789            self.assertEqual(opf.authors, 'Monkey Kitchen,Next'.split(','))
1790            self.assertEqual(opf.author_sort, 'Monkey')
1791            self.assertEqual(opf.title_sort, 'Wow')
1792            self.assertEqual(opf.tags, ['One', 'Two'])
1793            self.assertEqual(opf.isbn, '123456789')
1794            self.assertEqual(opf.series, 'A one book series')
1795            self.assertEqual(opf.series_index, 2.5)
1796            self.assertEqual(opf.rating, 4)
1797            self.assertEqual(opf.publication_type, 'test')
1798            self.assertEqual(list(opf.itermanifest())[0].get('href'), 'a ~ b')
1799            self.assertEqual(opf.get_identifiers(), {'isbn':'123456789',
1800                'dummy':'dummy'})
1801
1802        def testWriting(self):
1803            for test in [('title', 'New & Title'), ('authors', ['One', 'Two']),
1804                        ('author_sort', "Kitchen"), ('tags', ['Three']),
1805                        ('isbn', 'a'), ('rating', 3), ('series_index', 1),
1806                        ('title_sort', 'ts')]:
1807                setattr(self.opf, *test)
1808                attr, val = test
1809                self.assertEqual(getattr(self.opf, attr), val)
1810
1811            self.opf.render()
1812
1813        def testCreator(self):
1814            opf = OPFCreator(os.getcwd(), self.opf)
1815            buf = io.BytesIO()
1816            opf.render(buf)
1817            raw = buf.getvalue()
1818            self.testReading(opf=OPF(io.BytesIO(raw), os.getcwd()))
1819
1820        def testSmartUpdate(self):
1821            self.opf.smart_update(MetaInformation(self.opf))
1822            self.testReading()
1823
1824    return unittest.TestLoader().loadTestsFromTestCase(OPFTest)
1825
1826
1827def test():
1828    import unittest
1829    unittest.TextTestRunner(verbosity=2).run(suite())
1830
1831
1832def test_user_metadata():
1833    mi = Metadata('Test title', ['test author1', 'test author2'])
1834    um = {
1835        '#myseries': {'#value#': 'test series\xe4', 'datatype':'text',
1836            'is_multiple': None, 'name': 'My Series'},
1837        '#myseries_index': {'#value#': 2.45, 'datatype': 'float',
1838            'is_multiple': None},
1839        '#mytags': {'#value#':['t1','t2','t3'], 'datatype':'text',
1840            'is_multiple': '|', 'name': 'My Tags'}
1841        }
1842    mi.set_all_user_metadata(um)
1843    raw = metadata_to_opf(mi)
1844    opfc = OPFCreator(os.getcwd(), other=mi)
1845    out = io.BytesIO()
1846    opfc.render(out)
1847    raw2 = out.getvalue()
1848    f = io.BytesIO(raw)
1849    opf = OPF(f)
1850    f2 = io.BytesIO(raw2)
1851    opf2 = OPF(f2)
1852    assert um == opf._user_metadata_
1853    assert um == opf2._user_metadata_
1854    print(opf.render())
1855
1856
1857if __name__ == '__main__':
1858    # test_user_metadata()
1859    test_m2o()
1860    test()
1861