1'''
2Basic support for manipulating OEB 1.x/2.0 content and metadata.
3'''
4
5__license__   = 'GPL v3'
6__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
7__docformat__ = 'restructuredtext en'
8
9import os, re, logging, sys, numbers
10from collections import defaultdict
11from itertools import count
12from operator import attrgetter
13
14from lxml import etree, html
15from calibre import force_unicode
16from calibre.constants import filesystem_encoding, __version__
17from calibre.translations.dynamic import translate
18from calibre.utils.xml_parse import safe_xml_fromstring
19from calibre.ebooks.chardet import xml_to_unicode
20from calibre.ebooks.conversion.preprocess import CSSPreProcessor
21from calibre import (isbytestring, as_unicode, get_types_map)
22from calibre.ebooks.oeb.parse_utils import barename, XHTML_NS, namespace, XHTML, parse_html, NotHTML
23from calibre.utils.cleantext import clean_xml_chars
24from calibre.utils.short_uuid import uuid4
25from polyglot.builtins import iteritems, string_or_bytes, itervalues, codepoint_to_chr
26from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse
27from calibre.utils.icu import numeric_sort_key
28
29XML_NS       = 'http://www.w3.org/XML/1998/namespace'
30OEB_DOC_NS   = 'http://openebook.org/namespaces/oeb-document/1.0/'
31OPF1_NS      = 'http://openebook.org/namespaces/oeb-package/1.0/'
32OPF2_NS      = 'http://www.idpf.org/2007/opf'
33OPF_NSES     = {OPF1_NS, OPF2_NS}
34DC09_NS      = 'http://purl.org/metadata/dublin_core'
35DC10_NS      = 'http://purl.org/dc/elements/1.0/'
36DC11_NS      = 'http://purl.org/dc/elements/1.1/'
37DC_NSES      = {DC09_NS, DC10_NS, DC11_NS}
38XSI_NS       = 'http://www.w3.org/2001/XMLSchema-instance'
39DCTERMS_NS   = 'http://purl.org/dc/terms/'
40NCX_NS       = 'http://www.daisy.org/z3986/2005/ncx/'
41SVG_NS       = 'http://www.w3.org/2000/svg'
42XLINK_NS     = 'http://www.w3.org/1999/xlink'
43CALIBRE_NS   = 'http://calibre.kovidgoyal.net/2009/metadata'
44RE_NS        = 'http://exslt.org/regular-expressions'
45MBP_NS       = 'http://www.mobipocket.com'
46EPUB_NS      = 'http://www.idpf.org/2007/ops'
47MATHML_NS    = 'http://www.w3.org/1998/Math/MathML'
48
49XPNSMAP      = {
50        'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, 'd09': DC09_NS,
51        'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt': DCTERMS_NS,
52        'ncx': NCX_NS, 'svg': SVG_NS, 'xl': XLINK_NS, 're': RE_NS,
53        'mathml': MATHML_NS, 'mbp': MBP_NS, 'calibre': CALIBRE_NS,
54        'epub':EPUB_NS
55}
56
57OPF1_NSMAP   = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
58OPF2_NSMAP   = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
59                'xsi': XSI_NS, 'calibre': CALIBRE_NS}
60
61
62def XML(name):
63    return '{%s}%s' % (XML_NS, name)
64
65
66def OPF(name):
67    return '{%s}%s' % (OPF2_NS, name)
68
69
70def DC(name):
71    return '{%s}%s' % (DC11_NS, name)
72
73
74def XSI(name):
75    return '{%s}%s' % (XSI_NS, name)
76
77
78def DCTERMS(name):
79    return '{%s}%s' % (DCTERMS_NS, name)
80
81
82def NCX(name):
83    return '{%s}%s' % (NCX_NS, name)
84
85
86def SVG(name):
87    return '{%s}%s' % (SVG_NS, name)
88
89
90def XLINK(name):
91    return '{%s}%s' % (XLINK_NS, name)
92
93
94def CALIBRE(name):
95    return '{%s}%s' % (CALIBRE_NS, name)
96
97
98_css_url_re = re.compile(r'url\s*\([\'"]{0,1}(.*?)[\'"]{0,1}\)', re.I)
99_css_import_re = re.compile(r'@import "(.*?)"')
100_archive_re = re.compile(r'[^ ]+')
101
102# Tags that should not be self closed in epub output
103self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
104'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
105'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
106'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'iframe', 'ins', 'kbd',
107'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
108'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
109'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
110'video', 'title', 'script', 'style'}
111
112
113def css_text(x):
114    ans = x.cssText
115    if isinstance(ans, bytes):
116        ans = ans.decode('utf-8', 'replace')
117    return ans
118
119
120def as_string_type(pat, for_unicode):
121    if for_unicode:
122        if isinstance(pat, bytes):
123            pat = pat.decode('utf-8')
124    else:
125        if isinstance(pat, str):
126            pat = pat.encode('utf-8')
127    return pat
128
129
130def self_closing_pat(for_unicode):
131    attr = 'unicode_ans' if for_unicode else 'bytes_ans'
132    ans = getattr(self_closing_pat, attr, None)
133    if ans is None:
134        sub = '|'.join(self_closing_bad_tags)
135        template = r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'
136        pat = template % sub
137        pat = as_string_type(pat, for_unicode)
138        ans = re.compile(pat, flags=re.IGNORECASE)
139        setattr(self_closing_pat, attr, ans)
140    return ans
141
142
143def close_self_closing_tags(raw):
144    for_unicode = isinstance(raw, str)
145    repl = as_string_type(r'<\g<tag>\g<arg>></\g<tag>>', for_unicode)
146    pat = self_closing_pat(for_unicode)
147    return pat.sub(repl, raw)
148
149
150def uuid_id():
151    return 'u' + uuid4()
152
153
154def itercsslinks(raw):
155    for match in _css_url_re.finditer(raw):
156        yield match.group(1), match.start(1)
157    for match in _css_import_re.finditer(raw):
158        yield match.group(1), match.start(1)
159
160
161_link_attrs = set(html.defs.link_attrs) | {XLINK('href'), 'poster'}
162
163
164def iterlinks(root, find_links_in_css=True):
165    '''
166    Iterate over all links in a OEB Document.
167
168    :param root: A valid lxml.etree element.
169    '''
170    assert etree.iselement(root)
171
172    for el in root.iter('*'):
173        try:
174            tag = barename(el.tag).lower()
175        except Exception:
176            continue
177        attribs = el.attrib
178
179        if tag == 'object':
180            codebase = None
181            # <object> tags have attributes that are relative to
182            # codebase
183            if 'codebase' in attribs:
184                codebase = el.get('codebase')
185                yield (el, 'codebase', codebase, 0)
186            for attrib in 'classid', 'data':
187                if attrib in attribs:
188                    value = el.get(attrib)
189                    if codebase is not None:
190                        value = urljoin(codebase, value)
191                    yield (el, attrib, value, 0)
192            if 'archive' in attribs:
193                for match in _archive_re.finditer(el.get('archive')):
194                    value = match.group(0)
195                    if codebase is not None:
196                        value = urljoin(codebase, value)
197                    yield (el, 'archive', value, match.start())
198        else:
199            for attr in attribs:
200                if attr in _link_attrs:
201                    yield (el, attr, attribs[attr], 0)
202
203        if not find_links_in_css:
204            continue
205        if tag == 'style' and el.text:
206            for match in _css_url_re.finditer(el.text):
207                yield (el, None, match.group(1), match.start(1))
208            for match in _css_import_re.finditer(el.text):
209                yield (el, None, match.group(1), match.start(1))
210        if 'style' in attribs:
211            for match in _css_url_re.finditer(attribs['style']):
212                yield (el, 'style', match.group(1), match.start(1))
213
214
215def make_links_absolute(root, base_url):
216    '''
217    Make all links in the document absolute, given the
218    ``base_url`` for the document (the full URL where the document
219    came from)
220    '''
221    def link_repl(href):
222        return urljoin(base_url, href)
223    rewrite_links(root, link_repl)
224
225
226def resolve_base_href(root):
227    base_href = None
228    basetags = root.xpath('//base[@href]|//h:base[@href]',
229            namespaces=XPNSMAP)
230    for b in basetags:
231        base_href = b.get('href')
232        b.drop_tree()
233    if not base_href:
234        return
235    make_links_absolute(root, base_href, resolve_base_href=False)
236
237
238def rewrite_links(root, link_repl_func, resolve_base_href=False):
239    '''
240    Rewrite all the links in the document.  For each link
241    ``link_repl_func(link)`` will be called, and the return value
242    will replace the old link.
243
244    Note that links may not be absolute (unless you first called
245    ``make_links_absolute()``), and may be internal (e.g.,
246    ``'#anchor'``).  They can also be values like
247    ``'mailto:email'`` or ``'javascript:expr'``.
248
249    If the ``link_repl_func`` returns None, the attribute or
250    tag text will be removed completely.
251    '''
252    from css_parser import replaceUrls, log, CSSParser
253    log.setLevel(logging.WARN)
254    log.raiseExceptions = False
255
256    if resolve_base_href:
257        resolve_base_href(root)
258    for el, attrib, link, pos in iterlinks(root, find_links_in_css=False):
259        new_link = link_repl_func(link.strip())
260        if new_link == link:
261            continue
262        if new_link is None:
263            # Remove the attribute or element content
264            if attrib is None:
265                el.text = ''
266            else:
267                del el.attrib[attrib]
268            continue
269        if attrib is None:
270            new = el.text[:pos] + new_link + el.text[pos+len(link):]
271            el.text = new
272        else:
273            cur = el.attrib[attrib]
274            if not pos and len(cur) == len(link):
275                # Most common case
276                el.attrib[attrib] = new_link
277            else:
278                new = cur[:pos] + new_link + cur[pos+len(link):]
279                el.attrib[attrib] = new
280
281    parser = CSSParser(raiseExceptions=False, log=_css_logger,
282            fetcher=lambda x:(None, ''))
283    for el in root.iter(etree.Element):
284        try:
285            tag = el.tag
286        except UnicodeDecodeError:
287            continue
288
289        if tag == XHTML('style') and el.text and \
290                (_css_url_re.search(el.text) is not None or '@import' in
291                        el.text):
292            stylesheet = parser.parseString(el.text, validate=False)
293            replaceUrls(stylesheet, link_repl_func)
294            repl = css_text(stylesheet)
295            el.text = '\n'+ clean_xml_chars(repl) + '\n'
296
297        text = el.get('style')
298        if text and _css_url_re.search(text) is not None:
299            try:
300                stext = parser.parseStyle(text, validate=False)
301            except Exception:
302                # Parsing errors are raised by css_parser
303                continue
304            replaceUrls(stext, link_repl_func)
305            repl = css_text(stext).replace('\n', ' ').replace('\r',
306                    ' ')
307            el.set('style', repl)
308
309
310types_map = get_types_map()
311EPUB_MIME      = types_map['.epub']
312XHTML_MIME     = types_map['.xhtml']
313CSS_MIME       = types_map['.css']
314NCX_MIME       = types_map['.ncx']
315OPF_MIME       = types_map['.opf']
316PAGE_MAP_MIME  = 'application/oebps-page-map+xml'
317OEB_DOC_MIME   = 'text/x-oeb1-document'
318OEB_CSS_MIME   = 'text/x-oeb1-css'
319OPENTYPE_MIME  = types_map['.otf']
320GIF_MIME       = types_map['.gif']
321JPEG_MIME      = types_map['.jpeg']
322PNG_MIME       = types_map['.png']
323SVG_MIME       = types_map['.svg']
324WEBP_MIME      = types_map['.webp']
325BINARY_MIME    = 'application/octet-stream'
326
327XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
328
329OEB_STYLES        = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'}
330OEB_DOCS          = {XHTML_MIME, 'text/html', OEB_DOC_MIME,
331                         'text/x-oeb-document'}
332OEB_RASTER_IMAGES = {GIF_MIME, JPEG_MIME, PNG_MIME, WEBP_MIME}
333OEB_IMAGES        = {GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME}
334
335MS_COVER_TYPE = 'other.ms-coverimage-standard'
336
337ENTITY_RE     = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
338COLLAPSE_RE   = re.compile(r'[ \t\r\n\v]+')
339QNAME_RE      = re.compile(r'^[{][^{}]+[}][^{}]+$')
340PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
341XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
342CSSURL_RE     = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')
343
344
345def element(parent, *args, **kwargs):
346    if parent is not None:
347        return etree.SubElement(parent, *args, **kwargs)
348    return etree.Element(*args, **kwargs)
349
350
351def prefixname(name, nsrmap):
352    if not isqname(name):
353        return name
354    ns = namespace(name)
355    if ns not in nsrmap:
356        return name
357    prefix = nsrmap[ns]
358    if not prefix:
359        return barename(name)
360    return ':'.join((prefix, barename(name)))
361
362
363def isprefixname(name):
364    return name and PREFIXNAME_RE.match(name) is not None
365
366
367def qname(name, nsmap):
368    if not isprefixname(name):
369        return name
370    prefix, local = name.split(':', 1)
371    if prefix not in nsmap:
372        return name
373    return '{%s}%s' % (nsmap[prefix], local)
374
375
376def isqname(name):
377    return name and QNAME_RE.match(name) is not None
378
379
380def XPath(expr):
381    return etree.XPath(expr, namespaces=XPNSMAP)
382
383
384def xpath(elem, expr):
385    return elem.xpath(expr, namespaces=XPNSMAP)
386
387
388def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True):
389    if not strip_comments:
390        # -- in comments trips up adobe digital editions
391        for x in root.iterdescendants(etree.Comment):
392            if x.text and '--' in x.text:
393                x.text = x.text.replace('--', '__')
394    ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
395                          pretty_print=pretty_print, with_tail=with_tail)
396
397    if strip_comments:
398        ans = re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', ans)
399
400    return ans
401
402
403def xml2text(elem, pretty_print=False, method='text'):
404    return etree.tostring(elem, method=method, encoding='unicode', with_tail=False, pretty_print=pretty_print)
405
406
407def escape_cdata(root):
408    pat = re.compile(r'[<>&]')
409    for elem in root.iterdescendants('{%s}style' % XHTML_NS, '{%s}script' % XHTML_NS):
410        if elem.text and pat.search(elem.text) is not None:
411            elem.text = etree.CDATA(elem.text.replace(']]>', r'\]\]\>'))
412
413
414def serialize(data, media_type, pretty_print=False):
415    if isinstance(data, etree._Element):
416        is_oeb_doc = media_type in OEB_DOCS
417        if is_oeb_doc:
418            escape_cdata(data)
419        ans = xml2str(data, pretty_print=pretty_print)
420        if is_oeb_doc:
421            # Convert self closing div|span|a|video|audio|iframe|etc tags
422            # to normally closed ones, as they are interpreted
423            # incorrectly by some browser based renderers
424            ans = close_self_closing_tags(ans)
425        return ans
426    if isinstance(data, str):
427        return data.encode('utf-8')
428    if hasattr(data, 'cssText'):
429        data = data.cssText
430        if isinstance(data, str):
431            data = data.encode('utf-8')
432        return data + b'\n'
433    return bytes(data)
434
435
436ASCII_CHARS   = frozenset(codepoint_to_chr(x) for x in range(128))
437UNIBYTE_CHARS = frozenset(x.encode('ascii') for x in ASCII_CHARS)
438USAFE         = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
439                 'abcdefghijklmnopqrstuvwxyz'
440                 '0123456789' '_.-/~')
441URL_SAFE      = frozenset(USAFE)
442URL_SAFE_BYTES = frozenset(USAFE.encode('ascii'))
443URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE_BYTES]
444del USAFE
445
446
447def urlquote(href):
448    """ Quote URL-unsafe characters, allowing IRI-safe characters.
449    That is, this function returns valid IRIs not valid URIs. In particular,
450    IRIs can contain non-ascii characters.  """
451    result = []
452    isbytes = isinstance(href, bytes)
453    unsafe = URL_UNSAFE[int(isbytes)]
454    esc, join = "%%%02x", ''
455    if isbytes:
456        esc, join = esc.encode('ascii'), b''
457    for char in href:
458        if char in unsafe:
459            char = esc % ord(char)
460        result.append(char)
461    return join.join(result)
462
463
464def urlnormalize(href):
465    """Convert a URL into normalized form, with all and only URL-unsafe
466    characters URL quoted.
467    """
468    try:
469        parts = urlparse(href)
470    except ValueError as e:
471        raise ValueError('Failed to parse the URL: %r with underlying error: %s' % (href, as_unicode(e)))
472    if not parts.scheme or parts.scheme == 'file':
473        path, frag = urldefrag(href)
474        parts = ('', '', path, '', '', frag)
475    parts = (part.replace('\\', '/') for part in parts)
476    parts = (urlunquote(part) for part in parts)
477    parts = (urlquote(part) for part in parts)
478    return urlunparse(parts)
479
480
481def extract(elem):
482    """
483    Removes this element from the tree, including its children and
484    text.  The tail text is joined to the previous element or
485    parent.
486    """
487    parent = elem.getparent()
488    if parent is not None:
489        if elem.tail:
490            previous = elem.getprevious()
491            if previous is None:
492                parent.text = (parent.text or '') + elem.tail
493            else:
494                previous.tail = (previous.tail or '') + elem.tail
495        parent.remove(elem)
496
497
498class DummyHandler(logging.Handler):
499
500    def __init__(self):
501        logging.Handler.__init__(self, logging.WARNING)
502        self.setFormatter(logging.Formatter('%(message)s'))
503        self.log = None
504
505    def emit(self, record):
506        if self.log is not None:
507            msg = self.format(record)
508            f = self.log.error if record.levelno >= logging.ERROR \
509                    else self.log.warn
510            f(msg)
511
512
513_css_logger = logging.getLogger('calibre.css')
514_css_logger.setLevel(logging.WARNING)
515_css_log_handler = DummyHandler()
516_css_logger.addHandler(_css_log_handler)
517
518
519class OEBError(Exception):
520    """Generic OEB-processing error."""
521    pass
522
523
524class NullContainer:
525    """An empty container.
526
527    For use with book formats which do not support container-like access.
528    """
529
530    def __init__(self, log):
531        self.log = log
532
533    def read(self, path):
534        raise OEBError('Attempt to read from NullContainer')
535
536    def write(self, path):
537        raise OEBError('Attempt to write to NullContainer')
538
539    def exists(self, path):
540        return False
541
542    def namelist(self):
543        return []
544
545
546class DirContainer:
547    """Filesystem directory container."""
548
549    def __init__(self, path, log, ignore_opf=False):
550        self.log = log
551        if isbytestring(path):
552            path = path.decode(filesystem_encoding)
553        self.opfname = None
554        ext = os.path.splitext(path)[1].lower()
555        if ext == '.opf':
556            self.opfname = os.path.basename(path)
557            self.rootdir = os.path.dirname(path)
558            return
559        self.rootdir = path
560        if not ignore_opf:
561            for path in self.namelist():
562                ext = os.path.splitext(path)[1].lower()
563                if ext == '.opf':
564                    self.opfname = path
565                    return
566
567    def _unquote(self, path):
568        # unquote must run on a bytestring and will return a bytestring
569        # If it runs on a unicode object, it returns a double encoded unicode
570        # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
571        # and the latter is correct
572        if isinstance(path, str):
573            path = path.encode('utf-8')
574        return urlunquote(path).decode('utf-8')
575
576    def read(self, path):
577        if path is None:
578            path = self.opfname
579        path = os.path.join(self.rootdir, self._unquote(path))
580        with lopen(path, 'rb') as f:
581            return f.read()
582
583    def write(self, path, data):
584        path = os.path.join(self.rootdir, self._unquote(path))
585        dir = os.path.dirname(path)
586        if not os.path.isdir(dir):
587            os.makedirs(dir)
588        with lopen(path, 'wb') as f:
589            return f.write(data)
590
591    def exists(self, path):
592        if not path:
593            return False
594        try:
595            path = os.path.join(self.rootdir, self._unquote(path))
596        except ValueError:  # Happens if path contains quoted special chars
597            return False
598        try:
599            return os.path.isfile(path)
600        except UnicodeEncodeError:
601            # On linux, if LANG is unset, the os.stat call tries to encode the
602            # unicode path using ASCII
603            # To replicate try:
604            # LANG=en_US.ASCII python -c "import os; os.stat(u'Espa\xf1a')"
605            return os.path.isfile(path.encode(filesystem_encoding))
606
607    def namelist(self):
608        names = []
609        base = self.rootdir
610        for root, dirs, files in os.walk(base):
611            for fname in files:
612                fname = os.path.join(root, fname)
613                if isinstance(fname, bytes):
614                    try:
615                        fname = fname.decode(filesystem_encoding)
616                    except Exception:
617                        try:
618                            fname = fname.decode('utf-8')
619                        except Exception:
620                            continue
621                fname = fname.replace('\\', '/')
622                names.append(fname)
623        return names
624
625
626class Metadata:
627    """A collection of OEB data model metadata.
628
629    Provides access to the list of items associated with a particular metadata
630    term via the term's local name using either Python container or attribute
631    syntax.  Return an empty list for any terms with no currently associated
632    metadata items.
633    """
634
635    DC_TERMS      = {'contributor', 'coverage', 'creator', 'date',
636                         'description', 'format', 'identifier', 'language',
637                         'publisher', 'relation', 'rights', 'source',
638                         'subject', 'title', 'type'}
639    CALIBRE_TERMS = {'series', 'series_index', 'rating', 'timestamp',
640                         'publication_type', 'title_sort'}
641    OPF_ATTRS     = {'role': OPF('role'), 'file-as': OPF('file-as'),
642                     'scheme': OPF('scheme'), 'event': OPF('event'),
643                     'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'}
644    OPF1_NSMAP    = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
645    OPF2_NSMAP    = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
646                     'xsi': XSI_NS, 'calibre': CALIBRE_NS}
647
648    class Item:
649        """An item of OEB data model metadata.
650
651        The metadata term or name may be accessed via the :attr:`term` or
652        :attr:`name` attributes.  The metadata value or content may be accessed
653        via the :attr:`value` or :attr:`content` attributes, or via Unicode or
654        string representations of the object.
655
656        OEB data model metadata attributes may be accessed either via their
657        fully-qualified names using the Python container access syntax, or via
658        their local names using Python attribute syntax.  Only attributes
659        allowed by the OPF 2.0 specification are supported.
660        """
661        class Attribute:
662            """Smart accessor for allowed OEB metadata item attributes."""
663
664            def __init__(self, attr, allowed=None):
665                if not callable(attr):
666                    attr_, attr = attr, lambda term: attr_
667                self.attr = attr
668                self.allowed = allowed
669
670            def term_attr(self, obj):
671                term = obj.term
672                if namespace(term) != DC11_NS:
673                    term = OPF('meta')
674                allowed = self.allowed
675                if allowed is not None and term not in allowed:
676                    raise AttributeError(
677                        'attribute %r not valid for metadata term %r' % (
678                            self.attr(term), barename(obj.term)))
679                return self.attr(term)
680
681            def __get__(self, obj, cls):
682                if obj is None:
683                    return None
684                return obj.attrib.get(self.term_attr(obj), '')
685
686            def __set__(self, obj, value):
687                obj.attrib[self.term_attr(obj)] = value
688
689        def __init__(self, term, value, attrib={}, nsmap={}, **kwargs):
690            self.attrib = attrib = dict(attrib)
691            self.nsmap = nsmap = dict(nsmap)
692            attrib.update(kwargs)
693            if namespace(term) == OPF2_NS:
694                term = barename(term)
695            ns = namespace(term)
696            local = barename(term).lower()
697            if local in Metadata.DC_TERMS and (not ns or ns in DC_NSES):
698                # Anything looking like Dublin Core is coerced
699                term = DC(local)
700            elif local in Metadata.CALIBRE_TERMS and ns in (CALIBRE_NS, ''):
701                # Ditto for Calibre-specific metadata
702                term = CALIBRE(local)
703            self.term = term
704            self.value = value
705            for attr, value in tuple(iteritems(attrib)):
706                if isprefixname(value):
707                    attrib[attr] = qname(value, nsmap)
708                nsattr = Metadata.OPF_ATTRS.get(attr, attr)
709                if nsattr == OPF('scheme') and namespace(term) != DC11_NS:
710                    # The opf:meta element takes @scheme, not @opf:scheme
711                    nsattr = 'scheme'
712                if attr != nsattr:
713                    attrib[nsattr] = attrib.pop(attr)
714
715        @property
716        def name(self):
717            return self.term
718
719        @property
720        def content(self):
721            return self.value
722
723        @content.setter
724        def content(self, value):
725            self.value = value
726
727        scheme  = Attribute(lambda term: 'scheme' if
728                            term == OPF('meta') else OPF('scheme'),
729                            [DC('identifier'), OPF('meta')])
730        file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'),
731                                             DC('title')])
732        role    = Attribute(OPF('role'), [DC('creator'), DC('contributor')])
733        event   = Attribute(OPF('event'), [DC('date')])
734        id      = Attribute('id')
735        type    = Attribute(XSI('type'), [DC('date'), DC('format'),
736                                          DC('type')])
737        lang    = Attribute(XML('lang'), [DC('contributor'), DC('coverage'),
738                                          DC('creator'), DC('publisher'),
739                                          DC('relation'), DC('rights'),
740                                          DC('source'), DC('subject'),
741                                          OPF('meta')])
742
743        def __getitem__(self, key):
744            return self.attrib[key]
745
746        def __setitem__(self, key, value):
747            self.attrib[key] = value
748
749        def __contains__(self, key):
750            return key in self.attrib
751
752        def get(self, key, default=None):
753            return self.attrib.get(key, default)
754
755        def __repr__(self):
756            return 'Item(term=%r, value=%r, attrib=%r)' \
757                % (barename(self.term), self.value, self.attrib)
758
759        def __str__(self):
760            return as_unicode(self.value)
761
762        def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}):
763            attrib = {}
764            for key, value in self.attrib.items():
765                if namespace(key) == OPF2_NS:
766                    key = barename(key)
767                attrib[key] = prefixname(value, nsrmap)
768            if namespace(self.term) == DC11_NS:
769                name = DC(icu_title(barename(self.term)))
770                elem = element(dcmeta, name, attrib=attrib)
771                elem.text = self.value
772            else:
773                elem = element(xmeta, 'meta', attrib=attrib)
774                elem.attrib['name'] = prefixname(self.term, nsrmap)
775                elem.attrib['content'] = prefixname(self.value, nsrmap)
776            return elem
777
778        def to_opf2(self, parent=None, nsrmap={}):
779            attrib = {}
780            for key, value in self.attrib.items():
781                attrib[key] = prefixname(value, nsrmap)
782            if namespace(self.term) == DC11_NS:
783                elem = element(parent, self.term, attrib=attrib)
784                try:
785                    elem.text = self.value
786                except:
787                    elem.text = repr(self.value)
788            else:
789                elem = element(parent, OPF('meta'), attrib=attrib)
790                elem.attrib['name'] = prefixname(self.term, nsrmap)
791                elem.attrib['content'] = prefixname(self.value, nsrmap)
792            return elem
793
794    def __init__(self, oeb):
795        self.oeb = oeb
796        self.items = defaultdict(list)
797        self.primary_writing_mode = None
798
799    def add(self, term, value, attrib={}, nsmap={}, **kwargs):
800        """Add a new metadata item."""
801        item = self.Item(term, value, attrib, nsmap, **kwargs)
802        items = self.items[barename(item.term)]
803        items.append(item)
804        return item
805
806    def iterkeys(self):
807        yield from self.items
808    __iter__ = iterkeys
809
810    def clear(self, key):
811        l = self.items[key]
812        for x in list(l):
813            l.remove(x)
814
815    def filter(self, key, predicate):
816        l = self.items[key]
817        for x in list(l):
818            if predicate(x):
819                l.remove(x)
820
821    def __getitem__(self, key):
822        return self.items[key]
823
824    def __contains__(self, key):
825        return key in self.items
826
827    def __getattr__(self, term):
828        return self.items[term]
829
830    @property
831    def _nsmap(self):
832        nsmap = {}
833        for term in self.items:
834            for item in self.items[term]:
835                nsmap.update(item.nsmap)
836        return nsmap
837
838    @property
839    def _opf1_nsmap(self):
840        nsmap = self._nsmap
841        for key, value in nsmap.items():
842            if value in OPF_NSES or value in DC_NSES:
843                del nsmap[key]
844        return nsmap
845
846    @property
847    def _opf2_nsmap(self):
848        nsmap = self._nsmap
849        nsmap.update(OPF2_NSMAP)
850        return nsmap
851
852    def to_opf1(self, parent=None):
853        nsmap = self._opf1_nsmap
854        nsrmap = {value: key for key, value in iteritems(nsmap)}
855        elem = element(parent, 'metadata', nsmap=nsmap)
856        dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP)
857        xmeta = element(elem, 'x-metadata')
858        for term in self.items:
859            for item in self.items[term]:
860                item.to_opf1(dcmeta, xmeta, nsrmap=nsrmap)
861        if 'ms-chaptertour' not in self.items:
862            chaptertour = self.Item('ms-chaptertour', 'chaptertour')
863            chaptertour.to_opf1(dcmeta, xmeta, nsrmap=nsrmap)
864        return elem
865
866    def to_opf2(self, parent=None):
867        nsmap = self._opf2_nsmap
868        nsrmap = {value: key for key, value in iteritems(nsmap)}
869        elem = element(parent, OPF('metadata'), nsmap=nsmap)
870        for term in self.items:
871            for item in self.items[term]:
872                item.to_opf2(elem, nsrmap=nsrmap)
873        if self.primary_writing_mode:
874            elem.append(elem.makeelement(OPF('meta'), attrib={'name':'primary-writing-mode', 'content':self.primary_writing_mode}))
875        return elem
876
877
878class Manifest:
879    """Collection of files composing an OEB data model book.
880
881    Provides access to the content of the files composing the book and
882    attributes associated with those files, including their internal paths,
883    unique identifiers, and MIME types.
884
885    Itself acts as a :class:`set` of manifest items, and provides the following
886    instance data member for dictionary-like access:
887
888    :attr:`ids`: A dictionary in which the keys are the unique identifiers of
889        the manifest items and the values are the items themselves.
890    :attr:`hrefs`: A dictionary in which the keys are the internal paths of the
891        manifest items and the values are the items themselves.
892    """
893
894    class Item:
895        """An OEB data model book content file.
896
897        Provides the following data members for accessing the file content and
898        metadata associated with this particular file.
899
900        :attr:`id`: Unique identifier.
901        :attr:`href`: Book-internal path.
902        :attr:`media_type`: MIME type of the file content.
903        :attr:`fallback`: Unique id of any fallback manifest item associated
904            with this manifest item.
905        :attr:`spine_position`: Display/reading order index for book textual
906            content.  `None` for manifest items which are not part of the
907            book's textual content.
908        :attr:`linear`: `True` for textual content items which are part of the
909            primary linear reading order and `False` for textual content items
910            which are not (such as footnotes).  Meaningless for items which
911            have a :attr:`spine_position` of `None`.
912        """
913
914        def __init__(self, oeb, id, href, media_type,
915                     fallback=None, loader=str, data=None):
916            if href:
917                href = str(href)
918            self.oeb = oeb
919            self.id = id
920            self.href = self.path = urlnormalize(href)
921            self.media_type = media_type
922            self.fallback = fallback
923            self.override_css_fetch = None
924            self.resolve_css_imports = True
925            self.spine_position = None
926            self.linear = True
927            if loader is None and data is None:
928                loader = oeb.container.read
929            self._loader = loader
930            self._data = data
931
932        def __repr__(self):
933            return 'Item(id=%r, href=%r, media_type=%r)' \
934                % (self.id, self.href, self.media_type)
935
936        # Parsing {{{
937        def _parse_xml(self, data):
938            if not data:
939                return
940            data = xml_to_unicode(data, strip_encoding_pats=True,
941                    assume_utf8=True, resolve_entities=True)[0]
942            return safe_xml_fromstring(data)
943
944        def _parse_xhtml(self, data):
945            orig_data = data
946            fname = urlunquote(self.href)
947            self.oeb.log.debug('Parsing', fname, '...')
948            self.oeb.html_preprocessor.current_href = self.href
949            try:
950                data = parse_html(data, log=self.oeb.log,
951                        decoder=self.oeb.decode,
952                        preprocessor=self.oeb.html_preprocessor,
953                        filename=fname, non_html_file_tags={'ncx'})
954            except NotHTML:
955                return self._parse_xml(orig_data)
956            return data
957
958        def _parse_txt(self, data):
959            has_html = '<html>'
960            if isinstance(data, bytes):
961                has_html = has_html.encode('ascii')
962            if has_html in data:
963                return self._parse_xhtml(data)
964
965            self.oeb.log.debug('Converting', self.href, '...')
966
967            from calibre.ebooks.txt.processor import convert_markdown
968
969            title = self.oeb.metadata.title
970            if title:
971                title = str(title[0])
972            else:
973                title = _('Unknown')
974
975            return self._parse_xhtml(convert_markdown(data, title=title))
976
977        def _parse_css(self, data):
978            from css_parser import CSSParser, log, resolveImports
979            from css_parser.css import CSSRule
980            log.setLevel(logging.WARN)
981            log.raiseExceptions = False
982            self.oeb.log.debug('Parsing', self.href, '...')
983            data = self.oeb.decode(data)
984            data = self.oeb.css_preprocessor(data, add_namespace=False)
985            parser = CSSParser(loglevel=logging.WARNING,
986                               fetcher=self.override_css_fetch or self._fetch_css,
987                               log=_css_logger)
988            data = parser.parseString(data, href=self.href, validate=False)
989            if self.resolve_css_imports:
990                data = resolveImports(data)
991            for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
992                data.cssRules.remove(rule)
993            return data
994
995        def _fetch_css(self, path):
996            hrefs = self.oeb.manifest.hrefs
997            if path not in hrefs:
998                self.oeb.logger.warn('CSS import of missing file %r' % path)
999                return (None, None)
1000            item = hrefs[path]
1001            if item.media_type not in OEB_STYLES:
1002                self.oeb.logger.warn('CSS import of non-CSS file %r' % path)
1003                return (None, None)
1004            data = item.data.cssText
1005            enc = None if isinstance(data, str) else 'utf-8'
1006            return (enc, data)
1007
1008        # }}}
1009
1010        @property
1011        def data(self):
1012            """Provides MIME type sensitive access to the manifest
1013            entry's associated content.
1014
1015            - XHTML, HTML, and variant content is parsed as necessary to
1016              convert and return as an lxml.etree element in the XHTML
1017              namespace.
1018            - XML content is parsed and returned as an lxml.etree element.
1019            - CSS and CSS-variant content is parsed and returned as a css_parser
1020              CSS DOM stylesheet.
1021            - All other content is returned as a :class:`str` or :class:`bytes`
1022              object with no special parsing.
1023            """
1024            data = self._data
1025            if data is None:
1026                if self._loader is None:
1027                    return None
1028                data = self._loader(getattr(self, 'html_input_href',
1029                    self.href))
1030            try:
1031                mt = self.media_type.lower()
1032            except Exception:
1033                mt = 'application/octet-stream'
1034            if not isinstance(data, string_or_bytes):
1035                pass  # already parsed
1036            elif mt in OEB_DOCS:
1037                data = self._parse_xhtml(data)
1038            elif mt[-4:] in ('+xml', '/xml'):
1039                data = self._parse_xml(data)
1040            elif mt in OEB_STYLES:
1041                data = self._parse_css(data)
1042            elif mt == 'text/plain':
1043                self.oeb.log.warn('%s contains data in TXT format'%self.href,
1044                        'converting to HTML')
1045                data = self._parse_txt(data)
1046                self.media_type = XHTML_MIME
1047            self._data = data
1048            return data
1049
1050        @data.setter
1051        def data(self, value):
1052            self._data = value
1053
1054        @data.deleter
1055        def data(self):
1056            self._data = None
1057
1058        def reparse_css(self):
1059            self._data = self._parse_css(str(self))
1060
1061        def unload_data_from_memory(self, memory=None):
1062            if isinstance(self._data, bytes):
1063                if memory is None:
1064                    from calibre.ptempfile import PersistentTemporaryFile
1065                    pt = PersistentTemporaryFile(suffix='_oeb_base_mem_unloader.img')
1066                    with pt:
1067                        pt.write(self._data)
1068                    self.oeb._temp_files.append(pt.name)
1069
1070                    def loader(*args):
1071                        with open(pt.name, 'rb') as f:
1072                            ans = f.read()
1073                        os.remove(pt.name)
1074                        return ans
1075                    self._loader = loader
1076                else:
1077                    def loader2(*args):
1078                        with open(memory, 'rb') as f:
1079                            ans = f.read()
1080                        return ans
1081                    self._loader = loader2
1082                self._data = None
1083
1084        @property
1085        def unicode_representation(self):
1086            data = self.data
1087            if isinstance(data, etree._Element):
1088                return xml2text(data, pretty_print=self.oeb.pretty_print)
1089            if isinstance(data, str):
1090                return data
1091            if hasattr(data, 'cssText'):
1092                return css_text(data)
1093            return str(data)
1094
1095        @property
1096        def bytes_representation(self):
1097            return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print)
1098
1099        def __str__(self):
1100            return self.unicode_representation
1101
1102        def __eq__(self, other):
1103            return self is other
1104
1105        def __ne__(self, other):
1106            return self is not other
1107
1108        def __hash__(self):
1109            return id(self)
1110
1111        @property
1112        def sort_key(self):
1113            href = self.href
1114            if isinstance(href, bytes):
1115                href = force_unicode(href)
1116            sp = self.spine_position if isinstance(self.spine_position, numbers.Number) else sys.maxsize
1117            return sp, (self.media_type or '').lower(), numeric_sort_key(href), self.id
1118
1119        def relhref(self, href):
1120            """Convert the URL provided in :param:`href` from a book-absolute
1121            reference to a reference relative to this manifest item.
1122            """
1123            return rel_href(self.href, href)
1124
1125        def abshref(self, href):
1126            """Convert the URL provided in :param:`href` from a reference
1127            relative to this manifest item to a book-absolute reference.
1128            """
1129            try:
1130                purl = urlparse(href)
1131            except ValueError:
1132                return href
1133            scheme = purl.scheme
1134            if scheme and scheme != 'file':
1135                return href
1136            purl = list(purl)
1137            purl[0] = ''
1138            href = urlunparse(purl)
1139            path, frag = urldefrag(href)
1140            if not path:
1141                if frag:
1142                    return '#'.join((self.href, frag))
1143                else:
1144                    return self.href
1145            if '/' not in self.href:
1146                return href
1147            dirname = os.path.dirname(self.href)
1148            href = os.path.join(dirname, href)
1149            href = os.path.normpath(href).replace('\\', '/')
1150            return href
1151
1152    def __init__(self, oeb):
1153        self.oeb = oeb
1154        self.items = set()
1155        self.ids = {}
1156        self.hrefs = {}
1157
1158    def add(self, id, href, media_type, fallback=None, loader=None, data=None):
1159        """Add a new item to the book manifest.
1160
1161        The item's :param:`id`, :param:`href`, and :param:`media_type` are all
1162        required.  A :param:`fallback` item-id is required for any items with a
1163        MIME type which is not one of the OPS core media types.  Either the
1164        item's data itself may be provided with :param:`data`, or a loader
1165        function for the data may be provided with :param:`loader`, or the
1166        item's data may later be set manually via the :attr:`data` attribute.
1167        """
1168        item = self.Item(
1169            self.oeb, id, href, media_type, fallback, loader, data)
1170        self.items.add(item)
1171        self.ids[item.id] = item
1172        self.hrefs[item.href] = item
1173        return item
1174
1175    def remove(self, item):
1176        """Removes :param:`item` from the manifest."""
1177        if item in self.ids:
1178            item = self.ids[item]
1179        del self.ids[item.id]
1180        if item.href in self.hrefs:
1181            del self.hrefs[item.href]
1182        self.items.remove(item)
1183        if item in self.oeb.spine:
1184            self.oeb.spine.remove(item)
1185
1186    def remove_duplicate_item(self, item):
1187        if item in self.ids:
1188            item = self.ids[item]
1189        del self.ids[item.id]
1190        self.items.remove(item)
1191
1192    def generate(self, id=None, href=None):
1193        """Generate a new unique identifier and/or internal path for use in
1194        creating a new manifest item, using the provided :param:`id` and/or
1195        :param:`href` as bases.
1196
1197        Returns an two-tuple of the new id and path.  If either :param:`id` or
1198        :param:`href` are `None` then the corresponding item in the return
1199        tuple will also be `None`.
1200        """
1201        if id is not None:
1202            base = id
1203            index = 1
1204            while id in self.ids:
1205                id = base + str(index)
1206                index += 1
1207        if href is not None:
1208            href = urlnormalize(href)
1209            base, ext = os.path.splitext(href)
1210            index = 1
1211            lhrefs = {x.lower() for x in self.hrefs}
1212            while href.lower() in lhrefs:
1213                href = base + str(index) + ext
1214                index += 1
1215        return id, str(href)
1216
1217    def __iter__(self):
1218        yield from self.items
1219
1220    def __len__(self):
1221        return len(self.items)
1222
1223    def values(self):
1224        return list(self.items)
1225
1226    def __contains__(self, item):
1227        return item in self.items
1228
1229    def to_opf1(self, parent=None):
1230        elem = element(parent, 'manifest')
1231        for item in self.items:
1232            media_type = item.media_type
1233            if media_type in OEB_DOCS:
1234                media_type = OEB_DOC_MIME
1235            elif media_type in OEB_STYLES:
1236                media_type = OEB_CSS_MIME
1237            attrib = {'id': item.id, 'href': urlunquote(item.href),
1238                      'media-type': media_type}
1239            if item.fallback:
1240                attrib['fallback'] = item.fallback
1241            element(elem, 'item', attrib=attrib)
1242        return elem
1243
1244    def to_opf2(self, parent=None):
1245        elem = element(parent, OPF('manifest'))
1246        for item in sorted(self.items, key=attrgetter('sort_key')):
1247            media_type = item.media_type
1248            if media_type in OEB_DOCS:
1249                media_type = XHTML_MIME
1250            elif media_type in OEB_STYLES:
1251                media_type = CSS_MIME
1252            attrib = {'id': item.id, 'href': urlunquote(item.href),
1253                      'media-type': media_type}
1254            if item.fallback:
1255                attrib['fallback'] = item.fallback
1256            element(elem, OPF('item'), attrib=attrib)
1257        return elem
1258
1259    @property
1260    def main_stylesheet(self):
1261        ans = getattr(self, '_main_stylesheet', None)
1262        if ans is None:
1263            for item in self:
1264                if item.media_type.lower() in OEB_STYLES:
1265                    ans = item
1266                    break
1267        return ans
1268
1269    @main_stylesheet.setter
1270    def main_stylesheet(self, item):
1271        self._main_stylesheet = item
1272
1273
1274class Spine:
1275    """Collection of manifest items composing an OEB data model book's main
1276    textual content.
1277
1278    The spine manages which manifest items compose the book's main textual
1279    content and the sequence in which they appear.  Provides Python container
1280    access as a list-like object.
1281    """
1282
1283    def __init__(self, oeb):
1284        self.oeb = oeb
1285        self.items = []
1286        self.page_progression_direction = None
1287
1288    def _linear(self, linear):
1289        if isinstance(linear, string_or_bytes):
1290            linear = linear.lower()
1291        if linear is None or linear in ('yes', 'true'):
1292            linear = True
1293        elif linear in ('no', 'false'):
1294            linear = False
1295        return linear
1296
1297    def add(self, item, linear=None):
1298        """Append :param:`item` to the end of the `Spine`."""
1299        item.linear = self._linear(linear)
1300        item.spine_position = len(self.items)
1301        self.items.append(item)
1302        return item
1303
1304    def insert(self, index, item, linear):
1305        """Insert :param:`item` at position :param:`index` in the `Spine`."""
1306        item.linear = self._linear(linear)
1307        item.spine_position = index
1308        self.items.insert(index, item)
1309        for i in range(index, len(self.items)):
1310            self.items[i].spine_position = i
1311        return item
1312
1313    def remove(self, item):
1314        """Remove :param:`item` from the `Spine`."""
1315        index = item.spine_position
1316        self.items.pop(index)
1317        for i in range(index, len(self.items)):
1318            self.items[i].spine_position = i
1319        item.spine_position = None
1320
1321    def index(self, item):
1322        for i, x in enumerate(self):
1323            if item == x:
1324                return i
1325        return -1
1326
1327    def __iter__(self):
1328        yield from self.items
1329
1330    def __getitem__(self, index):
1331        return self.items[index]
1332
1333    def __len__(self):
1334        return len(self.items)
1335
1336    def __contains__(self, item):
1337        return (item in self.items)
1338
1339    def to_opf1(self, parent=None):
1340        elem = element(parent, 'spine')
1341        for item in self.items:
1342            if item.linear:
1343                element(elem, 'itemref', attrib={'idref': item.id})
1344        return elem
1345
1346    def to_opf2(self, parent=None):
1347        elem = element(parent, OPF('spine'))
1348        for item in self.items:
1349            attrib = {'idref': item.id}
1350            if not item.linear:
1351                attrib['linear'] = 'no'
1352            element(elem, OPF('itemref'), attrib=attrib)
1353        return elem
1354
1355
1356class Guide:
1357    """Collection of references to standard frequently-occurring sections
1358    within an OEB data model book.
1359
1360    Provides dictionary-like access, in which the keys are the OEB reference
1361    type identifiers and the values are `Reference` objects.
1362    """
1363
1364    class Reference:
1365        """Reference to a standard book section.
1366
1367        Provides the following instance data members:
1368
1369        :attr:`type`: Reference type identifier, as chosen from the list
1370            allowed in the OPF 2.0 specification.
1371        :attr:`title`: Human-readable section title.
1372        :attr:`href`: Book-internal URL of the referenced section.  May include
1373            a fragment identifier.
1374        """
1375        _TYPES_TITLES = [('cover', __('Cover')),
1376                         ('title-page', __('Title page')),
1377                         ('toc', __('Table of Contents')),
1378                         ('index', __('Index')),
1379                         ('glossary', __('Glossary')),
1380                         ('acknowledgements', __('Acknowledgements')),
1381                         ('bibliography', __('Bibliography')),
1382                         ('colophon', __('Colophon')),
1383                         ('copyright-page', __('Copyright')),
1384                         ('dedication', __('Dedication')),
1385                         ('epigraph', __('Epigraph')),
1386                         ('foreword', __('Foreword')),
1387                         ('loi', __('List of illustrations')),
1388                         ('lot', __('List of tables')),
1389                         ('notes', __('Notes')),
1390                         ('preface', __('Preface')),
1391                         ('text', __('Main text'))]
1392        TITLES = dict(_TYPES_TITLES)
1393        TYPES = frozenset(TITLES)
1394        ORDER = {t: i for i, (t, _) in enumerate(_TYPES_TITLES)}
1395
1396        def __init__(self, oeb, type, title, href):
1397            self.oeb = oeb
1398            if type.lower() in self.TYPES:
1399                type = type.lower()
1400            elif type not in self.TYPES and \
1401                 not type.startswith('other.'):
1402                type = 'other.' + type
1403            if not title and type in self.TITLES:
1404                title = oeb.translate(self.TITLES[type])
1405            self.type = type
1406            self.title = title
1407            self.href = urlnormalize(href)
1408
1409        def __repr__(self):
1410            return 'Reference(type=%r, title=%r, href=%r)' \
1411                % (self.type, self.title, self.href)
1412
1413        @property
1414        def item(self):
1415            """The manifest item associated with this reference."""
1416            path = urldefrag(self.href)[0]
1417            hrefs = self.oeb.manifest.hrefs
1418            return hrefs.get(path, None)
1419
1420    def __init__(self, oeb):
1421        self.oeb = oeb
1422        self.refs = {}
1423
1424    def add(self, type, title, href):
1425        """Add a new reference to the `Guide`."""
1426        if href:
1427            href = str(href)
1428        ref = self.Reference(self.oeb, type, title, href)
1429        self.refs[type] = ref
1430        return ref
1431
1432    def remove(self, type):
1433        return self.refs.pop(type, None)
1434
1435    def remove_by_href(self, href):
1436        remove = [r for r, i in iteritems(self.refs) if i.href == href]
1437        for r in remove:
1438            self.remove(r)
1439
1440    def iterkeys(self):
1441        yield from self.refs
1442    __iter__ = iterkeys
1443
1444    def values(self):
1445        return sorted(itervalues(self.refs), key=lambda ref: ref.ORDER.get(ref.type, 10000))
1446
1447    def items(self):
1448        yield from self.refs.items()
1449
1450    def __getitem__(self, key):
1451        return self.refs[key]
1452
1453    def get(self, key):
1454        return self.refs.get(key)
1455
1456    def __delitem__(self, key):
1457        del self.refs[key]
1458
1459    def __contains__(self, key):
1460        return key in self.refs
1461
1462    def __len__(self):
1463        return len(self.refs)
1464
1465    def to_opf1(self, parent=None):
1466        elem = element(parent, 'guide')
1467        for ref in self.refs.values():
1468            attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
1469            if ref.title:
1470                attrib['title'] = ref.title
1471            element(elem, 'reference', attrib=attrib)
1472        return elem
1473
1474    def to_opf2(self, parent=None):
1475        if not len(self):
1476            return
1477        elem = element(parent, OPF('guide'))
1478        for ref in self.refs.values():
1479            attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
1480            if ref.title:
1481                attrib['title'] = ref.title
1482            element(elem, OPF('reference'), attrib=attrib)
1483        return elem
1484
1485
1486class TOC:
1487    """Represents a hierarchical table of contents or navigation tree for
1488    accessing arbitrary semantic sections within an OEB data model book.
1489
1490    Acts as a node within the navigation tree.  Provides list-like access to
1491    sub-nodes.  Provides the follow node instance data attributes:
1492
1493    :attr:`title`: The title of this navigation node.
1494    :attr:`href`: Book-internal URL referenced by this node.
1495    :attr:`klass`: Optional semantic class referenced by this node.
1496    :attr:`id`: Option unique identifier for this node.
1497    :attr:`author`: Optional author attribution for periodicals <mbp:>
1498    :attr:`description`: Optional description attribute for periodicals <mbp:>
1499    :attr:`toc_thumbnail`: Optional toc thumbnail image
1500    """
1501
1502    def __init__(self, title=None, href=None, klass=None, id=None,
1503            play_order=None, author=None, description=None, toc_thumbnail=None):
1504        self.title = title
1505        self.href = urlnormalize(href) if href else href
1506        self.klass = klass
1507        self.id = id
1508        self.nodes = []
1509        self.play_order = 0
1510        if play_order is None:
1511            play_order = self.next_play_order()
1512        self.play_order = play_order
1513        self.author = author
1514        self.description = description
1515        self.toc_thumbnail = toc_thumbnail
1516
1517    def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None):
1518        """Create and return a new sub-node of this node."""
1519        node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail)
1520        self.nodes.append(node)
1521        return node
1522
1523    def remove(self, node):
1524        for child in self.nodes:
1525            if child is node:
1526                self.nodes.remove(child)
1527                return True
1528            else:
1529                if child.remove(node):
1530                    return True
1531        return False
1532
1533    def iter(self):
1534        """Iterate over this node and all descendants in depth-first order."""
1535        yield self
1536        for child in self.nodes:
1537            yield from child.iter()
1538
1539    def count(self):
1540        return len(list(self.iter())) - 1
1541
1542    def next_play_order(self):
1543        entries = [x.play_order for x in self.iter()]
1544        base = max(entries) if entries else 0
1545        return base+1
1546
1547    def has_href(self, href):
1548        for x in self.iter():
1549            if x.href == href:
1550                return True
1551        return False
1552
1553    def has_text(self, text):
1554        for x in self.iter():
1555            if x.title and x.title.lower() == text.lower():
1556                return True
1557        return False
1558
1559    def iterdescendants(self, breadth_first=False):
1560        """Iterate over all descendant nodes in depth-first order."""
1561        if breadth_first:
1562            for child in self.nodes:
1563                yield child
1564            for child in self.nodes:
1565                yield from child.iterdescendants(breadth_first=True)
1566        else:
1567            for child in self.nodes:
1568                yield from child.iter()
1569
1570    def __iter__(self):
1571        """Iterate over all immediate child nodes."""
1572        yield from self.nodes
1573
1574    def __getitem__(self, index):
1575        return self.nodes[index]
1576
1577    def autolayer(self):
1578        """Make sequences of children pointing to the same content file into
1579        children of the first node referencing that file.
1580        """
1581        prev = None
1582        for node in list(self.nodes):
1583            if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]:
1584                self.nodes.remove(node)
1585                prev.nodes.append(node)
1586            else:
1587                prev = node
1588
1589    def depth(self):
1590        """The maximum depth of the navigation tree rooted at this node."""
1591        try:
1592            return max(node.depth() for node in self.nodes) + 1
1593        except ValueError:
1594            return 1
1595
1596    def get_lines(self, lvl=0):
1597        ans = [('\t'*lvl) + 'TOC: %s --> %s'%(self.title, self.href)]
1598        for child in self:
1599            ans.extend(child.get_lines(lvl+1))
1600        return ans
1601
1602    def __str__(self):
1603        return '\n'.join(self.get_lines())
1604
1605    def to_opf1(self, tour):
1606        for node in self.nodes:
1607            element(tour, 'site', attrib={
1608                'title': node.title, 'href': urlunquote(node.href)})
1609            node.to_opf1(tour)
1610        return tour
1611
1612    def to_ncx(self, parent=None):
1613        if parent is None:
1614            parent = etree.Element(NCX('navMap'))
1615        for node in self.nodes:
1616            id = node.id or uuid_id()
1617            po = node.play_order
1618            if po == 0:
1619                po = 1
1620            attrib = {'id': id, 'playOrder': str(po)}
1621            if node.klass:
1622                attrib['class'] = node.klass
1623            point = element(parent, NCX('navPoint'), attrib=attrib)
1624            label = etree.SubElement(point, NCX('navLabel'))
1625            title = node.title
1626            if title:
1627                title = re.sub(r'\s+', ' ', title)
1628            element(label, NCX('text')).text = title
1629            # Do not unescape this URL as ADE requires it to be escaped to
1630            # handle semi colons and other special characters in the file names
1631            element(point, NCX('content'), src=node.href)
1632            node.to_ncx(point)
1633        return parent
1634
1635    def rationalize_play_orders(self):
1636        '''
1637        Ensure that all nodes with the same play_order have the same href and
1638        with different play_orders have different hrefs.
1639        '''
1640        def po_node(n):
1641            for x in self.iter():
1642                if x is n:
1643                    return
1644                if x.play_order == n.play_order:
1645                    return x
1646
1647        def href_node(n):
1648            for x in self.iter():
1649                if x is n:
1650                    return
1651                if x.href == n.href:
1652                    return x
1653
1654        for x in self.iter():
1655            y = po_node(x)
1656            if y is not None:
1657                if x.href != y.href:
1658                    x.play_order = getattr(href_node(x), 'play_order',
1659                            self.next_play_order())
1660            y = href_node(x)
1661            if y is not None:
1662                x.play_order = y.play_order
1663
1664
1665class PageList:
1666    """Collection of named "pages" to mapped positions within an OEB data model
1667    book's textual content.
1668
1669    Provides list-like access to the pages.
1670    """
1671
1672    class Page:
1673        """Represents a mapping between a page name and a position within
1674        the book content.
1675
1676        Provides the following instance data attributes:
1677
1678        :attr:`name`: The name of this page.  Generally a number.
1679        :attr:`href`: Book-internal URL at which point this page begins.
1680        :attr:`type`: Must be one of 'front' (for prefatory pages, as commonly
1681            labeled in print with small-case Roman numerals), 'normal' (for
1682            standard pages, as commonly labeled in print with Arabic numerals),
1683            or 'special' (for other pages, as commonly not labeled in any
1684            fashion in print, such as the cover and title pages).
1685        :attr:`klass`: Optional semantic class of this page.
1686        :attr:`id`: Optional unique identifier for this page.
1687        """
1688        TYPES = {'front', 'normal', 'special'}
1689
1690        def __init__(self, name, href, type='normal', klass=None, id=None):
1691            self.name = str(name)
1692            self.href = urlnormalize(href)
1693            self.type = type if type in self.TYPES else 'normal'
1694            self.id = id
1695            self.klass = klass
1696
1697    def __init__(self):
1698        self.pages = []
1699
1700    def add(self, name, href, type='normal', klass=None, id=None):
1701        """Create a new page and add it to the `PageList`."""
1702        page = self.Page(name, href, type, klass, id)
1703        self.pages.append(page)
1704        return page
1705
1706    def __len__(self):
1707        return len(self.pages)
1708
1709    def __iter__(self):
1710        yield from self.pages
1711
1712    def __getitem__(self, index):
1713        return self.pages[index]
1714
1715    def pop(self, index=-1):
1716        return self.pages.pop(index)
1717
1718    def remove(self, page):
1719        return self.pages.remove(page)
1720
1721    def to_ncx(self, parent=None):
1722        plist = element(parent, NCX('pageList'), id=uuid_id())
1723        values = {t: count(1) for t in ('front', 'normal', 'special')}
1724        for page in self.pages:
1725            id = page.id or uuid_id()
1726            type = page.type
1727            value = str(next(values[type]))
1728            attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'}
1729            if page.klass:
1730                attrib['class'] = page.klass
1731            ptarget = element(plist, NCX('pageTarget'), attrib=attrib)
1732            label = element(ptarget, NCX('navLabel'))
1733            element(label, NCX('text')).text = page.name
1734            element(ptarget, NCX('content'), src=page.href)
1735        return plist
1736
1737    def to_page_map(self):
1738        pmap = etree.Element(OPF('page-map'), nsmap={None: OPF2_NS})
1739        for page in self.pages:
1740            element(pmap, OPF('page'), name=page.name, href=page.href)
1741        return pmap
1742
1743
1744class OEBBook:
1745    """Representation of a book in the IDPF OEB data model."""
1746
1747    COVER_SVG_XP    = XPath('h:body//svg:svg[position() = 1]')
1748    COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
1749
1750    def __init__(self, logger,
1751            html_preprocessor,
1752            css_preprocessor=CSSPreProcessor(),
1753            encoding='utf-8', pretty_print=False,
1754            input_encoding='utf-8'):
1755        """Create empty book.  Arguments:
1756
1757        :param:`encoding`: Default encoding for textual content read
1758            from an external container.
1759        :param:`pretty_print`: Whether or not the canonical string form
1760            of XML markup is pretty-printed.
1761        :param html_preprocessor: A callable that takes a unicode object
1762            and returns a unicode object. Will be called on all html files
1763            before they are parsed.
1764        :param css_preprocessor: A callable that takes a unicode object
1765            and returns a unicode object. Will be called on all CSS files
1766            before they are parsed.
1767        :param:`logger`: A Log object to use for logging all messages
1768            related to the processing of this book.  It is accessible
1769            via the instance data members :attr:`logger,log`.
1770
1771        It provides the following public instance data members for
1772        accessing various parts of the OEB data model:
1773
1774        :attr:`metadata`: Metadata such as title, author name(s), etc.
1775        :attr:`manifest`: Manifest of all files included in the book,
1776            including MIME types and fallback information.
1777        :attr:`spine`: In-order list of manifest items which compose
1778            the textual content of the book.
1779        :attr:`guide`: Collection of references to standard positions
1780            within the text, such as the cover, preface, etc.
1781        :attr:`toc`: Hierarchical table of contents.
1782        :attr:`pages`: List of "pages," such as indexed to a print edition of
1783            the same text.
1784        """
1785        _css_log_handler.log = logger
1786        self.encoding = encoding
1787        self.input_encoding = input_encoding
1788        self.html_preprocessor = html_preprocessor
1789        self.css_preprocessor = css_preprocessor
1790        self.pretty_print = pretty_print
1791        self.logger = self.log = logger
1792        self.version = '2.0'
1793        self.container = NullContainer(self.log)
1794        self.metadata = Metadata(self)
1795        self.uid = None
1796        self.manifest = Manifest(self)
1797        self.spine = Spine(self)
1798        self.guide = Guide(self)
1799        self.toc = TOC()
1800        self.pages = PageList()
1801        self.auto_generated_toc = True
1802        self._temp_files = []
1803
1804    def clean_temp_files(self):
1805        for path in self._temp_files:
1806            try:
1807                os.remove(path)
1808            except:
1809                pass
1810
1811    @classmethod
1812    def generate(cls, opts):
1813        """Generate an OEBBook instance from command-line options."""
1814        encoding = opts.encoding
1815        pretty_print = opts.pretty_print
1816        return cls(encoding=encoding, pretty_print=pretty_print)
1817
1818    def translate(self, text):
1819        """Translate :param:`text` into the book's primary language."""
1820        lang = str(self.metadata.language[0])
1821        lang = lang.split('-', 1)[0].lower()
1822        return translate(lang, text)
1823
1824    def decode(self, data):
1825        """Automatically decode :param:`data` into a `unicode` object."""
1826        def fix_data(d):
1827            return d.replace('\r\n', '\n').replace('\r', '\n')
1828        if isinstance(data, str):
1829            return fix_data(data)
1830        bom_enc = None
1831        if data[:4] in (b'\0\0\xfe\xff', b'\xff\xfe\0\0'):
1832            bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
1833                    b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
1834            data = data[4:]
1835        elif data[:2] in (b'\xff\xfe', b'\xfe\xff'):
1836            bom_enc = {b'\xff\xfe':'utf-16-le', 'b\xfe\xff':'utf-16-be'}[data[:2]]
1837            data = data[2:]
1838        elif data[:3] == b'\xef\xbb\xbf':
1839            bom_enc = 'utf-8'
1840            data = data[3:]
1841        if bom_enc is not None:
1842            try:
1843                return fix_data(data.decode(bom_enc))
1844            except UnicodeDecodeError:
1845                pass
1846        if self.input_encoding:
1847            try:
1848                return fix_data(data.decode(self.input_encoding, 'replace'))
1849            except UnicodeDecodeError:
1850                pass
1851        try:
1852            return fix_data(data.decode('utf-8'))
1853        except UnicodeDecodeError:
1854            pass
1855        data, _ = xml_to_unicode(data)
1856        return fix_data(data)
1857
1858    def to_opf1(self):
1859        """Produce OPF 1.2 representing the book's metadata and structure.
1860
1861        Returns a dictionary in which the keys are MIME types and the values
1862        are tuples of (default) filenames and lxml.etree element structures.
1863        """
1864        package = etree.Element('package',
1865            attrib={'unique-identifier': self.uid.id})
1866        self.metadata.to_opf1(package)
1867        self.manifest.to_opf1(package)
1868        self.spine.to_opf1(package)
1869        tours = element(package, 'tours')
1870        tour = element(tours, 'tour',
1871            attrib={'id': 'chaptertour', 'title': 'Chapter Tour'})
1872        self.toc.to_opf1(tour)
1873        self.guide.to_opf1(package)
1874        return {OPF_MIME: ('content.opf', package)}
1875
1876    def _update_playorder(self, ncx):
1877        hrefs = set(map(urlnormalize, xpath(ncx, '//ncx:content/@src')))
1878        playorder = {}
1879        next = 1
1880        selector = XPath('h:body//*[@id or @name]')
1881        for item in self.spine:
1882            base = item.href
1883            if base in hrefs:
1884                playorder[base] = next
1885                next += 1
1886            for elem in selector(item.data):
1887                added = False
1888                for attr in ('id', 'name'):
1889                    id = elem.get(attr)
1890                    if not id:
1891                        continue
1892                    href = '#'.join([base, id])
1893                    if href in hrefs:
1894                        playorder[href] = next
1895                        added = True
1896                if added:
1897                    next += 1
1898        selector = XPath('ncx:content/@src')
1899        for i, elem in enumerate(xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]')):
1900            href = urlnormalize(selector(elem)[0])
1901            order = playorder.get(href, i)
1902            elem.attrib['playOrder'] = str(order)
1903        return
1904
1905    def _to_ncx(self):
1906        lang = str(self.metadata.language[0])
1907        lang = lang.replace('_', '-')
1908        ncx = etree.Element(NCX('ncx'),
1909            attrib={'version': '2005-1', XML('lang'): lang},
1910            nsmap={None: NCX_NS})
1911        head = etree.SubElement(ncx, NCX('head'))
1912        etree.SubElement(head, NCX('meta'),
1913            name='dtb:uid', content=str(self.uid))
1914        etree.SubElement(head, NCX('meta'),
1915            name='dtb:depth', content=str(self.toc.depth()))
1916        generator = ''.join(['calibre (', __version__, ')'])
1917        etree.SubElement(head, NCX('meta'),
1918            name='dtb:generator', content=generator)
1919        etree.SubElement(head, NCX('meta'),
1920            name='dtb:totalPageCount', content=str(len(self.pages)))
1921        maxpnum = etree.SubElement(head, NCX('meta'),
1922            name='dtb:maxPageNumber', content='0')
1923        title = etree.SubElement(ncx, NCX('docTitle'))
1924        text = etree.SubElement(title, NCX('text'))
1925        text.text = str(self.metadata.title[0])
1926        navmap = etree.SubElement(ncx, NCX('navMap'))
1927        self.toc.to_ncx(navmap)
1928        if len(self.pages) > 0:
1929            plist = self.pages.to_ncx(ncx)
1930            value = max(int(x) for x in xpath(plist, '//@value'))
1931            maxpnum.attrib['content'] = str(value)
1932        self._update_playorder(ncx)
1933        return ncx
1934
1935    def to_opf2(self, page_map=False):
1936        """Produce OPF 2.0 representing the book's metadata and structure.
1937
1938        Returns a dictionary in which the keys are MIME types and the values
1939        are tuples of (default) filenames and lxml.etree element structures.
1940        """
1941        results = {}
1942        package = etree.Element(OPF('package'),
1943            attrib={'version': '2.0', 'unique-identifier': self.uid.id},
1944            nsmap={None: OPF2_NS})
1945        self.metadata.to_opf2(package)
1946        manifest = self.manifest.to_opf2(package)
1947        spine = self.spine.to_opf2(package)
1948        self.guide.to_opf2(package)
1949        results[OPF_MIME] = ('content.opf', package)
1950        id, href = self.manifest.generate('ncx', 'toc.ncx')
1951        etree.SubElement(manifest, OPF('item'), id=id, href=href,
1952                         attrib={'media-type': NCX_MIME})
1953        spine.attrib['toc'] = id
1954        results[NCX_MIME] = (href, self._to_ncx())
1955        if page_map and len(self.pages) > 0:
1956            id, href = self.manifest.generate('page-map', 'page-map.xml')
1957            etree.SubElement(manifest, OPF('item'), id=id, href=href,
1958                             attrib={'media-type': PAGE_MAP_MIME})
1959            spine.attrib['page-map'] = id
1960            results[PAGE_MAP_MIME] = (href, self.pages.to_page_map())
1961        if self.spine.page_progression_direction in {'ltr', 'rtl'}:
1962            spine.attrib['page-progression-direction'] = self.spine.page_progression_direction
1963        return results
1964
1965
1966def rel_href(base_href, href):
1967    """Convert the URL provided in :param:`href` to a URL relative to the URL
1968    in :param:`base_href`  """
1969    if urlparse(href).scheme:
1970        return href
1971    if '/' not in base_href:
1972        return href
1973    base = list(filter(lambda x: x and x != '.', os.path.dirname(os.path.normpath(base_href)).replace(os.sep, '/').split('/')))
1974    while True:
1975        try:
1976            idx = base.index('..')
1977        except ValueError:
1978            break
1979        if idx > 0:
1980            del base[idx-1:idx+1]
1981        else:
1982            break
1983    if not base:
1984        return href
1985    target, frag = urldefrag(href)
1986    target = target.split('/')
1987    index = 0
1988    for index in range(min(len(base), len(target))):
1989        if base[index] != target[index]:
1990            break
1991    else:
1992        index += 1
1993    relhref = (['..'] * (len(base) - index)) + target[index:]
1994    relhref = '/'.join(relhref)
1995    if frag:
1996        relhref = '#'.join((relhref, frag))
1997    return relhref
1998