base.py - OpenGrok cross reference for /dports/deskutils/calibre/calibre-src-5.34.0/src/calibre/ebooks/oeb/base.py

'''
Basic support for manipulating OEB 1.x/2.0 content and metadata.
'''

__license__   = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
__docformat__ = 'restructuredtext en'

import os, re, logging, sys, numbers
from collections import defaultdict
from itertools import count
from operator import attrgetter

from lxml import etree, html
from calibre import force_unicode
from calibre.constants import filesystem_encoding, __version__
from calibre.translations.dynamic import translate
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.conversion.preprocess import CSSPreProcessor
from calibre import (isbytestring, as_unicode, get_types_map)
from calibre.ebooks.oeb.parse_utils import barename, XHTML_NS, namespace, XHTML, parse_html, NotHTML
from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.short_uuid import uuid4
from polyglot.builtins import iteritems, string_or_bytes, itervalues, codepoint_to_chr
from polyglot.urllib import unquote as urlunquote, urldefrag, urljoin, urlparse, urlunparse
from calibre.utils.icu import numeric_sort_key

XML_NS       = 'http://www.w3.org/XML/1998/namespace'
OEB_DOC_NS   = 'http://openebook.org/namespaces/oeb-document/1.0/'
OPF1_NS      = 'http://openebook.org/namespaces/oeb-package/1.0/'
OPF2_NS      = 'http://www.idpf.org/2007/opf'
OPF_NSES     = {OPF1_NS, OPF2_NS}
DC09_NS      = 'http://purl.org/metadata/dublin_core'
DC10_NS      = 'http://purl.org/dc/elements/1.0/'
DC11_NS      = 'http://purl.org/dc/elements/1.1/'
DC_NSES      = {DC09_NS, DC10_NS, DC11_NS}
XSI_NS       = 'http://www.w3.org/2001/XMLSchema-instance'
DCTERMS_NS   = 'http://purl.org/dc/terms/'
NCX_NS       = 'http://www.daisy.org/z3986/2005/ncx/'
SVG_NS       = 'http://www.w3.org/2000/svg'
XLINK_NS     = 'http://www.w3.org/1999/xlink'
CALIBRE_NS   = 'http://calibre.kovidgoyal.net/2009/metadata'
RE_NS        = 'http://exslt.org/regular-expressions'
MBP_NS       = 'http://www.mobipocket.com'
EPUB_NS      = 'http://www.idpf.org/2007/ops'
MATHML_NS    = 'http://www.w3.org/1998/Math/MathML'

XPNSMAP      = {
        'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, 'd09': DC09_NS,
        'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt': DCTERMS_NS,
        'ncx': NCX_NS, 'svg': SVG_NS, 'xl': XLINK_NS, 're': RE_NS,
        'mathml': MATHML_NS, 'mbp': MBP_NS, 'calibre': CALIBRE_NS,
        'epub':EPUB_NS
}

OPF1_NSMAP   = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
OPF2_NSMAP   = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
                'xsi': XSI_NS, 'calibre': CALIBRE_NS}


def XML(name):
    return '{%s}%s' % (XML_NS, name)


def OPF(name):
    return '{%s}%s' % (OPF2_NS, name)


def DC(name):
    return '{%s}%s' % (DC11_NS, name)


def XSI(name):
    return '{%s}%s' % (XSI_NS, name)


def DCTERMS(name):
    return '{%s}%s' % (DCTERMS_NS, name)


def NCX(name):
    return '{%s}%s' % (NCX_NS, name)


def SVG(name):
    return '{%s}%s' % (SVG_NS, name)


def XLINK(name):
    return '{%s}%s' % (XLINK_NS, name)


def CALIBRE(name):
    return '{%s}%s' % (CALIBRE_NS, name)


_css_url_re = re.compile(r'url\s*\([\'"]{0,1}(.*?)[\'"]{0,1}\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
_archive_re = re.compile(r'[^ ]+')

# Tags that should not be self closed in epub output
self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b',
'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details',
'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'iframe', 'ins', 'kbd',
'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p',
'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small',
'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var',
'video', 'title', 'script', 'style'}


def css_text(x):
    ans = x.cssText
    if isinstance(ans, bytes):
        ans = ans.decode('utf-8', 'replace')
    return ans


def as_string_type(pat, for_unicode):
    if for_unicode:
        if isinstance(pat, bytes):
            pat = pat.decode('utf-8')
    else:
        if isinstance(pat, str):
            pat = pat.encode('utf-8')
    return pat


def self_closing_pat(for_unicode):
    attr = 'unicode_ans' if for_unicode else 'bytes_ans'
    ans = getattr(self_closing_pat, attr, None)
    if ans is None:
        sub = '|'.join(self_closing_bad_tags)
        template = r'<(?P<tag>%s)(?=[\s/])(?P<arg>[^>]*)/>'
        pat = template % sub
        pat = as_string_type(pat, for_unicode)
        ans = re.compile(pat, flags=re.IGNORECASE)
        setattr(self_closing_pat, attr, ans)
    return ans


def close_self_closing_tags(raw):
    for_unicode = isinstance(raw, str)
    repl = as_string_type(r'<\g<tag>\g<arg>></\g<tag>>', for_unicode)
    pat = self_closing_pat(for_unicode)
    return pat.sub(repl, raw)


def uuid_id():
    return 'u' + uuid4()


def itercsslinks(raw):
    for match in _css_url_re.finditer(raw):
        yield match.group(1), match.start(1)
    for match in _css_import_re.finditer(raw):
        yield match.group(1), match.start(1)


_link_attrs = set(html.defs.link_attrs) | {XLINK('href'), 'poster'}


def iterlinks(root, find_links_in_css=True):
    '''
    Iterate over all links in a OEB Document.

    :param root: A valid lxml.etree element.
    '''
    assert etree.iselement(root)

    for el in root.iter('*'):
        try:
            tag = barename(el.tag).lower()
        except Exception:
            continue
        attribs = el.attrib

        if tag == 'object':
            codebase = None
            # <object> tags have attributes that are relative to
            # codebase
            if 'codebase' in attribs:
                codebase = el.get('codebase')
                yield (el, 'codebase', codebase, 0)
            for attrib in 'classid', 'data':
                if attrib in attribs:
                    value = el.get(attrib)
                    if codebase is not None:
                        value = urljoin(codebase, value)
                    yield (el, attrib, value, 0)
            if 'archive' in attribs:
                for match in _archive_re.finditer(el.get('archive')):
                    value = match.group(0)
                    if codebase is not None:
                        value = urljoin(codebase, value)
                    yield (el, 'archive', value, match.start())
        else:
            for attr in attribs:
                if attr in _link_attrs:
                    yield (el, attr, attribs[attr], 0)

        if not find_links_in_css:
            continue
        if tag == 'style' and el.text:
            for match in _css_url_re.finditer(el.text):
                yield (el, None, match.group(1), match.start(1))
            for match in _css_import_re.finditer(el.text):
                yield (el, None, match.group(1), match.start(1))
        if 'style' in attribs:
            for match in _css_url_re.finditer(attribs['style']):
                yield (el, 'style', match.group(1), match.start(1))


def make_links_absolute(root, base_url):
    '''
    Make all links in the document absolute, given the
    ``base_url`` for the document (the full URL where the document
    came from)
    '''
    def link_repl(href):
        return urljoin(base_url, href)
    rewrite_links(root, link_repl)


def resolve_base_href(root):
    base_href = None
    basetags = root.xpath('//base[@href]|//h:base[@href]',
            namespaces=XPNSMAP)
    for b in basetags:
        base_href = b.get('href')
        b.drop_tree()
    if not base_href:
        return
    make_links_absolute(root, base_href, resolve_base_href=False)


def rewrite_links(root, link_repl_func, resolve_base_href=False):
    '''
    Rewrite all the links in the document.  For each link
    ``link_repl_func(link)`` will be called, and the return value
    will replace the old link.

    Note that links may not be absolute (unless you first called
    ``make_links_absolute()``), and may be internal (e.g.,
    ``'#anchor'``).  They can also be values like
    ``'mailto:email'`` or ``'javascript:expr'``.

    If the ``link_repl_func`` returns None, the attribute or
    tag text will be removed completely.
    '''
    from css_parser import replaceUrls, log, CSSParser
    log.setLevel(logging.WARN)
    log.raiseExceptions = False

    if resolve_base_href:
        resolve_base_href(root)
    for el, attrib, link, pos in iterlinks(root, find_links_in_css=False):
        new_link = link_repl_func(link.strip())
        if new_link == link:
            continue
        if new_link is None:
            # Remove the attribute or element content
            if attrib is None:
                el.text = ''
            else:
                del el.attrib[attrib]
            continue
        if attrib is None:
            new = el.text[:pos] + new_link + el.text[pos+len(link):]
            el.text = new
        else:
            cur = el.attrib[attrib]
            if not pos and len(cur) == len(link):
                # Most common case
                el.attrib[attrib] = new_link
            else:
                new = cur[:pos] + new_link + cur[pos+len(link):]
                el.attrib[attrib] = new

    parser = CSSParser(raiseExceptions=False, log=_css_logger,
            fetcher=lambda x:(None, ''))
    for el in root.iter(etree.Element):
        try:
            tag = el.tag
        except UnicodeDecodeError:
            continue

        if tag == XHTML('style') and el.text and \
                (_css_url_re.search(el.text) is not None or '@import' in
                        el.text):
            stylesheet = parser.parseString(el.text, validate=False)
            replaceUrls(stylesheet, link_repl_func)
            repl = css_text(stylesheet)
            el.text = '\n'+ clean_xml_chars(repl) + '\n'

        text = el.get('style')
        if text and _css_url_re.search(text) is not None:
            try:
                stext = parser.parseStyle(text, validate=False)
            except Exception:
                # Parsing errors are raised by css_parser
                continue
            replaceUrls(stext, link_repl_func)
            repl = css_text(stext).replace('\n', ' ').replace('\r',
                    ' ')
            el.set('style', repl)


types_map = get_types_map()
EPUB_MIME      = types_map['.epub']
XHTML_MIME     = types_map['.xhtml']
CSS_MIME       = types_map['.css']
NCX_MIME       = types_map['.ncx']
OPF_MIME       = types_map['.opf']
PAGE_MAP_MIME  = 'application/oebps-page-map+xml'
OEB_DOC_MIME   = 'text/x-oeb1-document'
OEB_CSS_MIME   = 'text/x-oeb1-css'
OPENTYPE_MIME  = types_map['.otf']
GIF_MIME       = types_map['.gif']
JPEG_MIME      = types_map['.jpeg']
PNG_MIME       = types_map['.png']
SVG_MIME       = types_map['.svg']
WEBP_MIME      = types_map['.webp']
BINARY_MIME    = 'application/octet-stream'

XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS

OEB_STYLES        = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'}
OEB_DOCS          = {XHTML_MIME, 'text/html', OEB_DOC_MIME,
                         'text/x-oeb-document'}
OEB_RASTER_IMAGES = {GIF_MIME, JPEG_MIME, PNG_MIME, WEBP_MIME}
OEB_IMAGES        = {GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME}

MS_COVER_TYPE = 'other.ms-coverimage-standard'

ENTITY_RE     = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);')
COLLAPSE_RE   = re.compile(r'[ \t\r\n\v]+')
QNAME_RE      = re.compile(r'^[{][^{}]+[}][^{}]+$')
PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+')
XMLDECL_RE    = re.compile(r'^\s*<[?]xml.*?[?]>')
CSSURL_RE     = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''')


def element(parent, *args, **kwargs):
    if parent is not None:
        return etree.SubElement(parent, *args, **kwargs)
    return etree.Element(*args, **kwargs)


def prefixname(name, nsrmap):
    if not isqname(name):
        return name
    ns = namespace(name)
    if ns not in nsrmap:
        return name
    prefix = nsrmap[ns]
    if not prefix:
        return barename(name)
    return ':'.join((prefix, barename(name)))


def isprefixname(name):
    return name and PREFIXNAME_RE.match(name) is not None


def qname(name, nsmap):
    if not isprefixname(name):
        return name
    prefix, local = name.split(':', 1)
    if prefix not in nsmap:
        return name
    return '{%s}%s' % (nsmap[prefix], local)


def isqname(name):
    return name and QNAME_RE.match(name) is not None


def XPath(expr):
    return etree.XPath(expr, namespaces=XPNSMAP)


def xpath(elem, expr):
    return elem.xpath(expr, namespaces=XPNSMAP)


def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True):
    if not strip_comments:
        # -- in comments trips up adobe digital editions
        for x in root.iterdescendants(etree.Comment):
            if x.text and '--' in x.text:
                x.text = x.text.replace('--', '__')
    ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
                          pretty_print=pretty_print, with_tail=with_tail)

    if strip_comments:
        ans = re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', ans)

    return ans


def xml2text(elem, pretty_print=False, method='text'):
    return etree.tostring(elem, method=method, encoding='unicode', with_tail=False, pretty_print=pretty_print)


def escape_cdata(root):
    pat = re.compile(r'[<>&]')
    for elem in root.iterdescendants('{%s}style' % XHTML_NS, '{%s}script' % XHTML_NS):
        if elem.text and pat.search(elem.text) is not None:
            elem.text = etree.CDATA(elem.text.replace(']]>', r'\]\]\>'))


def serialize(data, media_type, pretty_print=False):
    if isinstance(data, etree._Element):
        is_oeb_doc = media_type in OEB_DOCS
        if is_oeb_doc:
            escape_cdata(data)
        ans = xml2str(data, pretty_print=pretty_print)
        if is_oeb_doc:
            # Convert self closing div|span|a|video|audio|iframe|etc tags
            # to normally closed ones, as they are interpreted
            # incorrectly by some browser based renderers
            ans = close_self_closing_tags(ans)
        return ans
    if isinstance(data, str):
        return data.encode('utf-8')
    if hasattr(data, 'cssText'):
        data = data.cssText
        if isinstance(data, str):
            data = data.encode('utf-8')
        return data + b'\n'
    return bytes(data)


ASCII_CHARS   = frozenset(codepoint_to_chr(x) for x in range(128))
UNIBYTE_CHARS = frozenset(x.encode('ascii') for x in ASCII_CHARS)
USAFE         = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                 'abcdefghijklmnopqrstuvwxyz'
                 '0123456789' '_.-/~')
URL_SAFE      = frozenset(USAFE)
URL_SAFE_BYTES = frozenset(USAFE.encode('ascii'))
URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE_BYTES]
del USAFE


def urlquote(href):
    """ Quote URL-unsafe characters, allowing IRI-safe characters.
    That is, this function returns valid IRIs not valid URIs. In particular,
    IRIs can contain non-ascii characters.  """
    result = []
    isbytes = isinstance(href, bytes)
    unsafe = URL_UNSAFE[int(isbytes)]
    esc, join = "%%%02x", ''
    if isbytes:
        esc, join = esc.encode('ascii'), b''
    for char in href:
        if char in unsafe:
            char = esc % ord(char)
        result.append(char)
    return join.join(result)


def urlnormalize(href):
    """Convert a URL into normalized form, with all and only URL-unsafe
    characters URL quoted.
    """
    try:
        parts = urlparse(href)
    except ValueError as e:
        raise ValueError('Failed to parse the URL: %r with underlying error: %s' % (href, as_unicode(e)))
    if not parts.scheme or parts.scheme == 'file':
        path, frag = urldefrag(href)
        parts = ('', '', path, '', '', frag)
    parts = (part.replace('\\', '/') for part in parts)
    parts = (urlunquote(part) for part in parts)
    parts = (urlquote(part) for part in parts)
    return urlunparse(parts)


def extract(elem):
    """
    Removes this element from the tree, including its children and
    text.  The tail text is joined to the previous element or
    parent.
    """
    parent = elem.getparent()
    if parent is not None:
        if elem.tail:
            previous = elem.getprevious()
            if previous is None:
                parent.text = (parent.text or '') + elem.tail
            else:
                previous.tail = (previous.tail or '') + elem.tail
        parent.remove(elem)


class DummyHandler(logging.Handler):

    def __init__(self):
        logging.Handler.__init__(self, logging.WARNING)
        self.setFormatter(logging.Formatter('%(message)s'))
        self.log = None

    def emit(self, record):
        if self.log is not None:
            msg = self.format(record)
            f = self.log.error if record.levelno >= logging.ERROR \
                    else self.log.warn
            f(msg)


_css_logger = logging.getLogger('calibre.css')
_css_logger.setLevel(logging.WARNING)
_css_log_handler = DummyHandler()
_css_logger.addHandler(_css_log_handler)


class OEBError(Exception):
    """Generic OEB-processing error."""
    pass


class NullContainer:
    """An empty container.

    For use with book formats which do not support container-like access.
    """

    def __init__(self, log):
        self.log = log

    def read(self, path):
        raise OEBError('Attempt to read from NullContainer')

    def write(self, path):
        raise OEBError('Attempt to write to NullContainer')

    def exists(self, path):
        return False

    def namelist(self):
        return []


class DirContainer:
    """Filesystem directory container."""

    def __init__(self, path, log, ignore_opf=False):
        self.log = log
        if isbytestring(path):
            path = path.decode(filesystem_encoding)
        self.opfname = None
        ext = os.path.splitext(path)[1].lower()
        if ext == '.opf':
            self.opfname = os.path.basename(path)
            self.rootdir = os.path.dirname(path)
            return
        self.rootdir = path
        if not ignore_opf:
            for path in self.namelist():
                ext = os.path.splitext(path)[1].lower()
                if ext == '.opf':
                    self.opfname = path
                    return

    def _unquote(self, path):
        # unquote must run on a bytestring and will return a bytestring
        # If it runs on a unicode object, it returns a double encoded unicode
        # string: unquote(u'%C3%A4') != unquote(b'%C3%A4').decode('utf-8')
        # and the latter is correct
        if isinstance(path, str):
            path = path.encode('utf-8')
        return urlunquote(path).decode('utf-8')

    def read(self, path):
        if path is None:
            path = self.opfname
        path = os.path.join(self.rootdir, self._unquote(path))
        with lopen(path, 'rb') as f:
            return f.read()

    def write(self, path, data):
        path = os.path.join(self.rootdir, self._unquote(path))
        dir = os.path.dirname(path)
        if not os.path.isdir(dir):
            os.makedirs(dir)
        with lopen(path, 'wb') as f:
            return f.write(data)

    def exists(self, path):
        if not path:
            return False
        try:
            path = os.path.join(self.rootdir, self._unquote(path))
        except ValueError:  # Happens if path contains quoted special chars
            return False
        try:
            return os.path.isfile(path)
        except UnicodeEncodeError:
            # On linux, if LANG is unset, the os.stat call tries to encode the
            # unicode path using ASCII
            # To replicate try:
            # LANG=en_US.ASCII python -c "import os; os.stat(u'Espa\xf1a')"
            return os.path.isfile(path.encode(filesystem_encoding))

    def namelist(self):
        names = []
        base = self.rootdir
        for root, dirs, files in os.walk(base):
            for fname in files:
                fname = os.path.join(root, fname)
                if isinstance(fname, bytes):
                    try:
                        fname = fname.decode(filesystem_encoding)
                    except Exception:
                        try:
                            fname = fname.decode('utf-8')
                        except Exception:
                            continue
                fname = fname.replace('\\', '/')
                names.append(fname)
        return names


class Metadata:
    """A collection of OEB data model metadata.

    Provides access to the list of items associated with a particular metadata
    term via the term's local name using either Python container or attribute
    syntax.  Return an empty list for any terms with no currently associated
    metadata items.
    """

    DC_TERMS      = {'contributor', 'coverage', 'creator', 'date',
                         'description', 'format', 'identifier', 'language',
                         'publisher', 'relation', 'rights', 'source',
                         'subject', 'title', 'type'}
    CALIBRE_TERMS = {'series', 'series_index', 'rating', 'timestamp',
                         'publication_type', 'title_sort'}
    OPF_ATTRS     = {'role': OPF('role'), 'file-as': OPF('file-as'),
                     'scheme': OPF('scheme'), 'event': OPF('event'),
                     'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'}
    OPF1_NSMAP    = {'dc': DC11_NS, 'oebpackage': OPF1_NS}
    OPF2_NSMAP    = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,
                     'xsi': XSI_NS, 'calibre': CALIBRE_NS}

    class Item:
        """An item of OEB data model metadata.

        The metadata term or name may be accessed via the :attr:`term` or
        :attr:`name` attributes.  The metadata value or content may be accessed
        via the :attr:`value` or :attr:`content` attributes, or via Unicode or
        string representations of the object.

        OEB data model metadata attributes may be accessed either via their
        fully-qualified names using the Python container access syntax, or via
        their local names using Python attribute syntax.  Only attributes
        allowed by the OPF 2.0 specification are supported.
        """
        class Attribute:
            """Smart accessor for allowed OEB metadata item attributes."""

            def __init__(self, attr, allowed=None):
                if not callable(attr):
                    attr_, attr = attr, lambda term: attr_
                self.attr = attr
                self.allowed = allowed

            def term_attr(self, obj):
                term = obj.term
                if namespace(term) != DC11_NS:
                    term = OPF('meta')
                allowed = self.allowed
                if allowed is not None and term not in allowed:
                    raise AttributeError(
                        'attribute %r not valid for metadata term %r' % (
                            self.attr(term), barename(obj.term)))
                return self.attr(term)

            def __get__(self, obj, cls):
                if obj is None:
                    return None
                return obj.attrib.get(self.term_attr(obj), '')

            def __set__(self, obj, value):
                obj.attrib[self.term_attr(obj)] = value

        def __init__(self, term, value, attrib={}, nsmap={}, **kwargs):
            self.attrib = attrib = dict(attrib)
            self.nsmap = nsmap = dict(nsmap)
            attrib.update(kwargs)
            if namespace(term) == OPF2_NS:
                term = barename(term)
            ns = namespace(term)
            local = barename(term).lower()
            if local in Metadata.DC_TERMS and (not ns or ns in DC_NSES):
                # Anything looking like Dublin Core is coerced
                term = DC(local)
            elif local in Metadata.CALIBRE_TERMS and ns in (CALIBRE_NS, ''):
                # Ditto for Calibre-specific metadata
                term = CALIBRE(local)
            self.term = term
            self.value = value
            for attr, value in tuple(iteritems(attrib)):
                if isprefixname(value):
                    attrib[attr] = qname(value, nsmap)
                nsattr = Metadata.OPF_ATTRS.get(attr, attr)
                if nsattr == OPF('scheme') and namespace(term) != DC11_NS:
                    # The opf:meta element takes @scheme, not @opf:scheme
                    nsattr = 'scheme'
                if attr != nsattr:
                    attrib[nsattr] = attrib.pop(attr)

        @property
        def name(self):
            return self.term

        @property
        def content(self):
            return self.value

        @content.setter
        def content(self, value):
            self.value = value

        scheme  = Attribute(lambda term: 'scheme' if
                            term == OPF('meta') else OPF('scheme'),
                            [DC('identifier'), OPF('meta')])
        file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'),
                                             DC('title')])
        role    = Attribute(OPF('role'), [DC('creator'), DC('contributor')])
        event   = Attribute(OPF('event'), [DC('date')])
        id      = Attribute('id')
        type    = Attribute(XSI('type'), [DC('date'), DC('format'),
                                          DC('type')])
        lang    = Attribute(XML('lang'), [DC('contributor'), DC('coverage'),
                                          DC('creator'), DC('publisher'),
                                          DC('relation'), DC('rights'),
                                          DC('source'), DC('subject'),
                                          OPF('meta')])

        def __getitem__(self, key):
            return self.attrib[key]

        def __setitem__(self, key, value):
            self.attrib[key] = value

        def __contains__(self, key):
            return key in self.attrib

        def get(self, key, default=None):
            return self.attrib.get(key, default)

        def __repr__(self):
            return 'Item(term=%r, value=%r, attrib=%r)' \
                % (barename(self.term), self.value, self.attrib)

        def __str__(self):
            return as_unicode(self.value)

        def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}):
            attrib = {}
            for key, value in self.attrib.items():
                if namespace(key) == OPF2_NS:
                    key = barename(key)
                attrib[key] = prefixname(value, nsrmap)
            if namespace(self.term) == DC11_NS:
                name = DC(icu_title(barename(self.term)))
                elem = element(dcmeta, name, attrib=attrib)
                elem.text = self.value
            else:
                elem = element(xmeta, 'meta', attrib=attrib)
                elem.attrib['name'] = prefixname(self.term, nsrmap)
                elem.attrib['content'] = prefixname(self.value, nsrmap)
            return elem

        def to_opf2(self, parent=None, nsrmap={}):
            attrib = {}
            for key, value in self.attrib.items():
                attrib[key] = prefixname(value, nsrmap)
            if namespace(self.term) == DC11_NS:
                elem = element(parent, self.term, attrib=attrib)
                try:
                    elem.text = self.value
                except:
                    elem.text = repr(self.value)
            else:
                elem = element(parent, OPF('meta'), attrib=attrib)
                elem.attrib['name'] = prefixname(self.term, nsrmap)
                elem.attrib['content'] = prefixname(self.value, nsrmap)
            return elem

    def __init__(self, oeb):
        self.oeb = oeb
        self.items = defaultdict(list)
        self.primary_writing_mode = None

    def add(self, term, value, attrib={}, nsmap={}, **kwargs):
        """Add a new metadata item."""
        item = self.Item(term, value, attrib, nsmap, **kwargs)
        items = self.items[barename(item.term)]
        items.append(item)
        return item

    def iterkeys(self):
        yield from self.items
    __iter__ = iterkeys

    def clear(self, key):
        l = self.items[key]
        for x in list(l):
            l.remove(x)

    def filter(self, key, predicate):
        l = self.items[key]
        for x in list(l):
            if predicate(x):
                l.remove(x)

    def __getitem__(self, key):
        return self.items[key]

    def __contains__(self, key):
        return key in self.items

    def __getattr__(self, term):
        return self.items[term]

    @property
    def _nsmap(self):
        nsmap = {}
        for term in self.items:
            for item in self.items[term]:
                nsmap.update(item.nsmap)
        return nsmap

    @property
    def _opf1_nsmap(self):
        nsmap = self._nsmap
        for key, value in nsmap.items():
            if value in OPF_NSES or value in DC_NSES:
                del nsmap[key]
        return nsmap

    @property
    def _opf2_nsmap(self):
        nsmap = self._nsmap
        nsmap.update(OPF2_NSMAP)
        return nsmap

    def to_opf1(self, parent=None):
        nsmap = self._opf1_nsmap
        nsrmap = {value: key for key, value in iteritems(nsmap)}
        elem = element(parent, 'metadata', nsmap=nsmap)
        dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP)
        xmeta = element(elem, 'x-metadata')
        for term in self.items:
            for item in self.items[term]:
                item.to_opf1(dcmeta, xmeta, nsrmap=nsrmap)
        if 'ms-chaptertour' not in self.items:
            chaptertour = self.Item('ms-chaptertour', 'chaptertour')
            chaptertour.to_opf1(dcmeta, xmeta, nsrmap=nsrmap)
        return elem

    def to_opf2(self, parent=None):
        nsmap = self._opf2_nsmap
        nsrmap = {value: key for key, value in iteritems(nsmap)}
        elem = element(parent, OPF('metadata'), nsmap=nsmap)
        for term in self.items:
            for item in self.items[term]:
                item.to_opf2(elem, nsrmap=nsrmap)
        if self.primary_writing_mode:
            elem.append(elem.makeelement(OPF('meta'), attrib={'name':'primary-writing-mode', 'content':self.primary_writing_mode}))
        return elem


class Manifest:
    """Collection of files composing an OEB data model book.

    Provides access to the content of the files composing the book and
    attributes associated with those files, including their internal paths,
    unique identifiers, and MIME types.

    Itself acts as a :class:`set` of manifest items, and provides the following
    instance data member for dictionary-like access:

    :attr:`ids`: A dictionary in which the keys are the unique identifiers of
        the manifest items and the values are the items themselves.
    :attr:`hrefs`: A dictionary in which the keys are the internal paths of the
        manifest items and the values are the items themselves.
    """

    class Item:
        """An OEB data model book content file.

        Provides the following data members for accessing the file content and
        metadata associated with this particular file.

        :attr:`id`: Unique identifier.
        :attr:`href`: Book-internal path.
        :attr:`media_type`: MIME type of the file content.
        :attr:`fallback`: Unique id of any fallback manifest item associated
            with this manifest item.
        :attr:`spine_position`: Display/reading order index for book textual
            content.  `None` for manifest items which are not part of the
            book's textual content.
        :attr:`linear`: `True` for textual content items which are part of the
            primary linear reading order and `False` for textual content items
            which are not (such as footnotes).  Meaningless for items which
            have a :attr:`spine_position` of `None`.
        """

        def __init__(self, oeb, id, href, media_type,
                     fallback=None, loader=str, data=None):
            if href:
                href = str(href)
            self.oeb = oeb
            self.id = id
            self.href = self.path = urlnormalize(href)
            self.media_type = media_type
            self.fallback = fallback
            self.override_css_fetch = None
            self.resolve_css_imports = True
            self.spine_position = None
            self.linear = True
            if loader is None and data is None:
                loader = oeb.container.read
            self._loader = loader
            self._data = data

        def __repr__(self):
            return 'Item(id=%r, href=%r, media_type=%r)' \
                % (self.id, self.href, self.media_type)

        # Parsing {{{
        def _parse_xml(self, data):
            if not data:
                return
            data = xml_to_unicode(data, strip_encoding_pats=True,
                    assume_utf8=True, resolve_entities=True)[0]
            return safe_xml_fromstring(data)

        def _parse_xhtml(self, data):
            orig_data = data
            fname = urlunquote(self.href)
            self.oeb.log.debug('Parsing', fname, '...')
            self.oeb.html_preprocessor.current_href = self.href
            try:
                data = parse_html(data, log=self.oeb.log,
                        decoder=self.oeb.decode,
                        preprocessor=self.oeb.html_preprocessor,
                        filename=fname, non_html_file_tags={'ncx'})
            except NotHTML:
                return self._parse_xml(orig_data)
            return data

        def _parse_txt(self, data):
            has_html = '<html>'
            if isinstance(data, bytes):
                has_html = has_html.encode('ascii')
            if has_html in data:
                return self._parse_xhtml(data)

            self.oeb.log.debug('Converting', self.href, '...')

            from calibre.ebooks.txt.processor import convert_markdown

            title = self.oeb.metadata.title
            if title:
                title = str(title[0])
            else:
                title = _('Unknown')

            return self._parse_xhtml(convert_markdown(data, title=title))

        def _parse_css(self, data):
            from css_parser import CSSParser, log, resolveImports
            from css_parser.css import CSSRule
            log.setLevel(logging.WARN)
            log.raiseExceptions = False
            self.oeb.log.debug('Parsing', self.href, '...')
            data = self.oeb.decode(data)
            data = self.oeb.css_preprocessor(data, add_namespace=False)
            parser = CSSParser(loglevel=logging.WARNING,
                               fetcher=self.override_css_fetch or self._fetch_css,
                               log=_css_logger)
            data = parser.parseString(data, href=self.href, validate=False)
            if self.resolve_css_imports:
                data = resolveImports(data)
            for rule in tuple(data.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
                data.cssRules.remove(rule)
            return data

        def _fetch_css(self, path):
            hrefs = self.oeb.manifest.hrefs
            if path not in hrefs:
                self.oeb.logger.warn('CSS import of missing file %r' % path)
                return (None, None)
            item = hrefs[path]
            if item.media_type not in OEB_STYLES:
                self.oeb.logger.warn('CSS import of non-CSS file %r' % path)
                return (None, None)
            data = item.data.cssText
            enc = None if isinstance(data, str) else 'utf-8'
            return (enc, data)

        # }}}

        @property
        def data(self):
            """Provides MIME type sensitive access to the manifest
            entry's associated content.

            - XHTML, HTML, and variant content is parsed as necessary to
              convert and return as an lxml.etree element in the XHTML
              namespace.
            - XML content is parsed and returned as an lxml.etree element.
            - CSS and CSS-variant content is parsed and returned as a css_parser
              CSS DOM stylesheet.
            - All other content is returned as a :class:`str` or :class:`bytes`
              object with no special parsing.
            """
            data = self._data
            if data is None:
                if self._loader is None:
                    return None
                data = self._loader(getattr(self, 'html_input_href',
                    self.href))
            try:
                mt = self.media_type.lower()
            except Exception:
                mt = 'application/octet-stream'
            if not isinstance(data, string_or_bytes):
                pass  # already parsed
            elif mt in OEB_DOCS:
                data = self._parse_xhtml(data)
            elif mt[-4:] in ('+xml', '/xml'):
                data = self._parse_xml(data)
            elif mt in OEB_STYLES:
                data = self._parse_css(data)
            elif mt == 'text/plain':
                self.oeb.log.warn('%s contains data in TXT format'%self.href,
                        'converting to HTML')
                data = self._parse_txt(data)
                self.media_type = XHTML_MIME
            self._data = data
            return data

        @data.setter
        def data(self, value):
            self._data = value

        @data.deleter
        def data(self):
            self._data = None

        def reparse_css(self):
            self._data = self._parse_css(str(self))

        def unload_data_from_memory(self, memory=None):
            if isinstance(self._data, bytes):
                if memory is None:
                    from calibre.ptempfile import PersistentTemporaryFile
                    pt = PersistentTemporaryFile(suffix='_oeb_base_mem_unloader.img')
                    with pt:
                        pt.write(self._data)
                    self.oeb._temp_files.append(pt.name)

                    def loader(*args):
                        with open(pt.name, 'rb') as f:
                            ans = f.read()
                        os.remove(pt.name)
                        return ans
                    self._loader = loader
                else:
                    def loader2(*args):
                        with open(memory, 'rb') as f:
                            ans = f.read()
                        return ans
                    self._loader = loader2
                self._data = None

        @property
        def unicode_representation(self):
            data = self.data
            if isinstance(data, etree._Element):
                return xml2text(data, pretty_print=self.oeb.pretty_print)
            if isinstance(data, str):
                return data
            if hasattr(data, 'cssText'):
                return css_text(data)
            return str(data)

        @property
        def bytes_representation(self):
            return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print)

        def __str__(self):
            return self.unicode_representation

        def __eq__(self, other):
            return self is other

        def __ne__(self, other):
            return self is not other

        def __hash__(self):
            return id(self)

        @property
        def sort_key(self):
            href = self.href
            if isinstance(href, bytes):
                href = force_unicode(href)
            sp = self.spine_position if isinstance(self.spine_position, numbers.Number) else sys.maxsize
            return sp, (self.media_type or '').lower(), numeric_sort_key(href), self.id

        def relhref(self, href):
            """Convert the URL provided in :param:`href` from a book-absolute
            reference to a reference relative to this manifest item.
            """
            return rel_href(self.href, href)

        def abshref(self, href):
            """Convert the URL provided in :param:`href` from a reference
            relative to this manifest item to a book-absolute reference.
            """
            try:
                purl = urlparse(href)
            except ValueError:
                return href
            scheme = purl.scheme
            if scheme and scheme != 'file':
                return href
            purl = list(purl)
            purl[0] = ''
            href = urlunparse(purl)
            path, frag = urldefrag(href)
            if not path:
                if frag:
                    return '#'.join((self.href, frag))
                else:
                    return self.href
            if '/' not in self.href:
                return href
            dirname = os.path.dirname(self.href)
            href = os.path.join(dirname, href)
            href = os.path.normpath(href).replace('\\', '/')
            return href

    def __init__(self, oeb):
        self.oeb = oeb
        self.items = set()
        self.ids = {}
        self.hrefs = {}

    def add(self, id, href, media_type, fallback=None, loader=None, data=None):
        """Add a new item to the book manifest.

        The item's :param:`id`, :param:`href`, and :param:`media_type` are all
        required.  A :param:`fallback` item-id is required for any items with a
        MIME type which is not one of the OPS core media types.  Either the
        item's data itself may be provided with :param:`data`, or a loader
        function for the data may be provided with :param:`loader`, or the
        item's data may later be set manually via the :attr:`data` attribute.
        """
        item = self.Item(
            self.oeb, id, href, media_type, fallback, loader, data)
        self.items.add(item)
        self.ids[item.id] = item
        self.hrefs[item.href] = item
        return item

    def remove(self, item):
        """Removes :param:`item` from the manifest."""
        if item in self.ids:
            item = self.ids[item]
        del self.ids[item.id]
        if item.href in self.hrefs:
            del self.hrefs[item.href]
        self.items.remove(item)
        if item in self.oeb.spine:
            self.oeb.spine.remove(item)

    def remove_duplicate_item(self, item):
        if item in self.ids:
            item = self.ids[item]
        del self.ids[item.id]
        self.items.remove(item)

    def generate(self, id=None, href=None):
        """Generate a new unique identifier and/or internal path for use in
        creating a new manifest item, using the provided :param:`id` and/or
        :param:`href` as bases.

        Returns an two-tuple of the new id and path.  If either :param:`id` or
        :param:`href` are `None` then the corresponding item in the return
        tuple will also be `None`.
        """
        if id is not None:
            base = id
            index = 1
            while id in self.ids:
                id = base + str(index)
                index += 1
        if href is not None:
            href = urlnormalize(href)
            base, ext = os.path.splitext(href)
            index = 1
            lhrefs = {x.lower() for x in self.hrefs}
            while href.lower() in lhrefs:
                href = base + str(index) + ext
                index += 1
        return id, str(href)

    def __iter__(self):
        yield from self.items

    def __len__(self):
        return len(self.items)

    def values(self):
        return list(self.items)

    def __contains__(self, item):
        return item in self.items

    def to_opf1(self, parent=None):
        elem = element(parent, 'manifest')
        for item in self.items:
            media_type = item.media_type
            if media_type in OEB_DOCS:
                media_type = OEB_DOC_MIME
            elif media_type in OEB_STYLES:
                media_type = OEB_CSS_MIME
            attrib = {'id': item.id, 'href': urlunquote(item.href),
                      'media-type': media_type}
            if item.fallback:
                attrib['fallback'] = item.fallback
            element(elem, 'item', attrib=attrib)
        return elem

    def to_opf2(self, parent=None):
        elem = element(parent, OPF('manifest'))
        for item in sorted(self.items, key=attrgetter('sort_key')):
            media_type = item.media_type
            if media_type in OEB_DOCS:
                media_type = XHTML_MIME
            elif media_type in OEB_STYLES:
                media_type = CSS_MIME
            attrib = {'id': item.id, 'href': urlunquote(item.href),
                      'media-type': media_type}
            if item.fallback:
                attrib['fallback'] = item.fallback
            element(elem, OPF('item'), attrib=attrib)
        return elem

    @property
    def main_stylesheet(self):
        ans = getattr(self, '_main_stylesheet', None)
        if ans is None:
            for item in self:
                if item.media_type.lower() in OEB_STYLES:
                    ans = item
                    break
        return ans

    @main_stylesheet.setter
    def main_stylesheet(self, item):
        self._main_stylesheet = item


class Spine:
    """Collection of manifest items composing an OEB data model book's main
    textual content.

    The spine manages which manifest items compose the book's main textual
    content and the sequence in which they appear.  Provides Python container
    access as a list-like object.
    """

    def __init__(self, oeb):
        self.oeb = oeb
        self.items = []
        self.page_progression_direction = None

    def _linear(self, linear):
        if isinstance(linear, string_or_bytes):
            linear = linear.lower()
        if linear is None or linear in ('yes', 'true'):
            linear = True
        elif linear in ('no', 'false'):
            linear = False
        return linear

    def add(self, item, linear=None):
        """Append :param:`item` to the end of the `Spine`."""
        item.linear = self._linear(linear)
        item.spine_position = len(self.items)
        self.items.append(item)
        return item

    def insert(self, index, item, linear):
        """Insert :param:`item` at position :param:`index` in the `Spine`."""
        item.linear = self._linear(linear)
        item.spine_position = index
        self.items.insert(index, item)
        for i in range(index, len(self.items)):
            self.items[i].spine_position = i
        return item

    def remove(self, item):
        """Remove :param:`item` from the `Spine`."""
        index = item.spine_position
        self.items.pop(index)
        for i in range(index, len(self.items)):
            self.items[i].spine_position = i
        item.spine_position = None

    def index(self, item):
        for i, x in enumerate(self):
            if item == x:
                return i
        return -1

    def __iter__(self):
        yield from self.items

    def __getitem__(self, index):
        return self.items[index]

    def __len__(self):
        return len(self.items)

    def __contains__(self, item):
        return (item in self.items)

    def to_opf1(self, parent=None):
        elem = element(parent, 'spine')
        for item in self.items:
            if item.linear:
                element(elem, 'itemref', attrib={'idref': item.id})
        return elem

    def to_opf2(self, parent=None):
        elem = element(parent, OPF('spine'))
        for item in self.items:
            attrib = {'idref': item.id}
            if not item.linear:
                attrib['linear'] = 'no'
            element(elem, OPF('itemref'), attrib=attrib)
        return elem


class Guide:
    """Collection of references to standard frequently-occurring sections
    within an OEB data model book.

    Provides dictionary-like access, in which the keys are the OEB reference
    type identifiers and the values are `Reference` objects.
    """

    class Reference:
        """Reference to a standard book section.

        Provides the following instance data members:

        :attr:`type`: Reference type identifier, as chosen from the list
            allowed in the OPF 2.0 specification.
        :attr:`title`: Human-readable section title.
        :attr:`href`: Book-internal URL of the referenced section.  May include
            a fragment identifier.
        """
        _TYPES_TITLES = [('cover', __('Cover')),
                         ('title-page', __('Title page')),
                         ('toc', __('Table of Contents')),
                         ('index', __('Index')),
                         ('glossary', __('Glossary')),
                         ('acknowledgements', __('Acknowledgements')),
                         ('bibliography', __('Bibliography')),
                         ('colophon', __('Colophon')),
                         ('copyright-page', __('Copyright')),
                         ('dedication', __('Dedication')),
                         ('epigraph', __('Epigraph')),
                         ('foreword', __('Foreword')),
                         ('loi', __('List of illustrations')),
                         ('lot', __('List of tables')),
                         ('notes', __('Notes')),
                         ('preface', __('Preface')),
                         ('text', __('Main text'))]
        TITLES = dict(_TYPES_TITLES)
        TYPES = frozenset(TITLES)
        ORDER = {t: i for i, (t, _) in enumerate(_TYPES_TITLES)}

        def __init__(self, oeb, type, title, href):
            self.oeb = oeb
            if type.lower() in self.TYPES:
                type = type.lower()
            elif type not in self.TYPES and \
                 not type.startswith('other.'):
                type = 'other.' + type
            if not title and type in self.TITLES:
                title = oeb.translate(self.TITLES[type])
            self.type = type
            self.title = title
            self.href = urlnormalize(href)

        def __repr__(self):
            return 'Reference(type=%r, title=%r, href=%r)' \
                % (self.type, self.title, self.href)

        @property
        def item(self):
            """The manifest item associated with this reference."""
            path = urldefrag(self.href)[0]
            hrefs = self.oeb.manifest.hrefs
            return hrefs.get(path, None)

    def __init__(self, oeb):
        self.oeb = oeb
        self.refs = {}

    def add(self, type, title, href):
        """Add a new reference to the `Guide`."""
        if href:
            href = str(href)
        ref = self.Reference(self.oeb, type, title, href)
        self.refs[type] = ref
        return ref

    def remove(self, type):
        return self.refs.pop(type, None)

    def remove_by_href(self, href):
        remove = [r for r, i in iteritems(self.refs) if i.href == href]
        for r in remove:
            self.remove(r)

    def iterkeys(self):
        yield from self.refs
    __iter__ = iterkeys

    def values(self):
        return sorted(itervalues(self.refs), key=lambda ref: ref.ORDER.get(ref.type, 10000))

    def items(self):
        yield from self.refs.items()

    def __getitem__(self, key):
        return self.refs[key]

    def get(self, key):
        return self.refs.get(key)

    def __delitem__(self, key):
        del self.refs[key]

    def __contains__(self, key):
        return key in self.refs

    def __len__(self):
        return len(self.refs)

    def to_opf1(self, parent=None):
        elem = element(parent, 'guide')
        for ref in self.refs.values():
            attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
            if ref.title:
                attrib['title'] = ref.title
            element(elem, 'reference', attrib=attrib)
        return elem

    def to_opf2(self, parent=None):
        if not len(self):
            return
        elem = element(parent, OPF('guide'))
        for ref in self.refs.values():
            attrib = {'type': ref.type, 'href': urlunquote(ref.href)}
            if ref.title:
                attrib['title'] = ref.title
            element(elem, OPF('reference'), attrib=attrib)
        return elem


class TOC:
    """Represents a hierarchical table of contents or navigation tree for
    accessing arbitrary semantic sections within an OEB data model book.

    Acts as a node within the navigation tree.  Provides list-like access to
    sub-nodes.  Provides the follow node instance data attributes:

    :attr:`title`: The title of this navigation node.
    :attr:`href`: Book-internal URL referenced by this node.
    :attr:`klass`: Optional semantic class referenced by this node.
    :attr:`id`: Option unique identifier for this node.
    :attr:`author`: Optional author attribution for periodicals <mbp:>
    :attr:`description`: Optional description attribute for periodicals <mbp:>
    :attr:`toc_thumbnail`: Optional toc thumbnail image
    """

    def __init__(self, title=None, href=None, klass=None, id=None,
            play_order=None, author=None, description=None, toc_thumbnail=None):
        self.title = title
        self.href = urlnormalize(href) if href else href
        self.klass = klass
        self.id = id
        self.nodes = []
        self.play_order = 0
        if play_order is None:
            play_order = self.next_play_order()
        self.play_order = play_order
        self.author = author
        self.description = description
        self.toc_thumbnail = toc_thumbnail

    def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None):
        """Create and return a new sub-node of this node."""
        node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail)
        self.nodes.append(node)
        return node

    def remove(self, node):
        for child in self.nodes:
            if child is node:
                self.nodes.remove(child)
                return True
            else:
                if child.remove(node):
                    return True
        return False

    def iter(self):
        """Iterate over this node and all descendants in depth-first order."""
        yield self
        for child in self.nodes:
            yield from child.iter()

    def count(self):
        return len(list(self.iter())) - 1

    def next_play_order(self):
        entries = [x.play_order for x in self.iter()]
        base = max(entries) if entries else 0
        return base+1

    def has_href(self, href):
        for x in self.iter():
            if x.href == href:
                return True
        return False

    def has_text(self, text):
        for x in self.iter():
            if x.title and x.title.lower() == text.lower():
                return True
        return False

    def iterdescendants(self, breadth_first=False):
        """Iterate over all descendant nodes in depth-first order."""
        if breadth_first:
            for child in self.nodes:
                yield child
            for child in self.nodes:
                yield from child.iterdescendants(breadth_first=True)
        else:
            for child in self.nodes:
                yield from child.iter()

    def __iter__(self):
        """Iterate over all immediate child nodes."""
        yield from self.nodes

    def __getitem__(self, index):
        return self.nodes[index]

    def autolayer(self):
        """Make sequences of children pointing to the same content file into
        children of the first node referencing that file.
        """
        prev = None
        for node in list(self.nodes):
            if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]:
                self.nodes.remove(node)
                prev.nodes.append(node)
            else:
                prev = node

    def depth(self):
        """The maximum depth of the navigation tree rooted at this node."""
        try:
            return max(node.depth() for node in self.nodes) + 1
        except ValueError:
            return 1

    def get_lines(self, lvl=0):
        ans = [('\t'*lvl) + 'TOC: %s --> %s'%(self.title, self.href)]
        for child in self:
            ans.extend(child.get_lines(lvl+1))
        return ans

    def __str__(self):
        return '\n'.join(self.get_lines())

    def to_opf1(self, tour):
        for node in self.nodes:
            element(tour, 'site', attrib={
                'title': node.title, 'href': urlunquote(node.href)})
            node.to_opf1(tour)
        return tour

    def to_ncx(self, parent=None):
        if parent is None:
            parent = etree.Element(NCX('navMap'))
        for node in self.nodes:
            id = node.id or uuid_id()
            po = node.play_order
            if po == 0:
                po = 1
            attrib = {'id': id, 'playOrder': str(po)}
            if node.klass:
                attrib['class'] = node.klass
            point = element(parent, NCX('navPoint'), attrib=attrib)
            label = etree.SubElement(point, NCX('navLabel'))
            title = node.title
            if title:
                title = re.sub(r'\s+', ' ', title)
            element(label, NCX('text')).text = title
            # Do not unescape this URL as ADE requires it to be escaped to
            # handle semi colons and other special characters in the file names
            element(point, NCX('content'), src=node.href)
            node.to_ncx(point)
        return parent

    def rationalize_play_orders(self):
        '''
        Ensure that all nodes with the same play_order have the same href and
        with different play_orders have different hrefs.
        '''
        def po_node(n):
            for x in self.iter():
                if x is n:
                    return
                if x.play_order == n.play_order:
                    return x

        def href_node(n):
            for x in self.iter():
                if x is n:
                    return
                if x.href == n.href:
                    return x

        for x in self.iter():
            y = po_node(x)
            if y is not None:
                if x.href != y.href:
                    x.play_order = getattr(href_node(x), 'play_order',
                            self.next_play_order())
            y = href_node(x)
            if y is not None:
                x.play_order = y.play_order


class PageList:
    """Collection of named "pages" to mapped positions within an OEB data model
    book's textual content.

    Provides list-like access to the pages.
    """

    class Page:
        """Represents a mapping between a page name and a position within
        the book content.

        Provides the following instance data attributes:

        :attr:`name`: The name of this page.  Generally a number.
        :attr:`href`: Book-internal URL at which point this page begins.
        :attr:`type`: Must be one of 'front' (for prefatory pages, as commonly
            labeled in print with small-case Roman numerals), 'normal' (for
            standard pages, as commonly labeled in print with Arabic numerals),
            or 'special' (for other pages, as commonly not labeled in any
            fashion in print, such as the cover and title pages).
        :attr:`klass`: Optional semantic class of this page.
        :attr:`id`: Optional unique identifier for this page.
        """
        TYPES = {'front', 'normal', 'special'}

        def __init__(self, name, href, type='normal', klass=None, id=None):
            self.name = str(name)
            self.href = urlnormalize(href)
            self.type = type if type in self.TYPES else 'normal'
            self.id = id
            self.klass = klass

    def __init__(self):
        self.pages = []

    def add(self, name, href, type='normal', klass=None, id=None):
        """Create a new page and add it to the `PageList`."""
        page = self.Page(name, href, type, klass, id)
        self.pages.append(page)
        return page

    def __len__(self):
        return len(self.pages)

    def __iter__(self):
        yield from self.pages

    def __getitem__(self, index):
        return self.pages[index]

    def pop(self, index=-1):
        return self.pages.pop(index)

    def remove(self, page):
        return self.pages.remove(page)

    def to_ncx(self, parent=None):
        plist = element(parent, NCX('pageList'), id=uuid_id())
        values = {t: count(1) for t in ('front', 'normal', 'special')}
        for page in self.pages:
            id = page.id or uuid_id()
            type = page.type
            value = str(next(values[type]))
            attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'}
            if page.klass:
                attrib['class'] = page.klass
            ptarget = element(plist, NCX('pageTarget'), attrib=attrib)
            label = element(ptarget, NCX('navLabel'))
            element(label, NCX('text')).text = page.name
            element(ptarget, NCX('content'), src=page.href)
        return plist

    def to_page_map(self):
        pmap = etree.Element(OPF('page-map'), nsmap={None: OPF2_NS})
        for page in self.pages:
            element(pmap, OPF('page'), name=page.name, href=page.href)
        return pmap


class OEBBook:
    """Representation of a book in the IDPF OEB data model."""

    COVER_SVG_XP    = XPath('h:body//svg:svg[position() = 1]')
    COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')

    def __init__(self, logger,
            html_preprocessor,
            css_preprocessor=CSSPreProcessor(),
            encoding='utf-8', pretty_print=False,
            input_encoding='utf-8'):
        """Create empty book.  Arguments:

        :param:`encoding`: Default encoding for textual content read
            from an external container.
        :param:`pretty_print`: Whether or not the canonical string form
            of XML markup is pretty-printed.
        :param html_preprocessor: A callable that takes a unicode object
            and returns a unicode object. Will be called on all html files
            before they are parsed.
        :param css_preprocessor: A callable that takes a unicode object
            and returns a unicode object. Will be called on all CSS files
            before they are parsed.
        :param:`logger`: A Log object to use for logging all messages
            related to the processing of this book.  It is accessible
            via the instance data members :attr:`logger,log`.

        It provides the following public instance data members for
        accessing various parts of the OEB data model:

        :attr:`metadata`: Metadata such as title, author name(s), etc.
        :attr:`manifest`: Manifest of all files included in the book,
            including MIME types and fallback information.
        :attr:`spine`: In-order list of manifest items which compose
            the textual content of the book.
        :attr:`guide`: Collection of references to standard positions
            within the text, such as the cover, preface, etc.
        :attr:`toc`: Hierarchical table of contents.
        :attr:`pages`: List of "pages," such as indexed to a print edition of
            the same text.
        """
        _css_log_handler.log = logger
        self.encoding = encoding
        self.input_encoding = input_encoding
        self.html_preprocessor = html_preprocessor
        self.css_preprocessor = css_preprocessor
        self.pretty_print = pretty_print
        self.logger = self.log = logger
        self.version = '2.0'
        self.container = NullContainer(self.log)
        self.metadata = Metadata(self)
        self.uid = None
        self.manifest = Manifest(self)
        self.spine = Spine(self)
        self.guide = Guide(self)
        self.toc = TOC()
        self.pages = PageList()
        self.auto_generated_toc = True
        self._temp_files = []

    def clean_temp_files(self):
        for path in self._temp_files:
            try:
                os.remove(path)
            except:
                pass

    @classmethod
    def generate(cls, opts):
        """Generate an OEBBook instance from command-line options."""
        encoding = opts.encoding
        pretty_print = opts.pretty_print
        return cls(encoding=encoding, pretty_print=pretty_print)

    def translate(self, text):
        """Translate :param:`text` into the book's primary language."""
        lang = str(self.metadata.language[0])
        lang = lang.split('-', 1)[0].lower()
        return translate(lang, text)

    def decode(self, data):
        """Automatically decode :param:`data` into a `unicode` object."""
        def fix_data(d):
            return d.replace('\r\n', '\n').replace('\r', '\n')
        if isinstance(data, str):
            return fix_data(data)
        bom_enc = None
        if data[:4] in (b'\0\0\xfe\xff', b'\xff\xfe\0\0'):
            bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
                    b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
            data = data[4:]
        elif data[:2] in (b'\xff\xfe', b'\xfe\xff'):
            bom_enc = {b'\xff\xfe':'utf-16-le', 'b\xfe\xff':'utf-16-be'}[data[:2]]
            data = data[2:]
        elif data[:3] == b'\xef\xbb\xbf':
            bom_enc = 'utf-8'
            data = data[3:]
        if bom_enc is not None:
            try:
                return fix_data(data.decode(bom_enc))
            except UnicodeDecodeError:
                pass
        if self.input_encoding:
            try:
                return fix_data(data.decode(self.input_encoding, 'replace'))
            except UnicodeDecodeError:
                pass
        try:
            return fix_data(data.decode('utf-8'))
        except UnicodeDecodeError:
            pass
        data, _ = xml_to_unicode(data)
        return fix_data(data)

    def to_opf1(self):
        """Produce OPF 1.2 representing the book's metadata and structure.

        Returns a dictionary in which the keys are MIME types and the values
        are tuples of (default) filenames and lxml.etree element structures.
        """
        package = etree.Element('package',
            attrib={'unique-identifier': self.uid.id})
        self.metadata.to_opf1(package)
        self.manifest.to_opf1(package)
        self.spine.to_opf1(package)
        tours = element(package, 'tours')
        tour = element(tours, 'tour',
            attrib={'id': 'chaptertour', 'title': 'Chapter Tour'})
        self.toc.to_opf1(tour)
        self.guide.to_opf1(package)
        return {OPF_MIME: ('content.opf', package)}

    def _update_playorder(self, ncx):
        hrefs = set(map(urlnormalize, xpath(ncx, '//ncx:content/@src')))
        playorder = {}
        next = 1
        selector = XPath('h:body//*[@id or @name]')
        for item in self.spine:
            base = item.href
            if base in hrefs:
                playorder[base] = next
                next += 1
            for elem in selector(item.data):
                added = False
                for attr in ('id', 'name'):
                    id = elem.get(attr)
                    if not id:
                        continue
                    href = '#'.join([base, id])
                    if href in hrefs:
                        playorder[href] = next
                        added = True
                if added:
                    next += 1
        selector = XPath('ncx:content/@src')
        for i, elem in enumerate(xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]')):
            href = urlnormalize(selector(elem)[0])
            order = playorder.get(href, i)
            elem.attrib['playOrder'] = str(order)
        return

    def _to_ncx(self):
        lang = str(self.metadata.language[0])
        lang = lang.replace('_', '-')
        ncx = etree.Element(NCX('ncx'),
            attrib={'version': '2005-1', XML('lang'): lang},
            nsmap={None: NCX_NS})
        head = etree.SubElement(ncx, NCX('head'))
        etree.SubElement(head, NCX('meta'),
            name='dtb:uid', content=str(self.uid))
        etree.SubElement(head, NCX('meta'),
            name='dtb:depth', content=str(self.toc.depth()))
        generator = ''.join(['calibre (', __version__, ')'])
        etree.SubElement(head, NCX('meta'),
            name='dtb:generator', content=generator)
        etree.SubElement(head, NCX('meta'),
            name='dtb:totalPageCount', content=str(len(self.pages)))
        maxpnum = etree.SubElement(head, NCX('meta'),
            name='dtb:maxPageNumber', content='0')
        title = etree.SubElement(ncx, NCX('docTitle'))
        text = etree.SubElement(title, NCX('text'))
        text.text = str(self.metadata.title[0])
        navmap = etree.SubElement(ncx, NCX('navMap'))
        self.toc.to_ncx(navmap)
        if len(self.pages) > 0:
            plist = self.pages.to_ncx(ncx)
            value = max(int(x) for x in xpath(plist, '//@value'))
            maxpnum.attrib['content'] = str(value)
        self._update_playorder(ncx)
        return ncx

    def to_opf2(self, page_map=False):
        """Produce OPF 2.0 representing the book's metadata and structure.

        Returns a dictionary in which the keys are MIME types and the values
        are tuples of (default) filenames and lxml.etree element structures.
        """
        results = {}
        package = etree.Element(OPF('package'),
            attrib={'version': '2.0', 'unique-identifier': self.uid.id},
            nsmap={None: OPF2_NS})
        self.metadata.to_opf2(package)
        manifest = self.manifest.to_opf2(package)
        spine = self.spine.to_opf2(package)
        self.guide.to_opf2(package)
        results[OPF_MIME] = ('content.opf', package)
        id, href = self.manifest.generate('ncx', 'toc.ncx')
        etree.SubElement(manifest, OPF('item'), id=id, href=href,
                         attrib={'media-type': NCX_MIME})
        spine.attrib['toc'] = id
        results[NCX_MIME] = (href, self._to_ncx())
        if page_map and len(self.pages) > 0:
            id, href = self.manifest.generate('page-map', 'page-map.xml')
            etree.SubElement(manifest, OPF('item'), id=id, href=href,
                             attrib={'media-type': PAGE_MAP_MIME})
            spine.attrib['page-map'] = id
            results[PAGE_MAP_MIME] = (href, self.pages.to_page_map())
        if self.spine.page_progression_direction in {'ltr', 'rtl'}:
            spine.attrib['page-progression-direction'] = self.spine.page_progression_direction
        return results


def rel_href(base_href, href):
    """Convert the URL provided in :param:`href` to a URL relative to the URL
    in :param:`base_href`  """
    if urlparse(href).scheme:
        return href
    if '/' not in base_href:
        return href
    base = list(filter(lambda x: x and x != '.', os.path.dirname(os.path.normpath(base_href)).replace(os.sep, '/').split('/')))
    while True:
        try:
            idx = base.index('..')
        except ValueError:
            break
        if idx > 0:
            del base[idx-1:idx+1]
        else:
            break
    if not base:
        return href
    target, frag = urldefrag(href)
    target = target.split('/')
    index = 0
    for index in range(min(len(base), len(target))):
        if base[index] != target[index]:
            break
    else:
        index += 1
    relhref = (['..'] * (len(base) - index)) + target[index:]
    relhref = '/'.join(relhref)
    if frag:
        relhref = '#'.join((relhref, frag))
    return relhref