1#!/usr/local/bin/python3.8
2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3
4
5__license__   = 'GPL v3'
6__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
7__docformat__ = 'restructuredtext en'
8
9import io
10import os
11import re
12import shutil
13import struct
14import textwrap
15from lxml import etree, html
16
17from calibre import entity_to_unicode, guess_type, xml_entity_to_unicode
18from calibre.ebooks import DRMError, unit_convert
19from calibre.ebooks.chardet import strip_encoding_declarations
20from calibre.ebooks.compression.palmdoc import decompress_doc
21from calibre.ebooks.metadata import MetaInformation
22from calibre.ebooks.metadata.opf2 import OPF, OPFCreator
23from calibre.ebooks.metadata.toc import TOC
24from calibre.ebooks.mobi import MobiError
25from calibre.ebooks.mobi.huffcdic import HuffReader
26from calibre.ebooks.mobi.reader.headers import BookHeader
27from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
28from calibre.utils.img import AnimatedGIF, gif_data_to_png_data, save_cover_data_to
29from calibre.utils.imghdr import what
30from calibre.utils.logging import default_log
31from polyglot.builtins import iteritems
32
33
34class TopazError(ValueError):
35    pass
36
37
38class KFXError(ValueError):
39
40    def __init__(self):
41        ValueError.__init__(self, _(
42            'This is an Amazon KFX book. It cannot be processed.'
43            ' See {} for information on how to handle KFX books.'
44        ).format('https://www.mobileread.com/forums/showthread.php?t=283371'))
45
46
47class MobiReader:
48    PAGE_BREAK_PAT = re.compile(
49        r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
50        re.IGNORECASE)
51    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
52
53    def __init__(self, filename_or_stream, log=None, user_encoding=None, debug=None,
54            try_extra_data_fix=False):
55        self.log = log or default_log
56        self.debug = debug
57        self.embedded_mi = None
58        self.warned_about_trailing_entry_corruption = False
59        self.base_css_rules = textwrap.dedent('''
60                body { text-align: justify }
61
62                blockquote { margin: 0em 0em 0em 2em; }
63
64                p { margin: 0em; text-indent: 1.5em }
65
66                .bold { font-weight: bold }
67
68                .italic { font-style: italic }
69
70                .underline { text-decoration: underline }
71
72                .mbp_pagebreak {
73                    page-break-after: always; margin: 0; display: block
74                }
75                ''')
76        self.tag_css_rules = {}
77        self.left_margins = {}
78        self.text_indents = {}
79
80        if hasattr(filename_or_stream, 'read'):
81            stream = filename_or_stream
82            stream.seek(0)
83        else:
84            stream = open(filename_or_stream, 'rb')
85
86        raw = stream.read()
87        if raw.startswith(b'TPZ'):
88            raise TopazError(_('This is an Amazon Topaz book. It cannot be processed.'))
89        if raw.startswith(b'\xeaDRMION\xee'):
90            raise KFXError()
91
92        self.header   = raw[0:72]
93        self.name     = self.header[:32].replace(b'\x00', b'')
94        self.num_sections, = struct.unpack('>H', raw[76:78])
95
96        self.ident = self.header[0x3C:0x3C + 8].upper()
97        if self.ident not in (b'BOOKMOBI', b'TEXTREAD'):
98            raise MobiError('Unknown book type: %s' % repr(self.ident))
99
100        self.sections = []
101        self.section_headers = []
102        for i in range(self.num_sections):
103            offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
104            flags, val = a1, a2 << 16 | a3 << 8 | a4
105            self.section_headers.append((offset, flags, val))
106
107        def section(section_number):
108            if section_number == self.num_sections - 1:
109                end_off = len(raw)
110            else:
111                end_off = self.section_headers[section_number + 1][0]
112            off = self.section_headers[section_number][0]
113            return raw[off:end_off]
114
115        for i in range(self.num_sections):
116            self.sections.append((section(i), self.section_headers[i]))
117
118        self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
119            user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
120        self.name = self.name.decode(self.book_header.codec, 'replace')
121        self.kf8_type = None
122        k8i = getattr(self.book_header.exth, 'kf8_header', None)
123
124        # Ancient PRC files from Baen can have random values for
125        # mobi_version, so be conservative
126        if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
127            'skelidx')):
128            self.kf8_type = 'standalone'
129        elif k8i is not None:  # Check for joint mobi 6 and kf 8 file
130            try:
131                raw = self.sections[k8i-1][0]
132            except:
133                raw = None
134            if raw == b'BOUNDARY':
135                try:
136                    self.book_header = BookHeader(self.sections[k8i][0],
137                            self.ident, user_encoding, self.log)
138                    self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
139                    self.book_header.mobi6_records = bh.records
140
141                    # Need the first_image_index from the mobi 6 header as well
142                    for x in ('first_image_index',):
143                        setattr(self.book_header, x, getattr(bh, x))
144
145                    # We need to do this because the MOBI 6 text extract code
146                    # does not know anything about the kf8 offset
147                    if hasattr(self.book_header, 'huff_offset'):
148                        self.book_header.huff_offset += k8i
149
150                    self.kf8_type = 'joint'
151                    self.kf8_boundary = k8i-1
152                except:
153                    self.book_header = bh
154
155    def check_for_drm(self):
156        if self.book_header.encryption_type != 0:
157            try:
158                name = self.book_header.exth.mi.title
159            except:
160                name = self.name
161            if not name:
162                name = self.name
163            raise DRMError(name)
164
165    def extract_content(self, output_dir, parse_cache):
166        output_dir = os.path.abspath(output_dir)
167        self.check_for_drm()
168        processed_records = self.extract_text()
169        if self.debug is not None:
170            parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
171        self.add_anchors()
172        self.processed_html = self.processed_html.decode(self.book_header.codec,
173            'ignore')
174        self.processed_html = self.processed_html.replace('</</', '</')
175        self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
176                self.processed_html)
177        self.processed_html = self.processed_html.replace('\ufeff', '')
178        # Remove tags of the form <xyz: ...> as they can cause issues further
179        # along the pipeline
180        self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
181                self.processed_html)
182
183        self.processed_html = strip_encoding_declarations(self.processed_html)
184        self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
185            self.processed_html)
186        image_name_map = self.extract_images(processed_records, output_dir)
187        self.replace_page_breaks()
188        self.cleanup_html()
189
190        self.log.debug('Parsing HTML...')
191        self.processed_html = clean_xml_chars(self.processed_html)
192        try:
193            root = html.fromstring(self.processed_html)
194            if len(root.xpath('//html')) > 5:
195                root = html.fromstring(self.processed_html.replace('\x0c',
196                    '').replace('\x14', ''))
197        except Exception:
198            self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
199            self.processed_html = self.remove_random_bytes(self.processed_html)
200            try:
201                root = html.fromstring(self.processed_html)
202            except Exception:
203                self.log.warning('MOBI markup could not be parsed by lxml using html5-parser')
204                # Happens on windows with python 3 where lxml causes libxml to die with an
205                # error about using UCS-4 little endian encoding if certain
206                # characters are present in the input
207                from html5_parser import parse
208                root = parse(self.processed_html, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True)
209        if root.xpath('descendant::p/descendant::p'):
210            from html5_parser import parse
211            self.log.warning('Malformed markup, parsing using html5-parser')
212            self.processed_html = strip_encoding_declarations(self.processed_html)
213            # These trip up the html5 parser causing all content to be placed
214            # under the <guide> tag
215            self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I)
216            self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I)
217            try:
218                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
219            except Exception:
220                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
221                self.processed_html = self.remove_random_bytes(self.processed_html)
222                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
223            if len(root.xpath('body/descendant::*')) < 1:
224                # There are probably stray </html>s in the markup
225                self.processed_html = self.processed_html.replace('</html>',
226                        '')
227                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
228
229        if root.tag != 'html':
230            self.log.warn('File does not have opening <html> tag')
231            nroot = html.fromstring('<html><head></head><body></body></html>')
232            bod = nroot.find('body')
233            for child in list(root):
234                child.getparent().remove(child)
235                bod.append(child)
236            root = nroot
237
238        htmls = list(root.xpath('//html'))
239
240        if len(htmls) > 1:
241            self.log.warn('Markup contains multiple <html> tags, merging.')
242            # Merge all <head> and <body> sections
243            for h in htmls:
244                p = h.getparent()
245                if hasattr(p, 'remove'):
246                    p.remove(h)
247            bodies, heads = root.xpath('//body'), root.xpath('//head')
248            for x in root:
249                root.remove(x)
250            head, body = map(root.makeelement, ('head', 'body'))
251            for h in heads:
252                for x in h:
253                    h.remove(x)
254                    head.append(x)
255            for b in bodies:
256                for x in b:
257                    b.remove(x)
258                    body.append(x)
259            root.append(head), root.append(body)
260        for x in root.xpath('//script'):
261            x.getparent().remove(x)
262
263        head = root.xpath('//head')
264        if head:
265            head = head[0]
266        else:
267            head = root.makeelement('head', {})
268            root.insert(0, head)
269        head.text = '\n\t'
270        link = head.makeelement('link', {'type':'text/css',
271            'href':'styles.css', 'rel':'stylesheet'})
272        head.insert(0, link)
273        link.tail = '\n\t'
274        title = head.xpath('descendant::title')
275        m = head.makeelement('meta', {'http-equiv':'Content-Type',
276            'content':'text/html; charset=utf-8'})
277        head.insert(0, m)
278        if not title:
279            title = head.makeelement('title', {})
280            try:
281                title.text = self.book_header.title
282            except ValueError:
283                title.text = clean_ascii_chars(self.book_header.title)
284            title.tail = '\n\t'
285            head.insert(0, title)
286            head.text = '\n\t'
287
288        self.upshift_markup(root, image_name_map)
289        guides = root.xpath('//guide')
290        guide = guides[0] if guides else None
291        metadata_elems = root.xpath('//metadata')
292        if metadata_elems and self.book_header.exth is None:
293            self.read_embedded_metadata(root, metadata_elems[0], guide)
294        for elem in guides + metadata_elems:
295            elem.getparent().remove(elem)
296        htmlfile = os.path.join(output_dir, 'index.html')
297        try:
298            for ref in guide.xpath('descendant::reference'):
299                if 'href' in ref.attrib:
300                    ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
301        except AttributeError:
302            pass
303
304        def write_as_utf8(path, data):
305            if isinstance(data, str):
306                data = data.encode('utf-8')
307            with lopen(path, 'wb') as f:
308                f.write(data)
309
310        parse_cache[htmlfile] = root
311        self.htmlfile = htmlfile
312        ncx = io.BytesIO()
313        opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
314        self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
315        opf.render(lopen(self.created_opf_path, 'wb'), ncx,
316            ncx_manifest_entry=ncx_manifest_entry)
317        ncx = ncx.getvalue()
318        if ncx:
319            ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
320            write_as_utf8(ncx_path, ncx)
321
322        css = [self.base_css_rules, '\n\n']
323        for cls, rule in self.tag_css_rules.items():
324            css.append('.%s { %s }\n\n' % (cls, rule))
325        write_as_utf8('styles.css', ''.join(css))
326
327        if self.book_header.exth is not None or self.embedded_mi is not None:
328            self.log.debug('Creating OPF...')
329            ncx = io.BytesIO()
330            opf, ncx_manifest_entry  = self.create_opf(htmlfile, guide, root)
331            opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
332                ncx_manifest_entry)
333            ncx = ncx.getvalue()
334            if ncx:
335                write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
336
337    def read_embedded_metadata(self, root, elem, guide):
338        raw = b'<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \
339                html.tostring(elem, encoding='utf-8') + b'</package>'
340        stream = io.BytesIO(raw)
341        opf = OPF(stream)
342        self.embedded_mi = opf.to_book_metadata()
343        if guide is not None:
344            for ref in guide.xpath('descendant::reference'):
345                if 'cover' in ref.get('type', '').lower():
346                    href = ref.get('href', '')
347                    if href.startswith('#'):
348                        href = href[1:]
349                    anchors = root.xpath('//*[@id="%s"]' % href)
350                    if anchors:
351                        cpos = anchors[0]
352                        reached = False
353                        for elem in root.iter():
354                            if elem is cpos:
355                                reached = True
356                            if reached and elem.tag == 'img':
357                                cover = elem.get('src', None)
358                                self.embedded_mi.cover = cover
359                                elem.getparent().remove(elem)
360                                break
361                    break
362
363    def cleanup_html(self):
364        self.log.debug('Cleaning up HTML...')
365        self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
366        if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
367            self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
368        self.processed_html = self.processed_html.replace('\r\n', '\n')
369        self.processed_html = self.processed_html.replace('> <', '>\n<')
370        self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
371        self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
372        self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
373        # Swap inline and block level elements, and order block level elements according to priority
374        # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
375        self.processed_html = re.sub(
376            r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html)
377        self.processed_html = re.sub(
378            r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html)
379        self.processed_html = re.sub(
380            r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html)
381        self.processed_html = re.sub(
382            r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
383        bods = htmls = 0
384        for x in re.finditer('</body>|</html>', self.processed_html):
385            if x == '</body>':
386                bods +=1
387            else:
388                htmls += 1
389            if bods > 1 and htmls > 1:
390                break
391        if bods > 1:
392            self.processed_html = self.processed_html.replace('</body>', '')
393        if htmls > 1:
394            self.processed_html = self.processed_html.replace('</html>', '')
395
396    def remove_random_bytes(self, html):
397        return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07',
398                    '', html)
399
400    def ensure_unit(self, raw, unit='px'):
401        if re.search(r'\d+$', raw) is not None:
402            raw += unit
403        return raw
404
405    def upshift_markup(self, root, image_name_map=None):
406        self.log.debug('Converting style information to CSS...')
407        image_name_map = image_name_map or {}
408        size_map = {
409            'xx-small': '0.5',
410            'x-small': '1',
411            'small': '2',
412            'medium': '3',
413            'large': '4',
414            'x-large': '5',
415            'xx-large': '6',
416            }
417
418        def barename(x):
419            return x.rpartition(':')[-1]
420
421        mobi_version = self.book_header.mobi_version
422        for x in root.xpath('//ncx'):
423            x.getparent().remove(x)
424        svg_tags = []
425        forwardable_anchors = []
426        pagebreak_anchors = []
427        BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'p'}
428        for i, tag in enumerate(root.iter(etree.Element)):
429            tag.attrib.pop('xmlns', '')
430            for x in tag.attrib:
431                if ':' in x:
432                    del tag.attrib[x]
433            if tag.tag and barename(tag.tag) == 'svg':
434                svg_tags.append(tag)
435            if tag.tag and barename(tag.tag.lower()) in \
436                ('country-region', 'place', 'placetype', 'placename',
437                    'state', 'city', 'street', 'address', 'content', 'form'):
438                tag.tag = 'div' if tag.tag in ('content', 'form') else 'span'
439                for key in tag.attrib.keys():
440                    tag.attrib.pop(key)
441                continue
442            styles, attrib = [], tag.attrib
443            if 'style' in attrib:
444                style = attrib.pop('style').strip()
445                if style:
446                    styles.append(style)
447            if 'height' in attrib:
448                height = attrib.pop('height').strip()
449                if (
450                        height and '<' not in height and '>' not in height and
451                        re.search(r'\d+', height)):
452                    if tag.tag in ('table', 'td', 'tr'):
453                        pass
454                    elif tag.tag == 'img':
455                        tag.set('height', height)
456                    else:
457                        if tag.tag == 'div' and not tag.text and \
458                                (not tag.tail or not tag.tail.strip()) and \
459                                not len(list(tag.iterdescendants())):
460                            # Paragraph spacer
461                            # Insert nbsp so that the element is never
462                            # discarded by a renderer
463                            tag.text = '\u00a0'  # nbsp
464                            styles.append('height: %s' %
465                                    self.ensure_unit(height))
466                        else:
467                            styles.append('margin-top: %s' % self.ensure_unit(height))
468            if 'width' in attrib:
469                width = attrib.pop('width').strip()
470                if width and re.search(r'\d+', width):
471                    if tag.tag in ('table', 'td', 'tr'):
472                        pass
473                    elif tag.tag == 'img':
474                        tag.set('width', width)
475                    else:
476                        ewidth = self.ensure_unit(width)
477                        styles.append('text-indent: %s' % ewidth)
478                        try:
479                            ewidth_val = unit_convert(ewidth, 12, 500, 166)
480                            self.text_indents[tag] = ewidth_val
481                        except:
482                            pass
483                        if width.startswith('-'):
484                            styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
485                            try:
486                                ewidth_val = unit_convert(ewidth[1:], 12, 500, 166)
487                                self.left_margins[tag] = ewidth_val
488                            except:
489                                pass
490
491            if 'align' in attrib:
492                align = attrib.pop('align').strip()
493                if align:
494                    align = align.lower()
495                    if align == 'baseline':
496                        styles.append('vertical-align: '+align)
497                    else:
498                        styles.append('text-align: %s' % align)
499            if tag.tag == 'hr':
500                if mobi_version == 1:
501                    tag.tag = 'div'
502                    styles.append('page-break-before: always')
503                    styles.append('display: block')
504                    styles.append('margin: 0')
505            elif tag.tag == 'i':
506                tag.tag = 'span'
507                tag.attrib['class'] = 'italic'
508            elif tag.tag == 'u':
509                tag.tag = 'span'
510                tag.attrib['class'] = 'underline'
511            elif tag.tag == 'b':
512                tag.tag = 'span'
513                tag.attrib['class'] = 'bold'
514            elif tag.tag == 'font':
515                sz = tag.get('size', '').lower()
516                try:
517                    float(sz)
518                except ValueError:
519                    if sz in list(size_map.keys()):
520                        attrib['size'] = size_map[sz]
521            elif tag.tag == 'img':
522                recindex = None
523                for attr in self.IMAGE_ATTRS:
524                    recindex = attrib.pop(attr, None) or recindex
525                if recindex is not None:
526                    try:
527                        recindex = int(recindex)
528                    except Exception:
529                        pass
530                    else:
531                        attrib['src'] = 'images/' + image_name_map.get(recindex, '%05d.jpg' % recindex)
532                for attr in ('width', 'height'):
533                    if attr in attrib:
534                        val = attrib[attr]
535                        if val.lower().endswith('em'):
536                            try:
537                                nval = float(val[:-2])
538                                nval *= 16 * (168.451/72)  # Assume this was set using the Kindle profile
539                                attrib[attr] = "%dpx"%int(nval)
540                            except:
541                                del attrib[attr]
542                        elif val.lower().endswith('%'):
543                            del attrib[attr]
544            elif tag.tag == 'pre':
545                if not tag.text:
546                    tag.tag = 'div'
547
548            if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag ==
549                    'div' and 'filepos-id' in attrib):
550                pagebreak_anchors.append(tag)
551
552            if 'color' in attrib:
553                styles.append('color: ' + attrib.pop('color'))
554            if 'bgcolor' in attrib:
555                styles.append('background-color: ' + attrib.pop('bgcolor'))
556
557            if 'filepos-id' in attrib:
558                attrib['id'] = attrib.pop('filepos-id')
559                if 'name' in attrib and attrib['name'] != attrib['id']:
560                    attrib['name'] = attrib['id']
561            if 'filepos' in attrib:
562                filepos = attrib.pop('filepos')
563                try:
564                    attrib['href'] = "#filepos%d" % int(filepos)
565                except ValueError:
566                    pass
567            if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') and
568                    not tag.text and len(tag) == 0 and (tag.tail is None or not
569                        tag.tail.strip()) and getattr(tag.getnext(), 'tag',
570                            None) in BLOCK_TAGS):
571                # This is an empty anchor immediately before a block tag, move
572                # the id onto the block tag instead
573                forwardable_anchors.append(tag)
574
575            if styles:
576                ncls = None
577                rule = '; '.join(styles)
578                for sel, srule in self.tag_css_rules.items():
579                    if srule == rule:
580                        ncls = sel
581                        break
582                if ncls is None:
583                    ncls = 'calibre_%d' % i
584                    self.tag_css_rules[ncls] = rule
585                cls = attrib.get('class', '')
586                cls = cls + (' ' if cls else '') + ncls
587                attrib['class'] = cls
588
589        for tag in svg_tags:
590            images = tag.xpath('descendant::img[@src]')
591            parent = tag.getparent()
592
593            if images and hasattr(parent, 'find'):
594                index = parent.index(tag)
595                for img in images:
596                    img.getparent().remove(img)
597                    img.tail = img.text = None
598                    parent.insert(index, img)
599
600            if hasattr(parent, 'remove'):
601                parent.remove(tag)
602
603        for tag in pagebreak_anchors:
604            anchor = tag.attrib['id']
605            del tag.attrib['id']
606            if 'name' in tag.attrib:
607                del tag.attrib['name']
608            p = tag.getparent()
609            a = p.makeelement('a')
610            a.attrib['id'] = anchor
611            p.insert(p.index(tag)+1, a)
612            if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS:
613                forwardable_anchors.append(a)
614
615        for tag in forwardable_anchors:
616            block = tag.getnext()
617            tag.getparent().remove(tag)
618
619            if 'id' in block.attrib:
620                tag.tail = block.text
621                block.text = None
622                block.insert(0, tag)
623            else:
624                block.attrib['id'] = tag.attrib['id']
625
626        # WebKit fails to navigate to anchors located on <br> tags
627        for br in root.xpath('/body/br[@id]'):
628            br.tag = 'div'
629
630    def get_left_whitespace(self, tag):
631
632        def whitespace(tag):
633            lm = ti = 0.0
634            if tag.tag == 'p':
635                ti = unit_convert('1.5em', 12, 500, 166)
636            if tag.tag == 'blockquote':
637                lm = unit_convert('2em', 12, 500, 166)
638            lm = self.left_margins.get(tag, lm)
639            ti = self.text_indents.get(tag, ti)
640            try:
641                lm = float(lm)
642            except:
643                lm = 0.0
644            try:
645                ti = float(ti)
646            except:
647                ti = 0.0
648            return lm + ti
649
650        parent = tag
651        ans = 0.0
652        while parent is not None:
653            ans += whitespace(parent)
654            parent = parent.getparent()
655
656        return ans
657
658    def create_opf(self, htmlfile, guide=None, root=None):
659        mi = getattr(self.book_header.exth, 'mi', self.embedded_mi)
660        if mi is None:
661            mi = MetaInformation(self.book_header.title, [_('Unknown')])
662        opf = OPFCreator(os.path.dirname(htmlfile), mi)
663        if hasattr(self.book_header.exth, 'cover_offset'):
664            opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
665        elif mi.cover is not None:
666            opf.cover = mi.cover
667        else:
668            opf.cover = 'images/%05d.jpg' % 1
669            if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
670                * opf.cover.split('/'))):
671                opf.cover = None
672
673        cover = opf.cover
674        cover_copied = None
675        if cover is not None:
676            cover = cover.replace('/', os.sep)
677            if os.path.exists(cover):
678                ncover = 'images'+os.sep+'calibre_cover.jpg'
679                if os.path.exists(ncover):
680                    os.remove(ncover)
681                shutil.copyfile(cover, ncover)
682                cover_copied = os.path.abspath(ncover)
683                opf.cover = ncover.replace(os.sep, '/')
684
685        manifest = [(htmlfile, 'application/xhtml+xml'),
686            (os.path.abspath('styles.css'), 'text/css')]
687        bp = os.path.dirname(htmlfile)
688        added = set()
689        for i in getattr(self, 'image_names', []):
690            path = os.path.join(bp, 'images', i)
691            added.add(path)
692            manifest.append((path, guess_type(path)[0] or 'image/jpeg'))
693        if cover_copied is not None:
694            manifest.append((cover_copied, 'image/jpeg'))
695
696        opf.create_manifest(manifest)
697        opf.create_spine([os.path.basename(htmlfile)])
698        toc = None
699        if guide is not None:
700            opf.create_guide(guide)
701            for ref in opf.guide:
702                if ref.type.lower() == 'toc':
703                    toc = ref.href()
704
705        ncx_manifest_entry = None
706        if toc:
707            ncx_manifest_entry = 'toc.ncx'
708            elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
709            tocobj = None
710            ent_pat = re.compile(r'&(\S+?);')
711            if elems:
712                tocobj = TOC()
713                found = False
714                reached = False
715                for x in root.iter():
716                    if x == elems[-1]:
717                        reached = True
718                        continue
719                    if reached and x.tag == 'a':
720                        href = x.get('href', '')
721                        if href and re.match(r'\w+://', href) is None:
722                            try:
723                                text = ' '.join([t.strip() for t in
724                                    x.xpath('descendant::text()')])
725                            except:
726                                text = ''
727                            text = ent_pat.sub(entity_to_unicode, text)
728                            item = tocobj.add_item(toc.partition('#')[0], href[1:],
729                                text)
730                            item.left_space = int(self.get_left_whitespace(x))
731                            found = True
732                    if reached and found and x.get('class', None) == 'mbp_pagebreak':
733                        break
734            if tocobj is not None:
735                tocobj = self.structure_toc(tocobj)
736                opf.set_toc(tocobj)
737
738        return opf, ncx_manifest_entry
739
740    def structure_toc(self, toc):
741        indent_vals = set()
742        for item in toc:
743            indent_vals.add(item.left_space)
744        if len(indent_vals) > 6 or len(indent_vals) < 2:
745            # Too many or too few levels, give up
746            return toc
747        indent_vals = sorted(indent_vals)
748
749        last_found = [None for i in indent_vals]
750
751        newtoc = TOC()
752
753        def find_parent(level):
754            candidates = last_found[:level]
755            for x in reversed(candidates):
756                if x is not None:
757                    return x
758            return newtoc
759
760        for item in toc:
761            level = indent_vals.index(item.left_space)
762            parent = find_parent(level)
763            last_found[level] = parent.add_item(item.href, item.fragment,
764                        item.text)
765
766        return newtoc
767
768    def sizeof_trailing_entries(self, data):
769        def sizeof_trailing_entry(ptr, psize):
770            bitpos, result = 0, 0
771            while True:
772                v = ord(ptr[psize-1:psize])
773                result |= (v & 0x7F) << bitpos
774                bitpos += 7
775                psize -= 1
776                if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0):
777                    return result
778
779        num = 0
780        size = len(data)
781        flags = self.book_header.extra_flags >> 1
782        while flags:
783            if flags & 1:
784                try:
785                    num += sizeof_trailing_entry(data, size - num)
786                except IndexError:
787                    self.warn_about_trailing_entry_corruption()
788                    return 0
789            flags >>= 1
790        if self.book_header.extra_flags & 1:
791            off = size - num - 1
792            num += (ord(data[off:off+1]) & 0x3) + 1
793        return num
794
795    def warn_about_trailing_entry_corruption(self):
796        if not self.warned_about_trailing_entry_corruption:
797            self.warned_about_trailing_entry_corruption = True
798            self.log.warn('The trailing data entries in this MOBI file are corrupted, you might see corrupted text in the output')
799
800    def text_section(self, index):
801        data = self.sections[index][0]
802        trail_size = self.sizeof_trailing_entries(data)
803        return data[:len(data)-trail_size]
804
805    def extract_text(self, offset=1):
806        self.log.debug('Extracting text...')
807        text_sections = [self.text_section(i) for i in range(offset,
808            min(self.book_header.records + offset, len(self.sections)))]
809        processed_records = list(range(offset-1, self.book_header.records +
810            offset))
811
812        self.mobi_html = b''
813
814        if self.book_header.compression_type == b'DH':
815            huffs = [self.sections[i][0] for i in
816                range(self.book_header.huff_offset,
817                    self.book_header.huff_offset + self.book_header.huff_number)]
818            processed_records += list(range(self.book_header.huff_offset,
819                self.book_header.huff_offset + self.book_header.huff_number))
820            huff = HuffReader(huffs)
821            unpack = huff.unpack
822
823        elif self.book_header.compression_type == b'\x00\x02':
824            unpack = decompress_doc
825
826        elif self.book_header.compression_type == b'\x00\x01':
827            unpack = lambda x: x
828        else:
829            raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type)
830        self.mobi_html = b''.join(map(unpack, text_sections))
831        if self.mobi_html.endswith(b'#'):
832            self.mobi_html = self.mobi_html[:-1]
833
834        if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
835            self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ')
836        self.mobi_html = self.mobi_html.replace(b'\0', b'')
837        if self.book_header.codec == 'cp1252':
838            self.mobi_html = self.mobi_html.replace(b'\x1e', b'')  # record separator
839            self.mobi_html = self.mobi_html.replace(b'\x02', b'')  # start of text
840        return processed_records
841
842    def replace_page_breaks(self):
843        self.processed_html = self.PAGE_BREAK_PAT.sub(
844            r'<div \1 class="mbp_pagebreak" />',
845            self.processed_html)
846
847    def add_anchors(self):
848        self.log.debug('Adding anchors...')
849        positions = set()
850        link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
851            re.IGNORECASE)
852        for match in link_pattern.finditer(self.mobi_html):
853            positions.add(int(match.group(1)))
854        pos = 0
855        processed_html = []
856        end_tag_re = re.compile(br'<\s*/')
857        for end in sorted(positions):
858            if end == 0:
859                continue
860            oend = end
861            l = self.mobi_html.find(b'<', end)
862            r = self.mobi_html.find(b'>', end)
863            anchor = b'<a id="filepos%d"></a>'
864            if r > -1 and (r < l or l == end or l == -1):
865                p = self.mobi_html.rfind(b'<', 0, end + 1)
866                if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and
867                        not self.mobi_html[p:r + 1].endswith(b'/>')):
868                    anchor = b' filepos-id="filepos%d"'
869                    end = r
870                else:
871                    end = r + 1
872            processed_html.append(self.mobi_html[pos:end] + (anchor % oend))
873            pos = end
874        processed_html.append(self.mobi_html[pos:])
875        processed_html = b''.join(processed_html)
876
877        # Remove anchors placed inside entities
878        self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);',
879                br'&\1\3;\2', processed_html)
880
881    def extract_images(self, processed_records, output_dir):
882        self.log.debug('Extracting images...')
883        output_dir = os.path.abspath(os.path.join(output_dir, 'images'))
884        if not os.path.exists(output_dir):
885            os.makedirs(output_dir)
886        image_index = 0
887        self.image_names = []
888        image_name_map = {}
889        start = getattr(self.book_header, 'first_image_index', -1)
890        if start > self.num_sections or start < 0:
891            # BAEN PRC files have bad headers
892            start = 0
893        for i in range(start, self.num_sections):
894            if i in processed_records:
895                continue
896            processed_records.append(i)
897            data  = self.sections[i][0]
898            image_index += 1
899            if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
900                    b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
901                # This record is a known non image type, no need to try to
902                # load the image
903                continue
904
905            try:
906                imgfmt = what(None, data)
907            except Exception:
908                continue
909            if imgfmt not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}:
910                continue
911            if imgfmt == 'jpeg':
912                imgfmt = 'jpg'
913            if imgfmt == 'gif':
914                try:
915                    data = gif_data_to_png_data(data)
916                    imgfmt = 'png'
917                except AnimatedGIF:
918                    pass
919            path = os.path.join(output_dir, '%05d.%s' % (image_index, imgfmt))
920            image_name_map[image_index] = os.path.basename(path)
921            if imgfmt == 'png':
922                with open(path, 'wb') as f:
923                    f.write(data)
924            else:
925                try:
926                    save_cover_data_to(data, path, minify_to=(10000, 10000))
927                except Exception:
928                    continue
929            self.image_names.append(os.path.basename(path))
930        return image_name_map
931
932
933def test_mbp_regex():
934    for raw, m in iteritems({
935        '<mbp:pagebreak></mbp:pagebreak>':'',
936        '<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
937        '<mbp:pagebreak> </mbp:pagebreak>':'',
938        '<mbp:pagebreak>xxx':'xxx',
939        '<mbp:pagebreak/>xxx':'xxx',
940        '<mbp:pagebreak sdf/ >xxx':' sdfxxx',
941        '<mbp:pagebreak / >':' ',
942        '</mbp:pagebreak>':'',
943        '</mbp:pagebreak sdf>':' sdf',
944        '</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':'xxx',
945        }):
946        ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
947        if ans != m:
948            raise Exception('%r != %r for %r'%(ans, m, raw))
949