1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' 7__docformat__ = 'restructuredtext en' 8 9import io 10import os 11import re 12import shutil 13import struct 14import textwrap 15from lxml import etree, html 16 17from calibre import entity_to_unicode, guess_type, xml_entity_to_unicode 18from calibre.ebooks import DRMError, unit_convert 19from calibre.ebooks.chardet import strip_encoding_declarations 20from calibre.ebooks.compression.palmdoc import decompress_doc 21from calibre.ebooks.metadata import MetaInformation 22from calibre.ebooks.metadata.opf2 import OPF, OPFCreator 23from calibre.ebooks.metadata.toc import TOC 24from calibre.ebooks.mobi import MobiError 25from calibre.ebooks.mobi.huffcdic import HuffReader 26from calibre.ebooks.mobi.reader.headers import BookHeader 27from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars 28from calibre.utils.img import AnimatedGIF, gif_data_to_png_data, save_cover_data_to 29from calibre.utils.imghdr import what 30from calibre.utils.logging import default_log 31from polyglot.builtins import iteritems 32 33 34class TopazError(ValueError): 35 pass 36 37 38class KFXError(ValueError): 39 40 def __init__(self): 41 ValueError.__init__(self, _( 42 'This is an Amazon KFX book. It cannot be processed.' 43 ' See {} for information on how to handle KFX books.' 44 ).format('https://www.mobileread.com/forums/showthread.php?t=283371')) 45 46 47class MobiReader: 48 PAGE_BREAK_PAT = re.compile( 49 r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*', 50 re.IGNORECASE) 51 IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') 52 53 def __init__(self, filename_or_stream, log=None, user_encoding=None, debug=None, 54 try_extra_data_fix=False): 55 self.log = log or default_log 56 self.debug = debug 57 self.embedded_mi = None 58 self.warned_about_trailing_entry_corruption = False 59 self.base_css_rules = textwrap.dedent(''' 60 body { text-align: justify } 61 62 blockquote { margin: 0em 0em 0em 2em; } 63 64 p { margin: 0em; text-indent: 1.5em } 65 66 .bold { font-weight: bold } 67 68 .italic { font-style: italic } 69 70 .underline { text-decoration: underline } 71 72 .mbp_pagebreak { 73 page-break-after: always; margin: 0; display: block 74 } 75 ''') 76 self.tag_css_rules = {} 77 self.left_margins = {} 78 self.text_indents = {} 79 80 if hasattr(filename_or_stream, 'read'): 81 stream = filename_or_stream 82 stream.seek(0) 83 else: 84 stream = open(filename_or_stream, 'rb') 85 86 raw = stream.read() 87 if raw.startswith(b'TPZ'): 88 raise TopazError(_('This is an Amazon Topaz book. It cannot be processed.')) 89 if raw.startswith(b'\xeaDRMION\xee'): 90 raise KFXError() 91 92 self.header = raw[0:72] 93 self.name = self.header[:32].replace(b'\x00', b'') 94 self.num_sections, = struct.unpack('>H', raw[76:78]) 95 96 self.ident = self.header[0x3C:0x3C + 8].upper() 97 if self.ident not in (b'BOOKMOBI', b'TEXTREAD'): 98 raise MobiError('Unknown book type: %s' % repr(self.ident)) 99 100 self.sections = [] 101 self.section_headers = [] 102 for i in range(self.num_sections): 103 offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8]) 104 flags, val = a1, a2 << 16 | a3 << 8 | a4 105 self.section_headers.append((offset, flags, val)) 106 107 def section(section_number): 108 if section_number == self.num_sections - 1: 109 end_off = len(raw) 110 else: 111 end_off = self.section_headers[section_number + 1][0] 112 off = self.section_headers[section_number][0] 113 return raw[off:end_off] 114 115 for i in range(self.num_sections): 116 self.sections.append((section(i), self.section_headers[i])) 117 118 self.book_header = bh = BookHeader(self.sections[0][0], self.ident, 119 user_encoding, self.log, try_extra_data_fix=try_extra_data_fix) 120 self.name = self.name.decode(self.book_header.codec, 'replace') 121 self.kf8_type = None 122 k8i = getattr(self.book_header.exth, 'kf8_header', None) 123 124 # Ancient PRC files from Baen can have random values for 125 # mobi_version, so be conservative 126 if (self.book_header.mobi_version == 8 and hasattr(self.book_header, 127 'skelidx')): 128 self.kf8_type = 'standalone' 129 elif k8i is not None: # Check for joint mobi 6 and kf 8 file 130 try: 131 raw = self.sections[k8i-1][0] 132 except: 133 raw = None 134 if raw == b'BOUNDARY': 135 try: 136 self.book_header = BookHeader(self.sections[k8i][0], 137 self.ident, user_encoding, self.log) 138 self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i 139 self.book_header.mobi6_records = bh.records 140 141 # Need the first_image_index from the mobi 6 header as well 142 for x in ('first_image_index',): 143 setattr(self.book_header, x, getattr(bh, x)) 144 145 # We need to do this because the MOBI 6 text extract code 146 # does not know anything about the kf8 offset 147 if hasattr(self.book_header, 'huff_offset'): 148 self.book_header.huff_offset += k8i 149 150 self.kf8_type = 'joint' 151 self.kf8_boundary = k8i-1 152 except: 153 self.book_header = bh 154 155 def check_for_drm(self): 156 if self.book_header.encryption_type != 0: 157 try: 158 name = self.book_header.exth.mi.title 159 except: 160 name = self.name 161 if not name: 162 name = self.name 163 raise DRMError(name) 164 165 def extract_content(self, output_dir, parse_cache): 166 output_dir = os.path.abspath(output_dir) 167 self.check_for_drm() 168 processed_records = self.extract_text() 169 if self.debug is not None: 170 parse_cache['calibre_raw_mobi_markup'] = self.mobi_html 171 self.add_anchors() 172 self.processed_html = self.processed_html.decode(self.book_header.codec, 173 'ignore') 174 self.processed_html = self.processed_html.replace('</</', '</') 175 self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', 176 self.processed_html) 177 self.processed_html = self.processed_html.replace('\ufeff', '') 178 # Remove tags of the form <xyz: ...> as they can cause issues further 179 # along the pipeline 180 self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', 181 self.processed_html) 182 183 self.processed_html = strip_encoding_declarations(self.processed_html) 184 self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, 185 self.processed_html) 186 image_name_map = self.extract_images(processed_records, output_dir) 187 self.replace_page_breaks() 188 self.cleanup_html() 189 190 self.log.debug('Parsing HTML...') 191 self.processed_html = clean_xml_chars(self.processed_html) 192 try: 193 root = html.fromstring(self.processed_html) 194 if len(root.xpath('//html')) > 5: 195 root = html.fromstring(self.processed_html.replace('\x0c', 196 '').replace('\x14', '')) 197 except Exception: 198 self.log.warning('MOBI markup appears to contain random bytes. Stripping.') 199 self.processed_html = self.remove_random_bytes(self.processed_html) 200 try: 201 root = html.fromstring(self.processed_html) 202 except Exception: 203 self.log.warning('MOBI markup could not be parsed by lxml using html5-parser') 204 # Happens on windows with python 3 where lxml causes libxml to die with an 205 # error about using UCS-4 little endian encoding if certain 206 # characters are present in the input 207 from html5_parser import parse 208 root = parse(self.processed_html, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True) 209 if root.xpath('descendant::p/descendant::p'): 210 from html5_parser import parse 211 self.log.warning('Malformed markup, parsing using html5-parser') 212 self.processed_html = strip_encoding_declarations(self.processed_html) 213 # These trip up the html5 parser causing all content to be placed 214 # under the <guide> tag 215 self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I) 216 self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I) 217 try: 218 root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) 219 except Exception: 220 self.log.warning('MOBI markup appears to contain random bytes. Stripping.') 221 self.processed_html = self.remove_random_bytes(self.processed_html) 222 root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) 223 if len(root.xpath('body/descendant::*')) < 1: 224 # There are probably stray </html>s in the markup 225 self.processed_html = self.processed_html.replace('</html>', 226 '') 227 root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) 228 229 if root.tag != 'html': 230 self.log.warn('File does not have opening <html> tag') 231 nroot = html.fromstring('<html><head></head><body></body></html>') 232 bod = nroot.find('body') 233 for child in list(root): 234 child.getparent().remove(child) 235 bod.append(child) 236 root = nroot 237 238 htmls = list(root.xpath('//html')) 239 240 if len(htmls) > 1: 241 self.log.warn('Markup contains multiple <html> tags, merging.') 242 # Merge all <head> and <body> sections 243 for h in htmls: 244 p = h.getparent() 245 if hasattr(p, 'remove'): 246 p.remove(h) 247 bodies, heads = root.xpath('//body'), root.xpath('//head') 248 for x in root: 249 root.remove(x) 250 head, body = map(root.makeelement, ('head', 'body')) 251 for h in heads: 252 for x in h: 253 h.remove(x) 254 head.append(x) 255 for b in bodies: 256 for x in b: 257 b.remove(x) 258 body.append(x) 259 root.append(head), root.append(body) 260 for x in root.xpath('//script'): 261 x.getparent().remove(x) 262 263 head = root.xpath('//head') 264 if head: 265 head = head[0] 266 else: 267 head = root.makeelement('head', {}) 268 root.insert(0, head) 269 head.text = '\n\t' 270 link = head.makeelement('link', {'type':'text/css', 271 'href':'styles.css', 'rel':'stylesheet'}) 272 head.insert(0, link) 273 link.tail = '\n\t' 274 title = head.xpath('descendant::title') 275 m = head.makeelement('meta', {'http-equiv':'Content-Type', 276 'content':'text/html; charset=utf-8'}) 277 head.insert(0, m) 278 if not title: 279 title = head.makeelement('title', {}) 280 try: 281 title.text = self.book_header.title 282 except ValueError: 283 title.text = clean_ascii_chars(self.book_header.title) 284 title.tail = '\n\t' 285 head.insert(0, title) 286 head.text = '\n\t' 287 288 self.upshift_markup(root, image_name_map) 289 guides = root.xpath('//guide') 290 guide = guides[0] if guides else None 291 metadata_elems = root.xpath('//metadata') 292 if metadata_elems and self.book_header.exth is None: 293 self.read_embedded_metadata(root, metadata_elems[0], guide) 294 for elem in guides + metadata_elems: 295 elem.getparent().remove(elem) 296 htmlfile = os.path.join(output_dir, 'index.html') 297 try: 298 for ref in guide.xpath('descendant::reference'): 299 if 'href' in ref.attrib: 300 ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] 301 except AttributeError: 302 pass 303 304 def write_as_utf8(path, data): 305 if isinstance(data, str): 306 data = data.encode('utf-8') 307 with lopen(path, 'wb') as f: 308 f.write(data) 309 310 parse_cache[htmlfile] = root 311 self.htmlfile = htmlfile 312 ncx = io.BytesIO() 313 opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) 314 self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' 315 opf.render(lopen(self.created_opf_path, 'wb'), ncx, 316 ncx_manifest_entry=ncx_manifest_entry) 317 ncx = ncx.getvalue() 318 if ncx: 319 ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') 320 write_as_utf8(ncx_path, ncx) 321 322 css = [self.base_css_rules, '\n\n'] 323 for cls, rule in self.tag_css_rules.items(): 324 css.append('.%s { %s }\n\n' % (cls, rule)) 325 write_as_utf8('styles.css', ''.join(css)) 326 327 if self.book_header.exth is not None or self.embedded_mi is not None: 328 self.log.debug('Creating OPF...') 329 ncx = io.BytesIO() 330 opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) 331 opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, 332 ncx_manifest_entry) 333 ncx = ncx.getvalue() 334 if ncx: 335 write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx) 336 337 def read_embedded_metadata(self, root, elem, guide): 338 raw = b'<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \ 339 html.tostring(elem, encoding='utf-8') + b'</package>' 340 stream = io.BytesIO(raw) 341 opf = OPF(stream) 342 self.embedded_mi = opf.to_book_metadata() 343 if guide is not None: 344 for ref in guide.xpath('descendant::reference'): 345 if 'cover' in ref.get('type', '').lower(): 346 href = ref.get('href', '') 347 if href.startswith('#'): 348 href = href[1:] 349 anchors = root.xpath('//*[@id="%s"]' % href) 350 if anchors: 351 cpos = anchors[0] 352 reached = False 353 for elem in root.iter(): 354 if elem is cpos: 355 reached = True 356 if reached and elem.tag == 'img': 357 cover = elem.get('src', None) 358 self.embedded_mi.cover = cover 359 elem.getparent().remove(elem) 360 break 361 break 362 363 def cleanup_html(self): 364 self.log.debug('Cleaning up HTML...') 365 self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html) 366 if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower(): 367 self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>' 368 self.processed_html = self.processed_html.replace('\r\n', '\n') 369 self.processed_html = self.processed_html.replace('> <', '>\n<') 370 self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:') 371 self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html) 372 self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html) 373 # Swap inline and block level elements, and order block level elements according to priority 374 # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec 375 self.processed_html = re.sub( 376 r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html) 377 self.processed_html = re.sub( 378 r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html) 379 self.processed_html = re.sub( 380 r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html) 381 self.processed_html = re.sub( 382 r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html) 383 bods = htmls = 0 384 for x in re.finditer('</body>|</html>', self.processed_html): 385 if x == '</body>': 386 bods +=1 387 else: 388 htmls += 1 389 if bods > 1 and htmls > 1: 390 break 391 if bods > 1: 392 self.processed_html = self.processed_html.replace('</body>', '') 393 if htmls > 1: 394 self.processed_html = self.processed_html.replace('</html>', '') 395 396 def remove_random_bytes(self, html): 397 return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07', 398 '', html) 399 400 def ensure_unit(self, raw, unit='px'): 401 if re.search(r'\d+$', raw) is not None: 402 raw += unit 403 return raw 404 405 def upshift_markup(self, root, image_name_map=None): 406 self.log.debug('Converting style information to CSS...') 407 image_name_map = image_name_map or {} 408 size_map = { 409 'xx-small': '0.5', 410 'x-small': '1', 411 'small': '2', 412 'medium': '3', 413 'large': '4', 414 'x-large': '5', 415 'xx-large': '6', 416 } 417 418 def barename(x): 419 return x.rpartition(':')[-1] 420 421 mobi_version = self.book_header.mobi_version 422 for x in root.xpath('//ncx'): 423 x.getparent().remove(x) 424 svg_tags = [] 425 forwardable_anchors = [] 426 pagebreak_anchors = [] 427 BLOCK_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'p'} 428 for i, tag in enumerate(root.iter(etree.Element)): 429 tag.attrib.pop('xmlns', '') 430 for x in tag.attrib: 431 if ':' in x: 432 del tag.attrib[x] 433 if tag.tag and barename(tag.tag) == 'svg': 434 svg_tags.append(tag) 435 if tag.tag and barename(tag.tag.lower()) in \ 436 ('country-region', 'place', 'placetype', 'placename', 437 'state', 'city', 'street', 'address', 'content', 'form'): 438 tag.tag = 'div' if tag.tag in ('content', 'form') else 'span' 439 for key in tag.attrib.keys(): 440 tag.attrib.pop(key) 441 continue 442 styles, attrib = [], tag.attrib 443 if 'style' in attrib: 444 style = attrib.pop('style').strip() 445 if style: 446 styles.append(style) 447 if 'height' in attrib: 448 height = attrib.pop('height').strip() 449 if ( 450 height and '<' not in height and '>' not in height and 451 re.search(r'\d+', height)): 452 if tag.tag in ('table', 'td', 'tr'): 453 pass 454 elif tag.tag == 'img': 455 tag.set('height', height) 456 else: 457 if tag.tag == 'div' and not tag.text and \ 458 (not tag.tail or not tag.tail.strip()) and \ 459 not len(list(tag.iterdescendants())): 460 # Paragraph spacer 461 # Insert nbsp so that the element is never 462 # discarded by a renderer 463 tag.text = '\u00a0' # nbsp 464 styles.append('height: %s' % 465 self.ensure_unit(height)) 466 else: 467 styles.append('margin-top: %s' % self.ensure_unit(height)) 468 if 'width' in attrib: 469 width = attrib.pop('width').strip() 470 if width and re.search(r'\d+', width): 471 if tag.tag in ('table', 'td', 'tr'): 472 pass 473 elif tag.tag == 'img': 474 tag.set('width', width) 475 else: 476 ewidth = self.ensure_unit(width) 477 styles.append('text-indent: %s' % ewidth) 478 try: 479 ewidth_val = unit_convert(ewidth, 12, 500, 166) 480 self.text_indents[tag] = ewidth_val 481 except: 482 pass 483 if width.startswith('-'): 484 styles.append('margin-left: %s' % self.ensure_unit(width[1:])) 485 try: 486 ewidth_val = unit_convert(ewidth[1:], 12, 500, 166) 487 self.left_margins[tag] = ewidth_val 488 except: 489 pass 490 491 if 'align' in attrib: 492 align = attrib.pop('align').strip() 493 if align: 494 align = align.lower() 495 if align == 'baseline': 496 styles.append('vertical-align: '+align) 497 else: 498 styles.append('text-align: %s' % align) 499 if tag.tag == 'hr': 500 if mobi_version == 1: 501 tag.tag = 'div' 502 styles.append('page-break-before: always') 503 styles.append('display: block') 504 styles.append('margin: 0') 505 elif tag.tag == 'i': 506 tag.tag = 'span' 507 tag.attrib['class'] = 'italic' 508 elif tag.tag == 'u': 509 tag.tag = 'span' 510 tag.attrib['class'] = 'underline' 511 elif tag.tag == 'b': 512 tag.tag = 'span' 513 tag.attrib['class'] = 'bold' 514 elif tag.tag == 'font': 515 sz = tag.get('size', '').lower() 516 try: 517 float(sz) 518 except ValueError: 519 if sz in list(size_map.keys()): 520 attrib['size'] = size_map[sz] 521 elif tag.tag == 'img': 522 recindex = None 523 for attr in self.IMAGE_ATTRS: 524 recindex = attrib.pop(attr, None) or recindex 525 if recindex is not None: 526 try: 527 recindex = int(recindex) 528 except Exception: 529 pass 530 else: 531 attrib['src'] = 'images/' + image_name_map.get(recindex, '%05d.jpg' % recindex) 532 for attr in ('width', 'height'): 533 if attr in attrib: 534 val = attrib[attr] 535 if val.lower().endswith('em'): 536 try: 537 nval = float(val[:-2]) 538 nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile 539 attrib[attr] = "%dpx"%int(nval) 540 except: 541 del attrib[attr] 542 elif val.lower().endswith('%'): 543 del attrib[attr] 544 elif tag.tag == 'pre': 545 if not tag.text: 546 tag.tag = 'div' 547 548 if (attrib.get('class', None) == 'mbp_pagebreak' and tag.tag == 549 'div' and 'filepos-id' in attrib): 550 pagebreak_anchors.append(tag) 551 552 if 'color' in attrib: 553 styles.append('color: ' + attrib.pop('color')) 554 if 'bgcolor' in attrib: 555 styles.append('background-color: ' + attrib.pop('bgcolor')) 556 557 if 'filepos-id' in attrib: 558 attrib['id'] = attrib.pop('filepos-id') 559 if 'name' in attrib and attrib['name'] != attrib['id']: 560 attrib['name'] = attrib['id'] 561 if 'filepos' in attrib: 562 filepos = attrib.pop('filepos') 563 try: 564 attrib['href'] = "#filepos%d" % int(filepos) 565 except ValueError: 566 pass 567 if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') and 568 not tag.text and len(tag) == 0 and (tag.tail is None or not 569 tag.tail.strip()) and getattr(tag.getnext(), 'tag', 570 None) in BLOCK_TAGS): 571 # This is an empty anchor immediately before a block tag, move 572 # the id onto the block tag instead 573 forwardable_anchors.append(tag) 574 575 if styles: 576 ncls = None 577 rule = '; '.join(styles) 578 for sel, srule in self.tag_css_rules.items(): 579 if srule == rule: 580 ncls = sel 581 break 582 if ncls is None: 583 ncls = 'calibre_%d' % i 584 self.tag_css_rules[ncls] = rule 585 cls = attrib.get('class', '') 586 cls = cls + (' ' if cls else '') + ncls 587 attrib['class'] = cls 588 589 for tag in svg_tags: 590 images = tag.xpath('descendant::img[@src]') 591 parent = tag.getparent() 592 593 if images and hasattr(parent, 'find'): 594 index = parent.index(tag) 595 for img in images: 596 img.getparent().remove(img) 597 img.tail = img.text = None 598 parent.insert(index, img) 599 600 if hasattr(parent, 'remove'): 601 parent.remove(tag) 602 603 for tag in pagebreak_anchors: 604 anchor = tag.attrib['id'] 605 del tag.attrib['id'] 606 if 'name' in tag.attrib: 607 del tag.attrib['name'] 608 p = tag.getparent() 609 a = p.makeelement('a') 610 a.attrib['id'] = anchor 611 p.insert(p.index(tag)+1, a) 612 if getattr(a.getnext(), 'tag', None) in BLOCK_TAGS: 613 forwardable_anchors.append(a) 614 615 for tag in forwardable_anchors: 616 block = tag.getnext() 617 tag.getparent().remove(tag) 618 619 if 'id' in block.attrib: 620 tag.tail = block.text 621 block.text = None 622 block.insert(0, tag) 623 else: 624 block.attrib['id'] = tag.attrib['id'] 625 626 # WebKit fails to navigate to anchors located on <br> tags 627 for br in root.xpath('/body/br[@id]'): 628 br.tag = 'div' 629 630 def get_left_whitespace(self, tag): 631 632 def whitespace(tag): 633 lm = ti = 0.0 634 if tag.tag == 'p': 635 ti = unit_convert('1.5em', 12, 500, 166) 636 if tag.tag == 'blockquote': 637 lm = unit_convert('2em', 12, 500, 166) 638 lm = self.left_margins.get(tag, lm) 639 ti = self.text_indents.get(tag, ti) 640 try: 641 lm = float(lm) 642 except: 643 lm = 0.0 644 try: 645 ti = float(ti) 646 except: 647 ti = 0.0 648 return lm + ti 649 650 parent = tag 651 ans = 0.0 652 while parent is not None: 653 ans += whitespace(parent) 654 parent = parent.getparent() 655 656 return ans 657 658 def create_opf(self, htmlfile, guide=None, root=None): 659 mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) 660 if mi is None: 661 mi = MetaInformation(self.book_header.title, [_('Unknown')]) 662 opf = OPFCreator(os.path.dirname(htmlfile), mi) 663 if hasattr(self.book_header.exth, 'cover_offset'): 664 opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1) 665 elif mi.cover is not None: 666 opf.cover = mi.cover 667 else: 668 opf.cover = 'images/%05d.jpg' % 1 669 if not os.path.exists(os.path.join(os.path.dirname(htmlfile), 670 * opf.cover.split('/'))): 671 opf.cover = None 672 673 cover = opf.cover 674 cover_copied = None 675 if cover is not None: 676 cover = cover.replace('/', os.sep) 677 if os.path.exists(cover): 678 ncover = 'images'+os.sep+'calibre_cover.jpg' 679 if os.path.exists(ncover): 680 os.remove(ncover) 681 shutil.copyfile(cover, ncover) 682 cover_copied = os.path.abspath(ncover) 683 opf.cover = ncover.replace(os.sep, '/') 684 685 manifest = [(htmlfile, 'application/xhtml+xml'), 686 (os.path.abspath('styles.css'), 'text/css')] 687 bp = os.path.dirname(htmlfile) 688 added = set() 689 for i in getattr(self, 'image_names', []): 690 path = os.path.join(bp, 'images', i) 691 added.add(path) 692 manifest.append((path, guess_type(path)[0] or 'image/jpeg')) 693 if cover_copied is not None: 694 manifest.append((cover_copied, 'image/jpeg')) 695 696 opf.create_manifest(manifest) 697 opf.create_spine([os.path.basename(htmlfile)]) 698 toc = None 699 if guide is not None: 700 opf.create_guide(guide) 701 for ref in opf.guide: 702 if ref.type.lower() == 'toc': 703 toc = ref.href() 704 705 ncx_manifest_entry = None 706 if toc: 707 ncx_manifest_entry = 'toc.ncx' 708 elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1]) 709 tocobj = None 710 ent_pat = re.compile(r'&(\S+?);') 711 if elems: 712 tocobj = TOC() 713 found = False 714 reached = False 715 for x in root.iter(): 716 if x == elems[-1]: 717 reached = True 718 continue 719 if reached and x.tag == 'a': 720 href = x.get('href', '') 721 if href and re.match(r'\w+://', href) is None: 722 try: 723 text = ' '.join([t.strip() for t in 724 x.xpath('descendant::text()')]) 725 except: 726 text = '' 727 text = ent_pat.sub(entity_to_unicode, text) 728 item = tocobj.add_item(toc.partition('#')[0], href[1:], 729 text) 730 item.left_space = int(self.get_left_whitespace(x)) 731 found = True 732 if reached and found and x.get('class', None) == 'mbp_pagebreak': 733 break 734 if tocobj is not None: 735 tocobj = self.structure_toc(tocobj) 736 opf.set_toc(tocobj) 737 738 return opf, ncx_manifest_entry 739 740 def structure_toc(self, toc): 741 indent_vals = set() 742 for item in toc: 743 indent_vals.add(item.left_space) 744 if len(indent_vals) > 6 or len(indent_vals) < 2: 745 # Too many or too few levels, give up 746 return toc 747 indent_vals = sorted(indent_vals) 748 749 last_found = [None for i in indent_vals] 750 751 newtoc = TOC() 752 753 def find_parent(level): 754 candidates = last_found[:level] 755 for x in reversed(candidates): 756 if x is not None: 757 return x 758 return newtoc 759 760 for item in toc: 761 level = indent_vals.index(item.left_space) 762 parent = find_parent(level) 763 last_found[level] = parent.add_item(item.href, item.fragment, 764 item.text) 765 766 return newtoc 767 768 def sizeof_trailing_entries(self, data): 769 def sizeof_trailing_entry(ptr, psize): 770 bitpos, result = 0, 0 771 while True: 772 v = ord(ptr[psize-1:psize]) 773 result |= (v & 0x7F) << bitpos 774 bitpos += 7 775 psize -= 1 776 if (v & 0x80) != 0 or (bitpos >= 28) or (psize == 0): 777 return result 778 779 num = 0 780 size = len(data) 781 flags = self.book_header.extra_flags >> 1 782 while flags: 783 if flags & 1: 784 try: 785 num += sizeof_trailing_entry(data, size - num) 786 except IndexError: 787 self.warn_about_trailing_entry_corruption() 788 return 0 789 flags >>= 1 790 if self.book_header.extra_flags & 1: 791 off = size - num - 1 792 num += (ord(data[off:off+1]) & 0x3) + 1 793 return num 794 795 def warn_about_trailing_entry_corruption(self): 796 if not self.warned_about_trailing_entry_corruption: 797 self.warned_about_trailing_entry_corruption = True 798 self.log.warn('The trailing data entries in this MOBI file are corrupted, you might see corrupted text in the output') 799 800 def text_section(self, index): 801 data = self.sections[index][0] 802 trail_size = self.sizeof_trailing_entries(data) 803 return data[:len(data)-trail_size] 804 805 def extract_text(self, offset=1): 806 self.log.debug('Extracting text...') 807 text_sections = [self.text_section(i) for i in range(offset, 808 min(self.book_header.records + offset, len(self.sections)))] 809 processed_records = list(range(offset-1, self.book_header.records + 810 offset)) 811 812 self.mobi_html = b'' 813 814 if self.book_header.compression_type == b'DH': 815 huffs = [self.sections[i][0] for i in 816 range(self.book_header.huff_offset, 817 self.book_header.huff_offset + self.book_header.huff_number)] 818 processed_records += list(range(self.book_header.huff_offset, 819 self.book_header.huff_offset + self.book_header.huff_number)) 820 huff = HuffReader(huffs) 821 unpack = huff.unpack 822 823 elif self.book_header.compression_type == b'\x00\x02': 824 unpack = decompress_doc 825 826 elif self.book_header.compression_type == b'\x00\x01': 827 unpack = lambda x: x 828 else: 829 raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type) 830 self.mobi_html = b''.join(map(unpack, text_sections)) 831 if self.mobi_html.endswith(b'#'): 832 self.mobi_html = self.mobi_html[:-1] 833 834 if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower(): 835 self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ') 836 self.mobi_html = self.mobi_html.replace(b'\0', b'') 837 if self.book_header.codec == 'cp1252': 838 self.mobi_html = self.mobi_html.replace(b'\x1e', b'') # record separator 839 self.mobi_html = self.mobi_html.replace(b'\x02', b'') # start of text 840 return processed_records 841 842 def replace_page_breaks(self): 843 self.processed_html = self.PAGE_BREAK_PAT.sub( 844 r'<div \1 class="mbp_pagebreak" />', 845 self.processed_html) 846 847 def add_anchors(self): 848 self.log.debug('Adding anchors...') 849 positions = set() 850 link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', 851 re.IGNORECASE) 852 for match in link_pattern.finditer(self.mobi_html): 853 positions.add(int(match.group(1))) 854 pos = 0 855 processed_html = [] 856 end_tag_re = re.compile(br'<\s*/') 857 for end in sorted(positions): 858 if end == 0: 859 continue 860 oend = end 861 l = self.mobi_html.find(b'<', end) 862 r = self.mobi_html.find(b'>', end) 863 anchor = b'<a id="filepos%d"></a>' 864 if r > -1 and (r < l or l == end or l == -1): 865 p = self.mobi_html.rfind(b'<', 0, end + 1) 866 if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and 867 not self.mobi_html[p:r + 1].endswith(b'/>')): 868 anchor = b' filepos-id="filepos%d"' 869 end = r 870 else: 871 end = r + 1 872 processed_html.append(self.mobi_html[pos:end] + (anchor % oend)) 873 pos = end 874 processed_html.append(self.mobi_html[pos:]) 875 processed_html = b''.join(processed_html) 876 877 # Remove anchors placed inside entities 878 self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);', 879 br'&\1\3;\2', processed_html) 880 881 def extract_images(self, processed_records, output_dir): 882 self.log.debug('Extracting images...') 883 output_dir = os.path.abspath(os.path.join(output_dir, 'images')) 884 if not os.path.exists(output_dir): 885 os.makedirs(output_dir) 886 image_index = 0 887 self.image_names = [] 888 image_name_map = {} 889 start = getattr(self.book_header, 'first_image_index', -1) 890 if start > self.num_sections or start < 0: 891 # BAEN PRC files have bad headers 892 start = 0 893 for i in range(start, self.num_sections): 894 if i in processed_records: 895 continue 896 processed_records.append(i) 897 data = self.sections[i][0] 898 image_index += 1 899 if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', 900 b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: 901 # This record is a known non image type, no need to try to 902 # load the image 903 continue 904 905 try: 906 imgfmt = what(None, data) 907 except Exception: 908 continue 909 if imgfmt not in {'jpg', 'jpeg', 'gif', 'png', 'bmp'}: 910 continue 911 if imgfmt == 'jpeg': 912 imgfmt = 'jpg' 913 if imgfmt == 'gif': 914 try: 915 data = gif_data_to_png_data(data) 916 imgfmt = 'png' 917 except AnimatedGIF: 918 pass 919 path = os.path.join(output_dir, '%05d.%s' % (image_index, imgfmt)) 920 image_name_map[image_index] = os.path.basename(path) 921 if imgfmt == 'png': 922 with open(path, 'wb') as f: 923 f.write(data) 924 else: 925 try: 926 save_cover_data_to(data, path, minify_to=(10000, 10000)) 927 except Exception: 928 continue 929 self.image_names.append(os.path.basename(path)) 930 return image_name_map 931 932 933def test_mbp_regex(): 934 for raw, m in iteritems({ 935 '<mbp:pagebreak></mbp:pagebreak>':'', 936 '<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy', 937 '<mbp:pagebreak> </mbp:pagebreak>':'', 938 '<mbp:pagebreak>xxx':'xxx', 939 '<mbp:pagebreak/>xxx':'xxx', 940 '<mbp:pagebreak sdf/ >xxx':' sdfxxx', 941 '<mbp:pagebreak / >':' ', 942 '</mbp:pagebreak>':'', 943 '</mbp:pagebreak sdf>':' sdf', 944 '</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':'xxx', 945 }): 946 ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw) 947 if ans != m: 948 raise Exception('%r != %r for %r'%(ans, m, raw)) 949