1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' 7__docformat__ = 'restructuredtext en' 8 9import struct, re, os 10from collections import namedtuple 11from itertools import repeat 12from uuid import uuid4 13 14from lxml import etree 15 16from calibre.ebooks.mobi.reader.headers import NULL_INDEX 17from calibre.ebooks.mobi.reader.index import read_index 18from calibre.ebooks.mobi.reader.ncx import read_ncx, build_toc 19from calibre.ebooks.mobi.reader.markup import expand_mobi8_markup 20from calibre.ebooks.mobi.reader.containers import Container, find_imgtype 21from calibre.ebooks.metadata.opf2 import Guide, OPFCreator 22from calibre.ebooks.metadata.toc import TOC 23from calibre.ebooks.mobi.utils import read_font_record 24from calibre.ebooks.oeb.parse_utils import parse_html 25from calibre.ebooks.oeb.base import XPath, XHTML, xml2text 26from polyglot.builtins import as_unicode 27from polyglot.urllib import urldefrag 28 29Part = namedtuple('Part', 30 'num type filename start end aid') 31 32Elem = namedtuple('Elem', 33 'insert_pos toc_text file_number sequence_number start_pos ' 34 'length') 35 36FlowInfo = namedtuple('FlowInfo', 37 'type format dir fname') 38 39# locate beginning and ending positions of tag with specific aid attribute 40 41 42def locate_beg_end_of_tag(ml, aid): 43 pattern = br'''<[^>]*\said\s*=\s*['"]%s['"][^>]*>''' % aid 44 aid_pattern = re.compile(pattern, re.IGNORECASE) 45 for m in re.finditer(aid_pattern, ml): 46 plt = m.start() 47 pgt = ml.find(b'>', plt+1) 48 return plt, pgt 49 return 0, 0 50 51 52def reverse_tag_iter(block): 53 ''' Iterate over all tags in block in reverse order, i.e. last tag 54 to first tag. ''' 55 end = len(block) 56 while True: 57 pgt = block.rfind(b'>', 0, end) 58 if pgt == -1: 59 break 60 plt = block.rfind(b'<', 0, pgt) 61 if plt == -1: 62 break 63 yield block[plt:pgt+1] 64 end = plt 65 66 67def get_first_resource_index(first_image_index, num_of_text_records, first_text_record_number): 68 first_resource_index = first_image_index 69 if first_resource_index in {-1, NULL_INDEX}: 70 first_resource_index = num_of_text_records + first_text_record_number 71 return first_resource_index 72 73 74class Mobi8Reader: 75 76 def __init__(self, mobi6_reader, log, for_tweak=False): 77 self.for_tweak = for_tweak 78 self.mobi6_reader, self.log = mobi6_reader, log 79 self.header = mobi6_reader.book_header 80 self.encrypted_fonts = [] 81 self.id_re = re.compile(br'''<[^>]+\s(?:id|ID)\s*=\s*['"]([^'"]+)['"]''') 82 self.name_re = re.compile(br'''<\s*a\s*\s(?:name|NAME)\s*=\s*['"]([^'"]+)['"]''') 83 self.aid_re = re.compile(br'''<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]''') 84 85 def __call__(self): 86 self.mobi6_reader.check_for_drm() 87 self.aid_anchor_suffix = uuid4().hex.encode('utf-8') 88 bh = self.mobi6_reader.book_header 89 if self.mobi6_reader.kf8_type == 'joint': 90 offset = self.mobi6_reader.kf8_boundary + 2 91 self.resource_offsets = [ 92 (get_first_resource_index(bh.first_image_index, bh.mobi6_records, 1), offset - 2), 93 (get_first_resource_index(bh.kf8_first_image_index, bh.records, offset), len(self.mobi6_reader.sections)), 94 ] 95 else: 96 offset = 1 97 self.resource_offsets = [(get_first_resource_index(bh.first_image_index, bh.records, offset), len(self.mobi6_reader.sections))] 98 99 self.processed_records = self.mobi6_reader.extract_text(offset=offset) 100 self.raw_ml = self.mobi6_reader.mobi_html 101 with open('debug-raw.html', 'wb') as f: 102 f.write(self.raw_ml) 103 104 self.kf8_sections = self.mobi6_reader.sections[offset-1:] 105 106 self.cover_offset = getattr(self.header.exth, 'cover_offset', None) 107 self.linked_aids = set() 108 109 self.read_indices() 110 self.build_parts() 111 guide = self.create_guide() 112 ncx = self.create_ncx() 113 resource_map = self.extract_resources(self.mobi6_reader.sections) 114 spine = self.expand_text(resource_map) 115 return self.write_opf(guide, ncx, spine, resource_map) 116 117 def read_indices(self): 118 self.flow_table = () 119 120 if self.header.fdstidx != NULL_INDEX: 121 header = self.kf8_sections[self.header.fdstidx][0] 122 if header[:4] != b'FDST': 123 raise ValueError('KF8 does not have a valid FDST record') 124 sec_start, num_sections = struct.unpack_from(b'>LL', header, 4) 125 secs = struct.unpack_from(b'>%dL' % (num_sections*2), 126 header, sec_start) 127 self.flow_table = tuple(zip(secs[::2], secs[1::2])) 128 129 self.files = [] 130 if self.header.skelidx != NULL_INDEX: 131 table = read_index(self.kf8_sections, self.header.skelidx, 132 self.header.codec)[0] 133 File = namedtuple('File', 134 'file_number name divtbl_count start_position length') 135 136 for i, text in enumerate(table): 137 tag_map = table[text] 138 self.files.append(File(i, text, tag_map[1][0], 139 tag_map[6][0], tag_map[6][1])) 140 141 self.elems = [] 142 if self.header.dividx != NULL_INDEX: 143 table, cncx = read_index(self.kf8_sections, self.header.dividx, 144 self.header.codec) 145 for i, text in enumerate(table): 146 tag_map = table[text] 147 toc_text = cncx[tag_map[2][0]] 148 self.elems.append(Elem(int(text), toc_text, tag_map[3][0], 149 tag_map[4][0], tag_map[6][0], tag_map[6][1])) 150 151 self.guide = [] 152 if self.header.othidx != NULL_INDEX: 153 table, cncx = read_index(self.kf8_sections, self.header.othidx, 154 self.header.codec) 155 Item = namedtuple('Item', 156 'type title pos_fid') 157 158 for i, ref_type in enumerate(table): 159 tag_map = table[ref_type] 160 # ref_type, ref_title, div/frag number 161 title = cncx[tag_map[1][0]] 162 fileno = None 163 if 3 in list(tag_map.keys()): 164 fileno = tag_map[3][0] 165 if 6 in list(tag_map.keys()): 166 fileno = tag_map[6] 167 if isinstance(ref_type, bytes): 168 ref_type = ref_type.decode(self.header.codec) 169 self.guide.append(Item(ref_type, title, fileno)) 170 171 def build_parts(self): 172 raw_ml = self.mobi6_reader.mobi_html 173 self.flows = [] 174 self.flowinfo = [] 175 ft = self.flow_table if self.flow_table else [(0, len(raw_ml))] 176 177 # now split the raw_ml into its flow pieces 178 for start, end in ft: 179 self.flows.append(raw_ml[start:end]) 180 181 # the first piece represents the xhtml text 182 text = self.flows[0] 183 self.flows[0] = b'' 184 185 # walk the <skeleton> and <div> tables to build original source xhtml 186 # files *without* destroying any file position information needed for 187 # later href processing and create final list of file separation start: 188 # stop points and etc in partinfo 189 self.parts = [] 190 self.partinfo = [] 191 divptr = 0 192 baseptr = 0 193 for skelnum, skelname, divcnt, skelpos, skellen in self.files: 194 baseptr = skelpos + skellen 195 skeleton = text[skelpos:baseptr] 196 inspos_warned = False 197 for i in range(divcnt): 198 insertpos, idtext, filenum, seqnum, startpos, length = \ 199 self.elems[divptr] 200 if i == 0: 201 aidtext = idtext[12:-2] 202 filename = 'part%04d.html' % filenum 203 part = text[baseptr:baseptr + length] 204 insertpos = insertpos - skelpos 205 head = skeleton[:insertpos] 206 tail = skeleton[insertpos:] 207 if (tail.find(b'>') < tail.find(b'<') or head.rfind(b'>') < 208 head.rfind(b'<')): 209 # There is an incomplete tag in either the head or tail. 210 # This can happen for some badly formed KF8 files, see for 211 # example, https://bugs.launchpad.net/bugs/1082669 212 if not inspos_warned: 213 self.log.warn( 214 'The div table for %s has incorrect insert ' 215 'positions. Calculating manually.'%skelname) 216 inspos_warned = True 217 bp, ep = locate_beg_end_of_tag(skeleton, aidtext if 218 isinstance(aidtext, bytes) else aidtext.encode('utf-8')) 219 if bp != ep: 220 insertpos = ep + 1 + startpos 221 222 skeleton = skeleton[0:insertpos] + part + skeleton[insertpos:] 223 baseptr = baseptr + length 224 divptr += 1 225 self.parts.append(skeleton) 226 if divcnt < 1: 227 # Empty file 228 aidtext = str(uuid4()) 229 filename = aidtext + '.html' 230 self.partinfo.append(Part(skelnum, 'text', filename, skelpos, 231 baseptr, aidtext)) 232 233 # The primary css style sheet is typically stored next followed by any 234 # snippets of code that were previously inlined in the 235 # original xhtml but have been stripped out and placed here. 236 # This can include local CDATA snippets and svg sections. 237 238 # The problem is that for most browsers and ereaders, you can not 239 # use <img src="imageXXXX.svg" /> to import any svg image that itself 240 # properly uses an <image/> tag to import some raster image - it 241 # should work according to the spec but does not for almost all browsers 242 # and ereaders and causes epub validation issues because those raster 243 # images are in manifest but not in xhtml text - since they only 244 # referenced from an svg image 245 246 # So we need to check the remaining flow pieces to see if they are css 247 # or svg images. if svg images, we must check if they have an <image/> 248 # and if so inline them into the xhtml text pieces. 249 250 # there may be other sorts of pieces stored here but until we see one 251 # in the wild to reverse engineer we won't be able to tell 252 253 self.flowinfo.append(FlowInfo(None, None, None, None)) 254 svg_tag_pattern = re.compile(br'''(<svg[^>]*>)''', re.IGNORECASE) 255 image_tag_pattern = re.compile(br'''(<(?:svg:)?image[^>]*>)''', re.IGNORECASE) 256 for j in range(1, len(self.flows)): 257 flowpart = self.flows[j] 258 nstr = '%04d' % j 259 m = svg_tag_pattern.search(flowpart) 260 if m is not None: 261 # svg 262 typ = 'svg' 263 start = m.start() 264 m2 = image_tag_pattern.search(flowpart) 265 if m2 is not None: 266 format = 'inline' 267 dir = None 268 fname = None 269 # strip off anything before <svg if inlining 270 flowpart = re.sub(br'(</?)svg:', r'\1', flowpart[start:]) 271 else: 272 format = 'file' 273 dir = "images" 274 fname = 'svgimg' + nstr + '.svg' 275 else: 276 # search for CDATA and if exists inline it 277 if flowpart.find(b'[CDATA[') >= 0: 278 typ = 'css' 279 flowpart = b'<style type="text/css">\n' + flowpart + b'\n</style>\n' 280 format = 'inline' 281 dir = None 282 fname = None 283 else: 284 # css - assume as standalone css file 285 typ = 'css' 286 format = 'file' 287 dir = "styles" 288 fname = nstr + '.css' 289 290 self.flows[j] = flowpart 291 self.flowinfo.append(FlowInfo(typ, format, dir, fname)) 292 293 def get_file_info(self, pos): 294 ''' Get information about the part (file) that exists at pos in 295 the raw markup ''' 296 for part in self.partinfo: 297 if pos >= part.start and pos < part.end: 298 return part 299 return Part(*repeat(None, len(Part._fields))) 300 301 def get_id_tag_by_pos_fid(self, posfid, offset): 302 # first convert kindle:pos:fid and offset info to position in file 303 insertpos, idtext, filenum, seqnm, startpos, length = self.elems[posfid] 304 pos = insertpos + offset 305 fi = self.get_file_info(pos) 306 # an existing "id=" must exist in original xhtml otherwise it would not 307 # have worked for linking. Amazon seems to have added its own 308 # additional "aid=" inside tags whose contents seem to represent some 309 # position information encoded into Base32 name. 310 311 # so find the closest "id=" before position the file by actually 312 # searching in that file 313 idtext = self.get_id_tag(pos) 314 return '%s/%s'%(fi.type, fi.filename), idtext 315 316 def get_id_tag(self, pos): 317 # Find the first tag with a named anchor (name or id attribute) before 318 # pos 319 fi = self.get_file_info(pos) 320 if fi.num is None and fi.start is None: 321 raise ValueError('No file contains pos: %d'%pos) 322 textblock = self.parts[fi.num] 323 npos = pos - fi.start 324 pgt = textblock.find(b'>', npos) 325 plt = textblock.find(b'<', npos) 326 # if npos inside a tag then search all text before the its end of tag marker 327 # else not in a tag need to search the preceding tag 328 if plt == npos or pgt < plt: 329 npos = pgt + 1 330 textblock = textblock[0:npos] 331 for tag in reverse_tag_iter(textblock): 332 m = self.id_re.match(tag) or self.name_re.match(tag) 333 if m is not None: 334 return m.group(1) 335 # For some files, kindlegen apparently creates links to tags 336 # without HTML anchors, using the AID instead. See 337 # See https://www.mobileread.com/forums/showthread.php?t=259557 338 m = self.aid_re.match(tag) 339 if m is not None: 340 self.linked_aids.add(m.group(1).decode('utf-8')) 341 return m.group(1) + b'-' + self.aid_anchor_suffix 342 343 # No tag found, link to start of file 344 return b'' 345 346 def create_guide(self): 347 guide = Guide() 348 has_start = False 349 for ref_type, ref_title, pos_fid in self.guide: 350 try: 351 if len(pos_fid) != 2: 352 continue 353 except TypeError: 354 continue # thumbnailstandard record, ignore it 355 linktgt, idtext = self.get_id_tag_by_pos_fid(*pos_fid) 356 if idtext: 357 if isinstance(idtext, bytes): 358 idtext = idtext.decode(self.header.codec) 359 linktgt += '#' + idtext 360 g = Guide.Reference(linktgt, os.getcwd()) 361 g.title, g.type = ref_title, ref_type 362 if g.title == 'start' or g.type == 'text': 363 has_start = True 364 guide.append(g) 365 366 so = self.header.exth.start_offset 367 if so not in {None, NULL_INDEX} and not has_start: 368 fi = self.get_file_info(so) 369 if fi.filename is not None: 370 idtext = self.get_id_tag(so).decode(self.header.codec) 371 linktgt = fi.filename 372 if idtext: 373 linktgt += '#' + idtext 374 g = Guide.Reference('%s/%s'%(fi.type, linktgt), os.getcwd()) 375 g.title, g.type = 'start', 'text' 376 guide.append(g) 377 378 return guide 379 380 def create_ncx(self): 381 index_entries = read_ncx(self.kf8_sections, self.header.ncxidx, 382 self.header.codec) 383 remove = [] 384 385 # Add href and anchor info to the index entries 386 for entry in index_entries: 387 pos_fid = entry['pos_fid'] 388 if pos_fid is None: 389 pos = entry['pos'] 390 fi = self.get_file_info(pos) 391 if fi.filename is None: 392 raise ValueError('Index entry has invalid pos: %d'%pos) 393 idtag = self.get_id_tag(pos) 394 href = '%s/%s'%(fi.type, fi.filename) 395 else: 396 try: 397 href, idtag = self.get_id_tag_by_pos_fid(*pos_fid) 398 except ValueError: 399 self.log.warn('Invalid entry in NCX (title: %s), ignoring' 400 %entry['text']) 401 remove.append(entry) 402 continue 403 404 entry['href'] = href 405 entry['idtag'] = as_unicode(idtag, self.header.codec or 'utf-8') 406 407 for e in remove: 408 index_entries.remove(e) 409 410 # Build the TOC object 411 return build_toc(index_entries) 412 413 def extract_resources(self, sections): 414 from calibre.ebooks.mobi.writer2.resources import PLACEHOLDER_GIF 415 resource_map = [] 416 container = None 417 for x in ('fonts', 'images'): 418 os.mkdir(x) 419 420 for start, end in self.resource_offsets: 421 for i, sec in enumerate(sections[start:end]): 422 fname_idx = i+1 423 data = sec[0] 424 typ = data[:4] 425 href = None 426 if typ in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', b'BOUN', 427 b'FDST', b'DATP', b'AUDI', b'VIDE', b'RESC', b'CMET', b'PAGE'}: 428 pass # Ignore these records 429 elif typ == b'FONT': 430 font = read_font_record(data) 431 href = "fonts/%05d.%s" % (fname_idx, font['ext']) 432 if font['err']: 433 self.log.warn('Reading font record %d failed: %s'%( 434 fname_idx, font['err'])) 435 if font['headers']: 436 self.log.debug('Font record headers: %s'%font['headers']) 437 with open(href.replace('/', os.sep), 'wb') as f: 438 f.write(font['font_data'] if font['font_data'] else 439 font['raw_data']) 440 if font['encrypted']: 441 self.encrypted_fonts.append(href) 442 elif typ == b'CONT': 443 if data == b'CONTBOUNDARY': 444 container = None 445 continue 446 container = Container(data) 447 elif typ == b'CRES': 448 data, imgtype = container.load_image(data) 449 if data is not None: 450 href = 'images/%05d.%s'%(container.resource_index, imgtype) 451 with open(href.replace('/', os.sep), 'wb') as f: 452 f.write(data) 453 elif typ == b'\xa0\xa0\xa0\xa0' and len(data) == 4 and container is not None: 454 container.resource_index += 1 455 elif container is None: 456 if not (len(data) == len(PLACEHOLDER_GIF) and data == PLACEHOLDER_GIF): 457 imgtype = find_imgtype(data) 458 href = 'images/%05d.%s'%(fname_idx, imgtype) 459 with open(href.replace('/', os.sep), 'wb') as f: 460 f.write(data) 461 462 resource_map.append(href) 463 464 return resource_map 465 466 def expand_text(self, resource_map): 467 return expand_mobi8_markup(self, resource_map, self.log) 468 469 def write_opf(self, guide, toc, spine, resource_map): 470 mi = self.header.exth.mi 471 if (self.cover_offset is not None and self.cover_offset < 472 len(resource_map)): 473 mi.cover = resource_map[self.cover_offset] 474 475 if len(list(toc)) < 2: 476 self.log.warn('KF8 has no metadata Table of Contents') 477 478 for ref in guide: 479 if ref.type == 'toc': 480 href = ref.href() 481 href, frag = urldefrag(href) 482 if os.path.exists(href.replace('/', os.sep)): 483 try: 484 toc = self.read_inline_toc(href, frag) 485 except: 486 self.log.exception('Failed to read inline ToC') 487 488 opf = OPFCreator(os.getcwd(), mi) 489 opf.guide = guide 490 491 def exclude(path): 492 return os.path.basename(path) == 'debug-raw.html' 493 494 # If there are no images then the azw3 input plugin dumps all 495 # binary records as .unknown images, remove them 496 if self.for_tweak and os.path.exists('images') and os.path.isdir('images'): 497 files = os.listdir('images') 498 unknown = [x for x in files if x.endswith('.unknown')] 499 if len(files) == len(unknown): 500 [os.remove('images/'+f) for f in files] 501 502 if self.for_tweak: 503 try: 504 os.remove('debug-raw.html') 505 except: 506 pass 507 508 opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude) 509 for entry in opf.manifest: 510 if entry.mime_type == 'text/html': 511 entry.mime_type = 'application/xhtml+xml' 512 opf.create_spine(spine) 513 opf.set_toc(toc) 514 ppd = getattr(self.header.exth, 'page_progression_direction', None) 515 if ppd in {'ltr', 'rtl', 'default'}: 516 opf.page_progression_direction = ppd 517 pwm = getattr(self.header.exth, 'primary_writing_mode', None) 518 if pwm is not None: 519 opf.primary_writing_mode = pwm 520 521 with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx: 522 opf.render(of, ncx, 'toc.ncx') 523 return 'metadata.opf' 524 525 def read_inline_toc(self, href, frag): 526 ans = TOC() 527 base_href = '/'.join(href.split('/')[:-1]) 528 with open(href.replace('/', os.sep), 'rb') as f: 529 raw = f.read().decode(self.header.codec) 530 root = parse_html(raw, log=self.log) 531 body = XPath('//h:body')(root) 532 reached = False 533 if body: 534 start = body[0] 535 else: 536 start = None 537 reached = True 538 if frag: 539 elems = XPath('//*[@id="%s"]'%frag)(root) 540 if elems: 541 start = elems[0] 542 543 def node_depth(elem): 544 ans = 0 545 parent = elem.getparent() 546 while parent is not None: 547 parent = parent.getparent() 548 ans += 1 549 return ans 550 551 # Layer the ToC based on nesting order in the source HTML 552 current_depth = None 553 parent = ans 554 seen = set() 555 links = [] 556 for elem in root.iterdescendants(etree.Element): 557 if reached and elem.tag == XHTML('a') and elem.get('href', 558 False): 559 href = elem.get('href') 560 href, frag = urldefrag(href) 561 href = base_href + '/' + href 562 text = xml2text(elem).strip() 563 if (text, href, frag) in seen: 564 continue 565 seen.add((text, href, frag)) 566 links.append((text, href, frag, node_depth(elem))) 567 elif elem is start: 568 reached = True 569 570 depths = sorted({x[-1] for x in links}) 571 depth_map = {x:i for i, x in enumerate(depths)} 572 for text, href, frag, depth in links: 573 depth = depth_map[depth] 574 if current_depth is None: 575 current_depth = 0 576 parent.add_item(href, frag, text) 577 elif current_depth == depth: 578 parent.add_item(href, frag, text) 579 elif current_depth < depth: 580 parent = parent[-1] if len(parent) > 0 else parent 581 parent.add_item(href, frag, text) 582 current_depth += 1 583 else: 584 delta = current_depth - depth 585 while delta > 0 and parent.parent is not None: 586 parent = parent.parent 587 delta -= 1 588 parent.add_item(href, frag, text) 589 current_depth = depth 590 return ans 591