1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' 7__docformat__ = 'restructuredtext en' 8 9import copy, logging 10from functools import partial 11from collections import defaultdict, namedtuple 12from io import BytesIO 13from struct import pack 14 15import css_parser 16from css_parser.css import CSSRule 17from lxml import etree 18 19from calibre import isbytestring, force_unicode 20from calibre.ebooks.mobi.utils import (create_text_record, to_base, 21 is_guide_ref_start) 22from calibre.ebooks.compression.palmdoc import compress_doc 23from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath, 24 extract, XHTML, urlnormalize) 25from calibre.ebooks.oeb.normalize_css import condense_sheet 26from calibre.ebooks.oeb.parse_utils import barename 27from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href 28from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex, 29 ChunkIndex, GuideIndex, NonLinearNCXIndex) 30from calibre.ebooks.mobi.writer8.mobi import KF8Book 31from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences 32from calibre.ebooks.mobi.writer8.toc import TOCAdder 33from polyglot.builtins import iteritems 34 35XML_DOCS = OEB_DOCS | {SVG_MIME} 36 37# References to record numbers in KF8 are stored as base-32 encoded integers, 38# with 4 digits 39to_ref = partial(to_base, base=32, min_num_digits=4) 40 41 42class KF8Writer: 43 44 def __init__(self, oeb, opts, resources): 45 self.oeb, self.opts, self.log = oeb, opts, oeb.log 46 self.compress = not self.opts.dont_compress 47 self.has_tbs = False 48 self.log.info('Creating KF8 output') 49 50 # Create an inline ToC if one does not already exist 51 self.toc_adder = TOCAdder(oeb, opts) 52 self.used_images = set() 53 self.resources = resources 54 self.flows = [None] # First flow item is reserved for the text 55 self.records = [None] # Placeholder for zeroth record 56 57 self.log('\tGenerating KF8 markup...') 58 self.dup_data() 59 self.cleanup_markup() 60 self.replace_resource_links() 61 self.extract_css_into_flows() 62 self.extract_svg_into_flows() 63 self.replace_internal_links_with_placeholders() 64 self.insert_aid_attributes() 65 self.chunk_it_up() 66 # Dump the cloned data as it is no longer needed 67 del self._data_cache 68 self.create_text_records() 69 self.log('\tCreating indices...') 70 self.create_fdst_records() 71 self.create_indices() 72 self.create_guide() 73 # We do not want to use this ToC for MOBI 6, so remove it 74 self.toc_adder.remove_generated_toc() 75 76 def dup_data(self): 77 ''' Duplicate data so that any changes we make to markup/CSS only 78 affect KF8 output and not MOBI 6 output ''' 79 self._data_cache = {} 80 # Suppress css_parser logging output as it is duplicated anyway earlier 81 # in the pipeline 82 css_parser.log.setLevel(logging.CRITICAL) 83 for item in self.oeb.manifest: 84 if item.media_type in XML_DOCS: 85 self._data_cache[item.href] = copy.deepcopy(item.data) 86 elif item.media_type in OEB_STYLES: 87 # I can't figure out how to make an efficient copy of the 88 # in-memory CSSStylesheet, as deepcopy doesn't work (raises an 89 # exception) 90 self._data_cache[item.href] = css_parser.parseString( 91 item.data.cssText, validate=False) 92 93 def data(self, item): 94 return self._data_cache.get(item.href, item.data) 95 96 def cleanup_markup(self): 97 for item in self.oeb.spine: 98 root = self.data(item) 99 100 # Remove empty script tags as they are pointless 101 for tag in XPath('//h:script')(root): 102 if not tag.text and not tag.get('src', False): 103 tag.getparent().remove(tag) 104 105 # Remove [ac]id attributes as they are used by this code for anchor 106 # to offset mapping 107 for tag in XPath('//*[@aid or @cid]')(root): 108 tag.attrib.pop('aid', None), tag.attrib.pop('cid', None) 109 110 def replace_resource_links(self): 111 ''' Replace links to resources (raster images/fonts) with pointers to 112 the MOBI record containing the resource. The pointers are of the form: 113 kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and 114 not used for fonts. ''' 115 116 def pointer(item, oref): 117 ref = urlnormalize(item.abshref(oref)) 118 idx = self.resources.item_map.get(ref, None) 119 if idx is not None: 120 is_image = self.resources.records[idx-1][:4] not in {b'FONT'} 121 idx = to_ref(idx) 122 if is_image: 123 self.used_images.add(ref) 124 return 'kindle:embed:%s?mime=%s'%(idx, 125 self.resources.mime_map[ref]) 126 else: 127 return 'kindle:embed:%s'%idx 128 return oref 129 130 for item in self.oeb.manifest: 131 132 if item.media_type in XML_DOCS: 133 root = self.data(item) 134 for tag in XPath('//h:img|//svg:image')(root): 135 for attr, ref in iteritems(tag.attrib): 136 if attr.split('}')[-1].lower() in {'src', 'href'}: 137 tag.attrib[attr] = pointer(item, ref) 138 139 for tag in XPath('//h:style')(root): 140 if tag.text: 141 sheet = css_parser.parseString(tag.text, validate=False) 142 replacer = partial(pointer, item) 143 css_parser.replaceUrls(sheet, replacer, 144 ignoreImportRules=True) 145 repl = sheet.cssText 146 if isbytestring(repl): 147 repl = repl.decode('utf-8') 148 tag.text = '\n'+ repl + '\n' 149 150 elif item.media_type in OEB_STYLES: 151 sheet = self.data(item) 152 replacer = partial(pointer, item) 153 css_parser.replaceUrls(sheet, replacer, ignoreImportRules=True) 154 155 def extract_css_into_flows(self): 156 inlines = defaultdict(list) # Ensure identical <style>s not repeated 157 sheets = {} 158 passthrough = getattr(self.opts, 'mobi_passthrough', False) 159 160 for item in self.oeb.manifest: 161 if item.media_type in OEB_STYLES: 162 sheet = self.data(item) 163 if not passthrough and not self.opts.expand_css and hasattr(item.data, 'cssText'): 164 condense_sheet(sheet) 165 sheets[item.href] = len(self.flows) 166 self.flows.append(sheet) 167 168 def fix_import_rules(sheet): 169 changed = False 170 for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE): 171 if rule.href: 172 href = item.abshref(rule.href) 173 idx = sheets.get(href, None) 174 if idx is not None: 175 idx = to_ref(idx) 176 rule.href = 'kindle:flow:%s?mime=text/css'%idx 177 changed = True 178 return changed 179 180 for item in self.oeb.spine: 181 root = self.data(item) 182 183 for link in XPath('//h:link[@href]')(root): 184 href = item.abshref(link.get('href')) 185 idx = sheets.get(href, None) 186 if idx is not None: 187 idx = to_ref(idx) 188 link.set('href', 'kindle:flow:%s?mime=text/css'%idx) 189 190 for tag in XPath('//h:style')(root): 191 p = tag.getparent() 192 idx = p.index(tag) 193 raw = tag.text 194 if not raw or not raw.strip(): 195 extract(tag) 196 continue 197 sheet = css_parser.parseString(raw, validate=False) 198 if fix_import_rules(sheet): 199 raw = force_unicode(sheet.cssText, 'utf-8') 200 201 repl = etree.Element(XHTML('link'), type='text/css', 202 rel='stylesheet') 203 repl.tail='\n' 204 p.insert(idx, repl) 205 extract(tag) 206 inlines[raw].append(repl) 207 208 for raw, elems in iteritems(inlines): 209 idx = to_ref(len(self.flows)) 210 self.flows.append(raw) 211 for link in elems: 212 link.set('href', 'kindle:flow:%s?mime=text/css'%idx) 213 214 for item in self.oeb.manifest: 215 if item.media_type in OEB_STYLES: 216 sheet = self.data(item) 217 if hasattr(sheet, 'cssRules'): 218 fix_import_rules(sheet) 219 220 for i, sheet in enumerate(tuple(self.flows)): 221 if hasattr(sheet, 'cssText'): 222 self.flows[i] = force_unicode(sheet.cssText, 'utf-8') 223 224 def extract_svg_into_flows(self): 225 images = {} 226 227 for item in self.oeb.manifest: 228 if item.media_type == SVG_MIME: 229 data = self.data(item) 230 images[item.href] = len(self.flows) 231 self.flows.append(etree.tostring(data, encoding='UTF-8', 232 with_tail=True, xml_declaration=True)) 233 234 for item in self.oeb.spine: 235 root = self.data(item) 236 237 for svg in XPath('//svg:svg')(root): 238 raw = etree.tostring(svg, encoding='unicode', with_tail=False) 239 idx = len(self.flows) 240 self.flows.append(raw) 241 p = svg.getparent() 242 pos = p.index(svg) 243 img = etree.Element(XHTML('img'), 244 src="kindle:flow:%s?mime=image/svg+xml"%to_ref(idx)) 245 p.insert(pos, img) 246 extract(svg) 247 248 for img in XPath('//h:img[@src]')(root): 249 src = img.get('src') 250 abshref = item.abshref(src) 251 idx = images.get(abshref, None) 252 if idx is not None: 253 img.set('src', 'kindle:flow:%s?mime=image/svg+xml'% 254 to_ref(idx)) 255 256 def replace_internal_links_with_placeholders(self): 257 self.link_map = {} 258 count = 0 259 hrefs = {item.href for item in self.oeb.spine} 260 for item in self.oeb.spine: 261 root = self.data(item) 262 263 for a in XPath('//h:a[@href]')(root): 264 count += 1 265 ref = item.abshref(a.get('href')) 266 href, _, frag = ref.partition('#') 267 try: 268 href = urlnormalize(href) 269 except ValueError: 270 # a non utf-8 quoted url? Since we cannot interpret it, pass it through. 271 pass 272 if href in hrefs: 273 placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count) 274 self.link_map[placeholder] = (href, frag) 275 a.set('href', placeholder) 276 277 def insert_aid_attributes(self): 278 self.id_map = {} 279 cid = 0 280 for i, item in enumerate(self.oeb.spine): 281 root = self.data(item) 282 aidbase = i * int(1e6) 283 j = 0 284 285 def in_table(elem): 286 p = elem.getparent() 287 if p is None: 288 return False 289 if barename(p.tag).lower() == 'table': 290 return True 291 return in_table(p) 292 for tag in root.iterdescendants(etree.Element): 293 id_ = tag.attrib.get('id', None) 294 if id_ is None and tag.tag == XHTML('a'): 295 # Can happen during tweaking 296 id_ = tag.attrib.get('name', None) 297 if id_ is not None: 298 tag.attrib['id'] = id_ 299 tagname = barename(tag.tag).lower() 300 if id_ is not None or tagname in aid_able_tags: 301 if tagname == 'table' or in_table(tag): 302 # The Kindle renderer barfs on large tables that have 303 # aid on any of their tags. See 304 # https://bugs.launchpad.net/bugs/1489495 305 if id_: 306 cid += 1 307 val = 'c%d' % cid 308 self.id_map[(item.href, id_)] = val 309 tag.set('cid', val) 310 else: 311 aid = to_base(aidbase + j, base=32) 312 tag.set('aid', aid) 313 if tag.tag == XHTML('body'): 314 self.id_map[(item.href, '')] = aid 315 if id_ is not None: 316 self.id_map[(item.href, id_)] = aid 317 318 j += 1 319 320 def chunk_it_up(self): 321 placeholder_map = {} 322 for placeholder, x in iteritems(self.link_map): 323 href, frag = x 324 aid = self.id_map.get(x, None) 325 if aid is None: 326 aid = self.id_map.get((href, '')) 327 placeholder_map[placeholder] = aid 328 chunker = Chunker(self.oeb, self.data, placeholder_map) 329 330 for x in ('skel_table', 'chunk_table', 'aid_offset_map'): 331 setattr(self, x, getattr(chunker, x)) 332 333 self.flows[0] = chunker.text 334 335 def create_text_records(self): 336 self.flows = [x.encode('utf-8') if isinstance(x, str) else x for x 337 in self.flows] 338 text = b''.join(self.flows) 339 self.text_length = len(text) 340 text = BytesIO(text) 341 nrecords = 0 342 records_size = 0 343 self.uncompressed_record_lengths = [] 344 345 if self.compress: 346 self.oeb.logger.info('\tCompressing markup...') 347 348 while text.tell() < self.text_length: 349 data, overlap = create_text_record(text) 350 self.uncompressed_record_lengths.append(len(data)) 351 if self.compress: 352 data = compress_doc(data) 353 354 data += overlap 355 data += pack(b'>B', len(overlap)) 356 357 self.records.append(data) 358 records_size += len(data) 359 nrecords += 1 360 361 self.last_text_record_idx = nrecords 362 self.first_non_text_record_idx = nrecords + 1 363 # Pad so that the next records starts at a 4 byte boundary 364 if records_size % 4 != 0: 365 self.records.append(b'\x00'*(records_size % 4)) 366 self.first_non_text_record_idx += 1 367 368 def create_fdst_records(self): 369 FDST = namedtuple('Flow', 'start end') 370 entries = [] 371 self.fdst_table = [] 372 for i, flow in enumerate(self.flows): 373 start = 0 if i == 0 else self.fdst_table[-1].end 374 self.fdst_table.append(FDST(start, start + len(flow))) 375 entries.extend(self.fdst_table[-1]) 376 rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) + 377 pack(b'>%dL'%len(entries), *entries)) 378 self.fdst_records = [rec] 379 self.fdst_count = len(self.fdst_table) 380 381 def create_indices(self): 382 self.skel_records = SkelIndex(self.skel_table)() 383 self.chunk_records = ChunkIndex(self.chunk_table)() 384 self.ncx_records = [] 385 toc = self.oeb.toc 386 entries = [] 387 is_periodical = self.opts.mobi_periodical 388 if toc.count() < 1: 389 self.log.warn('Document has no ToC, MOBI will have no NCX index') 390 return 391 392 # Flatten the ToC into a depth first list 393 fl = toc.iterdescendants() 394 for i, item in enumerate(fl): 395 entry = {'id': id(item), 'index': i, 'label':(item.title or 396 _('Unknown')), 'children':[]} 397 entry['depth'] = getattr(item, 'ncx_hlvl', 0) 398 p = getattr(item, 'ncx_parent', None) 399 if p is not None: 400 entry['parent_id'] = p 401 for child in item: 402 child.ncx_parent = entry['id'] 403 child.ncx_hlvl = entry['depth'] + 1 404 entry['children'].append(id(child)) 405 if is_periodical: 406 if item.author: 407 entry['author'] = item.author 408 if item.description: 409 entry['description'] = item.description 410 entries.append(entry) 411 href = item.href or '' 412 href, frag = href.partition('#')[0::2] 413 aid = self.id_map.get((href, frag), None) 414 if aid is None: 415 aid = self.id_map.get((href, ''), None) 416 if aid is None: 417 pos, fid = 0, 0 418 chunk = self.chunk_table[pos] 419 offset = chunk.insert_pos + fid 420 else: 421 pos, fid, offset = self.aid_offset_map[aid] 422 423 entry['pos_fid'] = (pos, fid) 424 entry['offset'] = offset 425 426 # The Kindle requires entries to be sorted by (depth, playorder) 427 # However, I cannot figure out how to deal with non linear ToCs, i.e. 428 # ToCs whose nth entry at depth d has an offset after its n+k entry at 429 # the same depth, so we sort on (depth, offset) instead. This re-orders 430 # the ToC to be linear. A non-linear ToC causes section to section 431 # jumping to not work. kindlegen somehow handles non-linear tocs, but I 432 # cannot figure out how. 433 original = sorted(entries, 434 key=lambda entry: (entry['depth'], entry['index'])) 435 linearized = sorted(entries, 436 key=lambda entry: (entry['depth'], entry['offset'])) 437 is_non_linear = original != linearized 438 entries = linearized 439 is_non_linear = False # False as we are using the linearized entries 440 441 if is_non_linear: 442 for entry in entries: 443 entry['kind'] = 'chapter' 444 445 for i, entry in enumerate(entries): 446 entry['index'] = i 447 id_to_index = {entry['id']:entry['index'] for entry in entries} 448 449 # Write the hierarchical information 450 for entry in entries: 451 children = entry.pop('children') 452 if children: 453 entry['first_child'] = id_to_index[children[0]] 454 entry['last_child'] = id_to_index[children[-1]] 455 if 'parent_id' in entry: 456 entry['parent'] = id_to_index[entry.pop('parent_id')] 457 458 # Write the lengths 459 def get_next_start(entry): 460 enders = [e['offset'] for e in entries if e['depth'] <= 461 entry['depth'] and e['offset'] > entry['offset']] 462 if enders: 463 return min(enders) 464 return len(self.flows[0]) 465 for entry in entries: 466 entry['length'] = get_next_start(entry) - entry['offset'] 467 468 self.has_tbs = apply_trailing_byte_sequences(entries, self.records, 469 self.uncompressed_record_lengths) 470 idx_type = NonLinearNCXIndex if is_non_linear else NCXIndex 471 self.ncx_records = idx_type(entries)() 472 473 def create_guide(self): 474 self.start_offset = None 475 self.guide_table = [] 476 self.guide_records = [] 477 GuideRef = namedtuple('GuideRef', 'title type pos_fid') 478 for ref in self.oeb.guide.values(): 479 href, frag = ref.href.partition('#')[0::2] 480 aid = self.id_map.get((href, frag), None) 481 if aid is None: 482 aid = self.id_map.get((href, '')) 483 if aid is None: 484 continue 485 pos, fid, offset = self.aid_offset_map[aid] 486 if is_guide_ref_start(ref): 487 self.start_offset = offset 488 self.guide_table.append(GuideRef(ref.title or 489 _('Unknown'), ref.type, (pos, fid))) 490 491 if self.guide_table: 492 self.guide_table.sort(key=lambda x:x.type) # Needed by the Kindle 493 self.guide_records = GuideIndex(self.guide_table)() 494 495 496def create_kf8_book(oeb, opts, resources, for_joint=False): 497 writer = KF8Writer(oeb, opts, resources) 498 return KF8Book(writer, for_joint=for_joint) 499