1#!/usr/local/bin/python3.8
2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3
4
5__license__   = 'GPL v3'
6__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
7__docformat__ = 'restructuredtext en'
8
9import copy, logging
10from functools import partial
11from collections import defaultdict, namedtuple
12from io import BytesIO
13from struct import pack
14
15import css_parser
16from css_parser.css import CSSRule
17from lxml import etree
18
19from calibre import isbytestring, force_unicode
20from calibre.ebooks.mobi.utils import (create_text_record, to_base,
21        is_guide_ref_start)
22from calibre.ebooks.compression.palmdoc import compress_doc
23from calibre.ebooks.oeb.base import (OEB_DOCS, OEB_STYLES, SVG_MIME, XPath,
24        extract, XHTML, urlnormalize)
25from calibre.ebooks.oeb.normalize_css import condense_sheet
26from calibre.ebooks.oeb.parse_utils import barename
27from calibre.ebooks.mobi.writer8.skeleton import Chunker, aid_able_tags, to_href
28from calibre.ebooks.mobi.writer8.index import (NCXIndex, SkelIndex,
29        ChunkIndex, GuideIndex, NonLinearNCXIndex)
30from calibre.ebooks.mobi.writer8.mobi import KF8Book
31from calibre.ebooks.mobi.writer8.tbs import apply_trailing_byte_sequences
32from calibre.ebooks.mobi.writer8.toc import TOCAdder
33from polyglot.builtins import iteritems
34
35XML_DOCS = OEB_DOCS | {SVG_MIME}
36
37# References to record numbers in KF8 are stored as base-32 encoded integers,
38# with 4 digits
39to_ref = partial(to_base, base=32, min_num_digits=4)
40
41
42class KF8Writer:
43
44    def __init__(self, oeb, opts, resources):
45        self.oeb, self.opts, self.log = oeb, opts, oeb.log
46        self.compress = not self.opts.dont_compress
47        self.has_tbs = False
48        self.log.info('Creating KF8 output')
49
50        # Create an inline ToC if one does not already exist
51        self.toc_adder = TOCAdder(oeb, opts)
52        self.used_images = set()
53        self.resources = resources
54        self.flows = [None]  # First flow item is reserved for the text
55        self.records = [None]  # Placeholder for zeroth record
56
57        self.log('\tGenerating KF8 markup...')
58        self.dup_data()
59        self.cleanup_markup()
60        self.replace_resource_links()
61        self.extract_css_into_flows()
62        self.extract_svg_into_flows()
63        self.replace_internal_links_with_placeholders()
64        self.insert_aid_attributes()
65        self.chunk_it_up()
66        # Dump the cloned data as it is no longer needed
67        del self._data_cache
68        self.create_text_records()
69        self.log('\tCreating indices...')
70        self.create_fdst_records()
71        self.create_indices()
72        self.create_guide()
73        # We do not want to use this ToC for MOBI 6, so remove it
74        self.toc_adder.remove_generated_toc()
75
76    def dup_data(self):
77        ''' Duplicate data so that any changes we make to markup/CSS only
78        affect KF8 output and not MOBI 6 output '''
79        self._data_cache = {}
80        # Suppress css_parser logging output as it is duplicated anyway earlier
81        # in the pipeline
82        css_parser.log.setLevel(logging.CRITICAL)
83        for item in self.oeb.manifest:
84            if item.media_type in XML_DOCS:
85                self._data_cache[item.href] = copy.deepcopy(item.data)
86            elif item.media_type in OEB_STYLES:
87                # I can't figure out how to make an efficient copy of the
88                # in-memory CSSStylesheet, as deepcopy doesn't work (raises an
89                # exception)
90                self._data_cache[item.href] = css_parser.parseString(
91                        item.data.cssText, validate=False)
92
93    def data(self, item):
94        return self._data_cache.get(item.href, item.data)
95
96    def cleanup_markup(self):
97        for item in self.oeb.spine:
98            root = self.data(item)
99
100            # Remove empty script tags as they are pointless
101            for tag in XPath('//h:script')(root):
102                if not tag.text and not tag.get('src', False):
103                    tag.getparent().remove(tag)
104
105            # Remove [ac]id attributes as they are used by this code for anchor
106            # to offset mapping
107            for tag in XPath('//*[@aid or @cid]')(root):
108                tag.attrib.pop('aid', None), tag.attrib.pop('cid', None)
109
110    def replace_resource_links(self):
111        ''' Replace links to resources (raster images/fonts) with pointers to
112        the MOBI record containing the resource. The pointers are of the form:
113        kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and
114        not used for fonts. '''
115
116        def pointer(item, oref):
117            ref = urlnormalize(item.abshref(oref))
118            idx = self.resources.item_map.get(ref, None)
119            if idx is not None:
120                is_image = self.resources.records[idx-1][:4] not in {b'FONT'}
121                idx = to_ref(idx)
122                if is_image:
123                    self.used_images.add(ref)
124                    return 'kindle:embed:%s?mime=%s'%(idx,
125                            self.resources.mime_map[ref])
126                else:
127                    return 'kindle:embed:%s'%idx
128            return oref
129
130        for item in self.oeb.manifest:
131
132            if item.media_type in XML_DOCS:
133                root = self.data(item)
134                for tag in XPath('//h:img|//svg:image')(root):
135                    for attr, ref in iteritems(tag.attrib):
136                        if attr.split('}')[-1].lower() in {'src', 'href'}:
137                            tag.attrib[attr] = pointer(item, ref)
138
139                for tag in XPath('//h:style')(root):
140                    if tag.text:
141                        sheet = css_parser.parseString(tag.text, validate=False)
142                        replacer = partial(pointer, item)
143                        css_parser.replaceUrls(sheet, replacer,
144                                ignoreImportRules=True)
145                        repl = sheet.cssText
146                        if isbytestring(repl):
147                            repl = repl.decode('utf-8')
148                        tag.text = '\n'+ repl + '\n'
149
150            elif item.media_type in OEB_STYLES:
151                sheet = self.data(item)
152                replacer = partial(pointer, item)
153                css_parser.replaceUrls(sheet, replacer, ignoreImportRules=True)
154
155    def extract_css_into_flows(self):
156        inlines = defaultdict(list)  # Ensure identical <style>s not repeated
157        sheets = {}
158        passthrough = getattr(self.opts, 'mobi_passthrough', False)
159
160        for item in self.oeb.manifest:
161            if item.media_type in OEB_STYLES:
162                sheet = self.data(item)
163                if not passthrough and not self.opts.expand_css and hasattr(item.data, 'cssText'):
164                    condense_sheet(sheet)
165                sheets[item.href] = len(self.flows)
166                self.flows.append(sheet)
167
168        def fix_import_rules(sheet):
169            changed = False
170            for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
171                if rule.href:
172                    href = item.abshref(rule.href)
173                    idx = sheets.get(href, None)
174                    if idx is not None:
175                        idx = to_ref(idx)
176                        rule.href = 'kindle:flow:%s?mime=text/css'%idx
177                        changed = True
178            return changed
179
180        for item in self.oeb.spine:
181            root = self.data(item)
182
183            for link in XPath('//h:link[@href]')(root):
184                href = item.abshref(link.get('href'))
185                idx = sheets.get(href, None)
186                if idx is not None:
187                    idx = to_ref(idx)
188                    link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
189
190            for tag in XPath('//h:style')(root):
191                p = tag.getparent()
192                idx = p.index(tag)
193                raw = tag.text
194                if not raw or not raw.strip():
195                    extract(tag)
196                    continue
197                sheet = css_parser.parseString(raw, validate=False)
198                if fix_import_rules(sheet):
199                    raw = force_unicode(sheet.cssText, 'utf-8')
200
201                repl = etree.Element(XHTML('link'), type='text/css',
202                        rel='stylesheet')
203                repl.tail='\n'
204                p.insert(idx, repl)
205                extract(tag)
206                inlines[raw].append(repl)
207
208        for raw, elems in iteritems(inlines):
209            idx = to_ref(len(self.flows))
210            self.flows.append(raw)
211            for link in elems:
212                link.set('href', 'kindle:flow:%s?mime=text/css'%idx)
213
214        for item in self.oeb.manifest:
215            if item.media_type in OEB_STYLES:
216                sheet = self.data(item)
217                if hasattr(sheet, 'cssRules'):
218                    fix_import_rules(sheet)
219
220        for i, sheet in enumerate(tuple(self.flows)):
221            if hasattr(sheet, 'cssText'):
222                self.flows[i] = force_unicode(sheet.cssText, 'utf-8')
223
224    def extract_svg_into_flows(self):
225        images = {}
226
227        for item in self.oeb.manifest:
228            if item.media_type == SVG_MIME:
229                data = self.data(item)
230                images[item.href] = len(self.flows)
231                self.flows.append(etree.tostring(data, encoding='UTF-8',
232                    with_tail=True, xml_declaration=True))
233
234        for item in self.oeb.spine:
235            root = self.data(item)
236
237            for svg in XPath('//svg:svg')(root):
238                raw = etree.tostring(svg, encoding='unicode', with_tail=False)
239                idx = len(self.flows)
240                self.flows.append(raw)
241                p = svg.getparent()
242                pos = p.index(svg)
243                img = etree.Element(XHTML('img'),
244                        src="kindle:flow:%s?mime=image/svg+xml"%to_ref(idx))
245                p.insert(pos, img)
246                extract(svg)
247
248            for img in XPath('//h:img[@src]')(root):
249                src = img.get('src')
250                abshref = item.abshref(src)
251                idx = images.get(abshref, None)
252                if idx is not None:
253                    img.set('src', 'kindle:flow:%s?mime=image/svg+xml'%
254                            to_ref(idx))
255
256    def replace_internal_links_with_placeholders(self):
257        self.link_map = {}
258        count = 0
259        hrefs = {item.href for item in self.oeb.spine}
260        for item in self.oeb.spine:
261            root = self.data(item)
262
263            for a in XPath('//h:a[@href]')(root):
264                count += 1
265                ref = item.abshref(a.get('href'))
266                href, _, frag = ref.partition('#')
267                try:
268                    href = urlnormalize(href)
269                except ValueError:
270                    # a non utf-8 quoted url? Since we cannot interpret it, pass it through.
271                    pass
272                if href in hrefs:
273                    placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count)
274                    self.link_map[placeholder] = (href, frag)
275                    a.set('href', placeholder)
276
277    def insert_aid_attributes(self):
278        self.id_map = {}
279        cid = 0
280        for i, item in enumerate(self.oeb.spine):
281            root = self.data(item)
282            aidbase = i * int(1e6)
283            j = 0
284
285            def in_table(elem):
286                p = elem.getparent()
287                if p is None:
288                    return False
289                if barename(p.tag).lower() == 'table':
290                    return True
291                return in_table(p)
292            for tag in root.iterdescendants(etree.Element):
293                id_ = tag.attrib.get('id', None)
294                if id_ is None and tag.tag == XHTML('a'):
295                    # Can happen during tweaking
296                    id_ = tag.attrib.get('name', None)
297                    if id_ is not None:
298                        tag.attrib['id'] = id_
299                tagname = barename(tag.tag).lower()
300                if id_ is not None or tagname in aid_able_tags:
301                    if tagname == 'table' or in_table(tag):
302                        # The Kindle renderer barfs on large tables that have
303                        # aid on any of their tags. See
304                        # https://bugs.launchpad.net/bugs/1489495
305                        if id_:
306                            cid += 1
307                            val = 'c%d' % cid
308                            self.id_map[(item.href, id_)] = val
309                            tag.set('cid', val)
310                    else:
311                        aid = to_base(aidbase + j, base=32)
312                        tag.set('aid', aid)
313                        if tag.tag == XHTML('body'):
314                            self.id_map[(item.href, '')] = aid
315                        if id_ is not None:
316                            self.id_map[(item.href, id_)] = aid
317
318                        j += 1
319
320    def chunk_it_up(self):
321        placeholder_map = {}
322        for placeholder, x in iteritems(self.link_map):
323            href, frag = x
324            aid = self.id_map.get(x, None)
325            if aid is None:
326                aid = self.id_map.get((href, ''))
327            placeholder_map[placeholder] = aid
328        chunker = Chunker(self.oeb, self.data, placeholder_map)
329
330        for x in ('skel_table', 'chunk_table', 'aid_offset_map'):
331            setattr(self, x, getattr(chunker, x))
332
333        self.flows[0] = chunker.text
334
335    def create_text_records(self):
336        self.flows = [x.encode('utf-8') if isinstance(x, str) else x for x
337                in self.flows]
338        text = b''.join(self.flows)
339        self.text_length = len(text)
340        text = BytesIO(text)
341        nrecords = 0
342        records_size = 0
343        self.uncompressed_record_lengths = []
344
345        if self.compress:
346            self.oeb.logger.info('\tCompressing markup...')
347
348        while text.tell() < self.text_length:
349            data, overlap = create_text_record(text)
350            self.uncompressed_record_lengths.append(len(data))
351            if self.compress:
352                data = compress_doc(data)
353
354            data += overlap
355            data += pack(b'>B', len(overlap))
356
357            self.records.append(data)
358            records_size += len(data)
359            nrecords += 1
360
361        self.last_text_record_idx = nrecords
362        self.first_non_text_record_idx = nrecords + 1
363        # Pad so that the next records starts at a 4 byte boundary
364        if records_size % 4 != 0:
365            self.records.append(b'\x00'*(records_size % 4))
366            self.first_non_text_record_idx += 1
367
368    def create_fdst_records(self):
369        FDST = namedtuple('Flow', 'start end')
370        entries = []
371        self.fdst_table = []
372        for i, flow in enumerate(self.flows):
373            start = 0 if i == 0 else self.fdst_table[-1].end
374            self.fdst_table.append(FDST(start, start + len(flow)))
375            entries.extend(self.fdst_table[-1])
376        rec = (b'FDST' + pack(b'>LL', 12, len(self.fdst_table)) +
377                pack(b'>%dL'%len(entries), *entries))
378        self.fdst_records = [rec]
379        self.fdst_count = len(self.fdst_table)
380
381    def create_indices(self):
382        self.skel_records = SkelIndex(self.skel_table)()
383        self.chunk_records = ChunkIndex(self.chunk_table)()
384        self.ncx_records = []
385        toc = self.oeb.toc
386        entries = []
387        is_periodical = self.opts.mobi_periodical
388        if toc.count() < 1:
389            self.log.warn('Document has no ToC, MOBI will have no NCX index')
390            return
391
392        # Flatten the ToC into a depth first list
393        fl = toc.iterdescendants()
394        for i, item in enumerate(fl):
395            entry = {'id': id(item), 'index': i, 'label':(item.title or
396                _('Unknown')), 'children':[]}
397            entry['depth'] = getattr(item, 'ncx_hlvl', 0)
398            p = getattr(item, 'ncx_parent', None)
399            if p is not None:
400                entry['parent_id'] = p
401            for child in item:
402                child.ncx_parent = entry['id']
403                child.ncx_hlvl = entry['depth'] + 1
404                entry['children'].append(id(child))
405            if is_periodical:
406                if item.author:
407                    entry['author'] = item.author
408                if item.description:
409                    entry['description'] = item.description
410            entries.append(entry)
411            href = item.href or ''
412            href, frag = href.partition('#')[0::2]
413            aid = self.id_map.get((href, frag), None)
414            if aid is None:
415                aid = self.id_map.get((href, ''), None)
416            if aid is None:
417                pos, fid = 0, 0
418                chunk = self.chunk_table[pos]
419                offset = chunk.insert_pos + fid
420            else:
421                pos, fid, offset = self.aid_offset_map[aid]
422
423            entry['pos_fid'] = (pos, fid)
424            entry['offset'] = offset
425
426        # The Kindle requires entries to be sorted by (depth, playorder)
427        # However, I cannot figure out how to deal with non linear ToCs, i.e.
428        # ToCs whose nth entry at depth d has an offset after its n+k entry at
429        # the same depth, so we sort on (depth, offset) instead. This re-orders
430        # the ToC to be linear. A non-linear ToC causes section to section
431        # jumping to not work. kindlegen somehow handles non-linear tocs, but I
432        # cannot figure out how.
433        original = sorted(entries,
434                key=lambda entry: (entry['depth'], entry['index']))
435        linearized = sorted(entries,
436                key=lambda entry: (entry['depth'], entry['offset']))
437        is_non_linear = original != linearized
438        entries = linearized
439        is_non_linear = False  # False as we are using the linearized entries
440
441        if is_non_linear:
442            for entry in entries:
443                entry['kind'] = 'chapter'
444
445        for i, entry in enumerate(entries):
446            entry['index'] = i
447        id_to_index = {entry['id']:entry['index'] for entry in entries}
448
449        # Write the hierarchical information
450        for entry in entries:
451            children = entry.pop('children')
452            if children:
453                entry['first_child'] = id_to_index[children[0]]
454                entry['last_child'] = id_to_index[children[-1]]
455            if 'parent_id' in entry:
456                entry['parent'] = id_to_index[entry.pop('parent_id')]
457
458        # Write the lengths
459        def get_next_start(entry):
460            enders = [e['offset'] for e in entries if e['depth'] <=
461                    entry['depth'] and e['offset'] > entry['offset']]
462            if enders:
463                return min(enders)
464            return len(self.flows[0])
465        for entry in entries:
466            entry['length'] = get_next_start(entry) - entry['offset']
467
468        self.has_tbs = apply_trailing_byte_sequences(entries, self.records,
469                self.uncompressed_record_lengths)
470        idx_type = NonLinearNCXIndex if is_non_linear else NCXIndex
471        self.ncx_records = idx_type(entries)()
472
473    def create_guide(self):
474        self.start_offset = None
475        self.guide_table = []
476        self.guide_records = []
477        GuideRef = namedtuple('GuideRef', 'title type pos_fid')
478        for ref in self.oeb.guide.values():
479            href, frag = ref.href.partition('#')[0::2]
480            aid = self.id_map.get((href, frag), None)
481            if aid is None:
482                aid = self.id_map.get((href, ''))
483            if aid is None:
484                continue
485            pos, fid, offset = self.aid_offset_map[aid]
486            if is_guide_ref_start(ref):
487                self.start_offset = offset
488            self.guide_table.append(GuideRef(ref.title or
489                _('Unknown'), ref.type, (pos, fid)))
490
491        if self.guide_table:
492            self.guide_table.sort(key=lambda x:x.type)  # Needed by the Kindle
493            self.guide_records = GuideIndex(self.guide_table)()
494
495
496def create_kf8_book(oeb, opts, resources, for_joint=False):
497    writer = KF8Writer(oeb, opts, resources)
498    return KF8Book(writer, for_joint=for_joint)
499