1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3
4
5__license__ = 'GPL v3'
6__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
7
8import copy, os, re
9from polyglot.builtins import string_or_bytes
10
11from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
12from calibre.ebooks.oeb.polish.errors import MalformedMarkup
13from calibre.ebooks.oeb.polish.toc import node_from_loc
14from calibre.ebooks.oeb.polish.replace import LinkRebaser
15from polyglot.builtins import iteritems
16from polyglot.urllib import urlparse
17
18
19class AbortError(ValueError):
20    pass
21
22
23def in_table(node):
24    while node is not None:
25        if node.tag.endswith('}table'):
26            return True
27        node = node.getparent()
28    return False
29
30
31def adjust_split_point(split_point, log):
32    '''
33    Move the split point up its ancestor chain if it has no content
34    before it. This handles the common case:
35    <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
36    h2.
37    '''
38    sp = split_point
39    while True:
40        parent = sp.getparent()
41        if (
42            parent is None or
43            barename(parent.tag) in {'body', 'html'} or
44            (parent.text and parent.text.strip()) or
45            parent.index(sp) > 0
46        ):
47            break
48        sp = parent
49
50    if sp is not split_point:
51        log.debug('Adjusted split point to ancestor')
52
53    return sp
54
55
56def get_body(root):
57    return root.find('h:body', namespaces=XPNSMAP)
58
59
60def do_split(split_point, log, before=True):
61    '''
62    Split tree into a *before* and an *after* tree at ``split_point``.
63
64    :param split_point: The Element at which to split
65    :param before: If True tree is split before split_point, otherwise after split_point
66    :return: before_tree, after_tree
67    '''
68    if before:
69        # We cannot adjust for after since moving an after split point to a
70        # parent will cause breakage if the parent contains any content
71        # after the original split point
72        split_point = adjust_split_point(split_point, log)
73    tree         = split_point.getroottree()
74    path         = tree.getpath(split_point)
75
76    tree, tree2  = copy.deepcopy(tree), copy.deepcopy(tree)
77    root, root2  = tree.getroot(), tree2.getroot()
78    body, body2  = map(get_body, (root, root2))
79    split_point  = root.xpath(path)[0]
80    split_point2 = root2.xpath(path)[0]
81
82    def nix_element(elem, top=True):
83        # Remove elem unless top is False in which case replace elem by its
84        # children
85        parent = elem.getparent()
86        if top:
87            parent.remove(elem)
88        else:
89            index = parent.index(elem)
90            parent[index:index+1] = list(elem.iterchildren())
91
92    # Tree 1
93    hit_split_point = False
94    keep_descendants = False
95    split_point_descendants = frozenset(split_point.iterdescendants())
96    for elem in tuple(body.iterdescendants()):
97        if elem is split_point:
98            hit_split_point = True
99            if before:
100                nix_element(elem)
101            else:
102                # We want to keep the descendants of the split point in
103                # Tree 1
104                keep_descendants = True
105                # We want the split point element, but not its tail
106                elem.tail = '\n'
107
108            continue
109        if hit_split_point:
110            if keep_descendants:
111                if elem in split_point_descendants:
112                    # elem is a descendant keep it
113                    continue
114                else:
115                    # We are out of split_point, so prevent further set
116                    # lookups of split_point_descendants
117                    keep_descendants = False
118            nix_element(elem)
119
120    # Tree 2
121    ancestors = frozenset(XPath('ancestor::*')(split_point2))
122    for elem in tuple(body2.iterdescendants()):
123        if elem is split_point2:
124            if not before:
125                # Keep the split point element's tail, if it contains non-whitespace
126                # text
127                tail = elem.tail
128                if tail and not tail.isspace():
129                    parent = elem.getparent()
130                    idx = parent.index(elem)
131                    if idx == 0:
132                        parent.text = (parent.text or '') + tail
133                    else:
134                        sib = parent[idx-1]
135                        sib.tail = (sib.tail or '') + tail
136                # Remove the element itself
137                nix_element(elem)
138            break
139        if elem in ancestors:
140            # We have to preserve the ancestors as they could have CSS
141            # styles that are inherited/applicable, like font or
142            # width. So we only remove the text, if any.
143            elem.text = '\n'
144        else:
145            nix_element(elem, top=False)
146
147    body2.text = '\n'
148
149    return tree, tree2
150
151
152class SplitLinkReplacer:
153
154    def __init__(self, base, bottom_anchors, top_name, bottom_name, container):
155        self.bottom_anchors, self.bottom_name = bottom_anchors, bottom_name
156        self.container, self.top_name = container, top_name
157        self.base = base
158        self.replaced = False
159
160    def __call__(self, url):
161        if url and url.startswith('#'):
162            return url
163        name = self.container.href_to_name(url, self.base)
164        if name != self.top_name:
165            return url
166        purl = urlparse(url)
167        if purl.fragment and purl.fragment in self.bottom_anchors:
168            url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment
169            self.replaced = True
170        return url
171
172
173def split(container, name, loc_or_xpath, before=True, totals=None):
174    '''
175    Split the file specified by name at the position specified by loc_or_xpath.
176    Splitting automatically migrates all links and references to the affected
177    files.
178
179    :param loc_or_xpath: Should be an XPath expression such as
180        //h:div[@id="split_here"]. Can also be a *loc* which is used internally to
181        implement splitting in the preview panel.
182    :param before: If True the split occurs before the identified element otherwise after it.
183    :param totals: Used internally
184    '''
185
186    root = container.parsed(name)
187    if isinstance(loc_or_xpath, str):
188        split_point = root.xpath(loc_or_xpath)[0]
189    else:
190        try:
191            split_point = node_from_loc(root, loc_or_xpath, totals=totals)
192        except MalformedMarkup:
193            # The webkit HTML parser and the container parser have yielded
194            # different node counts, this can happen if the file is valid XML
195            # but contains constructs like nested <p> tags. So force parse it
196            # with the HTML 5 parser and try again.
197            raw = container.raw_data(name)
198            root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
199            try:
200                split_point = node_from_loc(root, loc_or_xpath, totals=totals)
201            except MalformedMarkup:
202                raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
203                                        ' before splitting') % name)
204            container.replace(name, root)
205    if in_table(split_point):
206        raise AbortError('Cannot split inside tables')
207    if split_point.tag.endswith('}body'):
208        raise AbortError('Cannot split on the <body> tag')
209    tree1, tree2 = do_split(split_point, container.log, before=before)
210    root1, root2 = tree1.getroot(), tree2.getroot()
211    anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
212    anchors_in_bottom = frozenset(root2.xpath('//*/@id')) | frozenset(root2.xpath('//*/@name'))
213    base, ext = name.rpartition('.')[0::2]
214    base = re.sub(r'_split\d+$', '', base)
215    nname, s = None, 0
216    while not nname or container.exists(nname):
217        s += 1
218        nname = '%s_split%d.%s' % (base, s, ext)
219    manifest_item = container.generate_item(nname, media_type=container.mime_map[name])
220    bottom_name = container.href_to_name(manifest_item.get('href'), container.opf_name)
221
222    # Fix links in the split trees
223    for r in (root1, root2):
224        for a in r.xpath('//*[@href]'):
225            url = a.get('href')
226            if url.startswith('#'):
227                fname = name
228            else:
229                fname = container.href_to_name(url, name)
230            if fname == name:
231                purl = urlparse(url)
232                if purl.fragment in anchors_in_top:
233                    if r is root2:
234                        a.set('href', '%s#%s' % (container.name_to_href(name, bottom_name), purl.fragment))
235                    else:
236                        a.set('href', '#' + purl.fragment)
237                elif purl.fragment in anchors_in_bottom:
238                    if r is root1:
239                        a.set('href', '%s#%s' % (container.name_to_href(bottom_name, name), purl.fragment))
240                    else:
241                        a.set('href', '#' + purl.fragment)
242
243    # Fix all links in the container that point to anchors in the bottom tree
244    for fname, media_type in iteritems(container.mime_map):
245        if fname not in {name, bottom_name}:
246            repl = SplitLinkReplacer(fname, anchors_in_bottom, name, bottom_name, container)
247            container.replace_links(fname, repl)
248
249    container.replace(name, root1)
250    container.replace(bottom_name, root2)
251
252    spine = container.opf_xpath('//opf:spine')[0]
253    for spine_item, spine_name, linear in container.spine_iter:
254        if spine_name == name:
255            break
256    index = spine.index(spine_item) + 1
257
258    si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id'))
259    if not linear:
260        si.set('linear', 'no')
261    container.insert_into_xml(spine, si, index=index)
262    container.dirty(container.opf_name)
263    return bottom_name
264
265
266def multisplit(container, name, xpath, before=True):
267    '''
268    Split the specified file at multiple locations (all tags that match the specified XPath expression). See also: :func:`split`.
269    Splitting automatically migrates all links and references to the affected
270    files.
271
272    :param before: If True the splits occur before the identified element otherwise after it.
273    '''
274    root = container.parsed(name)
275    nodes = root.xpath(xpath, namespaces=XPNSMAP)
276    if not nodes:
277        raise AbortError(_('The expression %s did not match any nodes') % xpath)
278    for split_point in nodes:
279        if in_table(split_point):
280            raise AbortError('Cannot split inside tables')
281        if split_point.tag.endswith('}body'):
282            raise AbortError('Cannot split on the <body> tag')
283
284    for i, tag in enumerate(nodes):
285        tag.set('calibre-split-point', str(i))
286
287    current = name
288    all_names = [name]
289    for i in range(len(nodes)):
290        current = split(container, current, '//*[@calibre-split-point="%d"]' % i, before=before)
291        all_names.append(current)
292
293    for x in all_names:
294        for tag in container.parsed(x).xpath('//*[@calibre-split-point]'):
295            tag.attrib.pop('calibre-split-point')
296        container.dirty(x)
297
298    return all_names[1:]
299
300
301class MergeLinkReplacer:
302
303    def __init__(self, base, anchor_map, master, container):
304        self.container, self.anchor_map = container, anchor_map
305        self.master = master
306        self.base = base
307        self.replaced = False
308
309    def __call__(self, url):
310        if url and url.startswith('#'):
311            return url
312        name = self.container.href_to_name(url, self.base)
313        amap = self.anchor_map.get(name, None)
314        if amap is None:
315            return url
316        purl = urlparse(url)
317        frag = purl.fragment or ''
318        frag = amap.get(frag, frag)
319        url = self.container.name_to_href(self.master, self.base) + '#' + frag
320        self.replaced = True
321        return url
322
323
324def add_text(body, text):
325    if len(body) > 0:
326        body[-1].tail = (body[-1].tail or '') + text
327    else:
328        body.text = (body.text or '') + text
329
330
331def all_anchors(root):
332    return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
333
334
335def all_stylesheets(container, name):
336    for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
337        name = container.href_to_name(link.get('href'), name)
338        typ = link.get('type', 'text/css')
339        if typ == 'text/css':
340            yield name
341
342
343def unique_anchor(seen_anchors, current):
344    c = 0
345    ans = current
346    while ans in seen_anchors:
347        c += 1
348        ans = '%s_%d' % (current, c)
349    return ans
350
351
352def remove_name_attributes(root):
353    # Remove all name attributes, replacing them with id attributes
354    for elem in root.xpath('//*[@id and @name]'):
355        del elem.attrib['name']
356    for elem in root.xpath('//*[@name]'):
357        elem.set('id', elem.attrib.pop('name'))
358
359
360def merge_html(container, names, master, insert_page_breaks=False):
361    p = container.parsed
362    root = p(master)
363
364    # Ensure master has a <head>
365    head = root.find('h:head', namespaces=XPNSMAP)
366    if head is None:
367        head = root.makeelement(XHTML('head'))
368        container.insert_into_xml(root, head, 0)
369
370    seen_anchors = all_anchors(root)
371    seen_stylesheets = set(all_stylesheets(container, master))
372    master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
373    master_base = os.path.dirname(master)
374    anchor_map = {n:{} for n in names if n != master}
375    first_anchor_map = {}
376
377    for name in names:
378        if name == master:
379            continue
380        # Insert new stylesheets into master
381        for sheet in all_stylesheets(container, name):
382            if sheet not in seen_stylesheets:
383                seen_stylesheets.add(sheet)
384                link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
385                container.insert_into_xml(head, link)
386
387        # Rebase links if master is in a different directory
388        if os.path.dirname(name) != master_base:
389            container.replace_links(name, LinkRebaser(container, name, master))
390
391        root = p(name)
392        children = []
393        for body in p(name).findall('h:body', namespaces=XPNSMAP):
394            children.append(body.text if body.text and body.text.strip() else '\n\n')
395            children.extend(body)
396
397        first_child = ''
398        for first_child in children:
399            if not isinstance(first_child, string_or_bytes):
400                break
401        if isinstance(first_child, string_or_bytes):
402            # body contained only text, no tags
403            first_child = body.makeelement(XHTML('p'))
404            first_child.text, children[0] = children[0], first_child
405
406        amap = anchor_map[name]
407        remove_name_attributes(root)
408
409        for elem in root.xpath('//*[@id]'):
410            val = elem.get('id')
411            if not val:
412                continue
413            if val in seen_anchors:
414                nval = unique_anchor(seen_anchors, val)
415                elem.set('id', nval)
416                amap[val] = nval
417            else:
418                seen_anchors.add(val)
419
420        if 'id' not in first_child.attrib:
421            first_child.set('id', unique_anchor(seen_anchors, 'top'))
422            seen_anchors.add(first_child.get('id'))
423        first_anchor_map[name] = first_child.get('id')
424
425        if insert_page_breaks:
426            first_child.set('style', first_child.get('style', '') + '; page-break-before: always')
427
428        amap[''] = first_child.get('id')
429
430        # Fix links that point to local changed anchors
431        for a in XPath('//h:a[starts-with(@href, "#")]')(root):
432            q = a.get('href')[1:]
433            if q in amap:
434                a.set('href', '#' + amap[q])
435
436        for child in children:
437            if isinstance(child, string_or_bytes):
438                add_text(master_body, child)
439            else:
440                master_body.append(copy.deepcopy(child))
441
442        container.remove_item(name, remove_from_guide=False)
443
444    # Fix all links in the container that point to merged files
445    for fname, media_type in iteritems(container.mime_map):
446        repl = MergeLinkReplacer(fname, anchor_map, master, container)
447        container.replace_links(fname, repl)
448
449    return first_anchor_map
450
451
452def merge_css(container, names, master):
453    p = container.parsed
454    msheet = p(master)
455    master_base = os.path.dirname(master)
456    merged = set()
457
458    for name in names:
459        if name == master:
460            continue
461        # Rebase links if master is in a different directory
462        if os.path.dirname(name) != master_base:
463            container.replace_links(name, LinkRebaser(container, name, master))
464
465        sheet = p(name)
466
467        # Remove charset rules
468        cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE]
469        [sheet.deleteRule(sheet.cssRules.index(r)) for r in cr]
470        for rule in sheet.cssRules:
471            msheet.add(rule)
472
473        container.remove_item(name)
474        merged.add(name)
475
476    # Remove links to merged stylesheets in the html files, replacing with a
477    # link to the master sheet
478    for name, mt in iteritems(container.mime_map):
479        if mt in OEB_DOCS:
480            removed = False
481            root = p(name)
482            for link in XPath('//h:link[@href]')(root):
483                q = container.href_to_name(link.get('href'), name)
484                if q in merged:
485                    container.remove_from_xml(link)
486                    removed = True
487            if removed:
488                container.dirty(name)
489            if removed and master not in set(all_stylesheets(container, name)):
490                head = root.find('h:head', namespaces=XPNSMAP)
491                if head is not None:
492                    link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name))
493                    container.insert_into_xml(head, link)
494
495
496def merge(container, category, names, master):
497    '''
498    Merge the specified files into a single file, automatically migrating all
499    links and references to the affected files. The file must all either be HTML or CSS files.
500
501    :param category: Must be either ``'text'`` for HTML files or ``'styles'`` for CSS files
502    :param names: The list of files to be merged
503    :param master: Which of the merged files is the *master* file, that is, the file that will remain after merging.
504    '''
505    if category not in {'text', 'styles'}:
506        raise AbortError('Cannot merge files of type: %s' % category)
507    if len(names) < 2:
508        raise AbortError('Must specify at least two files to be merged')
509    if master not in names:
510        raise AbortError('The master file (%s) must be one of the files being merged' % master)
511
512    if category == 'text':
513        merge_html(container, names, master)
514    elif category == 'styles':
515        merge_css(container, names, master)
516
517    container.dirty(master)
518