1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' 7 8import copy, os, re 9from polyglot.builtins import string_or_bytes 10 11from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS 12from calibre.ebooks.oeb.polish.errors import MalformedMarkup 13from calibre.ebooks.oeb.polish.toc import node_from_loc 14from calibre.ebooks.oeb.polish.replace import LinkRebaser 15from polyglot.builtins import iteritems 16from polyglot.urllib import urlparse 17 18 19class AbortError(ValueError): 20 pass 21 22 23def in_table(node): 24 while node is not None: 25 if node.tag.endswith('}table'): 26 return True 27 node = node.getparent() 28 return False 29 30 31def adjust_split_point(split_point, log): 32 ''' 33 Move the split point up its ancestor chain if it has no content 34 before it. This handles the common case: 35 <div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the 36 h2. 37 ''' 38 sp = split_point 39 while True: 40 parent = sp.getparent() 41 if ( 42 parent is None or 43 barename(parent.tag) in {'body', 'html'} or 44 (parent.text and parent.text.strip()) or 45 parent.index(sp) > 0 46 ): 47 break 48 sp = parent 49 50 if sp is not split_point: 51 log.debug('Adjusted split point to ancestor') 52 53 return sp 54 55 56def get_body(root): 57 return root.find('h:body', namespaces=XPNSMAP) 58 59 60def do_split(split_point, log, before=True): 61 ''' 62 Split tree into a *before* and an *after* tree at ``split_point``. 63 64 :param split_point: The Element at which to split 65 :param before: If True tree is split before split_point, otherwise after split_point 66 :return: before_tree, after_tree 67 ''' 68 if before: 69 # We cannot adjust for after since moving an after split point to a 70 # parent will cause breakage if the parent contains any content 71 # after the original split point 72 split_point = adjust_split_point(split_point, log) 73 tree = split_point.getroottree() 74 path = tree.getpath(split_point) 75 76 tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree) 77 root, root2 = tree.getroot(), tree2.getroot() 78 body, body2 = map(get_body, (root, root2)) 79 split_point = root.xpath(path)[0] 80 split_point2 = root2.xpath(path)[0] 81 82 def nix_element(elem, top=True): 83 # Remove elem unless top is False in which case replace elem by its 84 # children 85 parent = elem.getparent() 86 if top: 87 parent.remove(elem) 88 else: 89 index = parent.index(elem) 90 parent[index:index+1] = list(elem.iterchildren()) 91 92 # Tree 1 93 hit_split_point = False 94 keep_descendants = False 95 split_point_descendants = frozenset(split_point.iterdescendants()) 96 for elem in tuple(body.iterdescendants()): 97 if elem is split_point: 98 hit_split_point = True 99 if before: 100 nix_element(elem) 101 else: 102 # We want to keep the descendants of the split point in 103 # Tree 1 104 keep_descendants = True 105 # We want the split point element, but not its tail 106 elem.tail = '\n' 107 108 continue 109 if hit_split_point: 110 if keep_descendants: 111 if elem in split_point_descendants: 112 # elem is a descendant keep it 113 continue 114 else: 115 # We are out of split_point, so prevent further set 116 # lookups of split_point_descendants 117 keep_descendants = False 118 nix_element(elem) 119 120 # Tree 2 121 ancestors = frozenset(XPath('ancestor::*')(split_point2)) 122 for elem in tuple(body2.iterdescendants()): 123 if elem is split_point2: 124 if not before: 125 # Keep the split point element's tail, if it contains non-whitespace 126 # text 127 tail = elem.tail 128 if tail and not tail.isspace(): 129 parent = elem.getparent() 130 idx = parent.index(elem) 131 if idx == 0: 132 parent.text = (parent.text or '') + tail 133 else: 134 sib = parent[idx-1] 135 sib.tail = (sib.tail or '') + tail 136 # Remove the element itself 137 nix_element(elem) 138 break 139 if elem in ancestors: 140 # We have to preserve the ancestors as they could have CSS 141 # styles that are inherited/applicable, like font or 142 # width. So we only remove the text, if any. 143 elem.text = '\n' 144 else: 145 nix_element(elem, top=False) 146 147 body2.text = '\n' 148 149 return tree, tree2 150 151 152class SplitLinkReplacer: 153 154 def __init__(self, base, bottom_anchors, top_name, bottom_name, container): 155 self.bottom_anchors, self.bottom_name = bottom_anchors, bottom_name 156 self.container, self.top_name = container, top_name 157 self.base = base 158 self.replaced = False 159 160 def __call__(self, url): 161 if url and url.startswith('#'): 162 return url 163 name = self.container.href_to_name(url, self.base) 164 if name != self.top_name: 165 return url 166 purl = urlparse(url) 167 if purl.fragment and purl.fragment in self.bottom_anchors: 168 url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment 169 self.replaced = True 170 return url 171 172 173def split(container, name, loc_or_xpath, before=True, totals=None): 174 ''' 175 Split the file specified by name at the position specified by loc_or_xpath. 176 Splitting automatically migrates all links and references to the affected 177 files. 178 179 :param loc_or_xpath: Should be an XPath expression such as 180 //h:div[@id="split_here"]. Can also be a *loc* which is used internally to 181 implement splitting in the preview panel. 182 :param before: If True the split occurs before the identified element otherwise after it. 183 :param totals: Used internally 184 ''' 185 186 root = container.parsed(name) 187 if isinstance(loc_or_xpath, str): 188 split_point = root.xpath(loc_or_xpath)[0] 189 else: 190 try: 191 split_point = node_from_loc(root, loc_or_xpath, totals=totals) 192 except MalformedMarkup: 193 # The webkit HTML parser and the container parser have yielded 194 # different node counts, this can happen if the file is valid XML 195 # but contains constructs like nested <p> tags. So force parse it 196 # with the HTML 5 parser and try again. 197 raw = container.raw_data(name) 198 root = container.parse_xhtml(raw, fname=name, force_html5_parse=True) 199 try: 200 split_point = node_from_loc(root, loc_or_xpath, totals=totals) 201 except MalformedMarkup: 202 raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool' 203 ' before splitting') % name) 204 container.replace(name, root) 205 if in_table(split_point): 206 raise AbortError('Cannot split inside tables') 207 if split_point.tag.endswith('}body'): 208 raise AbortError('Cannot split on the <body> tag') 209 tree1, tree2 = do_split(split_point, container.log, before=before) 210 root1, root2 = tree1.getroot(), tree2.getroot() 211 anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''} 212 anchors_in_bottom = frozenset(root2.xpath('//*/@id')) | frozenset(root2.xpath('//*/@name')) 213 base, ext = name.rpartition('.')[0::2] 214 base = re.sub(r'_split\d+$', '', base) 215 nname, s = None, 0 216 while not nname or container.exists(nname): 217 s += 1 218 nname = '%s_split%d.%s' % (base, s, ext) 219 manifest_item = container.generate_item(nname, media_type=container.mime_map[name]) 220 bottom_name = container.href_to_name(manifest_item.get('href'), container.opf_name) 221 222 # Fix links in the split trees 223 for r in (root1, root2): 224 for a in r.xpath('//*[@href]'): 225 url = a.get('href') 226 if url.startswith('#'): 227 fname = name 228 else: 229 fname = container.href_to_name(url, name) 230 if fname == name: 231 purl = urlparse(url) 232 if purl.fragment in anchors_in_top: 233 if r is root2: 234 a.set('href', '%s#%s' % (container.name_to_href(name, bottom_name), purl.fragment)) 235 else: 236 a.set('href', '#' + purl.fragment) 237 elif purl.fragment in anchors_in_bottom: 238 if r is root1: 239 a.set('href', '%s#%s' % (container.name_to_href(bottom_name, name), purl.fragment)) 240 else: 241 a.set('href', '#' + purl.fragment) 242 243 # Fix all links in the container that point to anchors in the bottom tree 244 for fname, media_type in iteritems(container.mime_map): 245 if fname not in {name, bottom_name}: 246 repl = SplitLinkReplacer(fname, anchors_in_bottom, name, bottom_name, container) 247 container.replace_links(fname, repl) 248 249 container.replace(name, root1) 250 container.replace(bottom_name, root2) 251 252 spine = container.opf_xpath('//opf:spine')[0] 253 for spine_item, spine_name, linear in container.spine_iter: 254 if spine_name == name: 255 break 256 index = spine.index(spine_item) + 1 257 258 si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id')) 259 if not linear: 260 si.set('linear', 'no') 261 container.insert_into_xml(spine, si, index=index) 262 container.dirty(container.opf_name) 263 return bottom_name 264 265 266def multisplit(container, name, xpath, before=True): 267 ''' 268 Split the specified file at multiple locations (all tags that match the specified XPath expression). See also: :func:`split`. 269 Splitting automatically migrates all links and references to the affected 270 files. 271 272 :param before: If True the splits occur before the identified element otherwise after it. 273 ''' 274 root = container.parsed(name) 275 nodes = root.xpath(xpath, namespaces=XPNSMAP) 276 if not nodes: 277 raise AbortError(_('The expression %s did not match any nodes') % xpath) 278 for split_point in nodes: 279 if in_table(split_point): 280 raise AbortError('Cannot split inside tables') 281 if split_point.tag.endswith('}body'): 282 raise AbortError('Cannot split on the <body> tag') 283 284 for i, tag in enumerate(nodes): 285 tag.set('calibre-split-point', str(i)) 286 287 current = name 288 all_names = [name] 289 for i in range(len(nodes)): 290 current = split(container, current, '//*[@calibre-split-point="%d"]' % i, before=before) 291 all_names.append(current) 292 293 for x in all_names: 294 for tag in container.parsed(x).xpath('//*[@calibre-split-point]'): 295 tag.attrib.pop('calibre-split-point') 296 container.dirty(x) 297 298 return all_names[1:] 299 300 301class MergeLinkReplacer: 302 303 def __init__(self, base, anchor_map, master, container): 304 self.container, self.anchor_map = container, anchor_map 305 self.master = master 306 self.base = base 307 self.replaced = False 308 309 def __call__(self, url): 310 if url and url.startswith('#'): 311 return url 312 name = self.container.href_to_name(url, self.base) 313 amap = self.anchor_map.get(name, None) 314 if amap is None: 315 return url 316 purl = urlparse(url) 317 frag = purl.fragment or '' 318 frag = amap.get(frag, frag) 319 url = self.container.name_to_href(self.master, self.base) + '#' + frag 320 self.replaced = True 321 return url 322 323 324def add_text(body, text): 325 if len(body) > 0: 326 body[-1].tail = (body[-1].tail or '') + text 327 else: 328 body.text = (body.text or '') + text 329 330 331def all_anchors(root): 332 return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name')) 333 334 335def all_stylesheets(container, name): 336 for link in XPath('//h:head/h:link[@href]')(container.parsed(name)): 337 name = container.href_to_name(link.get('href'), name) 338 typ = link.get('type', 'text/css') 339 if typ == 'text/css': 340 yield name 341 342 343def unique_anchor(seen_anchors, current): 344 c = 0 345 ans = current 346 while ans in seen_anchors: 347 c += 1 348 ans = '%s_%d' % (current, c) 349 return ans 350 351 352def remove_name_attributes(root): 353 # Remove all name attributes, replacing them with id attributes 354 for elem in root.xpath('//*[@id and @name]'): 355 del elem.attrib['name'] 356 for elem in root.xpath('//*[@name]'): 357 elem.set('id', elem.attrib.pop('name')) 358 359 360def merge_html(container, names, master, insert_page_breaks=False): 361 p = container.parsed 362 root = p(master) 363 364 # Ensure master has a <head> 365 head = root.find('h:head', namespaces=XPNSMAP) 366 if head is None: 367 head = root.makeelement(XHTML('head')) 368 container.insert_into_xml(root, head, 0) 369 370 seen_anchors = all_anchors(root) 371 seen_stylesheets = set(all_stylesheets(container, master)) 372 master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1] 373 master_base = os.path.dirname(master) 374 anchor_map = {n:{} for n in names if n != master} 375 first_anchor_map = {} 376 377 for name in names: 378 if name == master: 379 continue 380 # Insert new stylesheets into master 381 for sheet in all_stylesheets(container, name): 382 if sheet not in seen_stylesheets: 383 seen_stylesheets.add(sheet) 384 link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master)) 385 container.insert_into_xml(head, link) 386 387 # Rebase links if master is in a different directory 388 if os.path.dirname(name) != master_base: 389 container.replace_links(name, LinkRebaser(container, name, master)) 390 391 root = p(name) 392 children = [] 393 for body in p(name).findall('h:body', namespaces=XPNSMAP): 394 children.append(body.text if body.text and body.text.strip() else '\n\n') 395 children.extend(body) 396 397 first_child = '' 398 for first_child in children: 399 if not isinstance(first_child, string_or_bytes): 400 break 401 if isinstance(first_child, string_or_bytes): 402 # body contained only text, no tags 403 first_child = body.makeelement(XHTML('p')) 404 first_child.text, children[0] = children[0], first_child 405 406 amap = anchor_map[name] 407 remove_name_attributes(root) 408 409 for elem in root.xpath('//*[@id]'): 410 val = elem.get('id') 411 if not val: 412 continue 413 if val in seen_anchors: 414 nval = unique_anchor(seen_anchors, val) 415 elem.set('id', nval) 416 amap[val] = nval 417 else: 418 seen_anchors.add(val) 419 420 if 'id' not in first_child.attrib: 421 first_child.set('id', unique_anchor(seen_anchors, 'top')) 422 seen_anchors.add(first_child.get('id')) 423 first_anchor_map[name] = first_child.get('id') 424 425 if insert_page_breaks: 426 first_child.set('style', first_child.get('style', '') + '; page-break-before: always') 427 428 amap[''] = first_child.get('id') 429 430 # Fix links that point to local changed anchors 431 for a in XPath('//h:a[starts-with(@href, "#")]')(root): 432 q = a.get('href')[1:] 433 if q in amap: 434 a.set('href', '#' + amap[q]) 435 436 for child in children: 437 if isinstance(child, string_or_bytes): 438 add_text(master_body, child) 439 else: 440 master_body.append(copy.deepcopy(child)) 441 442 container.remove_item(name, remove_from_guide=False) 443 444 # Fix all links in the container that point to merged files 445 for fname, media_type in iteritems(container.mime_map): 446 repl = MergeLinkReplacer(fname, anchor_map, master, container) 447 container.replace_links(fname, repl) 448 449 return first_anchor_map 450 451 452def merge_css(container, names, master): 453 p = container.parsed 454 msheet = p(master) 455 master_base = os.path.dirname(master) 456 merged = set() 457 458 for name in names: 459 if name == master: 460 continue 461 # Rebase links if master is in a different directory 462 if os.path.dirname(name) != master_base: 463 container.replace_links(name, LinkRebaser(container, name, master)) 464 465 sheet = p(name) 466 467 # Remove charset rules 468 cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE] 469 [sheet.deleteRule(sheet.cssRules.index(r)) for r in cr] 470 for rule in sheet.cssRules: 471 msheet.add(rule) 472 473 container.remove_item(name) 474 merged.add(name) 475 476 # Remove links to merged stylesheets in the html files, replacing with a 477 # link to the master sheet 478 for name, mt in iteritems(container.mime_map): 479 if mt in OEB_DOCS: 480 removed = False 481 root = p(name) 482 for link in XPath('//h:link[@href]')(root): 483 q = container.href_to_name(link.get('href'), name) 484 if q in merged: 485 container.remove_from_xml(link) 486 removed = True 487 if removed: 488 container.dirty(name) 489 if removed and master not in set(all_stylesheets(container, name)): 490 head = root.find('h:head', namespaces=XPNSMAP) 491 if head is not None: 492 link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name)) 493 container.insert_into_xml(head, link) 494 495 496def merge(container, category, names, master): 497 ''' 498 Merge the specified files into a single file, automatically migrating all 499 links and references to the affected files. The file must all either be HTML or CSS files. 500 501 :param category: Must be either ``'text'`` for HTML files or ``'styles'`` for CSS files 502 :param names: The list of files to be merged 503 :param master: Which of the merged files is the *master* file, that is, the file that will remain after merging. 504 ''' 505 if category not in {'text', 'styles'}: 506 raise AbortError('Cannot merge files of type: %s' % category) 507 if len(names) < 2: 508 raise AbortError('Must specify at least two files to be merged') 509 if master not in names: 510 raise AbortError('The master file (%s) must be one of the files being merged' % master) 511 512 if category == 'text': 513 merge_html(container, names, master) 514 elif category == 'styles': 515 merge_css(container, names, master) 516 517 container.dirty(master) 518