1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' 7 8import sys, os, re, math, errno, uuid, numbers 9from collections import OrderedDict, defaultdict 10 11from lxml import html 12from lxml.html.builder import ( 13 HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1) 14 15from calibre import guess_type 16from calibre.ebooks.docx.container import DOCX, fromstring 17from calibre.ebooks.docx.names import XML, generate_anchor 18from calibre.ebooks.docx.styles import Styles, inherit, PageProperties 19from calibre.ebooks.docx.numbering import Numbering 20from calibre.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text 21from calibre.ebooks.docx.images import Images 22from calibre.ebooks.docx.tables import Tables 23from calibre.ebooks.docx.footnotes import Footnotes 24from calibre.ebooks.docx.cleanup import cleanup_markup 25from calibre.ebooks.docx.theme import Theme 26from calibre.ebooks.docx.toc import create_toc 27from calibre.ebooks.docx.fields import Fields 28from calibre.ebooks.docx.settings import Settings 29from calibre.ebooks.metadata.opf2 import OPFCreator 30from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1 31from polyglot.builtins import iteritems, itervalues 32 33 34NBSP = '\xa0' 35 36 37class Text: 38 39 def __init__(self, elem, attr, buf): 40 self.elem, self.attr, self.buf = elem, attr, buf 41 self.elems = [self.elem] 42 43 def add_elem(self, elem): 44 self.elems.append(elem) 45 setattr(self.elem, self.attr, ''.join(self.buf)) 46 self.elem, self.attr, self.buf = elem, 'tail', [] 47 48 def __iter__(self): 49 return iter(self.elems) 50 51 52def html_lang(docx_lang): 53 lang = canonicalize_lang(docx_lang) 54 if lang and lang != 'und': 55 lang = lang_as_iso639_1(lang) 56 if lang: 57 return lang 58 59 60class Convert: 61 62 def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False): 63 self.docx = DOCX(path_or_stream, log=log) 64 self.namespace = self.docx.namespace 65 self.ms_pat = re.compile(r'\s{2,}') 66 self.ws_pat = re.compile(r'[\n\r\t]') 67 self.log = self.docx.log 68 self.detect_cover = detect_cover 69 self.notes_text = notes_text or _('Notes') 70 self.notes_nopb = notes_nopb 71 self.nosupsub = nosupsub 72 self.dest_dir = dest_dir or os.getcwd() 73 self.mi = self.docx.metadata 74 self.body = BODY() 75 self.theme = Theme(self.namespace) 76 self.settings = Settings(self.namespace) 77 self.tables = Tables(self.namespace) 78 self.fields = Fields(self.namespace) 79 self.styles = Styles(self.namespace, self.tables) 80 self.images = Images(self.namespace, self.log) 81 self.object_map = OrderedDict() 82 self.html = HTML( 83 HEAD( 84 META(charset='utf-8'), 85 TITLE(self.mi.title or _('Unknown')), 86 LINK(rel='stylesheet', type='text/css', href='docx.css'), 87 ), 88 self.body 89 ) 90 self.html.text='\n\t' 91 self.html[0].text='\n\t\t' 92 self.html[0].tail='\n' 93 for child in self.html[0]: 94 child.tail = '\n\t\t' 95 self.html[0][-1].tail = '\n\t' 96 self.html[1].text = self.html[1].tail = '\n' 97 lang = html_lang(self.mi.language) 98 if lang: 99 self.html.set('lang', lang) 100 self.doc_lang = lang 101 else: 102 self.doc_lang = None 103 104 def __call__(self): 105 doc = self.docx.document 106 relationships_by_id, relationships_by_type = self.docx.document_relationships 107 self.resolve_alternate_content(doc) 108 self.fields(doc, self.log) 109 self.read_styles(relationships_by_type) 110 self.images(relationships_by_id) 111 self.layers = OrderedDict() 112 self.framed = [[]] 113 self.frame_map = {} 114 self.framed_map = {} 115 self.anchor_map = {} 116 self.link_map = defaultdict(list) 117 self.link_source_map = {} 118 self.toc_anchor = None 119 self.block_runs = [] 120 paras = [] 121 122 self.log.debug('Converting Word markup to HTML') 123 124 self.read_page_properties(doc) 125 self.current_rels = relationships_by_id 126 for wp, page_properties in iteritems(self.page_map): 127 self.current_page = page_properties 128 if wp.tag.endswith('}p'): 129 p = self.convert_p(wp) 130 self.body.append(p) 131 paras.append(wp) 132 133 self.read_block_anchors(doc) 134 self.styles.apply_contextual_spacing(paras) 135 self.mark_block_runs(paras) 136 # Apply page breaks at the start of every section, except the first 137 # section (since that will be the start of the file) 138 self.styles.apply_section_page_breaks(self.section_starts[1:]) 139 140 notes_header = None 141 orig_rid_map = self.images.rid_map 142 if self.footnotes.has_notes: 143 self.body.append(H1(self.notes_text)) 144 notes_header = self.body[-1] 145 notes_header.set('class', 'notes-header') 146 for anchor, text, note in self.footnotes: 147 dl = DL(id=anchor) 148 dl.set('class', 'footnote') 149 self.body.append(dl) 150 dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text))) 151 dl[-1][0].tail = ']' 152 dl.append(DD()) 153 paras = [] 154 self.images.rid_map = self.current_rels = note.rels[0] 155 for wp in note: 156 if wp.tag.endswith('}tbl'): 157 self.tables.register(wp, self.styles) 158 self.page_map[wp] = self.current_page 159 else: 160 p = self.convert_p(wp) 161 dl[-1].append(p) 162 paras.append(wp) 163 self.styles.apply_contextual_spacing(paras) 164 self.mark_block_runs(paras) 165 166 for p, wp in iteritems(self.object_map): 167 if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab': 168 # Paragraph uses tabs for indentation, convert to text-indent 169 parent = p[0] 170 tabs = [] 171 for child in parent: 172 if child.get('class', None) == 'tab': 173 tabs.append(child) 174 if child.tail: 175 break 176 else: 177 break 178 indent = len(tabs) * self.settings.default_tab_stop 179 style = self.styles.resolve(wp) 180 if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')): 181 if style.text_indent is not inherit: 182 indent = float(style.text_indent[:-2]) + indent 183 style.text_indent = '%.3gpt' % indent 184 parent.text = tabs[-1].tail or '' 185 for i in tabs: 186 parent.remove(i) 187 188 self.images.rid_map = orig_rid_map 189 190 self.resolve_links() 191 192 self.styles.cascade(self.layers) 193 194 self.tables.apply_markup(self.object_map, self.page_map) 195 196 numbered = [] 197 for html_obj, obj in iteritems(self.object_map): 198 raw = obj.get('calibre_num_id', None) 199 if raw is not None: 200 lvl, num_id = raw.partition(':')[0::2] 201 try: 202 lvl = int(lvl) 203 except (TypeError, ValueError): 204 lvl = 0 205 numbered.append((html_obj, num_id, lvl)) 206 self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images) 207 self.apply_frames() 208 209 if len(self.body) > 0: 210 self.body.text = '\n\t' 211 for child in self.body: 212 child.tail = '\n\t' 213 self.body[-1].tail = '\n' 214 215 self.log.debug('Converting styles to CSS') 216 self.styles.generate_classes() 217 for html_obj, obj in iteritems(self.object_map): 218 style = self.styles.resolve(obj) 219 if style is not None: 220 css = style.css 221 if css: 222 cls = self.styles.class_name(css) 223 if cls: 224 html_obj.set('class', cls) 225 for html_obj, css in iteritems(self.framed_map): 226 cls = self.styles.class_name(css) 227 if cls: 228 html_obj.set('class', cls) 229 230 if notes_header is not None: 231 for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'): 232 notes_header.tag = h.tag 233 cls = h.get('class', None) 234 if cls and cls != 'notes-header': 235 notes_header.set('class', '%s notes-header' % cls) 236 break 237 238 self.fields.polish_markup(self.object_map) 239 240 self.log.debug('Cleaning up redundant markup generated by Word') 241 self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath) 242 243 return self.write(doc) 244 245 def read_page_properties(self, doc): 246 current = [] 247 self.page_map = OrderedDict() 248 self.section_starts = [] 249 250 for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'): 251 if p.tag.endswith('}tbl'): 252 self.tables.register(p, self.styles) 253 current.append(p) 254 continue 255 sect = tuple(self.namespace.descendants(p, 'w:sectPr')) 256 if sect: 257 pr = PageProperties(self.namespace, sect) 258 paras = current + [p] 259 for x in paras: 260 self.page_map[x] = pr 261 self.section_starts.append(paras[0]) 262 current = [] 263 else: 264 current.append(p) 265 266 if current: 267 self.section_starts.append(current[0]) 268 last = self.namespace.XPath('./w:body/w:sectPr')(doc) 269 pr = PageProperties(self.namespace, last) 270 for x in current: 271 self.page_map[x] = pr 272 273 def resolve_alternate_content(self, doc): 274 # For proprietary extensions in Word documents use the fallback, spec 275 # compliant form 276 # See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility 277 for ac in self.namespace.descendants(doc, 'mc:AlternateContent'): 278 choices = self.namespace.XPath('./mc:Choice')(ac) 279 fallbacks = self.namespace.XPath('./mc:Fallback')(ac) 280 if fallbacks: 281 for choice in choices: 282 ac.remove(choice) 283 284 def read_styles(self, relationships_by_type): 285 286 def get_name(rtype, defname): 287 name = relationships_by_type.get(rtype, None) 288 if name is None: 289 cname = self.docx.document_name.split('/') 290 cname[-1] = defname 291 if self.docx.exists('/'.join(cname)): 292 name = name 293 if name and name.startswith('word/word') and not self.docx.exists(name): 294 name = name.partition('/')[2] 295 return name 296 297 nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml') 298 sname = get_name(self.namespace.names['STYLES'], 'styles.xml') 299 sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml') 300 fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml') 301 tname = get_name(self.namespace.names['THEMES'], 'theme1.xml') 302 foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml') 303 enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml') 304 numbering = self.numbering = Numbering(self.namespace) 305 footnotes = self.footnotes = Footnotes(self.namespace) 306 fonts = self.fonts = Fonts(self.namespace) 307 308 foraw = enraw = None 309 forel, enrel = ({}, {}), ({}, {}) 310 if sename is not None: 311 try: 312 seraw = self.docx.read(sename) 313 except KeyError: 314 self.log.warn('Settings %s do not exist' % sename) 315 except OSError as e: 316 if e.errno != errno.ENOENT: 317 raise 318 self.log.warn('Settings %s file missing' % sename) 319 else: 320 self.settings(fromstring(seraw)) 321 322 if foname is not None: 323 try: 324 foraw = self.docx.read(foname) 325 except KeyError: 326 self.log.warn('Footnotes %s do not exist' % foname) 327 else: 328 forel = self.docx.get_relationships(foname) 329 if enname is not None: 330 try: 331 enraw = self.docx.read(enname) 332 except KeyError: 333 self.log.warn('Endnotes %s do not exist' % enname) 334 else: 335 enrel = self.docx.get_relationships(enname) 336 footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel) 337 338 if fname is not None: 339 embed_relationships = self.docx.get_relationships(fname)[0] 340 try: 341 raw = self.docx.read(fname) 342 except KeyError: 343 self.log.warn('Fonts table %s does not exist' % fname) 344 else: 345 fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir) 346 347 if tname is not None: 348 try: 349 raw = self.docx.read(tname) 350 except KeyError: 351 self.log.warn('Styles %s do not exist' % sname) 352 else: 353 self.theme(fromstring(raw)) 354 355 styles_loaded = False 356 if sname is not None: 357 try: 358 raw = self.docx.read(sname) 359 except KeyError: 360 self.log.warn('Styles %s do not exist' % sname) 361 else: 362 self.styles(fromstring(raw), fonts, self.theme) 363 styles_loaded = True 364 if not styles_loaded: 365 self.styles(None, fonts, self.theme) 366 367 if nname is not None: 368 try: 369 raw = self.docx.read(nname) 370 except KeyError: 371 self.log.warn('Numbering styles %s do not exist' % nname) 372 else: 373 numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0]) 374 375 self.styles.resolve_numbering(numbering) 376 377 def write(self, doc): 378 toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace) 379 raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>') 380 with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: 381 f.write(raw) 382 css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub) 383 if css: 384 with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f: 385 f.write(css.encode('utf-8')) 386 387 opf = OPFCreator(self.dest_dir, self.mi) 388 opf.toc = toc 389 opf.create_manifest_from_files_in([self.dest_dir]) 390 for item in opf.manifest: 391 if item.media_type == 'text/html': 392 item.media_type = guess_type('a.xhtml')[0] 393 opf.create_spine(['index.html']) 394 if self.cover_image is not None: 395 opf.guide.set_cover(self.cover_image) 396 397 def process_guide(E, guide): 398 if self.toc_anchor is not None: 399 guide.append(E.reference( 400 href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc')) 401 toc_file = os.path.join(self.dest_dir, 'toc.ncx') 402 with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx: 403 opf.render(of, ncx, 'toc.ncx', process_guide=process_guide) 404 if os.path.getsize(toc_file) == 0: 405 os.remove(toc_file) 406 return os.path.join(self.dest_dir, 'metadata.opf') 407 408 def read_block_anchors(self, doc): 409 doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc)) 410 if doc_anchors: 411 current_bm = set() 412 rmap = {v:k for k, v in iteritems(self.object_map)} 413 for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'): 414 if p.tag.endswith('}p'): 415 if current_bm and p in rmap: 416 para = rmap[p] 417 if 'id' not in para.attrib: 418 para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map)))) 419 for name in current_bm: 420 self.anchor_map[name] = para.get('id') 421 current_bm = set() 422 elif p in doc_anchors: 423 anchor = self.namespace.get(p, 'w:name') 424 if anchor: 425 current_bm.add(anchor) 426 427 def convert_p(self, p): 428 dest = P() 429 self.object_map[dest] = p 430 style = self.styles.resolve_paragraph(p) 431 self.layers[p] = [] 432 self.frame_map[p] = style.frame 433 self.add_frame(dest, style.frame) 434 435 current_anchor = None 436 current_hyperlink = None 437 hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]') 438 439 def p_parent(x): 440 # Ensure that nested <w:p> tags are handled. These can occur if a 441 # textbox is present inside a paragraph. 442 while True: 443 x = x.getparent() 444 try: 445 if x.tag.endswith('}p'): 446 return x 447 except AttributeError: 448 break 449 450 for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'): 451 if p_parent(x) is not p: 452 continue 453 if x.tag.endswith('}r'): 454 span = self.convert_run(x) 455 if current_anchor is not None: 456 (dest if len(dest) == 0 else span).set('id', current_anchor) 457 current_anchor = None 458 if current_hyperlink is not None: 459 try: 460 hl = hl_xpath(x)[0] 461 self.link_map[hl].append(span) 462 self.link_source_map[hl] = self.current_rels 463 x.set('is-link', '1') 464 except IndexError: 465 current_hyperlink = None 466 dest.append(span) 467 self.layers[p].append(x) 468 elif x.tag.endswith('}bookmarkStart'): 469 anchor = self.namespace.get(x, 'w:name') 470 if anchor and anchor not in self.anchor_map and anchor != '_GoBack': 471 # _GoBack is a special bookmark inserted by Word 2010 for 472 # the return to previous edit feature, we ignore it 473 old_anchor = current_anchor 474 self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map))) 475 if old_anchor is not None: 476 # The previous anchor was not applied to any element 477 for a, t in tuple(self.anchor_map.items()): 478 if t == old_anchor: 479 self.anchor_map[a] = current_anchor 480 elif x.tag.endswith('}hyperlink'): 481 current_hyperlink = x 482 elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '): 483 old_anchor = current_anchor 484 anchor = str(uuid.uuid4()) 485 self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map))) 486 self.toc_anchor = current_anchor 487 if old_anchor is not None: 488 # The previous anchor was not applied to any element 489 for a, t in tuple(iteritems(self.anchor_map)): 490 if t == old_anchor: 491 self.anchor_map[a] = current_anchor 492 if current_anchor is not None: 493 if dest.get('id'): 494 # this bookmark was at the end of the paragraph 495 if len(dest): 496 if dest[-1].get('id'): 497 self.anchor_map[current_anchor] = dest[-1].get('id') 498 else: 499 dest[-1].set('id', current_anchor) 500 else: 501 self.anchor_map[current_anchor] = dest.get('id') 502 else: 503 # This paragraph had no <w:r> descendants 504 dest.set('id', current_anchor) 505 current_anchor = None 506 507 m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) 508 if m is not None: 509 n = min(6, max(1, int(m.group(1)))) 510 dest.tag = 'h%d' % n 511 dest.set('data-heading-level', str(n)) 512 513 if style.bidi is True: 514 dest.set('dir', 'rtl') 515 516 border_runs = [] 517 common_borders = [] 518 for span in dest: 519 run = self.object_map[span] 520 style = self.styles.resolve_run(run) 521 if not border_runs or border_runs[-1][1].same_border(style): 522 border_runs.append((span, style)) 523 elif border_runs: 524 if len(border_runs) > 1: 525 common_borders.append(border_runs) 526 border_runs = [] 527 528 for border_run in common_borders: 529 spans = [] 530 bs = {} 531 for span, style in border_run: 532 style.get_border_css(bs) 533 style.clear_border_css() 534 spans.append(span) 535 if bs: 536 cls = self.styles.register(bs, 'text_border') 537 wrapper = self.wrap_elems(spans, SPAN()) 538 wrapper.set('class', cls) 539 540 if not dest.text and len(dest) == 0 and not style.has_visible_border(): 541 # Empty paragraph add a non-breaking space so that it is rendered 542 # by WebKit 543 dest.text = NBSP 544 545 # If the last element in a block is a <br> the <br> is not rendered in 546 # HTML, unless it is followed by a trailing space. Word, on the other 547 # hand inserts a blank line for trailing <br>s. 548 if len(dest) > 0 and not dest[-1].tail: 549 if dest[-1].tag == 'br': 550 dest[-1].tail = NBSP 551 elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail: 552 dest[-1][-1].tail = NBSP 553 554 return dest 555 556 def wrap_elems(self, elems, wrapper): 557 p = elems[0].getparent() 558 idx = p.index(elems[0]) 559 p.insert(idx, wrapper) 560 wrapper.tail = elems[-1].tail 561 elems[-1].tail = None 562 for elem in elems: 563 try: 564 p.remove(elem) 565 except ValueError: 566 # Probably a hyperlink that spans multiple 567 # paragraphs,theoretically we should break this up into 568 # multiple hyperlinks, but I can't be bothered. 569 elem.getparent().remove(elem) 570 wrapper.append(elem) 571 return wrapper 572 573 def resolve_links(self): 574 self.resolved_link_map = {} 575 for hyperlink, spans in iteritems(self.link_map): 576 relationships_by_id = self.link_source_map[hyperlink] 577 span = spans[0] 578 if len(spans) > 1: 579 span = self.wrap_elems(spans, SPAN()) 580 span.tag = 'a' 581 self.resolved_link_map[hyperlink] = span 582 tgt = self.namespace.get(hyperlink, 'w:tgtFrame') 583 if tgt: 584 span.set('target', tgt) 585 tt = self.namespace.get(hyperlink, 'w:tooltip') 586 if tt: 587 span.set('title', tt) 588 rid = self.namespace.get(hyperlink, 'r:id') 589 if rid and rid in relationships_by_id: 590 span.set('href', relationships_by_id[rid]) 591 continue 592 anchor = self.namespace.get(hyperlink, 'w:anchor') 593 if anchor and anchor in self.anchor_map: 594 span.set('href', '#' + self.anchor_map[anchor]) 595 continue 596 self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' % 597 (rid, anchor)) 598 # hrefs that point nowhere give epubcheck a hernia. The element 599 # should be styled explicitly by Word anyway. 600 # span.set('href', '#') 601 rmap = {v:k for k, v in iteritems(self.object_map)} 602 for hyperlink, runs in self.fields.hyperlink_fields: 603 spans = [rmap[r] for r in runs if r in rmap] 604 if not spans: 605 continue 606 span = spans[0] 607 if len(spans) > 1: 608 span = self.wrap_elems(spans, SPAN()) 609 span.tag = 'a' 610 tgt = hyperlink.get('target', None) 611 if tgt: 612 span.set('target', tgt) 613 tt = hyperlink.get('title', None) 614 if tt: 615 span.set('title', tt) 616 url = hyperlink.get('url', None) 617 if url is None: 618 anchor = hyperlink.get('anchor', None) 619 if anchor in self.anchor_map: 620 span.set('href', '#' + self.anchor_map[anchor]) 621 continue 622 self.log.warn('Hyperlink field with unknown anchor: %s' % anchor) 623 else: 624 if url in self.anchor_map: 625 span.set('href', '#' + self.anchor_map[url]) 626 continue 627 span.set('href', url) 628 629 for img, link, relationships_by_id in self.images.links: 630 parent = img.getparent() 631 idx = parent.index(img) 632 a = A(img) 633 a.tail, img.tail = img.tail, None 634 parent.insert(idx, a) 635 tgt = link.get('target', None) 636 if tgt: 637 a.set('target', tgt) 638 tt = link.get('title', None) 639 if tt: 640 a.set('title', tt) 641 rid = link['id'] 642 if rid in relationships_by_id: 643 dest = relationships_by_id[rid] 644 if dest.startswith('#'): 645 if dest[1:] in self.anchor_map: 646 a.set('href', '#' + self.anchor_map[dest[1:]]) 647 else: 648 a.set('href', dest) 649 650 def convert_run(self, run): 651 ans = SPAN() 652 self.object_map[ans] = run 653 text = Text(ans, 'text', []) 654 655 for child in run: 656 if self.namespace.is_tag(child, 'w:t'): 657 if not child.text: 658 continue 659 space = child.get(XML('space'), None) 660 preserve = False 661 ctext = child.text 662 if space != 'preserve': 663 # Remove leading and trailing whitespace. Word ignores 664 # leading and trailing whitespace without preserve 665 ctext = ctext.strip(' \n\r\t') 666 # Only use a <span> with white-space:pre-wrap if this element 667 # actually needs it, i.e. if it has more than one 668 # consecutive space or it has newlines or tabs. 669 multi_spaces = self.ms_pat.search(ctext) is not None 670 preserve = multi_spaces or self.ws_pat.search(ctext) is not None 671 if preserve: 672 text.add_elem(SPAN(ctext, style="white-space:pre-wrap")) 673 ans.append(text.elem) 674 else: 675 text.buf.append(ctext) 676 elif self.namespace.is_tag(child, 'w:cr'): 677 text.add_elem(BR()) 678 ans.append(text.elem) 679 elif self.namespace.is_tag(child, 'w:br'): 680 typ = self.namespace.get(child, 'w:type') 681 if typ in {'column', 'page'}: 682 br = BR(style='page-break-after:always') 683 else: 684 clear = child.get('clear', None) 685 if clear in {'all', 'left', 'right'}: 686 br = BR(style='clear:%s'%('both' if clear == 'all' else clear)) 687 else: 688 br = BR() 689 text.add_elem(br) 690 ans.append(text.elem) 691 elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'): 692 for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir): 693 text.add_elem(img) 694 ans.append(text.elem) 695 elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'): 696 anchor, name = self.footnotes.get_ref(child) 697 if anchor and name: 698 l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name) 699 l.set('class', 'noteref') 700 text.add_elem(l) 701 ans.append(text.elem) 702 elif self.namespace.is_tag(child, 'w:tab'): 703 spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6)) 704 text.add_elem(SPAN(NBSP * spaces)) 705 ans.append(text.elem) 706 ans[-1].set('class', 'tab') 707 elif self.namespace.is_tag(child, 'w:noBreakHyphen'): 708 text.buf.append('\u2011') 709 elif self.namespace.is_tag(child, 'w:softHyphen'): 710 text.buf.append('\u00ad') 711 if text.buf: 712 setattr(text.elem, text.attr, ''.join(text.buf)) 713 714 style = self.styles.resolve_run(run) 715 if style.vert_align in {'superscript', 'subscript'}: 716 if ans.text or len(ans): 717 ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub') 718 if style.lang is not inherit: 719 lang = html_lang(style.lang) 720 if lang is not None and lang != self.doc_lang: 721 ans.set('lang', lang) 722 if style.rtl is True: 723 ans.set('dir', 'rtl') 724 if is_symbol_font(style.font_family): 725 for elem in text: 726 if elem.text: 727 elem.text = map_symbol_text(elem.text, style.font_family) 728 if elem.tail: 729 elem.tail = map_symbol_text(elem.tail, style.font_family) 730 style.font_family = 'sans-serif' 731 return ans 732 733 def add_frame(self, html_obj, style): 734 last_run = self.framed[-1] 735 if style is inherit: 736 if last_run: 737 self.framed.append([]) 738 return 739 740 if last_run: 741 if last_run[-1][1] == style: 742 last_run.append((html_obj, style)) 743 else: 744 self.framed[-1].append((html_obj, style)) 745 else: 746 last_run.append((html_obj, style)) 747 748 def apply_frames(self): 749 for run in filter(None, self.framed): 750 style = run[0][1] 751 paras = tuple(x[0] for x in run) 752 parent = paras[0].getparent() 753 idx = parent.index(paras[0]) 754 frame = DIV(*paras) 755 parent.insert(idx, frame) 756 self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]]) 757 self.styles.register(css, 'frame') 758 759 if not self.block_runs: 760 return 761 rmap = {v:k for k, v in iteritems(self.object_map)} 762 for border_style, blocks in self.block_runs: 763 paras = tuple(rmap[p] for p in blocks) 764 for p in paras: 765 if p.tag == 'li': 766 has_li = True 767 break 768 else: 769 has_li = False 770 parent = paras[0].getparent() 771 if parent.tag in ('ul', 'ol'): 772 ul = parent 773 parent = ul.getparent() 774 idx = parent.index(ul) 775 frame = DIV(ul) 776 elif has_li: 777 def top_level_tag(x): 778 while True: 779 q = x.getparent() 780 if q is parent or q is None: 781 break 782 x = q 783 return x 784 paras = tuple(map(top_level_tag, paras)) 785 idx = parent.index(paras[0]) 786 frame = DIV(*paras) 787 else: 788 idx = parent.index(paras[0]) 789 frame = DIV(*paras) 790 parent.insert(idx, frame) 791 self.framed_map[frame] = css = border_style.css 792 self.styles.register(css, 'frame') 793 794 def mark_block_runs(self, paras): 795 796 def process_run(run): 797 max_left = max_right = 0 798 has_visible_border = None 799 for p in run: 800 style = self.styles.resolve_paragraph(p) 801 if has_visible_border is None: 802 has_visible_border = style.has_visible_border() 803 if isinstance(style.margin_left, numbers.Number): 804 max_left = max(style.margin_left, max_left) 805 if isinstance(style.margin_right, numbers.Number): 806 max_right = max(style.margin_right, max_right) 807 if has_visible_border: 808 style.margin_left = style.margin_right = inherit 809 if p is not run[0]: 810 style.padding_top = 0 811 else: 812 border_style = style.clone_border_styles() 813 if has_visible_border: 814 border_style.margin_top, style.margin_top = style.margin_top, inherit 815 if p is not run[-1]: 816 style.padding_bottom = 0 817 else: 818 if has_visible_border: 819 border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit 820 style.clear_borders() 821 if p is not run[-1]: 822 style.apply_between_border() 823 if has_visible_border: 824 border_style.margin_left, border_style.margin_right = max_left,max_right 825 self.block_runs.append((border_style, run)) 826 827 run = [] 828 for p in paras: 829 if run and self.frame_map.get(p) == self.frame_map.get(run[-1]): 830 style = self.styles.resolve_paragraph(p) 831 last_style = self.styles.resolve_paragraph(run[-1]) 832 if style.has_identical_borders(last_style): 833 run.append(p) 834 continue 835 if len(run) > 1: 836 process_run(run) 837 run = [p] 838 if len(run) > 1: 839 process_run(run) 840 841 842if __name__ == '__main__': 843 import shutil 844 from calibre.utils.logging import default_log 845 default_log.filter_level = default_log.DEBUG 846 dest_dir = os.path.join(os.getcwd(), 'docx_input') 847 if os.path.exists(dest_dir): 848 shutil.rmtree(dest_dir) 849 os.mkdir(dest_dir) 850 Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)() 851