1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3
4
5__license__ = 'GPL v3'
6__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
7
8import sys, os, re, math, errno, uuid, numbers
9from collections import OrderedDict, defaultdict
10
11from lxml import html
12from lxml.html.builder import (
13    HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
14
15from calibre import guess_type
16from calibre.ebooks.docx.container import DOCX, fromstring
17from calibre.ebooks.docx.names import XML, generate_anchor
18from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
19from calibre.ebooks.docx.numbering import Numbering
20from calibre.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
21from calibre.ebooks.docx.images import Images
22from calibre.ebooks.docx.tables import Tables
23from calibre.ebooks.docx.footnotes import Footnotes
24from calibre.ebooks.docx.cleanup import cleanup_markup
25from calibre.ebooks.docx.theme import Theme
26from calibre.ebooks.docx.toc import create_toc
27from calibre.ebooks.docx.fields import Fields
28from calibre.ebooks.docx.settings import Settings
29from calibre.ebooks.metadata.opf2 import OPFCreator
30from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
31from polyglot.builtins import iteritems, itervalues
32
33
34NBSP = '\xa0'
35
36
37class Text:
38
39    def __init__(self, elem, attr, buf):
40        self.elem, self.attr, self.buf = elem, attr, buf
41        self.elems = [self.elem]
42
43    def add_elem(self, elem):
44        self.elems.append(elem)
45        setattr(self.elem, self.attr, ''.join(self.buf))
46        self.elem, self.attr, self.buf = elem, 'tail', []
47
48    def __iter__(self):
49        return iter(self.elems)
50
51
52def html_lang(docx_lang):
53    lang = canonicalize_lang(docx_lang)
54    if lang and lang != 'und':
55        lang = lang_as_iso639_1(lang)
56        if lang:
57            return lang
58
59
60class Convert:
61
62    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
63        self.docx = DOCX(path_or_stream, log=log)
64        self.namespace = self.docx.namespace
65        self.ms_pat = re.compile(r'\s{2,}')
66        self.ws_pat = re.compile(r'[\n\r\t]')
67        self.log = self.docx.log
68        self.detect_cover = detect_cover
69        self.notes_text = notes_text or _('Notes')
70        self.notes_nopb = notes_nopb
71        self.nosupsub = nosupsub
72        self.dest_dir = dest_dir or os.getcwd()
73        self.mi = self.docx.metadata
74        self.body = BODY()
75        self.theme = Theme(self.namespace)
76        self.settings = Settings(self.namespace)
77        self.tables = Tables(self.namespace)
78        self.fields = Fields(self.namespace)
79        self.styles = Styles(self.namespace, self.tables)
80        self.images = Images(self.namespace, self.log)
81        self.object_map = OrderedDict()
82        self.html = HTML(
83            HEAD(
84                META(charset='utf-8'),
85                TITLE(self.mi.title or _('Unknown')),
86                LINK(rel='stylesheet', type='text/css', href='docx.css'),
87            ),
88            self.body
89        )
90        self.html.text='\n\t'
91        self.html[0].text='\n\t\t'
92        self.html[0].tail='\n'
93        for child in self.html[0]:
94            child.tail = '\n\t\t'
95        self.html[0][-1].tail = '\n\t'
96        self.html[1].text = self.html[1].tail = '\n'
97        lang = html_lang(self.mi.language)
98        if lang:
99            self.html.set('lang', lang)
100            self.doc_lang = lang
101        else:
102            self.doc_lang = None
103
104    def __call__(self):
105        doc = self.docx.document
106        relationships_by_id, relationships_by_type = self.docx.document_relationships
107        self.resolve_alternate_content(doc)
108        self.fields(doc, self.log)
109        self.read_styles(relationships_by_type)
110        self.images(relationships_by_id)
111        self.layers = OrderedDict()
112        self.framed = [[]]
113        self.frame_map = {}
114        self.framed_map = {}
115        self.anchor_map = {}
116        self.link_map = defaultdict(list)
117        self.link_source_map = {}
118        self.toc_anchor = None
119        self.block_runs = []
120        paras = []
121
122        self.log.debug('Converting Word markup to HTML')
123
124        self.read_page_properties(doc)
125        self.current_rels = relationships_by_id
126        for wp, page_properties in iteritems(self.page_map):
127            self.current_page = page_properties
128            if wp.tag.endswith('}p'):
129                p = self.convert_p(wp)
130                self.body.append(p)
131                paras.append(wp)
132
133        self.read_block_anchors(doc)
134        self.styles.apply_contextual_spacing(paras)
135        self.mark_block_runs(paras)
136        # Apply page breaks at the start of every section, except the first
137        # section (since that will be the start of the file)
138        self.styles.apply_section_page_breaks(self.section_starts[1:])
139
140        notes_header = None
141        orig_rid_map = self.images.rid_map
142        if self.footnotes.has_notes:
143            self.body.append(H1(self.notes_text))
144            notes_header = self.body[-1]
145            notes_header.set('class', 'notes-header')
146            for anchor, text, note in self.footnotes:
147                dl = DL(id=anchor)
148                dl.set('class', 'footnote')
149                self.body.append(dl)
150                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
151                dl[-1][0].tail = ']'
152                dl.append(DD())
153                paras = []
154                self.images.rid_map = self.current_rels = note.rels[0]
155                for wp in note:
156                    if wp.tag.endswith('}tbl'):
157                        self.tables.register(wp, self.styles)
158                        self.page_map[wp] = self.current_page
159                    else:
160                        p = self.convert_p(wp)
161                        dl[-1].append(p)
162                        paras.append(wp)
163                self.styles.apply_contextual_spacing(paras)
164                self.mark_block_runs(paras)
165
166        for p, wp in iteritems(self.object_map):
167            if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
168                # Paragraph uses tabs for indentation, convert to text-indent
169                parent = p[0]
170                tabs = []
171                for child in parent:
172                    if child.get('class', None) == 'tab':
173                        tabs.append(child)
174                        if child.tail:
175                            break
176                    else:
177                        break
178                indent = len(tabs) * self.settings.default_tab_stop
179                style = self.styles.resolve(wp)
180                if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
181                    if style.text_indent is not inherit:
182                        indent = float(style.text_indent[:-2]) + indent
183                    style.text_indent = '%.3gpt' % indent
184                    parent.text = tabs[-1].tail or ''
185                    for i in tabs:
186                        parent.remove(i)
187
188        self.images.rid_map = orig_rid_map
189
190        self.resolve_links()
191
192        self.styles.cascade(self.layers)
193
194        self.tables.apply_markup(self.object_map, self.page_map)
195
196        numbered = []
197        for html_obj, obj in iteritems(self.object_map):
198            raw = obj.get('calibre_num_id', None)
199            if raw is not None:
200                lvl, num_id = raw.partition(':')[0::2]
201                try:
202                    lvl = int(lvl)
203                except (TypeError, ValueError):
204                    lvl = 0
205                numbered.append((html_obj, num_id, lvl))
206        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
207        self.apply_frames()
208
209        if len(self.body) > 0:
210            self.body.text = '\n\t'
211            for child in self.body:
212                child.tail = '\n\t'
213            self.body[-1].tail = '\n'
214
215        self.log.debug('Converting styles to CSS')
216        self.styles.generate_classes()
217        for html_obj, obj in iteritems(self.object_map):
218            style = self.styles.resolve(obj)
219            if style is not None:
220                css = style.css
221                if css:
222                    cls = self.styles.class_name(css)
223                    if cls:
224                        html_obj.set('class', cls)
225        for html_obj, css in iteritems(self.framed_map):
226            cls = self.styles.class_name(css)
227            if cls:
228                html_obj.set('class', cls)
229
230        if notes_header is not None:
231            for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
232                notes_header.tag = h.tag
233                cls = h.get('class', None)
234                if cls and cls != 'notes-header':
235                    notes_header.set('class', '%s notes-header' % cls)
236                break
237
238        self.fields.polish_markup(self.object_map)
239
240        self.log.debug('Cleaning up redundant markup generated by Word')
241        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
242
243        return self.write(doc)
244
245    def read_page_properties(self, doc):
246        current = []
247        self.page_map = OrderedDict()
248        self.section_starts = []
249
250        for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
251            if p.tag.endswith('}tbl'):
252                self.tables.register(p, self.styles)
253                current.append(p)
254                continue
255            sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
256            if sect:
257                pr = PageProperties(self.namespace, sect)
258                paras = current + [p]
259                for x in paras:
260                    self.page_map[x] = pr
261                self.section_starts.append(paras[0])
262                current = []
263            else:
264                current.append(p)
265
266        if current:
267            self.section_starts.append(current[0])
268            last = self.namespace.XPath('./w:body/w:sectPr')(doc)
269            pr = PageProperties(self.namespace, last)
270            for x in current:
271                self.page_map[x] = pr
272
273    def resolve_alternate_content(self, doc):
274        # For proprietary extensions in Word documents use the fallback, spec
275        # compliant form
276        # See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
277        for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
278            choices = self.namespace.XPath('./mc:Choice')(ac)
279            fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
280            if fallbacks:
281                for choice in choices:
282                    ac.remove(choice)
283
284    def read_styles(self, relationships_by_type):
285
286        def get_name(rtype, defname):
287            name = relationships_by_type.get(rtype, None)
288            if name is None:
289                cname = self.docx.document_name.split('/')
290                cname[-1] = defname
291                if self.docx.exists('/'.join(cname)):
292                    name = name
293            if name and name.startswith('word/word') and not self.docx.exists(name):
294                name = name.partition('/')[2]
295            return name
296
297        nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
298        sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
299        sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
300        fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
301        tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
302        foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
303        enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
304        numbering = self.numbering = Numbering(self.namespace)
305        footnotes = self.footnotes = Footnotes(self.namespace)
306        fonts = self.fonts = Fonts(self.namespace)
307
308        foraw = enraw = None
309        forel, enrel = ({}, {}), ({}, {})
310        if sename is not None:
311            try:
312                seraw = self.docx.read(sename)
313            except KeyError:
314                self.log.warn('Settings %s do not exist' % sename)
315            except OSError as e:
316                if e.errno != errno.ENOENT:
317                    raise
318                self.log.warn('Settings %s file missing' % sename)
319            else:
320                self.settings(fromstring(seraw))
321
322        if foname is not None:
323            try:
324                foraw = self.docx.read(foname)
325            except KeyError:
326                self.log.warn('Footnotes %s do not exist' % foname)
327            else:
328                forel = self.docx.get_relationships(foname)
329        if enname is not None:
330            try:
331                enraw = self.docx.read(enname)
332            except KeyError:
333                self.log.warn('Endnotes %s do not exist' % enname)
334            else:
335                enrel = self.docx.get_relationships(enname)
336        footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)
337
338        if fname is not None:
339            embed_relationships = self.docx.get_relationships(fname)[0]
340            try:
341                raw = self.docx.read(fname)
342            except KeyError:
343                self.log.warn('Fonts table %s does not exist' % fname)
344            else:
345                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
346
347        if tname is not None:
348            try:
349                raw = self.docx.read(tname)
350            except KeyError:
351                self.log.warn('Styles %s do not exist' % sname)
352            else:
353                self.theme(fromstring(raw))
354
355        styles_loaded = False
356        if sname is not None:
357            try:
358                raw = self.docx.read(sname)
359            except KeyError:
360                self.log.warn('Styles %s do not exist' % sname)
361            else:
362                self.styles(fromstring(raw), fonts, self.theme)
363                styles_loaded = True
364        if not styles_loaded:
365            self.styles(None, fonts, self.theme)
366
367        if nname is not None:
368            try:
369                raw = self.docx.read(nname)
370            except KeyError:
371                self.log.warn('Numbering styles %s do not exist' % nname)
372            else:
373                numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
374
375        self.styles.resolve_numbering(numbering)
376
377    def write(self, doc):
378        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
379        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
380        with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
381            f.write(raw)
382        css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
383        if css:
384            with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
385                f.write(css.encode('utf-8'))
386
387        opf = OPFCreator(self.dest_dir, self.mi)
388        opf.toc = toc
389        opf.create_manifest_from_files_in([self.dest_dir])
390        for item in opf.manifest:
391            if item.media_type == 'text/html':
392                item.media_type = guess_type('a.xhtml')[0]
393        opf.create_spine(['index.html'])
394        if self.cover_image is not None:
395            opf.guide.set_cover(self.cover_image)
396
397        def process_guide(E, guide):
398            if self.toc_anchor is not None:
399                guide.append(E.reference(
400                    href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
401        toc_file = os.path.join(self.dest_dir, 'toc.ncx')
402        with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
403            opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
404        if os.path.getsize(toc_file) == 0:
405            os.remove(toc_file)
406        return os.path.join(self.dest_dir, 'metadata.opf')
407
408    def read_block_anchors(self, doc):
409        doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
410        if doc_anchors:
411            current_bm = set()
412            rmap = {v:k for k, v in iteritems(self.object_map)}
413            for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
414                if p.tag.endswith('}p'):
415                    if current_bm and p in rmap:
416                        para = rmap[p]
417                        if 'id' not in para.attrib:
418                            para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
419                        for name in current_bm:
420                            self.anchor_map[name] = para.get('id')
421                        current_bm = set()
422                elif p in doc_anchors:
423                    anchor = self.namespace.get(p, 'w:name')
424                    if anchor:
425                        current_bm.add(anchor)
426
427    def convert_p(self, p):
428        dest = P()
429        self.object_map[dest] = p
430        style = self.styles.resolve_paragraph(p)
431        self.layers[p] = []
432        self.frame_map[p] = style.frame
433        self.add_frame(dest, style.frame)
434
435        current_anchor = None
436        current_hyperlink = None
437        hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
438
439        def p_parent(x):
440            # Ensure that nested <w:p> tags are handled. These can occur if a
441            # textbox is present inside a paragraph.
442            while True:
443                x = x.getparent()
444                try:
445                    if x.tag.endswith('}p'):
446                        return x
447                except AttributeError:
448                    break
449
450        for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
451            if p_parent(x) is not p:
452                continue
453            if x.tag.endswith('}r'):
454                span = self.convert_run(x)
455                if current_anchor is not None:
456                    (dest if len(dest) == 0 else span).set('id', current_anchor)
457                    current_anchor = None
458                if current_hyperlink is not None:
459                    try:
460                        hl = hl_xpath(x)[0]
461                        self.link_map[hl].append(span)
462                        self.link_source_map[hl] = self.current_rels
463                        x.set('is-link', '1')
464                    except IndexError:
465                        current_hyperlink = None
466                dest.append(span)
467                self.layers[p].append(x)
468            elif x.tag.endswith('}bookmarkStart'):
469                anchor = self.namespace.get(x, 'w:name')
470                if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
471                    # _GoBack is a special bookmark inserted by Word 2010 for
472                    # the return to previous edit feature, we ignore it
473                    old_anchor = current_anchor
474                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
475                    if old_anchor is not None:
476                        # The previous anchor was not applied to any element
477                        for a, t in tuple(self.anchor_map.items()):
478                            if t == old_anchor:
479                                self.anchor_map[a] = current_anchor
480            elif x.tag.endswith('}hyperlink'):
481                current_hyperlink = x
482            elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
483                old_anchor = current_anchor
484                anchor = str(uuid.uuid4())
485                self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
486                self.toc_anchor = current_anchor
487                if old_anchor is not None:
488                    # The previous anchor was not applied to any element
489                    for a, t in tuple(iteritems(self.anchor_map)):
490                        if t == old_anchor:
491                            self.anchor_map[a] = current_anchor
492        if current_anchor is not None:
493            if dest.get('id'):
494                # this bookmark was at the end of the paragraph
495                if len(dest):
496                    if dest[-1].get('id'):
497                        self.anchor_map[current_anchor] = dest[-1].get('id')
498                    else:
499                        dest[-1].set('id', current_anchor)
500                else:
501                    self.anchor_map[current_anchor] = dest.get('id')
502            else:
503                # This paragraph had no <w:r> descendants
504                dest.set('id', current_anchor)
505            current_anchor = None
506
507        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
508        if m is not None:
509            n = min(6, max(1, int(m.group(1))))
510            dest.tag = 'h%d' % n
511            dest.set('data-heading-level', str(n))
512
513        if style.bidi is True:
514            dest.set('dir', 'rtl')
515
516        border_runs = []
517        common_borders = []
518        for span in dest:
519            run = self.object_map[span]
520            style = self.styles.resolve_run(run)
521            if not border_runs or border_runs[-1][1].same_border(style):
522                border_runs.append((span, style))
523            elif border_runs:
524                if len(border_runs) > 1:
525                    common_borders.append(border_runs)
526                border_runs = []
527
528        for border_run in common_borders:
529            spans = []
530            bs = {}
531            for span, style in border_run:
532                style.get_border_css(bs)
533                style.clear_border_css()
534                spans.append(span)
535            if bs:
536                cls = self.styles.register(bs, 'text_border')
537                wrapper = self.wrap_elems(spans, SPAN())
538                wrapper.set('class', cls)
539
540        if not dest.text and len(dest) == 0 and not style.has_visible_border():
541            # Empty paragraph add a non-breaking space so that it is rendered
542            # by WebKit
543            dest.text = NBSP
544
545        # If the last element in a block is a <br> the <br> is not rendered in
546        # HTML, unless it is followed by a trailing space. Word, on the other
547        # hand inserts a blank line for trailing <br>s.
548        if len(dest) > 0 and not dest[-1].tail:
549            if dest[-1].tag == 'br':
550                dest[-1].tail = NBSP
551            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
552                dest[-1][-1].tail = NBSP
553
554        return dest
555
556    def wrap_elems(self, elems, wrapper):
557        p = elems[0].getparent()
558        idx = p.index(elems[0])
559        p.insert(idx, wrapper)
560        wrapper.tail = elems[-1].tail
561        elems[-1].tail = None
562        for elem in elems:
563            try:
564                p.remove(elem)
565            except ValueError:
566                # Probably a hyperlink that spans multiple
567                # paragraphs,theoretically we should break this up into
568                # multiple hyperlinks, but I can't be bothered.
569                elem.getparent().remove(elem)
570            wrapper.append(elem)
571        return wrapper
572
573    def resolve_links(self):
574        self.resolved_link_map = {}
575        for hyperlink, spans in iteritems(self.link_map):
576            relationships_by_id = self.link_source_map[hyperlink]
577            span = spans[0]
578            if len(spans) > 1:
579                span = self.wrap_elems(spans, SPAN())
580            span.tag = 'a'
581            self.resolved_link_map[hyperlink] = span
582            tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
583            if tgt:
584                span.set('target', tgt)
585            tt = self.namespace.get(hyperlink, 'w:tooltip')
586            if tt:
587                span.set('title', tt)
588            rid = self.namespace.get(hyperlink, 'r:id')
589            if rid and rid in relationships_by_id:
590                span.set('href', relationships_by_id[rid])
591                continue
592            anchor = self.namespace.get(hyperlink, 'w:anchor')
593            if anchor and anchor in self.anchor_map:
594                span.set('href', '#' + self.anchor_map[anchor])
595                continue
596            self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
597                          (rid, anchor))
598            # hrefs that point nowhere give epubcheck a hernia. The element
599            # should be styled explicitly by Word anyway.
600            # span.set('href', '#')
601        rmap = {v:k for k, v in iteritems(self.object_map)}
602        for hyperlink, runs in self.fields.hyperlink_fields:
603            spans = [rmap[r] for r in runs if r in rmap]
604            if not spans:
605                continue
606            span = spans[0]
607            if len(spans) > 1:
608                span = self.wrap_elems(spans, SPAN())
609            span.tag = 'a'
610            tgt = hyperlink.get('target', None)
611            if tgt:
612                span.set('target', tgt)
613            tt = hyperlink.get('title', None)
614            if tt:
615                span.set('title', tt)
616            url = hyperlink.get('url', None)
617            if url is None:
618                anchor = hyperlink.get('anchor', None)
619                if anchor in self.anchor_map:
620                    span.set('href', '#' + self.anchor_map[anchor])
621                    continue
622                self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
623            else:
624                if url in self.anchor_map:
625                    span.set('href', '#' + self.anchor_map[url])
626                    continue
627                span.set('href', url)
628
629        for img, link, relationships_by_id in self.images.links:
630            parent = img.getparent()
631            idx = parent.index(img)
632            a = A(img)
633            a.tail, img.tail = img.tail, None
634            parent.insert(idx, a)
635            tgt = link.get('target', None)
636            if tgt:
637                a.set('target', tgt)
638            tt = link.get('title', None)
639            if tt:
640                a.set('title', tt)
641            rid = link['id']
642            if rid in relationships_by_id:
643                dest = relationships_by_id[rid]
644                if dest.startswith('#'):
645                    if dest[1:] in self.anchor_map:
646                        a.set('href', '#' + self.anchor_map[dest[1:]])
647                else:
648                    a.set('href', dest)
649
650    def convert_run(self, run):
651        ans = SPAN()
652        self.object_map[ans] = run
653        text = Text(ans, 'text', [])
654
655        for child in run:
656            if self.namespace.is_tag(child, 'w:t'):
657                if not child.text:
658                    continue
659                space = child.get(XML('space'), None)
660                preserve = False
661                ctext = child.text
662                if space != 'preserve':
663                    # Remove leading and trailing whitespace. Word ignores
664                    # leading and trailing whitespace without preserve
665                    ctext = ctext.strip(' \n\r\t')
666                # Only use a <span> with white-space:pre-wrap if this element
667                # actually needs it, i.e. if it has more than one
668                # consecutive space or it has newlines or tabs.
669                multi_spaces = self.ms_pat.search(ctext) is not None
670                preserve = multi_spaces or self.ws_pat.search(ctext) is not None
671                if preserve:
672                    text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
673                    ans.append(text.elem)
674                else:
675                    text.buf.append(ctext)
676            elif self.namespace.is_tag(child, 'w:cr'):
677                text.add_elem(BR())
678                ans.append(text.elem)
679            elif self.namespace.is_tag(child, 'w:br'):
680                typ = self.namespace.get(child, 'w:type')
681                if typ in {'column', 'page'}:
682                    br = BR(style='page-break-after:always')
683                else:
684                    clear = child.get('clear', None)
685                    if clear in {'all', 'left', 'right'}:
686                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
687                    else:
688                        br = BR()
689                text.add_elem(br)
690                ans.append(text.elem)
691            elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
692                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
693                    text.add_elem(img)
694                    ans.append(text.elem)
695            elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
696                anchor, name = self.footnotes.get_ref(child)
697                if anchor and name:
698                    l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
699                    l.set('class', 'noteref')
700                    text.add_elem(l)
701                    ans.append(text.elem)
702            elif self.namespace.is_tag(child, 'w:tab'):
703                spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
704                text.add_elem(SPAN(NBSP * spaces))
705                ans.append(text.elem)
706                ans[-1].set('class', 'tab')
707            elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
708                text.buf.append('\u2011')
709            elif self.namespace.is_tag(child, 'w:softHyphen'):
710                text.buf.append('\u00ad')
711        if text.buf:
712            setattr(text.elem, text.attr, ''.join(text.buf))
713
714        style = self.styles.resolve_run(run)
715        if style.vert_align in {'superscript', 'subscript'}:
716            if ans.text or len(ans):
717                ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
718        if style.lang is not inherit:
719            lang = html_lang(style.lang)
720            if lang is not None and lang != self.doc_lang:
721                ans.set('lang', lang)
722        if style.rtl is True:
723            ans.set('dir', 'rtl')
724        if is_symbol_font(style.font_family):
725            for elem in text:
726                if elem.text:
727                    elem.text = map_symbol_text(elem.text, style.font_family)
728                if elem.tail:
729                    elem.tail = map_symbol_text(elem.tail, style.font_family)
730            style.font_family = 'sans-serif'
731        return ans
732
733    def add_frame(self, html_obj, style):
734        last_run = self.framed[-1]
735        if style is inherit:
736            if last_run:
737                self.framed.append([])
738            return
739
740        if last_run:
741            if last_run[-1][1] == style:
742                last_run.append((html_obj, style))
743            else:
744                self.framed[-1].append((html_obj, style))
745        else:
746            last_run.append((html_obj, style))
747
748    def apply_frames(self):
749        for run in filter(None, self.framed):
750            style = run[0][1]
751            paras = tuple(x[0] for x in run)
752            parent = paras[0].getparent()
753            idx = parent.index(paras[0])
754            frame = DIV(*paras)
755            parent.insert(idx, frame)
756            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
757            self.styles.register(css, 'frame')
758
759        if not self.block_runs:
760            return
761        rmap = {v:k for k, v in iteritems(self.object_map)}
762        for border_style, blocks in self.block_runs:
763            paras = tuple(rmap[p] for p in blocks)
764            for p in paras:
765                if p.tag == 'li':
766                    has_li = True
767                    break
768            else:
769                has_li = False
770            parent = paras[0].getparent()
771            if parent.tag in ('ul', 'ol'):
772                ul = parent
773                parent = ul.getparent()
774                idx = parent.index(ul)
775                frame = DIV(ul)
776            elif has_li:
777                def top_level_tag(x):
778                    while True:
779                        q = x.getparent()
780                        if q is parent or q is None:
781                            break
782                        x = q
783                    return x
784                paras = tuple(map(top_level_tag, paras))
785                idx = parent.index(paras[0])
786                frame = DIV(*paras)
787            else:
788                idx = parent.index(paras[0])
789                frame = DIV(*paras)
790            parent.insert(idx, frame)
791            self.framed_map[frame] = css = border_style.css
792            self.styles.register(css, 'frame')
793
794    def mark_block_runs(self, paras):
795
796        def process_run(run):
797            max_left = max_right = 0
798            has_visible_border = None
799            for p in run:
800                style = self.styles.resolve_paragraph(p)
801                if has_visible_border is None:
802                    has_visible_border = style.has_visible_border()
803                if isinstance(style.margin_left, numbers.Number):
804                    max_left = max(style.margin_left, max_left)
805                if isinstance(style.margin_right, numbers.Number):
806                    max_right = max(style.margin_right, max_right)
807                if has_visible_border:
808                    style.margin_left = style.margin_right = inherit
809                if p is not run[0]:
810                    style.padding_top = 0
811                else:
812                    border_style = style.clone_border_styles()
813                    if has_visible_border:
814                        border_style.margin_top, style.margin_top = style.margin_top, inherit
815                if p is not run[-1]:
816                    style.padding_bottom = 0
817                else:
818                    if has_visible_border:
819                        border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
820                style.clear_borders()
821                if p is not run[-1]:
822                    style.apply_between_border()
823            if has_visible_border:
824                border_style.margin_left, border_style.margin_right = max_left,max_right
825                self.block_runs.append((border_style, run))
826
827        run = []
828        for p in paras:
829            if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
830                style = self.styles.resolve_paragraph(p)
831                last_style = self.styles.resolve_paragraph(run[-1])
832                if style.has_identical_borders(last_style):
833                    run.append(p)
834                    continue
835            if len(run) > 1:
836                process_run(run)
837            run = [p]
838        if len(run) > 1:
839            process_run(run)
840
841
842if __name__ == '__main__':
843    import shutil
844    from calibre.utils.logging import default_log
845    default_log.filter_level = default_log.DEBUG
846    dest_dir = os.path.join(os.getcwd(), 'docx_input')
847    if os.path.exists(dest_dir):
848        shutil.rmtree(dest_dir)
849    os.mkdir(dest_dir)
850    Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()
851