1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2015, Kovid Goyal <kovid at kovidgoyal.net>' 7 8import numbers 9from collections import Counter, defaultdict 10from operator import attrgetter 11 12from lxml import etree 13 14from calibre.ebooks import parse_css_length 15from calibre.ebooks.docx.writer.utils import convert_color, int_or_zero 16from calibre.utils.localization import lang_as_iso639_1 17from polyglot.builtins import iteritems 18from tinycss.css21 import CSS21Parser 19 20css_parser = CSS21Parser() 21 22border_edges = ('left', 'top', 'right', 'bottom') 23border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color') 24ignore = object() 25 26 27def parse_css_font_family(raw): 28 decl, errs = css_parser.parse_style_attr('font-family:' + raw) 29 if decl: 30 for token in decl[0].value: 31 if token.type in 'STRING IDENT': 32 val = token.value 33 if val == 'inherit': 34 break 35 yield val 36 37 38def css_font_family_to_docx(raw): 39 generic = {'serif':'Cambria', 'sansserif':'Candara', 'sans-serif':'Candara', 'fantasy':'Comic Sans', 'cursive':'Segoe Script'} 40 for ff in parse_css_font_family(raw): 41 return generic.get(ff.lower(), ff) 42 43 44def bmap(x): 45 return 'on' if x else 'off' 46 47 48def is_dropcaps(html_tag, tag_style): 49 return len(html_tag) < 2 and len(etree.tostring(html_tag, method='text', encoding='unicode', with_tail=False)) < 5 and tag_style['float'] == 'left' 50 51 52class CombinedStyle: 53 54 def __init__(self, bs, rs, blocks, namespace): 55 self.bs, self.rs, self.blocks = bs, rs, blocks 56 self.namespace = namespace 57 self.id = self.name = self.seq = None 58 self.outline_level = None 59 60 def apply(self): 61 for block in self.blocks: 62 block.linked_style = self 63 for run in block.runs: 64 run.parent_style = self.rs 65 66 def serialize(self, styles, normal_style): 67 makeelement = self.namespace.makeelement 68 w = lambda x: '{%s}%s' % (self.namespace.namespaces['w'], x) 69 block = makeelement(styles, 'w:style', w_styleId=self.id, w_type='paragraph') 70 makeelement(block, 'w:name', w_val=self.name) 71 makeelement(block, 'w:qFormat') 72 if self is not normal_style: 73 makeelement(block, 'w:basedOn', w_val=normal_style.id) 74 if self.seq == 0: 75 block.set(w('default'), '1') 76 pPr = makeelement(block, 'w:pPr') 77 self.bs.serialize_properties(pPr, normal_style.bs) 78 if self.outline_level is not None: 79 makeelement(pPr, 'w:outlineLvl', w_val=str(self.outline_level + 1)) 80 rPr = makeelement(block, 'w:rPr') 81 self.rs.serialize_properties(rPr, normal_style.rs) 82 83 84class FloatSpec: 85 86 def __init__(self, namespace, html_tag, tag_style): 87 self.makeelement = namespace.makeelement 88 self.is_dropcaps = is_dropcaps(html_tag, tag_style) 89 self.blocks = [] 90 if self.is_dropcaps: 91 self.dropcaps_lines = 3 92 else: 93 self.x_align = tag_style['float'] 94 self.w = self.h = None 95 if tag_style._get('width') != 'auto': 96 self.w = int(20 * max(tag_style['min-width'], tag_style['width'])) 97 if tag_style._get('height') == 'auto': 98 self.h_rule = 'auto' 99 else: 100 if tag_style['min-height'] > 0: 101 self.h_rule, self.h = 'atLeast', tag_style['min-height'] 102 else: 103 self.h_rule, self.h = 'exact', tag_style['height'] 104 self.h = int(20 * self.h) 105 self.h_space = int(20 * max(tag_style['margin-right'], tag_style['margin-left'])) 106 self.v_space = int(20 * max(tag_style['margin-top'], tag_style['margin-bottom'])) 107 108 read_css_block_borders(self, tag_style) 109 110 def serialize(self, block, parent): 111 if self.is_dropcaps: 112 attrs = dict(w_dropCap='drop', w_lines=str(self.dropcaps_lines), w_wrap='around', w_vAnchor='text', w_hAnchor='text') 113 else: 114 attrs = dict( 115 w_wrap='around', w_vAnchor='text', w_hAnchor='text', w_xAlign=self.x_align, w_y='1', 116 w_hSpace=str(self.h_space), w_vSpace=str(self.v_space), w_hRule=self.h_rule 117 ) 118 if self.w is not None: 119 attrs['w_w'] = str(self.w) 120 if self.h is not None: 121 attrs['w_h'] = str(self.h) 122 self.makeelement(parent, 'w:framePr', **attrs) 123 # Margins are already applied by the frame style, so override them to 124 # be zero on individual blocks 125 self.makeelement(parent, 'w:ind', w_left='0', w_leftChars='0', w_right='0', w_rightChars='0') 126 attrs = {} 127 if block is self.blocks[0]: 128 attrs.update(dict(w_before='0', w_beforeLines='0')) 129 if block is self.blocks[-1]: 130 attrs.update(dict(w_after='0', w_afterLines='0')) 131 if attrs: 132 self.makeelement(parent, 'w:spacing', **attrs) 133 # Similarly apply the same border and padding properties to all blocks 134 # in this floatspec 135 bdr = self.makeelement(parent, 'w:pBdr') 136 for edge in border_edges: 137 padding = getattr(self, 'padding_' + edge) 138 width = getattr(self, 'border_%s_width' % edge) 139 bstyle = getattr(self, 'border_%s_style' % edge) 140 self.makeelement( 141 bdr, 'w:'+edge, w_space=str(padding), w_val=bstyle, w_sz=str(width), w_color=getattr(self, 'border_%s_color' % edge)) 142 143 144class DOCXStyle: 145 146 ALL_PROPS = () 147 TYPE = 'paragraph' 148 149 def __init__(self, namespace): 150 self.namespace = namespace 151 self.w = lambda x: '{%s}%s' % (namespace.namespaces['w'], x) 152 self.id = self.name = None 153 self.next_style = None 154 self.calculate_hash() 155 156 def calculate_hash(self): 157 self._hash = hash(tuple( 158 getattr(self, x) for x in self.ALL_PROPS)) 159 160 def makeelement(self, parent, name, **attrs): 161 return parent.makeelement(self.w(name), **{self.w(k):v for k, v in iteritems(attrs)}) 162 163 def __hash__(self): 164 return self._hash 165 166 def __eq__(self, other): 167 for x in self.ALL_PROPS: 168 if getattr(self, x) != getattr(other, x, None): 169 return False 170 return True 171 172 def __ne__(self, other): 173 return not self == other 174 175 def __repr__(self): 176 return etree.tostring(self.serialize(etree.Element(self.__class__.__name__, nsmap={'w':self.namespace.namespaces['w']})), pretty_print=True) 177 __str__ = __repr__ 178 179 def serialize(self, styles, normal_style): 180 makeelement = self.makeelement 181 style = makeelement(styles, 'style', styleId=self.id, type=self.TYPE) 182 style.append(makeelement(style, 'name', val=self.name)) 183 if self is not normal_style: 184 style.append(makeelement(style, 'basedOn', val=normal_style.id)) 185 styles.append(style) 186 return style 187 188 189LINE_STYLES = { 190 'none' : 'none', 191 'hidden': 'none', 192 'dotted': 'dotted', 193 'dashed': 'dashed', 194 'solid' : 'single', 195 'double': 'double', 196 'groove': 'threeDEngrave', 197 'ridge' : 'threeDEmboss', 198 'inset' : 'inset', 199 'outset': 'outset', 200} 201 202 203class TextStyle(DOCXStyle): 204 205 ALL_PROPS = ('font_family', 'font_size', 'bold', 'italic', 'color', 206 'background_color', 'underline', 'strike', 'dstrike', 'caps', 207 'shadow', 'small_caps', 'spacing', 'vertical_align', 'padding', 208 'border_style', 'border_width', 'border_color') 209 TYPE = 'character' 210 211 def __init__(self, namespace, css, is_parent_style=False): 212 self.font_family = css_font_family_to_docx(css['font-family']) 213 try: 214 self.font_size = max(0, int(float(css['font-size']) * 2)) # stylizer normalizes all font sizes into pts 215 except (ValueError, TypeError, AttributeError): 216 self.font_size = None 217 218 fw = css['font-weight'] 219 self.bold = (fw.lower() if hasattr(fw, 'lower') else fw) in {'bold', 'bolder'} or int_or_zero(fw) >= 700 220 self.italic = css['font-style'].lower() in {'italic', 'oblique'} 221 self.color = convert_color(css['color']) 222 self.background_color = None if is_parent_style else convert_color(css.backgroundColor) 223 td = set((css.effective_text_decoration or '').split()) 224 self.underline = 'underline' in td 225 self.dstrike = 'line-through' in td and 'overline' in td 226 self.strike = not self.dstrike and 'line-through' in td 227 self.text_transform = css['text-transform'] # TODO: If lowercase or capitalize, transform the actual text 228 self.caps = self.text_transform == 'uppercase' 229 self.small_caps = css['font-variant'].lower() in {'small-caps', 'smallcaps'} 230 self.shadow = css['text-shadow'] not in {'none', None} 231 try: 232 self.spacing = int(float(css['letter-spacing']) * 20) 233 except (ValueError, TypeError, AttributeError): 234 self.spacing = None 235 va = css.first_vertical_align 236 if isinstance(va, numbers.Number): 237 self.vertical_align = str(int(va * 2)) 238 else: 239 val = { 240 'top':'superscript', 'text-top':'superscript', 'sup':'superscript', 'super':'superscript', 241 'bottom':'subscript', 'text-bottom':'subscript', 'sub':'subscript'}.get(va) 242 self.vertical_align = val or 'baseline' 243 244 self.padding = self.border_color = self.border_width = self.border_style = None 245 if not is_parent_style: 246 # DOCX does not support individual borders/padding for inline content 247 for edge in border_edges: 248 # In DOCX padding can only be a positive integer 249 try: 250 padding = max(0, int(css['padding-' + edge])) 251 except ValueError: 252 padding = 0 253 if self.padding is None: 254 self.padding = padding 255 elif self.padding != padding: 256 self.padding = ignore 257 val = css['border-%s-width' % edge] 258 if not isinstance(val, numbers.Number): 259 val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0) 260 val = min(96, max(2, int(val * 8))) 261 if self.border_width is None: 262 self.border_width = val 263 elif self.border_width != val: 264 self.border_width = ignore 265 color = convert_color(css['border-%s-color' % edge]) 266 if self.border_color is None: 267 self.border_color = color 268 elif self.border_color != color: 269 self.border_color = ignore 270 style = LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none') 271 if self.border_style is None: 272 self.border_style = style 273 elif self.border_style != style: 274 self.border_style = ignore 275 276 if self.padding in (None, ignore): 277 self.padding = 0 278 if self.border_width in (None, ignore): 279 self.border_width = 0 280 if self.border_style in (None, ignore): 281 self.border_style = 'none' 282 if self.border_color in (None, ignore): 283 self.border_color = 'auto' 284 if self.border_style == 'none': 285 self.border_width, self.border_color = 0, 'auto' 286 287 DOCXStyle.__init__(self, namespace) 288 289 def serialize_borders(self, bdr, normal_style): 290 w = self.w 291 is_normal_style = self is normal_style 292 if is_normal_style or self.padding != normal_style.padding: 293 bdr.set(w('space'), str(self.padding)) 294 if is_normal_style or self.border_width != normal_style.border_width: 295 bdr.set(w('sz'), str(self.border_width)) 296 if is_normal_style or self.border_style != normal_style.border_style: 297 bdr.set(w('val'), self.border_style) 298 if is_normal_style or self.border_color != normal_style.border_color: 299 bdr.set(w('color'), self.border_color) 300 return bdr 301 302 def serialize(self, styles, normal_style): 303 makeelement = self.makeelement 304 style_root = DOCXStyle.serialize(self, styles, normal_style) 305 style = makeelement(style_root, 'rPr') 306 self.serialize_properties(style, normal_style) 307 if len(style) > 0: 308 style_root.append(style) 309 return style_root 310 311 def serialize_properties(self, rPr, normal_style): 312 makeelement = self.makeelement 313 is_normal_style = self is normal_style 314 if is_normal_style or self.font_family != normal_style.font_family: 315 rPr.append(makeelement( 316 rPr, 'rFonts', **{k:self.font_family for k in 'ascii cs eastAsia hAnsi'.split()})) 317 318 for name, attr, vmap in (('sz', 'font_size', str), ('b', 'bold', bmap), ('i', 'italic', bmap)): 319 val = getattr(self, attr) 320 if is_normal_style or getattr(normal_style, attr) != val: 321 for suffix in ('', 'Cs'): 322 rPr.append(makeelement(rPr, name + suffix, val=vmap(val))) 323 324 def check_attr(attr): 325 val = getattr(self, attr) 326 return is_normal_style or (val != getattr(normal_style, attr)) 327 328 if check_attr('color'): 329 rPr.append(makeelement(rPr, 'color', val=self.color or 'auto')) 330 if check_attr('background_color'): 331 rPr.append(makeelement(rPr, 'shd', fill=self.background_color or 'auto')) 332 if check_attr('underline'): 333 rPr.append(makeelement(rPr, 'u', val='single' if self.underline else 'none')) 334 if check_attr('dstrike'): 335 rPr.append(makeelement(rPr, 'dstrike', val=bmap(self.dstrike))) 336 if check_attr('strike'): 337 rPr.append(makeelement(rPr, 'strike', val=bmap(self.strike))) 338 if check_attr('caps'): 339 rPr.append(makeelement(rPr, 'caps', val=bmap(self.caps))) 340 if check_attr('small_caps'): 341 rPr.append(makeelement(rPr, 'smallCaps', val=bmap(self.small_caps))) 342 if check_attr('shadow'): 343 rPr.append(makeelement(rPr, 'shadow', val=bmap(self.shadow))) 344 if check_attr('spacing'): 345 rPr.append(makeelement(rPr, 'spacing', val=str(self.spacing or 0))) 346 if is_normal_style: 347 rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align if self.vertical_align in {'superscript', 'subscript'} else 'baseline')) 348 elif self.vertical_align != normal_style.vertical_align: 349 if self.vertical_align in {'superscript', 'subscript', 'baseline'}: 350 rPr.append(makeelement(rPr, 'vertAlign', val=self.vertical_align)) 351 else: 352 rPr.append(makeelement(rPr, 'position', val=self.vertical_align)) 353 354 bdr = self.serialize_borders(makeelement(rPr, 'bdr'), normal_style) 355 if bdr.attrib: 356 rPr.append(bdr) 357 358 359class DescendantTextStyle: 360 361 def __init__(self, parent_style, child_style): 362 self.id = self.name = None 363 self.makeelement = child_style.makeelement 364 365 p = [] 366 367 def add(name, **props): 368 p.append((name, frozenset(iteritems(props)))) 369 370 def vals(attr): 371 return getattr(parent_style, attr), getattr(child_style, attr) 372 373 def check(attr): 374 pval, cval = vals(attr) 375 return pval != cval 376 377 if parent_style.font_family != child_style.font_family: 378 add('rFonts', **{k:child_style.font_family for k in 'ascii cs eastAsia hAnsi'.split()}) 379 380 for name, attr in (('sz', 'font_size'), ('b', 'bold'), ('i', 'italic')): 381 pval, cval = vals(attr) 382 if pval != cval: 383 val = 'on' if attr in {'bold', 'italic'} else str(cval) # bold, italic are toggle properties 384 for suffix in ('', 'Cs'): 385 add(name + suffix, val=val) 386 387 if check('color'): 388 add('color', val=child_style.color or 'auto') 389 if check('background_color'): 390 add('shd', fill=child_style.background_color or 'auto') 391 if check('underline'): 392 add('u', val='single' if child_style.underline else 'none') 393 if check('dstrike'): 394 add('dstrike', val=bmap(child_style.dstrike)) 395 if check('strike'): 396 add('strike', val='on') # toggle property 397 if check('caps'): 398 add('caps', val='on') # toggle property 399 if check('small_caps'): 400 add('smallCaps', val='on') # toggle property 401 if check('shadow'): 402 add('shadow', val='on') # toggle property 403 if check('spacing'): 404 add('spacing', val=str(child_style.spacing or 0)) 405 if check('vertical_align'): 406 val = child_style.vertical_align 407 if val in {'superscript', 'subscript', 'baseline'}: 408 add('vertAlign', val=val) 409 else: 410 add('position', val=val) 411 412 bdr = {} 413 if check('padding'): 414 bdr['space'] = str(child_style.padding) 415 if check('border_width'): 416 bdr['sz'] = str(child_style.border_width) 417 if check('border_style'): 418 bdr['val'] = child_style.border_style 419 if check('border_color'): 420 bdr['color'] = child_style.border_color 421 if bdr: 422 add('bdr', **bdr) 423 self.properties = tuple(p) 424 self._hash = hash(self.properties) 425 426 def __hash__(self): 427 return self._hash 428 429 def __eq__(self, other): 430 return self.properties == other.properties 431 432 def __ne__(self, other): 433 return self.properties != other.properties 434 435 def serialize(self, styles): 436 makeelement = self.makeelement 437 style = makeelement(styles, 'style', styleId=self.id, type='character') 438 style.append(makeelement(style, 'name', val=self.name)) 439 rpr = makeelement(style, 'rPr') 440 style.append(rpr) 441 for name, attrs in self.properties: 442 rpr.append(makeelement(style, name, **dict(attrs))) 443 styles.append(style) 444 return style 445 446 447def read_css_block_borders(self, css, store_css_style=False): 448 for edge in border_edges: 449 if css is None: 450 setattr(self, 'padding_' + edge, 0) 451 setattr(self, 'margin_' + edge, 0) 452 setattr(self, 'css_margin_' + edge, '') 453 setattr(self, 'border_%s_width' % edge, 2) 454 setattr(self, 'border_%s_color' % edge, None) 455 setattr(self, 'border_%s_style' % edge, 'none') 456 if store_css_style: 457 setattr(self, 'border_%s_css_style' % edge, 'none') 458 else: 459 # In DOCX padding can only be a positive integer 460 try: 461 setattr(self, 'padding_' + edge, max(0, int(css['padding-' + edge]))) 462 except ValueError: 463 setattr(self, 'padding_' + edge, 0) # invalid value for padding 464 # In DOCX margin must be a positive integer in twips (twentieth of a point) 465 try: 466 setattr(self, 'margin_' + edge, max(0, int(css['margin-' + edge] * 20))) 467 except ValueError: 468 setattr(self, 'margin_' + edge, 0) # e.g.: margin: auto 469 setattr(self, 'css_margin_' + edge, css._style.get('margin-' + edge, '')) 470 val = css['border-%s-width' % edge] 471 if not isinstance(val, numbers.Number): 472 val = {'thin':0.2, 'medium':1, 'thick':2}.get(val, 0) 473 val = min(96, max(2, int(val * 8))) 474 setattr(self, 'border_%s_width' % edge, val) 475 setattr(self, 'border_%s_color' % edge, convert_color(css['border-%s-color' % edge]) or 'auto') 476 setattr(self, 'border_%s_style' % edge, LINE_STYLES.get(css['border-%s-style' % edge].lower(), 'none')) 477 if store_css_style: 478 setattr(self, 'border_%s_css_style' % edge, css['border-%s-style' % edge].lower()) 479 480 481class BlockStyle(DOCXStyle): 482 483 ALL_PROPS = tuple( 484 'text_align css_text_indent text_indent line_height background_color'.split( 485 ) + ['margin_' + edge for edge in border_edges 486 ] + ['css_margin_' + edge for edge in border_edges 487 ] + [x%edge for edge in border_edges for x in border_props] 488 ) 489 490 def __init__(self, namespace, css, html_block, is_table_cell=False, parent_bg=None): 491 read_css_block_borders(self, css) 492 if is_table_cell: 493 for edge in border_edges: 494 setattr(self, 'border_%s_style' % edge, 'none') 495 setattr(self, 'border_%s_width' % edge, 0) 496 setattr(self, 'padding_' + edge, 0) 497 setattr(self, 'margin_' + edge, 0) 498 if css is None: 499 self.text_indent = 0 500 self.css_text_indent = None 501 self.line_height = 280 502 self.background_color = None 503 self.text_align = 'left' 504 else: 505 try: 506 self.text_indent = int(css['text-indent'] * 20) 507 self.css_text_indent = css._get('text-indent') 508 except (TypeError, ValueError): 509 self.text_indent = 0 510 self.css_text_indent = None 511 try: 512 self.line_height = max(0, int(css.lineHeight * 20)) 513 except (TypeError, ValueError): 514 self.line_height = max(0, int(1.2 * css.fontSize * 20)) 515 self.background_color = None if is_table_cell else convert_color(css['background-color']) 516 if not is_table_cell and self.background_color is None: 517 self.background_color = parent_bg 518 try: 519 ws = css['white-space'].lower() 520 preserve_whitespace = ws in {'pre', 'pre-wrap'} 521 except Exception: 522 preserve_whitespace = False 523 try: 524 aval = css['text-align'].lower() 525 if preserve_whitespace: 526 aval = 'start' 527 self.text_align = {'start':'left', 'left':'left', 'end':'right', 'right':'right', 'center':'center', 'justify':'both', 'centre':'center'}.get( 528 aval, 'left') 529 except AttributeError: 530 self.text_align = 'left' 531 532 DOCXStyle.__init__(self, namespace) 533 534 def serialize_borders(self, bdr, normal_style): 535 w = self.w 536 for edge in border_edges: 537 e = bdr.makeelement(w(edge)) 538 padding = getattr(self, 'padding_' + edge) 539 if (self is normal_style and padding > 0) or (padding != getattr(normal_style, 'padding_' + edge)): 540 e.set(w('space'), str(padding)) 541 width = getattr(self, 'border_%s_width' % edge) 542 bstyle = getattr(self, 'border_%s_style' % edge) 543 if (self is normal_style and width > 0 and bstyle != 'none' 544 ) or width != getattr(normal_style, 'border_%s_width' % edge 545 ) or bstyle != getattr(normal_style, 'border_%s_style' % edge): 546 e.set(w('val'), bstyle) 547 e.set(w('sz'), str(width)) 548 e.set(w('color'), getattr(self, 'border_%s_color' % edge)) 549 if e.attrib: 550 bdr.append(e) 551 return bdr 552 553 def serialize(self, styles, normal_style): 554 makeelement = self.makeelement 555 style_root = DOCXStyle.serialize(self, styles, normal_style) 556 style = makeelement(style_root, 'pPr') 557 self.serialize_properties(style, normal_style) 558 if len(style) > 0: 559 style_root.append(style) 560 return style_root 561 562 def serialize_properties(self, pPr, normal_style): 563 makeelement, w = self.makeelement, self.w 564 spacing = makeelement(pPr, 'spacing') 565 for edge, attr in iteritems({'top':'before', 'bottom':'after'}): 566 getter = attrgetter('css_margin_' + edge) 567 css_val, css_unit = parse_css_length(getter(self)) 568 if css_unit in ('em', 'ex'): 569 lines = max(0, int(css_val * (50 if css_unit == 'ex' else 100))) 570 if (self is normal_style and lines > 0) or getter(self) != getter(normal_style): 571 spacing.set(w(attr + 'Lines'), str(lines)) 572 else: 573 getter = attrgetter('margin_' + edge) 574 val = getter(self) 575 if (self is normal_style and val > 0) or val != getter(normal_style): 576 spacing.set(w(attr), str(val)) 577 578 if self is normal_style or self.line_height != normal_style.line_height: 579 spacing.set(w('line'), str(self.line_height)) 580 spacing.set(w('lineRule'), 'atLeast') 581 582 if spacing.attrib: 583 pPr.append(spacing) 584 585 ind = makeelement(pPr, 'ind') 586 for edge in ('left', 'right'): 587 getter = attrgetter('css_margin_' + edge) 588 css_val, css_unit = parse_css_length(getter(self)) 589 if css_unit in ('em', 'ex'): 590 chars = max(0, int(css_val * (50 if css_unit == 'ex' else 100))) 591 if (self is normal_style and chars > 0) or getter(self) != getter(normal_style): 592 ind.set(w(edge + 'Chars'), str(chars)) 593 else: 594 getter = attrgetter('margin_' + edge) 595 val = getter(self) 596 if (self is normal_style and val > 0) or val != getter(normal_style): 597 ind.set(w(edge), str(val)) 598 ind.set(w(edge + 'Chars'), '0') # This is needed to override any declaration in the parent style 599 css_val, css_unit = parse_css_length(self.css_text_indent) 600 if css_unit in ('em', 'ex'): 601 chars = int(css_val * (50 if css_unit == 'ex' else 100)) 602 if css_val >= 0: 603 if (self is normal_style and chars > 0) or self.css_text_indent != normal_style.css_text_indent: 604 ind.set(w('firstLineChars'), str(chars)) 605 else: 606 if (self is normal_style and chars < 0) or self.css_text_indent != normal_style.css_text_indent: 607 ind.set(w('hangingChars'), str(abs(chars))) 608 else: 609 val = self.text_indent 610 if val >= 0: 611 if (self is normal_style and val > 0) or self.text_indent != normal_style.text_indent: 612 ind.set(w('firstLine'), str(val)) 613 ind.set(w('firstLineChars'), '0') # This is needed to override any declaration in the parent style 614 else: 615 if (self is normal_style and val < 0) or self.text_indent != normal_style.text_indent: 616 ind.set(w('hanging'), str(abs(val))) 617 ind.set(w('hangingChars'), '0') 618 if ind.attrib: 619 pPr.append(ind) 620 621 if (self is normal_style and self.background_color) or self.background_color != normal_style.background_color: 622 pPr.append(makeelement(pPr, 'shd', val='clear', color='auto', fill=self.background_color or 'auto')) 623 624 pbdr = self.serialize_borders(pPr.makeelement(w('pBdr')), normal_style) 625 if len(pbdr): 626 pPr.append(pbdr) 627 628 if self is normal_style or self.text_align != normal_style.text_align: 629 pPr.append(makeelement(pPr, 'jc', val=self.text_align)) 630 631 if self is not normal_style and self.next_style is not None: 632 pPr.append(makeelement(pPr, 'next', val=self.next_style)) 633 634 635class StylesManager: 636 637 def __init__(self, namespace, log, document_lang): 638 self.namespace = namespace 639 self.document_lang = lang_as_iso639_1(document_lang) or 'en' 640 self.log = log 641 self.block_styles, self.text_styles = {}, {} 642 self.styles_for_html_blocks = {} 643 644 def create_text_style(self, css_style, is_parent_style=False): 645 ans = TextStyle(self.namespace, css_style, is_parent_style=is_parent_style) 646 existing = self.text_styles.get(ans, None) 647 if existing is None: 648 self.text_styles[ans] = ans 649 else: 650 ans = existing 651 return ans 652 653 def create_block_style(self, css_style, html_block, is_table_cell=False, parent_bg=None): 654 ans = BlockStyle(self.namespace, css_style, html_block, is_table_cell=is_table_cell, parent_bg=parent_bg) 655 existing = self.block_styles.get(ans, None) 656 if existing is None: 657 self.block_styles[ans] = ans 658 else: 659 ans = existing 660 self.styles_for_html_blocks[html_block] = ans 661 return ans 662 663 def finalize(self, all_blocks): 664 block_counts, run_counts = Counter(), Counter() 665 block_rmap, run_rmap = defaultdict(list), defaultdict(list) 666 used_pairs = defaultdict(list) 667 heading_styles = defaultdict(list) 668 headings = frozenset('h1 h2 h3 h4 h5 h6'.split()) 669 pure_block_styles = set() 670 671 for block in all_blocks: 672 bs = block.style 673 block_counts[bs] += 1 674 block_rmap[block.style].append(block) 675 local_run_counts = Counter() 676 for run in block.runs: 677 count = run.style_weight 678 run_counts[run.style] += count 679 local_run_counts[run.style] += count 680 run_rmap[run.style].append(run) 681 if local_run_counts: 682 rs = local_run_counts.most_common(1)[0][0] 683 used_pairs[(bs, rs)].append(block) 684 if block.html_tag in headings: 685 heading_styles[block.html_tag].append((bs, rs)) 686 else: 687 pure_block_styles.add(bs) 688 689 self.pure_block_styles = sorted(pure_block_styles, key=block_counts.__getitem__) 690 bnum = len(str(max(1, len(pure_block_styles) - 1))) 691 for i, bs in enumerate(self.pure_block_styles): 692 bs.id = bs.name = '%0{}d Block'.format(bnum) % i 693 bs.seq = i 694 if i == 0: 695 self.normal_pure_block_style = bs 696 697 counts = Counter() 698 smap = {} 699 for (bs, rs), blocks in iteritems(used_pairs): 700 s = CombinedStyle(bs, rs, blocks, self.namespace) 701 smap[(bs, rs)] = s 702 counts[s] += sum(1 for b in blocks if not b.is_empty()) 703 for i, heading_tag in enumerate(sorted(heading_styles)): 704 styles = sorted((smap[k] for k in heading_styles[heading_tag]), key=counts.__getitem__) 705 styles = list(filter(lambda s:s.outline_level is None, styles)) 706 if styles: 707 heading_style = styles[-1] 708 heading_style.outline_level = i 709 710 snum = len(str(max(1, len(counts) - 1))) 711 heading_styles = [] 712 for i, (style, count) in enumerate(counts.most_common()): 713 if i == 0: 714 self.normal_style = style 715 style.id = style.name = 'Normal' 716 else: 717 if style.outline_level is None: 718 val = 'Para %0{}d'.format(snum) % i 719 else: 720 val = 'Heading %d' % (style.outline_level + 1) 721 heading_styles.append(style) 722 style.id = style.name = val 723 style.seq = i 724 self.combined_styles = sorted(counts, key=attrgetter('seq')) 725 [ls.apply() for ls in self.combined_styles] 726 727 descendant_style_map = {} 728 ds_counts = Counter() 729 for block in all_blocks: 730 for run in block.runs: 731 if run.parent_style is not run.style and run.parent_style and run.style: 732 ds = DescendantTextStyle(run.parent_style, run.style) 733 if ds.properties: 734 run.descendant_style = descendant_style_map.get(ds) 735 if run.descendant_style is None: 736 run.descendant_style = descendant_style_map[ds] = ds 737 ds_counts[run.descendant_style] += run.style_weight 738 rnum = len(str(max(1, len(ds_counts) - 1))) 739 for i, (text_style, count) in enumerate(ds_counts.most_common()): 740 text_style.id = 'Text%d' % i 741 text_style.name = '%0{}d Text'.format(rnum) % i 742 text_style.seq = i 743 self.descendant_text_styles = sorted(descendant_style_map, key=attrgetter('seq')) 744 745 self.log.debug('%d Text Styles %d Combined styles' % tuple(map(len, ( 746 self.descendant_text_styles, self.combined_styles)))) 747 748 self.primary_heading_style = None 749 if heading_styles: 750 heading_styles.sort(key=attrgetter('outline_level')) 751 self.primary_heading_style = heading_styles[0] 752 else: 753 ms = 0 754 for s in self.combined_styles: 755 if s.rs.font_size > ms: 756 self.primary_heading_style = s 757 ms = s.rs.font_size 758 759 def serialize(self, styles): 760 lang = styles.xpath('descendant::*[local-name()="lang"]')[0] 761 for k in tuple(lang.attrib): 762 lang.attrib[k] = self.document_lang 763 for style in self.combined_styles: 764 style.serialize(styles, self.normal_style) 765 for style in self.descendant_text_styles: 766 style.serialize(styles) 767 for style in sorted(self.pure_block_styles, key=attrgetter('seq')): 768 style.serialize(styles, self.normal_pure_block_style) 769