1#!/usr/local/bin/python3.8 2# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 3 4 5# this program works in concert with the output from KindleUnpack 6 7''' 8Convert from Mobi ML to XHTML 9''' 10 11from __future__ import division, absolute_import, print_function 12 13import os 14import sys 15import re 16 17SPECIAL_HANDLING_TAGS = { 18 '?xml' : ('xmlheader', -1), 19 '!--' : ('comment', -3), 20 '!DOCTYPE' : ('doctype', -1), 21} 22 23SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment'] 24 25SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference'] 26 27class MobiMLConverter(object): 28 29 PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) 30 IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') 31 32 def __init__(self, filename): 33 self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n' 34 self.base_css_rules += 'p { margin: 0em }\n' 35 self.base_css_rules += '.bold { font-weight: bold }\n' 36 self.base_css_rules += '.italic { font-style: italic }\n' 37 self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n' 38 self.tag_css_rules = {} 39 self.tag_css_rule_cnt = 0 40 self.path = [] 41 self.filename = filename 42 self.wipml = open(self.filename, 'r').read() 43 self.pos = 0 44 self.opfname = self.filename.rsplit('.',1)[0] + '.opf' 45 self.opos = 0 46 self.meta = '' 47 self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css') 48 self.current_font_size = 3 49 self.font_history = [] 50 51 def cleanup_html(self): 52 self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml) 53 self.wipml = self.wipml.replace('\r\n', '\n') 54 self.wipml = self.wipml.replace('> <', '>\n<') 55 self.wipml = self.wipml.replace('<mbp: ', '<mbp:') 56 # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml) 57 self.wipml = self.wipml.replace('<br></br>','<br/>') 58 59 def replace_page_breaks(self): 60 self.wipml = self.PAGE_BREAK_PAT.sub( 61 '<div class="mbp_pagebreak" />', 62 self.wipml) 63 64 # parse leading text of ml and tag 65 def parseml(self): 66 p = self.pos 67 if p >= len(self.wipml): 68 return None 69 if self.wipml[p] != '<': 70 res = self.wipml.find('<',p) 71 if res == -1 : 72 res = len(self.wipml) 73 self.pos = res 74 return self.wipml[p:res], None 75 # handle comment as a special case to deal with multi-line comments 76 if self.wipml[p:p+4] == '<!--': 77 te = self.wipml.find('-->',p+1) 78 if te != -1: 79 te = te+2 80 else : 81 te = self.wipml.find('>',p+1) 82 ntb = self.wipml.find('<',p+1) 83 if ntb != -1 and ntb < te: 84 self.pos = ntb 85 return self.wipml[p:ntb], None 86 self.pos = te + 1 87 return None, self.wipml[p:te+1] 88 89 # parses string version of tag to identify its name, 90 # its type 'begin', 'end' or 'single', 91 # plus build a hashtable of its attributes 92 # code is written to handle the possiblity of very poor formating 93 def parsetag(self, s): 94 p = 1 95 # get the tag name 96 tname = None 97 ttype = None 98 tattr = {} 99 while s[p:p+1] == ' ' : 100 p += 1 101 if s[p:p+1] == '/': 102 ttype = 'end' 103 p += 1 104 while s[p:p+1] == ' ' : 105 p += 1 106 b = p 107 while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") : 108 p += 1 109 tname=s[b:p].lower() 110 if tname == '!doctype': 111 tname = '!DOCTYPE' 112 # special cases 113 if tname in SPECIAL_HANDLING_TAGS: 114 ttype, backstep = SPECIAL_HANDLING_TAGS[tname] 115 tattr['special'] = s[p:backstep] 116 if ttype is None: 117 # parse any attributes 118 while s.find('=',p) != -1 : 119 while s[p:p+1] == ' ' : 120 p += 1 121 b = p 122 while s[p:p+1] != '=' : 123 p += 1 124 aname = s[b:p].lower() 125 aname = aname.rstrip(' ') 126 p += 1 127 while s[p:p+1] == ' ' : 128 p += 1 129 if s[p:p+1] in ('"', "'") : 130 p = p + 1 131 b = p 132 while s[p:p+1] not in ('"', "'") : 133 p += 1 134 val = s[b:p] 135 p += 1 136 else : 137 b = p 138 while s[p:p+1] not in ('>', '/', ' ') : 139 p += 1 140 val = s[b:p] 141 tattr[aname] = val 142 # label beginning and single tags 143 if ttype is None: 144 ttype = 'begin' 145 if s.find(' /',p) >= 0: 146 ttype = 'single_ext' 147 elif s.find('/',p) >= 0: 148 ttype = 'single' 149 return ttype, tname, tattr 150 151 # main routine to convert from mobi markup language to html 152 def processml(self): 153 154 # are these really needed 155 html_done = False 156 head_done = False 157 body_done = False 158 159 skip = False 160 161 htmlstr = '' 162 self.replace_page_breaks() 163 self.cleanup_html() 164 165 # now parse the cleaned up ml into standard xhtml 166 while True: 167 168 r = self.parseml() 169 if not r: 170 break 171 172 text, tag = r 173 174 if text: 175 if not skip: 176 htmlstr += text 177 178 if tag: 179 ttype, tname, tattr = self.parsetag(tag) 180 181 # If we run into a DTD or xml declarations inside the body ... bail. 182 if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done: 183 htmlstr += '\n</body></html>' 184 break 185 186 # make sure self-closing tags actually self-close 187 if ttype == 'begin' and tname in SELF_CLOSING_TAGS: 188 ttype = 'single' 189 190 # make sure any end tags of self-closing tags are discarded 191 if ttype == 'end' and tname in SELF_CLOSING_TAGS: 192 continue 193 194 # remove embedded guide and refernces from old mobis 195 if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'): 196 tname = 'removeme:{0}'.format(tname) 197 tattr = None 198 if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end': 199 if self.path[-1] == 'removeme:{0}'.format(tname): 200 tname = 'removeme:{0}'.format(tname) 201 tattr = None 202 203 # Get rid of font tags that only have a color attribute. 204 if tname == 'font' and ttype in ('begin', 'single', 'single_ext'): 205 if 'color' in tattr and len(tattr) == 1: 206 tname = 'removeme:{0}'.format(tname) 207 tattr = None 208 209 # Get rid of empty spans in the markup. 210 if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr): 211 tname = 'removeme:{0}'.format(tname) 212 213 # need to handle fonts outside of the normal methods 214 # so fonts tags won't be added to the self.path since we keep track 215 # of font tags separately with self.font_history 216 if tname == 'font' and ttype == 'begin': 217 # check for nested font start tags 218 if len(self.font_history) > 0 : 219 # inject a font end tag 220 taginfo = ('end', 'font', None) 221 htmlstr += self.processtag(taginfo) 222 self.font_history.append((ttype, tname, tattr)) 223 # handle the current font start tag 224 taginfo = (ttype, tname, tattr) 225 htmlstr += self.processtag(taginfo) 226 continue 227 228 # check for nested font tags and unnest them 229 if tname == 'font' and ttype == 'end': 230 self.font_history.pop() 231 # handle this font end tag 232 taginfo = ('end', 'font', None) 233 htmlstr += self.processtag(taginfo) 234 # check if we were nested 235 if len(self.font_history) > 0: 236 # inject a copy of the most recent font start tag from history 237 taginfo = self.font_history[-1] 238 htmlstr += self.processtag(taginfo) 239 continue 240 241 # keep track of nesting path 242 if ttype == 'begin': 243 self.path.append(tname) 244 elif ttype == 'end': 245 if tname != self.path[-1]: 246 print('improper nesting: ', self.path, tname, ttype) 247 if tname not in self.path: 248 # handle case of end tag with no beginning by injecting empty begin tag 249 taginfo = ('begin', tname, None) 250 htmlstr += self.processtag(taginfo) 251 print(" - fixed by injecting empty start tag ", tname) 252 self.path.append(tname) 253 elif len(self.path) > 1 and tname == self.path[-2]: 254 # handle case of dangling missing end 255 taginfo = ('end', self.path[-1], None) 256 htmlstr += self.processtag(taginfo) 257 print(" - fixed by injecting end tag ", self.path[-1]) 258 self.path.pop() 259 self.path.pop() 260 261 if tname == 'removeme:{0}'.format(tname): 262 if ttype in ('begin', 'single', 'single_ext'): 263 skip = True 264 else: 265 skip = False 266 else: 267 taginfo = (ttype, tname, tattr) 268 htmlstr += self.processtag(taginfo) 269 270 # handle potential issue of multiple html, head, and body sections 271 if tname == 'html' and ttype == 'begin' and not html_done: 272 htmlstr += '\n' 273 html_done = True 274 275 if tname == 'head' and ttype == 'begin' and not head_done: 276 htmlstr += '\n' 277 # also add in metadata and style link tags 278 htmlstr += self.meta 279 htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n' 280 head_done = True 281 282 if tname == 'body' and ttype == 'begin' and not body_done: 283 htmlstr += '\n' 284 body_done = True 285 286 # handle issue of possibly missing html, head, and body tags 287 # I have not seen this but the original did something like this so ... 288 if not body_done: 289 htmlstr = '<body>\n' + htmlstr + '</body>\n' 290 if not head_done: 291 headstr = '<head>\n' 292 headstr += self.meta 293 headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n' 294 headstr += '</head>\n' 295 htmlstr = headstr + htmlstr 296 if not html_done: 297 htmlstr = '<html>\n' + htmlstr + '</html>\n' 298 299 # finally add DOCTYPE info 300 htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr 301 302 css = self.base_css_rules 303 for cls, rule in self.tag_css_rules.items(): 304 css += '.%s { %s }\n' % (cls, rule) 305 306 return (htmlstr, css, self.cssname) 307 308 def ensure_unit(self, raw, unit='px'): 309 if re.search(r'\d+$', raw) is not None: 310 raw += unit 311 return raw 312 313 # flatten possibly modified tag back to string 314 def taginfo_tostring(self, taginfo): 315 (ttype, tname, tattr) = taginfo 316 if ttype is None or tname is None: 317 return '' 318 if ttype == 'end': 319 return '</%s>' % tname 320 if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr: 321 info = tattr['special'] 322 if ttype == 'comment': 323 return '<%s %s-->' % (tname, info) 324 else: 325 return '<%s %s>' % (tname, info) 326 res = [] 327 res.append('<%s' % tname) 328 if tattr is not None: 329 for key in tattr: 330 res.append(' %s="%s"' % (key, tattr[key])) 331 if ttype == 'single': 332 res.append('/>') 333 elif ttype == 'single_ext': 334 res.append(' />') 335 else : 336 res.append('>') 337 return "".join(res) 338 339 # routines to convert from mobi ml tags atributes to xhtml attributes and styles 340 def processtag(self, taginfo): 341 # Converting mobi font sizes to numerics 342 size_map = { 343 'xx-small': '1', 344 'x-small': '2', 345 'small': '3', 346 'medium': '4', 347 'large': '5', 348 'x-large': '6', 349 'xx-large': '7', 350 } 351 352 size_to_em_map = { 353 '1': '.65em', 354 '2': '.75em', 355 '3': '1em', 356 '4': '1.125em', 357 '5': '1.25em', 358 '6': '1.5em', 359 '7': '2em', 360 } 361 362 # current tag to work on 363 (ttype, tname, tattr) = taginfo 364 if not tattr: 365 tattr = {} 366 367 styles = [] 368 369 if tname is None or tname.startswith('removeme'): 370 return '' 371 372 # have not seen an example of this yet so keep it here to be safe 373 # until this is better understood 374 if tname in ('country-region', 'place', 'placetype', 'placename', 375 'state', 'city', 'street', 'address', 'content'): 376 tname = 'div' if tname == 'content' else 'span' 377 for key in tattr: 378 tattr.pop(key) 379 380 # handle general case of style, height, width, bgcolor in any tag 381 if 'style' in tattr: 382 style = tattr.pop('style').strip() 383 if style: 384 styles.append(style) 385 386 if 'align' in tattr: 387 align = tattr.pop('align').strip() 388 if align: 389 if tname in ('table', 'td', 'tr'): 390 pass 391 else: 392 styles.append('text-align: %s' % align) 393 394 if 'height' in tattr: 395 height = tattr.pop('height').strip() 396 if height and '<' not in height and '>' not in height and re.search(r'\d+', height): 397 if tname in ('table', 'td', 'tr'): 398 pass 399 elif tname == 'img': 400 tattr['height'] = height 401 else: 402 styles.append('margin-top: %s' % self.ensure_unit(height)) 403 404 if 'width' in tattr: 405 width = tattr.pop('width').strip() 406 if width and re.search(r'\d+', width): 407 if tname in ('table', 'td', 'tr'): 408 pass 409 elif tname == 'img': 410 tattr['width'] = width 411 else: 412 styles.append('text-indent: %s' % self.ensure_unit(width)) 413 if width.startswith('-'): 414 styles.append('margin-left: %s' % self.ensure_unit(width[1:])) 415 416 if 'bgcolor' in tattr: 417 # no proprietary html allowed 418 if tname == 'div': 419 del tattr['bgcolor'] 420 421 elif tname == 'font': 422 # Change font tags to span tags 423 tname = 'span' 424 if ttype in ('begin', 'single', 'single_ext'): 425 # move the face attribute to css font-family 426 if 'face' in tattr: 427 face = tattr.pop('face').strip() 428 styles.append('font-family: "%s"' % face) 429 430 # Monitor the constantly changing font sizes, change them to ems and move 431 # them to css. The following will work for 'flat' font tags, but nested font tags 432 # will cause things to go wonky. Need to revert to the parent font tag's size 433 # when a closing tag is encountered. 434 if 'size' in tattr: 435 sz = tattr.pop('size').strip().lower() 436 try: 437 float(sz) 438 except ValueError: 439 if sz in size_map: 440 sz = size_map[sz] 441 else: 442 if sz.startswith('-') or sz.startswith('+'): 443 sz = self.current_font_size + float(sz) 444 if sz > 7: 445 sz = 7 446 elif sz < 1: 447 sz = 1 448 sz = str(int(sz)) 449 styles.append('font-size: %s' % size_to_em_map[sz]) 450 self.current_font_size = int(sz) 451 452 elif tname == 'img': 453 for attr in ('width', 'height'): 454 if attr in tattr: 455 val = tattr[attr] 456 if val.lower().endswith('em'): 457 try: 458 nval = float(val[:-2]) 459 nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile 460 tattr[attr] = "%dpx"%int(nval) 461 except: 462 del tattr[attr] 463 elif val.lower().endswith('%'): 464 del tattr[attr] 465 466 # convert the anchor tags 467 if 'filepos-id' in tattr: 468 tattr['id'] = tattr.pop('filepos-id') 469 if 'name' in tattr and tattr['name'] != tattr['id']: 470 tattr['name'] = tattr['id'] 471 472 if 'filepos' in tattr: 473 filepos = tattr.pop('filepos') 474 try: 475 tattr['href'] = "#filepos%d" % int(filepos) 476 except ValueError: 477 pass 478 479 if styles: 480 ncls = None 481 rule = '; '.join(styles) 482 for sel, srule in self.tag_css_rules.items(): 483 if srule == rule: 484 ncls = sel 485 break 486 if ncls is None: 487 self.tag_css_rule_cnt += 1 488 ncls = 'rule_%d' % self.tag_css_rule_cnt 489 self.tag_css_rules[ncls] = rule 490 cls = tattr.get('class', '') 491 cls = cls + (' ' if cls else '') + ncls 492 tattr['class'] = cls 493 494 # convert updated tag back to string representation 495 if len(tattr) == 0: 496 tattr = None 497 taginfo = (ttype, tname, tattr) 498 return self.taginfo_tostring(taginfo) 499 500''' main only left in for testing outside of plugin ''' 501 502def main(argv=sys.argv): 503 if len(argv) != 2: 504 return 1 505 else: 506 infile = argv[1] 507 508 try: 509 print('Converting Mobi Markup Language to XHTML') 510 mlc = MobiMLConverter(infile) 511 print('Processing ...') 512 htmlstr, css, cssname = mlc.processml() 513 outname = infile.rsplit('.',1)[0] + '_converted.html' 514 open(outname, 'w').write(htmlstr) 515 open(cssname, 'w').write(css) 516 print('Completed') 517 print('XHTML version of book can be found at: ' + outname) 518 519 except ValueError as e: 520 print("Error: %s" % e) 521 return 1 522 523 return 0 524 525 526if __name__ == "__main__": 527 sys.exit(main()) 528