1#!/usr/local/bin/python3.8
2# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
3
4
5# this program works in concert with the output from KindleUnpack
6
7'''
8Convert from Mobi ML to XHTML
9'''
10
11from __future__ import division, absolute_import, print_function
12
13import os
14import sys
15import re
16
17SPECIAL_HANDLING_TAGS = {
18    '?xml'     : ('xmlheader', -1),
19    '!--'      : ('comment', -3),
20    '!DOCTYPE' : ('doctype', -1),
21}
22
23SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
24
25SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
26
27class MobiMLConverter(object):
28
29    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
30    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
31
32    def __init__(self, filename):
33        self.base_css_rules =  'blockquote { margin: 0em 0em 0em 1.25em }\n'
34        self.base_css_rules += 'p { margin: 0em }\n'
35        self.base_css_rules += '.bold { font-weight: bold }\n'
36        self.base_css_rules += '.italic { font-style: italic }\n'
37        self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
38        self.tag_css_rules = {}
39        self.tag_css_rule_cnt = 0
40        self.path = []
41        self.filename = filename
42        self.wipml = open(self.filename, 'r').read()
43        self.pos = 0
44        self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
45        self.opos = 0
46        self.meta = ''
47        self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
48        self.current_font_size = 3
49        self.font_history = []
50
51    def cleanup_html(self):
52        self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml)
53        self.wipml = self.wipml.replace('\r\n', '\n')
54        self.wipml = self.wipml.replace('> <', '>\n<')
55        self.wipml = self.wipml.replace('<mbp: ', '<mbp:')
56        # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
57        self.wipml = self.wipml.replace('<br></br>','<br/>')
58
59    def replace_page_breaks(self):
60        self.wipml = self.PAGE_BREAK_PAT.sub(
61            '<div class="mbp_pagebreak" />',
62            self.wipml)
63
64    # parse leading text of ml and tag
65    def parseml(self):
66        p = self.pos
67        if p >= len(self.wipml):
68            return None
69        if self.wipml[p] != '<':
70            res = self.wipml.find('<',p)
71            if res == -1 :
72                res = len(self.wipml)
73            self.pos = res
74            return self.wipml[p:res], None
75        # handle comment as a special case to deal with multi-line comments
76        if self.wipml[p:p+4] == '<!--':
77            te = self.wipml.find('-->',p+1)
78            if te != -1:
79                te = te+2
80        else :
81            te = self.wipml.find('>',p+1)
82            ntb = self.wipml.find('<',p+1)
83            if ntb != -1 and ntb < te:
84                self.pos = ntb
85                return self.wipml[p:ntb], None
86        self.pos = te + 1
87        return None, self.wipml[p:te+1]
88
89    # parses string version of tag to identify its name,
90    # its type 'begin', 'end' or 'single',
91    # plus build a hashtable of its attributes
92    # code is written to handle the possiblity of very poor formating
93    def parsetag(self, s):
94        p = 1
95        # get the tag name
96        tname = None
97        ttype = None
98        tattr = {}
99        while s[p:p+1] == ' ' :
100            p += 1
101        if s[p:p+1] == '/':
102            ttype = 'end'
103            p += 1
104            while s[p:p+1] == ' ' :
105                p += 1
106        b = p
107        while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
108            p += 1
109        tname=s[b:p].lower()
110        if tname == '!doctype':
111            tname = '!DOCTYPE'
112        # special cases
113        if tname in SPECIAL_HANDLING_TAGS:
114            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
115            tattr['special'] = s[p:backstep]
116        if ttype is None:
117            # parse any attributes
118            while s.find('=',p) != -1 :
119                while s[p:p+1] == ' ' :
120                    p += 1
121                b = p
122                while s[p:p+1] != '=' :
123                    p += 1
124                aname = s[b:p].lower()
125                aname = aname.rstrip(' ')
126                p += 1
127                while s[p:p+1] == ' ' :
128                    p += 1
129                if s[p:p+1] in ('"', "'") :
130                    p = p + 1
131                    b = p
132                    while s[p:p+1] not in ('"', "'") :
133                        p += 1
134                    val = s[b:p]
135                    p += 1
136                else :
137                    b = p
138                    while s[p:p+1] not in ('>', '/', ' ') :
139                        p += 1
140                    val = s[b:p]
141                tattr[aname] = val
142        # label beginning and single tags
143        if ttype is None:
144            ttype = 'begin'
145            if s.find(' /',p) >= 0:
146                ttype = 'single_ext'
147            elif s.find('/',p) >= 0:
148                ttype = 'single'
149        return ttype, tname, tattr
150
151    # main routine to convert from mobi markup language to html
152    def processml(self):
153
154        # are these really needed
155        html_done = False
156        head_done = False
157        body_done = False
158
159        skip = False
160
161        htmlstr = ''
162        self.replace_page_breaks()
163        self.cleanup_html()
164
165        # now parse the cleaned up ml into standard xhtml
166        while True:
167
168            r = self.parseml()
169            if not r:
170                break
171
172            text, tag = r
173
174            if text:
175                if not skip:
176                    htmlstr += text
177
178            if tag:
179                ttype, tname, tattr = self.parsetag(tag)
180
181                # If we run into a DTD or xml declarations inside the body ... bail.
182                if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done:
183                    htmlstr += '\n</body></html>'
184                    break
185
186                # make sure self-closing tags actually self-close
187                if ttype == 'begin' and tname in SELF_CLOSING_TAGS:
188                    ttype = 'single'
189
190                # make sure any end tags of self-closing tags are discarded
191                if ttype == 'end' and tname in SELF_CLOSING_TAGS:
192                    continue
193
194                # remove embedded guide and refernces from old mobis
195                if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'):
196                    tname = 'removeme:{0}'.format(tname)
197                    tattr = None
198                if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end':
199                    if self.path[-1] == 'removeme:{0}'.format(tname):
200                        tname = 'removeme:{0}'.format(tname)
201                        tattr = None
202
203                # Get rid of font tags that only have a color attribute.
204                if tname == 'font' and ttype in ('begin', 'single', 'single_ext'):
205                    if 'color' in tattr and len(tattr) == 1:
206                        tname = 'removeme:{0}'.format(tname)
207                        tattr = None
208
209                # Get rid of empty spans in the markup.
210                if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr):
211                    tname = 'removeme:{0}'.format(tname)
212
213                # need to handle fonts outside of the normal methods
214                # so fonts tags won't be added to the self.path since we keep track
215                # of font tags separately with self.font_history
216                if tname == 'font' and ttype == 'begin':
217                    # check for nested font start tags
218                    if len(self.font_history) > 0 :
219                        # inject a font end tag
220                        taginfo = ('end', 'font', None)
221                        htmlstr += self.processtag(taginfo)
222                    self.font_history.append((ttype, tname, tattr))
223                    # handle the current font start tag
224                    taginfo = (ttype, tname, tattr)
225                    htmlstr += self.processtag(taginfo)
226                    continue
227
228                # check for nested font tags and unnest them
229                if tname == 'font' and ttype == 'end':
230                    self.font_history.pop()
231                    # handle this font end tag
232                    taginfo = ('end', 'font', None)
233                    htmlstr += self.processtag(taginfo)
234                    # check if we were nested
235                    if len(self.font_history) > 0:
236                        # inject a copy of the most recent font start tag from history
237                        taginfo = self.font_history[-1]
238                        htmlstr += self.processtag(taginfo)
239                    continue
240
241                # keep track of nesting path
242                if ttype == 'begin':
243                    self.path.append(tname)
244                elif ttype == 'end':
245                    if tname != self.path[-1]:
246                        print('improper nesting: ', self.path, tname, ttype)
247                        if tname not in self.path:
248                            # handle case of end tag with no beginning by injecting empty begin tag
249                            taginfo = ('begin', tname, None)
250                            htmlstr += self.processtag(taginfo)
251                            print("     - fixed by injecting empty start tag ", tname)
252                            self.path.append(tname)
253                        elif len(self.path) >  1 and tname == self.path[-2]:
254                            # handle case of dangling missing end
255                            taginfo = ('end', self.path[-1], None)
256                            htmlstr += self.processtag(taginfo)
257                            print("     - fixed by injecting end tag ", self.path[-1])
258                            self.path.pop()
259                    self.path.pop()
260
261                if tname == 'removeme:{0}'.format(tname):
262                    if ttype in ('begin', 'single', 'single_ext'):
263                        skip = True
264                    else:
265                        skip = False
266                else:
267                    taginfo = (ttype, tname, tattr)
268                    htmlstr += self.processtag(taginfo)
269
270                # handle potential issue of multiple html, head, and body sections
271                if tname == 'html' and ttype == 'begin' and not html_done:
272                    htmlstr += '\n'
273                    html_done = True
274
275                if tname == 'head' and ttype == 'begin' and not head_done:
276                    htmlstr += '\n'
277                    # also add in metadata and style link tags
278                    htmlstr += self.meta
279                    htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
280                    head_done = True
281
282                if tname == 'body' and ttype == 'begin' and not body_done:
283                    htmlstr += '\n'
284                    body_done = True
285
286        # handle issue of possibly missing html, head, and body tags
287        # I have not seen this but the original did something like this so ...
288        if not body_done:
289            htmlstr = '<body>\n' + htmlstr + '</body>\n'
290        if not head_done:
291            headstr = '<head>\n'
292            headstr += self.meta
293            headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
294            headstr += '</head>\n'
295            htmlstr = headstr + htmlstr
296        if not html_done:
297            htmlstr = '<html>\n' + htmlstr + '</html>\n'
298
299        # finally add DOCTYPE info
300        htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr
301
302        css = self.base_css_rules
303        for cls, rule in self.tag_css_rules.items():
304            css += '.%s { %s }\n' % (cls, rule)
305
306        return (htmlstr, css, self.cssname)
307
308    def ensure_unit(self, raw, unit='px'):
309        if re.search(r'\d+$', raw) is not None:
310            raw += unit
311        return raw
312
313    # flatten possibly modified tag back to string
314    def taginfo_tostring(self, taginfo):
315        (ttype, tname, tattr) = taginfo
316        if ttype is None or tname is None:
317            return ''
318        if ttype == 'end':
319            return '</%s>' % tname
320        if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr:
321            info = tattr['special']
322            if ttype == 'comment':
323                return '<%s %s-->' % (tname, info)
324            else:
325                return '<%s %s>' % (tname, info)
326        res = []
327        res.append('<%s' % tname)
328        if tattr is not None:
329            for key in tattr:
330                res.append(' %s="%s"' % (key, tattr[key]))
331        if ttype == 'single':
332            res.append('/>')
333        elif ttype == 'single_ext':
334            res.append(' />')
335        else :
336            res.append('>')
337        return "".join(res)
338
339    # routines to convert from mobi ml tags atributes to xhtml attributes and styles
340    def processtag(self, taginfo):
341        # Converting mobi font sizes to numerics
342        size_map = {
343            'xx-small': '1',
344            'x-small': '2',
345            'small': '3',
346            'medium': '4',
347            'large': '5',
348            'x-large': '6',
349            'xx-large': '7',
350            }
351
352        size_to_em_map = {
353            '1': '.65em',
354            '2': '.75em',
355            '3': '1em',
356            '4': '1.125em',
357            '5': '1.25em',
358            '6': '1.5em',
359            '7': '2em',
360            }
361
362        # current tag to work on
363        (ttype, tname, tattr) = taginfo
364        if not tattr:
365            tattr = {}
366
367        styles = []
368
369        if tname is None or tname.startswith('removeme'):
370            return ''
371
372        # have not seen an example of this yet so keep it here to be safe
373        # until this is better understood
374        if tname in ('country-region', 'place', 'placetype', 'placename',
375                'state', 'city', 'street', 'address', 'content'):
376            tname = 'div' if tname == 'content' else 'span'
377            for key in tattr:
378                tattr.pop(key)
379
380        # handle general case of style, height, width, bgcolor in any tag
381        if 'style' in tattr:
382            style = tattr.pop('style').strip()
383            if style:
384                styles.append(style)
385
386        if 'align' in tattr:
387            align = tattr.pop('align').strip()
388            if align:
389                if tname in ('table', 'td', 'tr'):
390                    pass
391                else:
392                    styles.append('text-align: %s' % align)
393
394        if 'height' in tattr:
395            height = tattr.pop('height').strip()
396            if height and '<' not in height and '>' not in height and re.search(r'\d+', height):
397                if tname in ('table', 'td', 'tr'):
398                    pass
399                elif tname == 'img':
400                    tattr['height'] = height
401                else:
402                    styles.append('margin-top: %s' % self.ensure_unit(height))
403
404        if 'width' in tattr:
405            width = tattr.pop('width').strip()
406            if width and re.search(r'\d+', width):
407                if tname in ('table', 'td', 'tr'):
408                    pass
409                elif tname == 'img':
410                    tattr['width'] =  width
411                else:
412                    styles.append('text-indent: %s' % self.ensure_unit(width))
413                    if width.startswith('-'):
414                        styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
415
416        if 'bgcolor' in tattr:
417            # no proprietary html allowed
418            if tname == 'div':
419                del tattr['bgcolor']
420
421        elif tname == 'font':
422            # Change font tags to span tags
423            tname = 'span'
424            if ttype in ('begin', 'single', 'single_ext'):
425                # move the face attribute to css font-family
426                if 'face' in tattr:
427                    face = tattr.pop('face').strip()
428                    styles.append('font-family: "%s"' % face)
429
430                    # Monitor the constantly changing font sizes, change them to ems and move
431                    # them to css. The following will work for 'flat' font tags, but nested font tags
432                    # will cause things to go wonky. Need to revert to the parent font tag's size
433                    # when a closing tag is encountered.
434                if 'size' in tattr:
435                    sz = tattr.pop('size').strip().lower()
436                    try:
437                        float(sz)
438                    except ValueError:
439                        if sz in size_map:
440                            sz = size_map[sz]
441                    else:
442                        if sz.startswith('-') or sz.startswith('+'):
443                            sz = self.current_font_size + float(sz)
444                            if sz > 7:
445                                sz = 7
446                            elif sz < 1:
447                                sz = 1
448                            sz = str(int(sz))
449                    styles.append('font-size: %s' % size_to_em_map[sz])
450                    self.current_font_size = int(sz)
451
452        elif tname == 'img':
453            for attr in ('width', 'height'):
454                if attr in tattr:
455                    val = tattr[attr]
456                    if val.lower().endswith('em'):
457                        try:
458                            nval = float(val[:-2])
459                            nval *= 16 * (168.451/72)  # Assume this was set using the Kindle profile
460                            tattr[attr] = "%dpx"%int(nval)
461                        except:
462                            del tattr[attr]
463                    elif val.lower().endswith('%'):
464                        del tattr[attr]
465
466        # convert the anchor tags
467        if 'filepos-id' in tattr:
468            tattr['id'] = tattr.pop('filepos-id')
469            if 'name' in tattr and tattr['name'] != tattr['id']:
470                tattr['name'] = tattr['id']
471
472        if 'filepos' in tattr:
473            filepos = tattr.pop('filepos')
474            try:
475                tattr['href'] = "#filepos%d" % int(filepos)
476            except ValueError:
477                pass
478
479        if styles:
480            ncls = None
481            rule = '; '.join(styles)
482            for sel, srule in self.tag_css_rules.items():
483                if srule == rule:
484                    ncls = sel
485                    break
486            if ncls is None:
487                self.tag_css_rule_cnt += 1
488                ncls = 'rule_%d' % self.tag_css_rule_cnt
489                self.tag_css_rules[ncls] = rule
490            cls = tattr.get('class', '')
491            cls = cls + (' ' if cls else '') + ncls
492            tattr['class'] = cls
493
494        # convert updated tag back to string representation
495        if len(tattr) == 0:
496            tattr = None
497        taginfo = (ttype, tname, tattr)
498        return self.taginfo_tostring(taginfo)
499
500''' main only left in for testing outside of plugin '''
501
502def main(argv=sys.argv):
503    if len(argv) != 2:
504        return 1
505    else:
506        infile = argv[1]
507
508    try:
509        print('Converting Mobi Markup Language to XHTML')
510        mlc = MobiMLConverter(infile)
511        print('Processing ...')
512        htmlstr, css, cssname = mlc.processml()
513        outname = infile.rsplit('.',1)[0] + '_converted.html'
514        open(outname, 'w').write(htmlstr)
515        open(cssname, 'w').write(css)
516        print('Completed')
517        print('XHTML version of book can be found at: ' + outname)
518
519    except ValueError as e:
520        print("Error: %s" % e)
521        return 1
522
523    return 0
524
525
526if __name__ == "__main__":
527    sys.exit(main())
528