1''' CHM File decoding support '''
2__license__ = 'GPL v3'
3__copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
4                 ' and Alex Bramley <a.bramley at gmail.com>.'
5
6import os
7
8from calibre.customize.conversion import InputFormatPlugin
9from calibre.ptempfile import TemporaryDirectory
10from calibre.constants import filesystem_encoding
11from polyglot.builtins import as_bytes
12
13
14class CHMInput(InputFormatPlugin):
15
16    name        = 'CHM Input'
17    author      = 'Kovid Goyal and Alex Bramley'
18    description = _('Convert CHM files to OEB')
19    file_types  = {'chm'}
20    commit_name = 'chm_input'
21
22    def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
23        from calibre.ebooks.chm.reader import CHMReader
24        log.debug('Opening CHM file')
25        rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
26        log.debug('Extracting CHM to %s' % output_dir)
27        rdr.extract_content(output_dir, debug_dump=debug_dump)
28        self._chm_reader = rdr
29        return rdr.hhc_path
30
31    def convert(self, stream, options, file_ext, log, accelerators):
32        from calibre.ebooks.chm.metadata import get_metadata_from_reader
33        from calibre.customize.ui import plugin_for_input_format
34        self.opts = options
35
36        log.debug('Processing CHM...')
37        with TemporaryDirectory('_chm2oeb') as tdir:
38            if not isinstance(tdir, str):
39                tdir = tdir.decode(filesystem_encoding)
40            html_input = plugin_for_input_format('html')
41            for opt in html_input.options:
42                setattr(options, opt.option.name, opt.recommended_value)
43            no_images = False  # options.no_images
44            chm_name = stream.name
45            # chm_data = stream.read()
46
47            # closing stream so CHM can be opened by external library
48            stream.close()
49            log.debug('tdir=%s' % tdir)
50            log.debug('stream.name=%s' % stream.name)
51            debug_dump = False
52            odi = options.debug_pipeline
53            if odi:
54                debug_dump = os.path.join(odi, 'input')
55            mainname = self._chmtohtml(tdir, chm_name, no_images, log,
56                    debug_dump=debug_dump)
57            mainpath = os.path.join(tdir, mainname)
58
59            try:
60                metadata = get_metadata_from_reader(self._chm_reader)
61            except Exception:
62                log.exception('Failed to read metadata, using filename')
63                from calibre.ebooks.metadata.book.base import Metadata
64                metadata = Metadata(os.path.basename(chm_name))
65            encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
66            self._chm_reader.CloseCHM()
67            # print((tdir, mainpath))
68            # from calibre import ipython
69            # ipython()
70
71            options.debug_pipeline = None
72            options.input_encoding = 'utf-8'
73            uenc = encoding
74            if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
75                uenc = 'utf-8'
76            htmlpath, toc = self._create_html_root(mainpath, log, uenc)
77            oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
78            options.debug_pipeline = odi
79            if toc.count() > 1:
80                oeb.toc = self.parse_html_toc(oeb.spine[0])
81                oeb.manifest.remove(oeb.spine[0])
82                oeb.auto_generated_toc = False
83        return oeb
84
85    def parse_html_toc(self, item):
86        from calibre.ebooks.oeb.base import TOC, XPath
87        dx = XPath('./h:div')
88        ax = XPath('./h:a[1]')
89
90        def do_node(parent, div):
91            for child in dx(div):
92                a = ax(child)[0]
93                c = parent.add(a.text, a.attrib['href'])
94                do_node(c, child)
95
96        toc = TOC()
97        root = XPath('//h:div[1]')(item.data)[0]
98        do_node(toc, root)
99        return toc
100
101    def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
102        # use HTMLInput plugin to generate book
103        from calibre.customize.builtins import HTMLInput
104        opts.breadth_first = True
105        htmlinput = HTMLInput(None)
106        oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
107        return oeb
108
109    def _create_html_root(self, hhcpath, log, encoding):
110        from lxml import html
111        from polyglot.urllib import unquote as _unquote
112        from calibre.ebooks.oeb.base import urlquote
113        from calibre.ebooks.chardet import xml_to_unicode
114        hhcdata = self._read_file(hhcpath)
115        hhcdata = hhcdata.decode(encoding)
116        hhcdata = xml_to_unicode(hhcdata, verbose=True,
117                            strip_encoding_pats=True, resolve_entities=True)[0]
118        hhcroot = html.fromstring(hhcdata)
119        toc = self._process_nodes(hhcroot)
120        # print("=============================")
121        # print("Printing hhcroot")
122        # print(etree.tostring(hhcroot, pretty_print=True))
123        # print("=============================")
124        log.debug('Found %d section nodes' % toc.count())
125        htmlpath = os.path.splitext(hhcpath)[0] + ".html"
126        base = os.path.dirname(os.path.abspath(htmlpath))
127
128        def unquote(x):
129            if isinstance(x, str):
130                x = x.encode('utf-8')
131            return _unquote(x).decode('utf-8')
132
133        def unquote_path(x):
134            y = unquote(x)
135            if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
136                x = y
137            return x
138
139        def donode(item, parent, base, subpath):
140            for child in item:
141                title = child.title
142                if not title:
143                    continue
144                raw = unquote_path(child.href or '')
145                rsrcname = os.path.basename(raw)
146                rsrcpath = os.path.join(subpath, rsrcname)
147                if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
148                    rsrcpath = raw
149
150                if '%' not in rsrcpath:
151                    rsrcpath = urlquote(rsrcpath)
152                if not raw:
153                    rsrcpath = ''
154                c = DIV(A(title, href=rsrcpath))
155                donode(child, c, base, subpath)
156                parent.append(c)
157
158        with open(htmlpath, 'wb') as f:
159            if toc.count() > 1:
160                from lxml.html.builder import HTML, BODY, DIV, A
161                path0 = toc[0].href
162                path0 = unquote_path(path0)
163                subpath = os.path.dirname(path0)
164                base = os.path.dirname(f.name)
165                root = DIV()
166                donode(toc, root, base, subpath)
167                raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
168                                   pretty_print=True)
169                f.write(raw)
170            else:
171                f.write(as_bytes(hhcdata))
172        return htmlpath, toc
173
174    def _read_file(self, name):
175        with lopen(name, 'rb') as f:
176            data = f.read()
177        return data
178
179    def add_node(self, node, toc, ancestor_map):
180        from calibre.ebooks.chm.reader import match_string
181        if match_string(node.attrib.get('type', ''), 'text/sitemap'):
182            p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
183            parent = p[0] if p else None
184            toc = ancestor_map.get(parent, toc)
185            title = href = ''
186            for param in node.xpath('./param'):
187                if match_string(param.attrib['name'], 'name'):
188                    title = param.attrib['value']
189                elif match_string(param.attrib['name'], 'local'):
190                    href = param.attrib['value']
191            child = toc.add(title or _('Unknown'), href)
192            ancestor_map[node] = child
193
194    def _process_nodes(self, root):
195        from calibre.ebooks.oeb.base import TOC
196        toc = TOC()
197        ancestor_map = {}
198        for node in root.xpath('//object'):
199            self.add_node(node, toc, ancestor_map)
200        return toc
201