1''' CHM File decoding support ''' 2__license__ = 'GPL v3' 3__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \ 4 ' and Alex Bramley <a.bramley at gmail.com>.' 5 6import os 7 8from calibre.customize.conversion import InputFormatPlugin 9from calibre.ptempfile import TemporaryDirectory 10from calibre.constants import filesystem_encoding 11from polyglot.builtins import as_bytes 12 13 14class CHMInput(InputFormatPlugin): 15 16 name = 'CHM Input' 17 author = 'Kovid Goyal and Alex Bramley' 18 description = _('Convert CHM files to OEB') 19 file_types = {'chm'} 20 commit_name = 'chm_input' 21 22 def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False): 23 from calibre.ebooks.chm.reader import CHMReader 24 log.debug('Opening CHM file') 25 rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding) 26 log.debug('Extracting CHM to %s' % output_dir) 27 rdr.extract_content(output_dir, debug_dump=debug_dump) 28 self._chm_reader = rdr 29 return rdr.hhc_path 30 31 def convert(self, stream, options, file_ext, log, accelerators): 32 from calibre.ebooks.chm.metadata import get_metadata_from_reader 33 from calibre.customize.ui import plugin_for_input_format 34 self.opts = options 35 36 log.debug('Processing CHM...') 37 with TemporaryDirectory('_chm2oeb') as tdir: 38 if not isinstance(tdir, str): 39 tdir = tdir.decode(filesystem_encoding) 40 html_input = plugin_for_input_format('html') 41 for opt in html_input.options: 42 setattr(options, opt.option.name, opt.recommended_value) 43 no_images = False # options.no_images 44 chm_name = stream.name 45 # chm_data = stream.read() 46 47 # closing stream so CHM can be opened by external library 48 stream.close() 49 log.debug('tdir=%s' % tdir) 50 log.debug('stream.name=%s' % stream.name) 51 debug_dump = False 52 odi = options.debug_pipeline 53 if odi: 54 debug_dump = os.path.join(odi, 'input') 55 mainname = self._chmtohtml(tdir, chm_name, no_images, log, 56 debug_dump=debug_dump) 57 mainpath = os.path.join(tdir, mainname) 58 59 try: 60 metadata = get_metadata_from_reader(self._chm_reader) 61 except Exception: 62 log.exception('Failed to read metadata, using filename') 63 from calibre.ebooks.metadata.book.base import Metadata 64 metadata = Metadata(os.path.basename(chm_name)) 65 encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252' 66 self._chm_reader.CloseCHM() 67 # print((tdir, mainpath)) 68 # from calibre import ipython 69 # ipython() 70 71 options.debug_pipeline = None 72 options.input_encoding = 'utf-8' 73 uenc = encoding 74 if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files: 75 uenc = 'utf-8' 76 htmlpath, toc = self._create_html_root(mainpath, log, uenc) 77 oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) 78 options.debug_pipeline = odi 79 if toc.count() > 1: 80 oeb.toc = self.parse_html_toc(oeb.spine[0]) 81 oeb.manifest.remove(oeb.spine[0]) 82 oeb.auto_generated_toc = False 83 return oeb 84 85 def parse_html_toc(self, item): 86 from calibre.ebooks.oeb.base import TOC, XPath 87 dx = XPath('./h:div') 88 ax = XPath('./h:a[1]') 89 90 def do_node(parent, div): 91 for child in dx(div): 92 a = ax(child)[0] 93 c = parent.add(a.text, a.attrib['href']) 94 do_node(c, child) 95 96 toc = TOC() 97 root = XPath('//h:div[1]')(item.data)[0] 98 do_node(toc, root) 99 return toc 100 101 def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi): 102 # use HTMLInput plugin to generate book 103 from calibre.customize.builtins import HTMLInput 104 opts.breadth_first = True 105 htmlinput = HTMLInput(None) 106 oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi) 107 return oeb 108 109 def _create_html_root(self, hhcpath, log, encoding): 110 from lxml import html 111 from polyglot.urllib import unquote as _unquote 112 from calibre.ebooks.oeb.base import urlquote 113 from calibre.ebooks.chardet import xml_to_unicode 114 hhcdata = self._read_file(hhcpath) 115 hhcdata = hhcdata.decode(encoding) 116 hhcdata = xml_to_unicode(hhcdata, verbose=True, 117 strip_encoding_pats=True, resolve_entities=True)[0] 118 hhcroot = html.fromstring(hhcdata) 119 toc = self._process_nodes(hhcroot) 120 # print("=============================") 121 # print("Printing hhcroot") 122 # print(etree.tostring(hhcroot, pretty_print=True)) 123 # print("=============================") 124 log.debug('Found %d section nodes' % toc.count()) 125 htmlpath = os.path.splitext(hhcpath)[0] + ".html" 126 base = os.path.dirname(os.path.abspath(htmlpath)) 127 128 def unquote(x): 129 if isinstance(x, str): 130 x = x.encode('utf-8') 131 return _unquote(x).decode('utf-8') 132 133 def unquote_path(x): 134 y = unquote(x) 135 if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))): 136 x = y 137 return x 138 139 def donode(item, parent, base, subpath): 140 for child in item: 141 title = child.title 142 if not title: 143 continue 144 raw = unquote_path(child.href or '') 145 rsrcname = os.path.basename(raw) 146 rsrcpath = os.path.join(subpath, rsrcname) 147 if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))): 148 rsrcpath = raw 149 150 if '%' not in rsrcpath: 151 rsrcpath = urlquote(rsrcpath) 152 if not raw: 153 rsrcpath = '' 154 c = DIV(A(title, href=rsrcpath)) 155 donode(child, c, base, subpath) 156 parent.append(c) 157 158 with open(htmlpath, 'wb') as f: 159 if toc.count() > 1: 160 from lxml.html.builder import HTML, BODY, DIV, A 161 path0 = toc[0].href 162 path0 = unquote_path(path0) 163 subpath = os.path.dirname(path0) 164 base = os.path.dirname(f.name) 165 root = DIV() 166 donode(toc, root, base, subpath) 167 raw = html.tostring(HTML(BODY(root)), encoding='utf-8', 168 pretty_print=True) 169 f.write(raw) 170 else: 171 f.write(as_bytes(hhcdata)) 172 return htmlpath, toc 173 174 def _read_file(self, name): 175 with lopen(name, 'rb') as f: 176 data = f.read() 177 return data 178 179 def add_node(self, node, toc, ancestor_map): 180 from calibre.ebooks.chm.reader import match_string 181 if match_string(node.attrib.get('type', ''), 'text/sitemap'): 182 p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]') 183 parent = p[0] if p else None 184 toc = ancestor_map.get(parent, toc) 185 title = href = '' 186 for param in node.xpath('./param'): 187 if match_string(param.attrib['name'], 'name'): 188 title = param.attrib['value'] 189 elif match_string(param.attrib['name'], 'local'): 190 href = param.attrib['value'] 191 child = toc.add(title or _('Unknown'), href) 192 ancestor_map[node] = child 193 194 def _process_nodes(self, root): 195 from calibre.ebooks.oeb.base import TOC 196 toc = TOC() 197 ancestor_map = {} 198 for node in root.xpath('//object'): 199 self.add_node(node, toc, ancestor_map) 200 return toc 201