1''' CHM File decoding support ''' 2__license__ = 'GPL v3' 3__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \ 4 ' and Alex Bramley <a.bramley at gmail.com>.' 5 6import codecs 7import os 8import re 9 10from calibre import guess_type as guess_mimetype 11from calibre.constants import filesystem_encoding, iswindows 12from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString 13from calibre.ebooks.chardet import xml_to_unicode 14from calibre.ebooks.metadata.toc import TOC 15from chm.chm import CHMFile, chmlib 16from polyglot.builtins import as_unicode 17 18 19def match_string(s1, s2_already_lowered): 20 if s1 is not None and s2_already_lowered is not None: 21 if s1.lower()==s2_already_lowered: 22 return True 23 return False 24 25 26def check_all_prev_empty(tag): 27 if tag is None: 28 return True 29 if tag.__class__ == NavigableString and not check_empty(tag): 30 return False 31 return check_all_prev_empty(tag.previousSibling) 32 33 34def check_empty(s, rex=re.compile(r'\S')): 35 return rex.search(s) is None 36 37 38class CHMError(Exception): 39 pass 40 41 42class CHMReader(CHMFile): 43 44 def __init__(self, input, log, input_encoding=None): 45 CHMFile.__init__(self) 46 if isinstance(input, str): 47 enc = 'mbcs' if iswindows else filesystem_encoding 48 try: 49 input = input.encode(enc) 50 except UnicodeEncodeError: 51 from calibre.ptempfile import PersistentTemporaryFile 52 with PersistentTemporaryFile(suffix='.chm') as t: 53 t.write(open(input, 'rb').read()) 54 input = t.name 55 if not self.LoadCHM(input): 56 raise CHMError("Unable to open CHM file '%s'"%(input,)) 57 self.log = log 58 self.input_encoding = input_encoding 59 self._sourcechm = input 60 self._contents = None 61 self._playorder = 0 62 self._metadata = False 63 self._extracted = False 64 self.re_encoded_files = set() 65 self.get_encodings() 66 if self.home: 67 self.home = self.decode_hhp_filename(self.home) 68 if self.topics: 69 self.topics = self.decode_hhp_filename(self.topics) 70 71 # location of '.hhc' file, which is the CHM TOC. 72 base = self.topics or self.home 73 self.root = os.path.splitext(base.lstrip('/'))[0] 74 self.hhc_path = self.root + ".hhc" 75 76 def decode_hhp_filename(self, path): 77 if isinstance(path, str): 78 return path 79 for enc in (self.encoding_from_system_file, self.encoding_from_lcid, 'cp1252', 'cp1251', 'latin1', 'utf-8'): 80 if enc: 81 try: 82 q = path.decode(enc) 83 except UnicodeDecodeError: 84 continue 85 res, ui = self.ResolveObject(q) 86 if res == chmlib.CHM_RESOLVE_SUCCESS: 87 return q 88 89 def get_encodings(self): 90 self.encoding_from_system_file = self.encoding_from_lcid = None 91 q = self.GetEncoding() 92 if q: 93 try: 94 if isinstance(q, bytes): 95 q = q.decode('ascii') 96 codecs.lookup(q) 97 self.encoding_from_system_file = q 98 except Exception: 99 pass 100 101 lcid = self.GetLCID() 102 if lcid is not None: 103 q = lcid[0] 104 if q: 105 try: 106 if isinstance(q, bytes): 107 q = q.decode('ascii') 108 codecs.lookup(q) 109 self.encoding_from_lcid = q 110 except Exception: 111 pass 112 113 def get_encoding(self): 114 return self.encoding_from_system_file or self.encoding_from_lcid or 'cp1252' 115 116 def _parse_toc(self, ul, basedir=os.getcwd()): 117 toc = TOC(play_order=self._playorder, base_path=basedir, text='') 118 self._playorder += 1 119 for li in ul('li', recursive=False): 120 href = li.object('param', {'name': 'Local'})[0]['value'] 121 if href.count('#'): 122 href, frag = href.split('#') 123 else: 124 frag = None 125 name = self._deentity(li.object('param', {'name': 'Name'})[0]['value']) 126 # print "========>", name 127 toc.add_item(href, frag, name, play_order=self._playorder) 128 self._playorder += 1 129 if li.ul: 130 child = self._parse_toc(li.ul) 131 child.parent = toc 132 toc.append(child) 133 # print toc 134 return toc 135 136 def ResolveObject(self, path): 137 # filenames are utf-8 encoded in the chm index as far as I can 138 # determine, see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html 139 if not isinstance(path, bytes): 140 path = path.encode('utf-8') 141 return CHMFile.ResolveObject(self, path) 142 143 def GetFile(self, path): 144 # have to have abs paths for ResolveObject, but Contents() deliberately 145 # makes them relative. So we don't have to worry, re-add the leading /. 146 # note this path refers to the internal CHM structure 147 if path[0] != '/': 148 path = '/' + path 149 res, ui = self.ResolveObject(path) 150 if res != chmlib.CHM_RESOLVE_SUCCESS: 151 raise CHMError(f"Unable to locate {path!r} within CHM file {self.filename!r}") 152 size, data = self.RetrieveObject(ui) 153 if size == 0: 154 raise CHMError(f"{path!r} is zero bytes in length!") 155 return data 156 157 def get_home(self): 158 return self.GetFile(self.home) 159 160 def ExtractFiles(self, output_dir=os.getcwd(), debug_dump=False): 161 html_files = set() 162 for path in self.Contents(): 163 fpath = path 164 lpath = os.path.join(output_dir, fpath) 165 self._ensure_dir(lpath) 166 try: 167 data = self.GetFile(path) 168 except: 169 self.log.exception('Failed to extract %s from CHM, ignoring'%path) 170 continue 171 if lpath.find(';') != -1: 172 # fix file names with ";<junk>" at the end, see _reformat() 173 lpath = lpath.split(';')[0] 174 try: 175 with open(lpath, 'wb') as f: 176 f.write(data) 177 try: 178 if 'html' in guess_mimetype(path)[0]: 179 html_files.add(lpath) 180 except: 181 pass 182 except: 183 if iswindows and len(lpath) > 250: 184 self.log.warn('%r filename too long, skipping'%path) 185 continue 186 raise 187 188 if debug_dump: 189 import shutil 190 shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump')) 191 for lpath in html_files: 192 with lopen(lpath, 'r+b') as f: 193 data = f.read() 194 data = self._reformat(data, lpath) 195 if isinstance(data, str): 196 data = data.encode('utf-8') 197 f.seek(0) 198 f.truncate() 199 f.write(data) 200 201 self._extracted = True 202 files = [y for y in os.listdir(output_dir) if 203 os.path.isfile(os.path.join(output_dir, y))] 204 if self.hhc_path not in files: 205 for f in files: 206 if f.lower() == self.hhc_path.lower(): 207 self.hhc_path = f 208 break 209 if self.hhc_path not in files and files: 210 for f in files: 211 if f.partition('.')[-1].lower() in {'html', 'htm', 'xhtm', 212 'xhtml'}: 213 self.hhc_path = f 214 break 215 216 if self.hhc_path == '.hhc' and self.hhc_path not in files: 217 from calibre import walk 218 for x in walk(output_dir): 219 if os.path.basename(x).lower() in ('index.htm', 'index.html', 220 'contents.htm', 'contents.html'): 221 self.hhc_path = os.path.relpath(x, output_dir) 222 break 223 224 if self.hhc_path not in files and files: 225 self.hhc_path = files[0] 226 227 def _reformat(self, data, htmlpath): 228 if self.input_encoding: 229 data = data.decode(self.input_encoding) 230 try: 231 data = xml_to_unicode(data, strip_encoding_pats=True)[0] 232 soup = BeautifulSoup(data) 233 except ValueError: 234 # hit some strange encoding problems... 235 self.log.exception("Unable to parse html for cleaning, leaving it") 236 return data 237 # nuke javascript... 238 [s.extract() for s in soup('script')] 239 # See if everything is inside a <head> tag 240 # https://bugs.launchpad.net/bugs/1273512 241 body = soup.find('body') 242 if body is not None and body.parent.name == 'head': 243 html = soup.find('html') 244 html.insert(len(html), body) 245 246 # remove forward and back nav bars from the top/bottom of each page 247 # cos they really fuck with the flow of things and generally waste space 248 # since we can't use [a,b] syntax to select arbitrary items from a list 249 # we'll have to do this manually... 250 # only remove the tables, if they have an image with an alt attribute 251 # containing prev, next or team 252 t = soup('table') 253 if t: 254 if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None): 255 try: 256 alt = t[0].img['alt'].lower() 257 if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1: 258 t[0].extract() 259 except: 260 pass 261 if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None): 262 try: 263 alt = t[-1].img['alt'].lower() 264 if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1: 265 t[-1].extract() 266 except: 267 pass 268 # for some very odd reason each page's content appears to be in a table 269 # too. and this table has sub-tables for random asides... grr. 270 271 # remove br at top of page if present after nav bars removed 272 br = soup('br') 273 if br: 274 if check_all_prev_empty(br[0].previousSibling): 275 br[0].extract() 276 277 # some images seem to be broken in some chm's :/ 278 base = os.path.dirname(htmlpath) 279 for img in soup('img', src=True): 280 src = img['src'] 281 ipath = os.path.join(base, *src.split('/')) 282 if os.path.exists(ipath): 283 continue 284 src = src.split(';')[0] 285 if not src: 286 continue 287 ipath = os.path.join(base, *src.split('/')) 288 if not os.path.exists(ipath): 289 while src.startswith('../'): 290 src = src[3:] 291 img['src'] = src 292 try: 293 # if there is only a single table with a single element 294 # in the body, replace it by the contents of this single element 295 tables = soup.body.findAll('table', recursive=False) 296 if tables and len(tables) == 1: 297 trs = tables[0].findAll('tr', recursive=False) 298 if trs and len(trs) == 1: 299 tds = trs[0].findAll('td', recursive=False) 300 if tds and len(tds) == 1: 301 tdContents = tds[0].contents 302 tableIdx = soup.body.contents.index(tables[0]) 303 tables[0].extract() 304 while tdContents: 305 soup.body.insert(tableIdx, tdContents.pop()) 306 except: 307 pass 308 # do not prettify, it would reformat the <pre> tags! 309 try: 310 ans = soup.decode_contents() 311 self.re_encoded_files.add(os.path.abspath(htmlpath)) 312 return ans 313 except RuntimeError: 314 return data 315 316 def Contents(self): 317 if self._contents is not None: 318 return self._contents 319 paths = [] 320 321 def get_paths(chm, ui, ctx): 322 # these are supposed to be UTF-8 in CHM as best as I can determine 323 # see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html 324 path = as_unicode(ui.path, 'utf-8') 325 # skip directories 326 # note this path refers to the internal CHM structure 327 if path[-1] != '/': 328 # and make paths relative 329 paths.append(path.lstrip('/')) 330 chmlib.chm_enumerate(self.file, chmlib.CHM_ENUMERATE_NORMAL, get_paths, None) 331 self._contents = paths 332 return self._contents 333 334 def _ensure_dir(self, path): 335 dir = os.path.dirname(path) 336 if not os.path.isdir(dir): 337 os.makedirs(dir) 338 339 def extract_content(self, output_dir=os.getcwd(), debug_dump=False): 340 self.ExtractFiles(output_dir=output_dir, debug_dump=debug_dump) 341