1''' CHM File decoding support '''
2__license__ = 'GPL v3'
3__copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
4                 ' and Alex Bramley <a.bramley at gmail.com>.'
5
6import codecs
7import os
8import re
9
10from calibre import guess_type as guess_mimetype
11from calibre.constants import filesystem_encoding, iswindows
12from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
13from calibre.ebooks.chardet import xml_to_unicode
14from calibre.ebooks.metadata.toc import TOC
15from chm.chm import CHMFile, chmlib
16from polyglot.builtins import as_unicode
17
18
19def match_string(s1, s2_already_lowered):
20    if s1 is not None and s2_already_lowered is not None:
21        if s1.lower()==s2_already_lowered:
22            return True
23    return False
24
25
26def check_all_prev_empty(tag):
27    if tag is None:
28        return True
29    if tag.__class__ == NavigableString and not check_empty(tag):
30        return False
31    return check_all_prev_empty(tag.previousSibling)
32
33
34def check_empty(s, rex=re.compile(r'\S')):
35    return rex.search(s) is None
36
37
38class CHMError(Exception):
39    pass
40
41
42class CHMReader(CHMFile):
43
44    def __init__(self, input, log, input_encoding=None):
45        CHMFile.__init__(self)
46        if isinstance(input, str):
47            enc = 'mbcs' if iswindows else filesystem_encoding
48            try:
49                input = input.encode(enc)
50            except UnicodeEncodeError:
51                from calibre.ptempfile import PersistentTemporaryFile
52                with PersistentTemporaryFile(suffix='.chm') as t:
53                    t.write(open(input, 'rb').read())
54                input = t.name
55        if not self.LoadCHM(input):
56            raise CHMError("Unable to open CHM file '%s'"%(input,))
57        self.log = log
58        self.input_encoding = input_encoding
59        self._sourcechm = input
60        self._contents = None
61        self._playorder = 0
62        self._metadata = False
63        self._extracted = False
64        self.re_encoded_files = set()
65        self.get_encodings()
66        if self.home:
67            self.home = self.decode_hhp_filename(self.home)
68        if self.topics:
69            self.topics = self.decode_hhp_filename(self.topics)
70
71        # location of '.hhc' file, which is the CHM TOC.
72        base = self.topics or self.home
73        self.root = os.path.splitext(base.lstrip('/'))[0]
74        self.hhc_path = self.root + ".hhc"
75
76    def decode_hhp_filename(self, path):
77        if isinstance(path, str):
78            return path
79        for enc in (self.encoding_from_system_file, self.encoding_from_lcid, 'cp1252', 'cp1251', 'latin1', 'utf-8'):
80            if enc:
81                try:
82                    q = path.decode(enc)
83                except UnicodeDecodeError:
84                    continue
85                res, ui = self.ResolveObject(q)
86                if res == chmlib.CHM_RESOLVE_SUCCESS:
87                    return q
88
89    def get_encodings(self):
90        self.encoding_from_system_file = self.encoding_from_lcid = None
91        q = self.GetEncoding()
92        if q:
93            try:
94                if isinstance(q, bytes):
95                    q = q.decode('ascii')
96                    codecs.lookup(q)
97                    self.encoding_from_system_file = q
98            except Exception:
99                pass
100
101        lcid = self.GetLCID()
102        if lcid is not None:
103            q = lcid[0]
104            if q:
105                try:
106                    if isinstance(q, bytes):
107                        q = q.decode('ascii')
108                        codecs.lookup(q)
109                        self.encoding_from_lcid = q
110                except Exception:
111                    pass
112
113    def get_encoding(self):
114        return self.encoding_from_system_file or self.encoding_from_lcid or 'cp1252'
115
116    def _parse_toc(self, ul, basedir=os.getcwd()):
117        toc = TOC(play_order=self._playorder, base_path=basedir, text='')
118        self._playorder += 1
119        for li in ul('li', recursive=False):
120            href = li.object('param', {'name': 'Local'})[0]['value']
121            if href.count('#'):
122                href, frag = href.split('#')
123            else:
124                frag = None
125            name = self._deentity(li.object('param', {'name': 'Name'})[0]['value'])
126            # print "========>", name
127            toc.add_item(href, frag, name, play_order=self._playorder)
128            self._playorder += 1
129            if li.ul:
130                child = self._parse_toc(li.ul)
131                child.parent = toc
132                toc.append(child)
133        # print toc
134        return toc
135
136    def ResolveObject(self, path):
137        # filenames are utf-8 encoded in the chm index as far as I can
138        # determine, see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html
139        if not isinstance(path, bytes):
140            path = path.encode('utf-8')
141        return CHMFile.ResolveObject(self, path)
142
143    def GetFile(self, path):
144        # have to have abs paths for ResolveObject, but Contents() deliberately
145        # makes them relative. So we don't have to worry, re-add the leading /.
146        # note this path refers to the internal CHM structure
147        if path[0] != '/':
148            path = '/' + path
149        res, ui = self.ResolveObject(path)
150        if res != chmlib.CHM_RESOLVE_SUCCESS:
151            raise CHMError(f"Unable to locate {path!r} within CHM file {self.filename!r}")
152        size, data = self.RetrieveObject(ui)
153        if size == 0:
154            raise CHMError(f"{path!r} is zero bytes in length!")
155        return data
156
157    def get_home(self):
158        return self.GetFile(self.home)
159
160    def ExtractFiles(self, output_dir=os.getcwd(), debug_dump=False):
161        html_files = set()
162        for path in self.Contents():
163            fpath = path
164            lpath = os.path.join(output_dir, fpath)
165            self._ensure_dir(lpath)
166            try:
167                data = self.GetFile(path)
168            except:
169                self.log.exception('Failed to extract %s from CHM, ignoring'%path)
170                continue
171            if lpath.find(';') != -1:
172                # fix file names with ";<junk>" at the end, see _reformat()
173                lpath = lpath.split(';')[0]
174            try:
175                with open(lpath, 'wb') as f:
176                    f.write(data)
177                try:
178                    if 'html' in guess_mimetype(path)[0]:
179                        html_files.add(lpath)
180                except:
181                    pass
182            except:
183                if iswindows and len(lpath) > 250:
184                    self.log.warn('%r filename too long, skipping'%path)
185                    continue
186                raise
187
188        if debug_dump:
189            import shutil
190            shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump'))
191        for lpath in html_files:
192            with lopen(lpath, 'r+b') as f:
193                data = f.read()
194                data = self._reformat(data, lpath)
195                if isinstance(data, str):
196                    data = data.encode('utf-8')
197                f.seek(0)
198                f.truncate()
199                f.write(data)
200
201        self._extracted = True
202        files = [y for y in os.listdir(output_dir) if
203                os.path.isfile(os.path.join(output_dir, y))]
204        if self.hhc_path not in files:
205            for f in files:
206                if f.lower() == self.hhc_path.lower():
207                    self.hhc_path = f
208                    break
209        if self.hhc_path not in files and files:
210            for f in files:
211                if f.partition('.')[-1].lower() in {'html', 'htm', 'xhtm',
212                        'xhtml'}:
213                    self.hhc_path = f
214                    break
215
216        if self.hhc_path == '.hhc' and self.hhc_path not in files:
217            from calibre import walk
218            for x in walk(output_dir):
219                if os.path.basename(x).lower() in ('index.htm', 'index.html',
220                        'contents.htm', 'contents.html'):
221                    self.hhc_path = os.path.relpath(x, output_dir)
222                    break
223
224        if self.hhc_path not in files and files:
225            self.hhc_path = files[0]
226
227    def _reformat(self, data, htmlpath):
228        if self.input_encoding:
229            data = data.decode(self.input_encoding)
230        try:
231            data = xml_to_unicode(data, strip_encoding_pats=True)[0]
232            soup = BeautifulSoup(data)
233        except ValueError:
234            # hit some strange encoding problems...
235            self.log.exception("Unable to parse html for cleaning, leaving it")
236            return data
237        # nuke javascript...
238        [s.extract() for s in soup('script')]
239        # See if everything is inside a <head> tag
240        # https://bugs.launchpad.net/bugs/1273512
241        body = soup.find('body')
242        if body is not None and body.parent.name == 'head':
243            html = soup.find('html')
244            html.insert(len(html), body)
245
246        # remove forward and back nav bars from the top/bottom of each page
247        # cos they really fuck with the flow of things and generally waste space
248        # since we can't use [a,b] syntax to select arbitrary items from a list
249        # we'll have to do this manually...
250        # only remove the tables, if they have an image with an alt attribute
251        # containing prev, next or team
252        t = soup('table')
253        if t:
254            if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None):
255                try:
256                    alt = t[0].img['alt'].lower()
257                    if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
258                        t[0].extract()
259                except:
260                    pass
261            if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None):
262                try:
263                    alt = t[-1].img['alt'].lower()
264                    if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
265                        t[-1].extract()
266                except:
267                    pass
268        # for some very odd reason each page's content appears to be in a table
269        # too. and this table has sub-tables for random asides... grr.
270
271        # remove br at top of page if present after nav bars removed
272        br = soup('br')
273        if br:
274            if check_all_prev_empty(br[0].previousSibling):
275                br[0].extract()
276
277        # some images seem to be broken in some chm's :/
278        base = os.path.dirname(htmlpath)
279        for img in soup('img', src=True):
280            src = img['src']
281            ipath = os.path.join(base, *src.split('/'))
282            if os.path.exists(ipath):
283                continue
284            src = src.split(';')[0]
285            if not src:
286                continue
287            ipath = os.path.join(base, *src.split('/'))
288            if not os.path.exists(ipath):
289                while src.startswith('../'):
290                    src = src[3:]
291            img['src'] = src
292        try:
293            # if there is only a single table with a single element
294            # in the body, replace it by the contents of this single element
295            tables = soup.body.findAll('table', recursive=False)
296            if tables and len(tables) == 1:
297                trs = tables[0].findAll('tr', recursive=False)
298                if trs and len(trs) == 1:
299                    tds = trs[0].findAll('td', recursive=False)
300                    if tds and len(tds) == 1:
301                        tdContents = tds[0].contents
302                        tableIdx = soup.body.contents.index(tables[0])
303                        tables[0].extract()
304                        while tdContents:
305                            soup.body.insert(tableIdx, tdContents.pop())
306        except:
307            pass
308        # do not prettify, it would reformat the <pre> tags!
309        try:
310            ans = soup.decode_contents()
311            self.re_encoded_files.add(os.path.abspath(htmlpath))
312            return ans
313        except RuntimeError:
314            return data
315
316    def Contents(self):
317        if self._contents is not None:
318            return self._contents
319        paths = []
320
321        def get_paths(chm, ui, ctx):
322            # these are supposed to be UTF-8 in CHM as best as I can determine
323            # see https://tika.apache.org/1.11/api/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.html
324            path = as_unicode(ui.path, 'utf-8')
325            # skip directories
326            # note this path refers to the internal CHM structure
327            if path[-1] != '/':
328                # and make paths relative
329                paths.append(path.lstrip('/'))
330        chmlib.chm_enumerate(self.file, chmlib.CHM_ENUMERATE_NORMAL, get_paths, None)
331        self._contents = paths
332        return self._contents
333
334    def _ensure_dir(self, path):
335        dir = os.path.dirname(path)
336        if not os.path.isdir(dir):
337            os.makedirs(dir)
338
339    def extract_content(self, output_dir=os.getcwd(), debug_dump=False):
340        self.ExtractFiles(output_dir=output_dir, debug_dump=debug_dump)
341