1#!/usr/local/bin/python3.8
2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3
4
5__license__   = 'GPL v3'
6__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
7__docformat__ = 'restructuredtext en'
8
9import os
10import re
11import tempfile
12from functools import partial
13from urllib.parse import quote
14
15from calibre.constants import isbsd, islinux
16from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
17from calibre.utils.filenames import ascii_filename
18from calibre.utils.imghdr import what
19from calibre.utils.localization import get_lang
20from polyglot.builtins import as_unicode
21
22
23def sanitize_file_name(x):
24    ans = re.sub(r'\s+', ' ', ascii_filename(x))
25    for ch in '?&=;#/\\':
26        ans = ans.replace(ch, '_')
27        q = quote(ch, safe='')
28        ans = re.sub(f'\\{q}', '_', ans, flags=re.I)
29    ans = ans.strip().rstrip('.')
30    ans, ext = ans.rpartition('.')[::2]
31    return (ans.strip() + '.' + ext.strip()).rstrip('.')
32
33
34class HTMLInput(InputFormatPlugin):
35
36    name        = 'HTML Input'
37    author      = 'Kovid Goyal'
38    description = _('Convert HTML and OPF files to an OEB')
39    file_types  = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
40    commit_name = 'html_input'
41
42    options = {
43        OptionRecommendation(name='breadth_first',
44            recommended_value=False, level=OptionRecommendation.LOW,
45            help=_('Traverse links in HTML files breadth first. Normally, '
46                    'they are traversed depth first.'
47                   )
48        ),
49
50        OptionRecommendation(name='max_levels',
51            recommended_value=5, level=OptionRecommendation.LOW,
52            help=_('Maximum levels of recursion when following links in '
53                   'HTML files. Must be non-negative. 0 implies that no '
54                   'links in the root HTML file are followed. Default is '
55                   '%default.'
56                   )
57        ),
58
59        OptionRecommendation(name='dont_package',
60            recommended_value=False, level=OptionRecommendation.LOW,
61            help=_('Normally this input plugin re-arranges all the input '
62                'files into a standard folder hierarchy. Only use this option '
63                'if you know what you are doing as it can result in various '
64                'nasty side effects in the rest of the conversion pipeline.'
65                )
66        ),
67
68    }
69
70    def convert(self, stream, opts, file_ext, log,
71                accelerators):
72        self._is_case_sensitive = None
73        basedir = os.getcwd()
74        self.opts = opts
75
76        fname = None
77        if hasattr(stream, 'name'):
78            basedir = os.path.dirname(stream.name)
79            fname = os.path.basename(stream.name)
80
81        if file_ext != 'opf':
82            if opts.dont_package:
83                raise ValueError('The --dont-package option is not supported for an HTML input file')
84            from calibre.ebooks.metadata.html import get_metadata
85            mi = get_metadata(stream)
86            if fname:
87                from calibre.ebooks.metadata.meta import metadata_from_filename
88                fmi = metadata_from_filename(fname)
89                fmi.smart_update(mi)
90                mi = fmi
91            oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
92            return oeb
93
94        from calibre.ebooks.conversion.plumber import create_oebbook
95        return create_oebbook(log, stream.name, opts,
96                encoding=opts.input_encoding)
97
98    def is_case_sensitive(self, path):
99        if getattr(self, '_is_case_sensitive', None) is not None:
100            return self._is_case_sensitive
101        if not path or not os.path.exists(path):
102            return islinux or isbsd
103        self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
104        return self._is_case_sensitive
105
106    def create_oebbook(self, htmlpath, basedir, opts, log, mi):
107        import css_parser
108        import logging
109        import uuid
110
111        from calibre import guess_type
112        from calibre.ebooks.conversion.plumber import create_oebbook
113        from calibre.ebooks.html.input import get_filelist
114        from calibre.ebooks.metadata import string_to_authors
115        from calibre.ebooks.oeb.base import (
116            BINARY_MIME, OEB_STYLES, DirContainer, rewrite_links, urldefrag,
117            urlnormalize, urlquote, xpath
118        )
119        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
120        from calibre.utils.localization import canonicalize_lang
121        css_parser.log.setLevel(logging.WARN)
122        self.OEB_STYLES = OEB_STYLES
123        oeb = create_oebbook(log, None, opts, self,
124                encoding=opts.input_encoding, populate=False)
125        self.oeb = oeb
126
127        metadata = oeb.metadata
128        meta_info_to_oeb_metadata(mi, metadata, log)
129        if not metadata.language:
130            l = canonicalize_lang(getattr(opts, 'language', None))
131            if not l:
132                oeb.logger.warn('Language not specified')
133                l = get_lang().replace('_', '-')
134            metadata.add('language', l)
135        if not metadata.creator:
136            a = getattr(opts, 'authors', None)
137            if a:
138                a = string_to_authors(a)
139            if not a:
140                oeb.logger.warn('Creator not specified')
141                a = [self.oeb.translate(__('Unknown'))]
142            for aut in a:
143                metadata.add('creator', aut)
144        if not metadata.title:
145            oeb.logger.warn('Title not specified')
146            metadata.add('title', self.oeb.translate(__('Unknown')))
147        bookid = str(uuid.uuid4())
148        metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
149        for ident in metadata.identifier:
150            if 'id' in ident.attrib:
151                self.oeb.uid = metadata.identifier[0]
152                break
153
154        filelist = get_filelist(htmlpath, basedir, opts, log)
155        filelist = [f for f in filelist if not f.is_binary]
156        htmlfile_map = {}
157        for f in filelist:
158            path = f.path
159            oeb.container = DirContainer(os.path.dirname(path), log,
160                    ignore_opf=True)
161            bname = os.path.basename(path)
162            id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
163            htmlfile_map[path] = href
164            item = oeb.manifest.add(id, href, 'text/html')
165            if path == htmlpath and '%' in path:
166                bname = urlquote(bname)
167            item.html_input_href = bname
168            oeb.spine.add(item, True)
169
170        self.added_resources = {}
171        self.log = log
172        self.log('Normalizing filename cases')
173        for path, href in htmlfile_map.items():
174            if not self.is_case_sensitive(path):
175                path = path.lower()
176            self.added_resources[path] = href
177        self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
178        self.urldefrag = urldefrag
179        self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
180        self.stylesheets_to_process = []
181
182        self.log('Rewriting HTML links')
183        for f in filelist:
184            path = f.path
185            dpath = os.path.dirname(path)
186            oeb.container = DirContainer(dpath, log, ignore_opf=True)
187            href = htmlfile_map[path]
188            try:
189                item = oeb.manifest.hrefs[href]
190            except KeyError:
191                item = oeb.manifest.hrefs[urlnormalize(href)]
192            rewrite_links(item.data, partial(self.resource_adder, base=dpath))
193
194        while self.stylesheets_to_process:
195            sheet = self.stylesheets_to_process.pop()
196            css_parser.replaceUrls(sheet.data, partial(self.resource_adder, base=sheet.html_input_dirpath))
197        for item in oeb.manifest:
198            if item.media_type in self.OEB_STYLES:
199                item.resolve_css_imports = True
200                item.override_css_fetch = None
201                item.reparse_css()
202
203        toc = self.oeb.toc
204        self.oeb.auto_generated_toc = True
205        titles = []
206        headers = []
207        for item in self.oeb.spine:
208            if not item.linear:
209                continue
210            html = item.data
211            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
212            title = re.sub(r'\s+', ' ', title.strip())
213            if title:
214                titles.append(title)
215            headers.append('(unlabled)')
216            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
217                expr = '/h:html/h:body//h:%s[position()=1]/text()'
218                header = ''.join(xpath(html, expr % tag))
219                header = re.sub(r'\s+', ' ', header.strip())
220                if header:
221                    headers[-1] = header
222                    break
223        use = titles
224        if len(titles) > len(set(titles)):
225            use = headers
226        for title, item in zip(use, self.oeb.spine):
227            if not item.linear:
228                continue
229            toc.add(title, item.href)
230
231        oeb.container = DirContainer(os.getcwd(), oeb.log, ignore_opf=True)
232        return oeb
233
234    def link_to_local_path(self, link_, base=None):
235        from calibre.ebooks.html.input import Link
236        if not isinstance(link_, str):
237            try:
238                link_ = link_.decode('utf-8', 'error')
239            except:
240                self.log.warn('Failed to decode link %r. Ignoring'%link_)
241                return None, None
242        try:
243            l = Link(link_, base if base else os.getcwd())
244        except:
245            self.log.exception('Failed to process link: %r'%link_)
246            return None, None
247        if l.path is None:
248            # Not a local resource
249            return None, None
250        link = l.path.replace('/', os.sep).strip()
251        frag = l.fragment
252        if not link:
253            return None, None
254        return link, frag
255
256    def resource_adder(self, link_, base=None):
257        from polyglot.urllib import quote
258        link, frag = self.link_to_local_path(link_, base=base)
259        if link is None:
260            return link_
261        try:
262            if base and not os.path.isabs(link):
263                link = os.path.join(base, link)
264            link = os.path.abspath(link)
265        except:
266            return link_
267        if not os.access(link, os.R_OK):
268            return link_
269        if os.path.isdir(link):
270            self.log.warn(link_, 'is a link to a directory. Ignoring.')
271            return link_
272        if not self.is_case_sensitive(tempfile.gettempdir()):
273            link = link.lower()
274        if link not in self.added_resources:
275            guessed = self.guess_type(os.path.basename(link))[0]
276            media_type = guessed or self.BINARY_MIME
277            is_stylesheet = media_type in self.OEB_STYLES
278            bhref = os.path.basename(link)
279            id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
280            if media_type == 'text/plain':
281                self.log.warn('Ignoring link to text file %r'%link_)
282                return None
283            if media_type == self.BINARY_MIME:
284                # Check for the common case, images
285                try:
286                    img = what(link)
287                except OSError:
288                    pass
289                else:
290                    if img:
291                        media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
292
293            self.oeb.log.debug('Added', link, 'with href:', href)
294            self.oeb.container = self.DirContainer(os.path.dirname(link),
295                    self.oeb.log, ignore_opf=True)
296            # Load into memory
297            item = self.oeb.manifest.add(id, href, media_type)
298            # bhref refers to an already existing file. The read() method of
299            # DirContainer will call unquote on it before trying to read the
300            # file, therefore we quote it here.
301            if isinstance(bhref, str):
302                bhref = bhref.encode('utf-8')
303            item.html_input_href = as_unicode(quote(bhref))
304            if is_stylesheet:
305                item.html_input_dirpath = os.path.dirname(link)
306                item.resolve_css_imports = False
307                item.override_css_fetch = lambda url: (None, '')
308                self.stylesheets_to_process.append(item)
309            item.data
310            self.added_resources[link] = href
311
312        nlink = self.added_resources[link]
313        if frag:
314            nlink = '#'.join((nlink, frag))
315        return nlink
316