1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>' 7__docformat__ = 'restructuredtext en' 8 9import os 10import re 11import tempfile 12from functools import partial 13from urllib.parse import quote 14 15from calibre.constants import isbsd, islinux 16from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation 17from calibre.utils.filenames import ascii_filename 18from calibre.utils.imghdr import what 19from calibre.utils.localization import get_lang 20from polyglot.builtins import as_unicode 21 22 23def sanitize_file_name(x): 24 ans = re.sub(r'\s+', ' ', ascii_filename(x)) 25 for ch in '?&=;#/\\': 26 ans = ans.replace(ch, '_') 27 q = quote(ch, safe='') 28 ans = re.sub(f'\\{q}', '_', ans, flags=re.I) 29 ans = ans.strip().rstrip('.') 30 ans, ext = ans.rpartition('.')[::2] 31 return (ans.strip() + '.' + ext.strip()).rstrip('.') 32 33 34class HTMLInput(InputFormatPlugin): 35 36 name = 'HTML Input' 37 author = 'Kovid Goyal' 38 description = _('Convert HTML and OPF files to an OEB') 39 file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'} 40 commit_name = 'html_input' 41 42 options = { 43 OptionRecommendation(name='breadth_first', 44 recommended_value=False, level=OptionRecommendation.LOW, 45 help=_('Traverse links in HTML files breadth first. Normally, ' 46 'they are traversed depth first.' 47 ) 48 ), 49 50 OptionRecommendation(name='max_levels', 51 recommended_value=5, level=OptionRecommendation.LOW, 52 help=_('Maximum levels of recursion when following links in ' 53 'HTML files. Must be non-negative. 0 implies that no ' 54 'links in the root HTML file are followed. Default is ' 55 '%default.' 56 ) 57 ), 58 59 OptionRecommendation(name='dont_package', 60 recommended_value=False, level=OptionRecommendation.LOW, 61 help=_('Normally this input plugin re-arranges all the input ' 62 'files into a standard folder hierarchy. Only use this option ' 63 'if you know what you are doing as it can result in various ' 64 'nasty side effects in the rest of the conversion pipeline.' 65 ) 66 ), 67 68 } 69 70 def convert(self, stream, opts, file_ext, log, 71 accelerators): 72 self._is_case_sensitive = None 73 basedir = os.getcwd() 74 self.opts = opts 75 76 fname = None 77 if hasattr(stream, 'name'): 78 basedir = os.path.dirname(stream.name) 79 fname = os.path.basename(stream.name) 80 81 if file_ext != 'opf': 82 if opts.dont_package: 83 raise ValueError('The --dont-package option is not supported for an HTML input file') 84 from calibre.ebooks.metadata.html import get_metadata 85 mi = get_metadata(stream) 86 if fname: 87 from calibre.ebooks.metadata.meta import metadata_from_filename 88 fmi = metadata_from_filename(fname) 89 fmi.smart_update(mi) 90 mi = fmi 91 oeb = self.create_oebbook(stream.name, basedir, opts, log, mi) 92 return oeb 93 94 from calibre.ebooks.conversion.plumber import create_oebbook 95 return create_oebbook(log, stream.name, opts, 96 encoding=opts.input_encoding) 97 98 def is_case_sensitive(self, path): 99 if getattr(self, '_is_case_sensitive', None) is not None: 100 return self._is_case_sensitive 101 if not path or not os.path.exists(path): 102 return islinux or isbsd 103 self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper())) 104 return self._is_case_sensitive 105 106 def create_oebbook(self, htmlpath, basedir, opts, log, mi): 107 import css_parser 108 import logging 109 import uuid 110 111 from calibre import guess_type 112 from calibre.ebooks.conversion.plumber import create_oebbook 113 from calibre.ebooks.html.input import get_filelist 114 from calibre.ebooks.metadata import string_to_authors 115 from calibre.ebooks.oeb.base import ( 116 BINARY_MIME, OEB_STYLES, DirContainer, rewrite_links, urldefrag, 117 urlnormalize, urlquote, xpath 118 ) 119 from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata 120 from calibre.utils.localization import canonicalize_lang 121 css_parser.log.setLevel(logging.WARN) 122 self.OEB_STYLES = OEB_STYLES 123 oeb = create_oebbook(log, None, opts, self, 124 encoding=opts.input_encoding, populate=False) 125 self.oeb = oeb 126 127 metadata = oeb.metadata 128 meta_info_to_oeb_metadata(mi, metadata, log) 129 if not metadata.language: 130 l = canonicalize_lang(getattr(opts, 'language', None)) 131 if not l: 132 oeb.logger.warn('Language not specified') 133 l = get_lang().replace('_', '-') 134 metadata.add('language', l) 135 if not metadata.creator: 136 a = getattr(opts, 'authors', None) 137 if a: 138 a = string_to_authors(a) 139 if not a: 140 oeb.logger.warn('Creator not specified') 141 a = [self.oeb.translate(__('Unknown'))] 142 for aut in a: 143 metadata.add('creator', aut) 144 if not metadata.title: 145 oeb.logger.warn('Title not specified') 146 metadata.add('title', self.oeb.translate(__('Unknown'))) 147 bookid = str(uuid.uuid4()) 148 metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') 149 for ident in metadata.identifier: 150 if 'id' in ident.attrib: 151 self.oeb.uid = metadata.identifier[0] 152 break 153 154 filelist = get_filelist(htmlpath, basedir, opts, log) 155 filelist = [f for f in filelist if not f.is_binary] 156 htmlfile_map = {} 157 for f in filelist: 158 path = f.path 159 oeb.container = DirContainer(os.path.dirname(path), log, 160 ignore_opf=True) 161 bname = os.path.basename(path) 162 id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) 163 htmlfile_map[path] = href 164 item = oeb.manifest.add(id, href, 'text/html') 165 if path == htmlpath and '%' in path: 166 bname = urlquote(bname) 167 item.html_input_href = bname 168 oeb.spine.add(item, True) 169 170 self.added_resources = {} 171 self.log = log 172 self.log('Normalizing filename cases') 173 for path, href in htmlfile_map.items(): 174 if not self.is_case_sensitive(path): 175 path = path.lower() 176 self.added_resources[path] = href 177 self.urlnormalize, self.DirContainer = urlnormalize, DirContainer 178 self.urldefrag = urldefrag 179 self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME 180 self.stylesheets_to_process = [] 181 182 self.log('Rewriting HTML links') 183 for f in filelist: 184 path = f.path 185 dpath = os.path.dirname(path) 186 oeb.container = DirContainer(dpath, log, ignore_opf=True) 187 href = htmlfile_map[path] 188 try: 189 item = oeb.manifest.hrefs[href] 190 except KeyError: 191 item = oeb.manifest.hrefs[urlnormalize(href)] 192 rewrite_links(item.data, partial(self.resource_adder, base=dpath)) 193 194 while self.stylesheets_to_process: 195 sheet = self.stylesheets_to_process.pop() 196 css_parser.replaceUrls(sheet.data, partial(self.resource_adder, base=sheet.html_input_dirpath)) 197 for item in oeb.manifest: 198 if item.media_type in self.OEB_STYLES: 199 item.resolve_css_imports = True 200 item.override_css_fetch = None 201 item.reparse_css() 202 203 toc = self.oeb.toc 204 self.oeb.auto_generated_toc = True 205 titles = [] 206 headers = [] 207 for item in self.oeb.spine: 208 if not item.linear: 209 continue 210 html = item.data 211 title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) 212 title = re.sub(r'\s+', ' ', title.strip()) 213 if title: 214 titles.append(title) 215 headers.append('(unlabled)') 216 for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): 217 expr = '/h:html/h:body//h:%s[position()=1]/text()' 218 header = ''.join(xpath(html, expr % tag)) 219 header = re.sub(r'\s+', ' ', header.strip()) 220 if header: 221 headers[-1] = header 222 break 223 use = titles 224 if len(titles) > len(set(titles)): 225 use = headers 226 for title, item in zip(use, self.oeb.spine): 227 if not item.linear: 228 continue 229 toc.add(title, item.href) 230 231 oeb.container = DirContainer(os.getcwd(), oeb.log, ignore_opf=True) 232 return oeb 233 234 def link_to_local_path(self, link_, base=None): 235 from calibre.ebooks.html.input import Link 236 if not isinstance(link_, str): 237 try: 238 link_ = link_.decode('utf-8', 'error') 239 except: 240 self.log.warn('Failed to decode link %r. Ignoring'%link_) 241 return None, None 242 try: 243 l = Link(link_, base if base else os.getcwd()) 244 except: 245 self.log.exception('Failed to process link: %r'%link_) 246 return None, None 247 if l.path is None: 248 # Not a local resource 249 return None, None 250 link = l.path.replace('/', os.sep).strip() 251 frag = l.fragment 252 if not link: 253 return None, None 254 return link, frag 255 256 def resource_adder(self, link_, base=None): 257 from polyglot.urllib import quote 258 link, frag = self.link_to_local_path(link_, base=base) 259 if link is None: 260 return link_ 261 try: 262 if base and not os.path.isabs(link): 263 link = os.path.join(base, link) 264 link = os.path.abspath(link) 265 except: 266 return link_ 267 if not os.access(link, os.R_OK): 268 return link_ 269 if os.path.isdir(link): 270 self.log.warn(link_, 'is a link to a directory. Ignoring.') 271 return link_ 272 if not self.is_case_sensitive(tempfile.gettempdir()): 273 link = link.lower() 274 if link not in self.added_resources: 275 guessed = self.guess_type(os.path.basename(link))[0] 276 media_type = guessed or self.BINARY_MIME 277 is_stylesheet = media_type in self.OEB_STYLES 278 bhref = os.path.basename(link) 279 id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref)) 280 if media_type == 'text/plain': 281 self.log.warn('Ignoring link to text file %r'%link_) 282 return None 283 if media_type == self.BINARY_MIME: 284 # Check for the common case, images 285 try: 286 img = what(link) 287 except OSError: 288 pass 289 else: 290 if img: 291 media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME 292 293 self.oeb.log.debug('Added', link, 'with href:', href) 294 self.oeb.container = self.DirContainer(os.path.dirname(link), 295 self.oeb.log, ignore_opf=True) 296 # Load into memory 297 item = self.oeb.manifest.add(id, href, media_type) 298 # bhref refers to an already existing file. The read() method of 299 # DirContainer will call unquote on it before trying to read the 300 # file, therefore we quote it here. 301 if isinstance(bhref, str): 302 bhref = bhref.encode('utf-8') 303 item.html_input_href = as_unicode(quote(bhref)) 304 if is_stylesheet: 305 item.html_input_dirpath = os.path.dirname(link) 306 item.resolve_css_imports = False 307 item.override_css_fetch = lambda url: (None, '') 308 self.stylesheets_to_process.append(item) 309 item.data 310 self.added_resources[link] = href 311 312 nlink = self.added_resources[link] 313 if frag: 314 nlink = '#'.join((nlink, frag)) 315 return nlink 316