1#!/usr/local/bin/python3.8 2 3 4__license__ = 'GPL v3' 5__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' 6 7''' 8Fetch a webpage and its links recursively. The webpages are saved to disk in 9UTF-8 encoding with any charset declarations removed. 10''' 11 12 13import os 14import re 15import socket 16import sys 17import threading 18import time 19import traceback 20from urllib.request import urlopen 21 22from calibre import browser, relpath, unicode_path 23from calibre.constants import filesystem_encoding, iswindows 24from calibre.ebooks.BeautifulSoup import BeautifulSoup 25from calibre.ebooks.chardet import xml_to_unicode 26from calibre.utils.config import OptionParser 27from calibre.utils.filenames import ascii_filename 28from calibre.utils.img import image_from_data, image_to_data 29from calibre.utils.imghdr import what 30from calibre.utils.logging import Log 31from calibre.web.fetch.utils import rescale_image 32from polyglot.http_client import responses 33from polyglot.urllib import ( 34 URLError, quote, url2pathname, urljoin, urlparse, urlsplit, urlunparse, 35 urlunsplit 36) 37 38 39class AbortArticle(Exception): 40 pass 41 42 43class FetchError(Exception): 44 pass 45 46 47class closing: 48 49 'Context to automatically close something at the end of a block.' 50 51 def __init__(self, thing): 52 self.thing = thing 53 54 def __enter__(self): 55 return self.thing 56 57 def __exit__(self, *exc_info): 58 try: 59 self.thing.close() 60 except Exception: 61 pass 62 63 64def canonicalize_url(url): 65 # mechanize does not handle quoting automatically 66 if re.search(r'\s+', url) is not None: 67 purl = list(urlparse(url)) 68 for i in range(2, 6): 69 purl[i] = quote(purl[i]) 70 url = urlunparse(purl) 71 return url 72 73 74bad_url_counter = 0 75 76 77def basename(url): 78 try: 79 parts = urlsplit(url) 80 path = url2pathname(parts.path) 81 res = os.path.basename(path) 82 except: 83 global bad_url_counter 84 bad_url_counter += 1 85 return 'bad_url_%d.html'%bad_url_counter 86 if not os.path.splitext(res)[1]: 87 return 'index.html' 88 return res 89 90 91def save_soup(soup, target): 92 for meta in soup.findAll('meta', content=True): 93 if 'charset' in meta['content'].lower(): 94 meta.extract() 95 for meta in soup.findAll('meta', charset=True): 96 meta.extract() 97 head = soup.find('head') 98 if head is not None: 99 nm = soup.new_tag('meta', charset='utf-8') 100 head.insert(0, nm) 101 102 selfdir = os.path.dirname(target) 103 104 for tag in soup.findAll(['img', 'link', 'a']): 105 for key in ('src', 'href'): 106 path = tag.get(key, None) 107 if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): 108 tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) 109 110 html = str(soup) 111 with open(target, 'wb') as f: 112 f.write(html.encode('utf-8')) 113 114 115class response(bytes): 116 117 def __new__(cls, *args): 118 obj = super().__new__(cls, *args) 119 obj.newurl = None 120 return obj 121 122 123def default_is_link_wanted(url, tag): 124 raise NotImplementedError() 125 126 127class RecursiveFetcher: 128 LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in 129 ('.exe\\s*$', '.mp3\\s*$', '.ogg\\s*$', '^\\s*mailto:', '^\\s*$')) 130 # ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in 131 # ( 132 # 133 # ) 134 # ) 135 CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) 136 default_timeout = socket.getdefaulttimeout() # Needed here as it is used in __del__ 137 138 def __init__(self, options, log, image_map={}, css_map={}, job_info=None): 139 bd = options.dir 140 if not isinstance(bd, str): 141 bd = bd.decode(filesystem_encoding) 142 143 self.base_dir = os.path.abspath(os.path.expanduser(bd)) 144 if not os.path.exists(self.base_dir): 145 os.makedirs(self.base_dir) 146 self.log = log 147 self.verbose = options.verbose 148 self.timeout = options.timeout 149 self.encoding = options.encoding 150 self.browser = options.browser if hasattr(options, 'browser') else browser() 151 self.max_recursions = options.max_recursions 152 self.match_regexps = [re.compile(i, re.IGNORECASE) for i in options.match_regexps] 153 self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps] 154 self.max_files = options.max_files 155 self.delay = options.delay 156 self.last_fetch_at = 0. 157 self.filemap = {} 158 self.imagemap = image_map 159 self.imagemap_lock = threading.RLock() 160 self.stylemap = css_map 161 self.image_url_processor = None 162 self.stylemap_lock = threading.RLock() 163 self.downloaded_paths = [] 164 self.current_dir = self.base_dir 165 self.files = 0 166 self.preprocess_regexps = getattr(options, 'preprocess_regexps', []) 167 self.remove_tags = getattr(options, 'remove_tags', []) 168 self.remove_tags_after = getattr(options, 'remove_tags_after', None) 169 self.remove_tags_before = getattr(options, 'remove_tags_before', None) 170 self.keep_only_tags = getattr(options, 'keep_only_tags', []) 171 self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 172 self.preprocess_raw_html = getattr(options, 'preprocess_raw_html', 173 lambda raw, url: raw) 174 self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None) 175 self.postprocess_html_ext = getattr(options, 'postprocess_html', None) 176 self.preprocess_image_ext = getattr(options, 'preprocess_image', None) 177 self._is_link_wanted = getattr(options, 'is_link_wanted', 178 default_is_link_wanted) 179 self.compress_news_images_max_size = getattr(options, 'compress_news_images_max_size', None) 180 self.compress_news_images = getattr(options, 'compress_news_images', False) 181 self.compress_news_images_auto_size = getattr(options, 'compress_news_images_auto_size', 16) 182 self.scale_news_images = getattr(options, 'scale_news_images', None) 183 self.download_stylesheets = not options.no_stylesheets 184 self.show_progress = True 185 self.failed_links = [] 186 self.job_info = job_info 187 188 def get_soup(self, src, url=None): 189 nmassage = [] 190 nmassage.extend(self.preprocess_regexps) 191 # Remove comments as they can leave detritus when extracting tags leaves 192 # multiple nested comments 193 nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) 194 usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] 195 usrc = self.preprocess_raw_html(usrc, url) 196 for pat, repl in nmassage: 197 usrc = pat.sub(repl, usrc) 198 soup = BeautifulSoup(usrc) 199 200 replace = self.prepreprocess_html_ext(soup) 201 if replace is not None: 202 replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] 203 for pat, repl in nmassage: 204 replace = pat.sub(repl, replace) 205 soup = BeautifulSoup(replace) 206 207 if self.keep_only_tags: 208 body = soup.new_tag('body') 209 try: 210 if isinstance(self.keep_only_tags, dict): 211 self.keep_only_tags = [self.keep_only_tags] 212 for spec in self.keep_only_tags: 213 for tag in soup.find('body').findAll(**spec): 214 body.insert(len(body.contents), tag) 215 soup.find('body').replaceWith(body) 216 except AttributeError: # soup has no body element 217 pass 218 219 def remove_beyond(tag, next): 220 while tag is not None and getattr(tag, 'name', None) != 'body': 221 after = getattr(tag, next) 222 while after is not None: 223 ns = getattr(tag, next) 224 after.extract() 225 after = ns 226 tag = tag.parent 227 228 if self.remove_tags_after is not None: 229 rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after 230 for spec in rt: 231 tag = soup.find(**spec) 232 remove_beyond(tag, 'nextSibling') 233 234 if self.remove_tags_before is not None: 235 rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before 236 for spec in rt: 237 tag = soup.find(**spec) 238 remove_beyond(tag, 'previousSibling') 239 240 for kwds in self.remove_tags: 241 for tag in soup.findAll(**kwds): 242 tag.extract() 243 return self.preprocess_html_ext(soup) 244 245 def fetch_url(self, url): 246 data = None 247 self.log.debug('Fetching', url) 248 st = time.monotonic() 249 250 # Check for a URL pointing to the local filesystem and special case it 251 # for efficiency and robustness. Bypasses delay checking as it does not 252 # apply to local fetches. Ensures that unicode paths that are not 253 # representable in the filesystem_encoding work. 254 is_local = 0 255 if url.startswith('file://'): 256 is_local = 7 257 elif url.startswith('file:'): 258 is_local = 5 259 if is_local > 0: 260 url = url[is_local:] 261 if iswindows and url.startswith('/'): 262 url = url[1:] 263 with open(url, 'rb') as f: 264 data = response(f.read()) 265 data.newurl = 'file:'+url # This is what mechanize does for 266 # local URLs 267 self.log.debug('Fetched %s in %.1f seconds' % (url, time.monotonic() - st)) 268 return data 269 270 delta = time.monotonic() - self.last_fetch_at 271 if delta < self.delay: 272 time.sleep(self.delay - delta) 273 url = canonicalize_url(url) 274 open_func = getattr(self.browser, 'open_novisit', self.browser.open) 275 try: 276 with closing(open_func(url, timeout=self.timeout)) as f: 277 data = response(f.read()+f.read()) 278 data.newurl = f.geturl() 279 except URLError as err: 280 if hasattr(err, 'code') and err.code in responses: 281 raise FetchError(responses[err.code]) 282 is_temp = False 283 reason = getattr(err, 'reason', None) 284 if isinstance(reason, socket.gaierror): 285 # see man gai_strerror() for details 286 if getattr(reason, 'errno', None) in (socket.EAI_AGAIN, socket.EAI_NONAME): 287 is_temp = True 288 if is_temp: # Connection reset by peer or Name or service not known 289 self.log.debug('Temporary error, retrying in 1 second') 290 time.sleep(1) 291 with closing(open_func(url, timeout=self.timeout)) as f: 292 data = response(f.read()+f.read()) 293 data.newurl = f.geturl() 294 else: 295 raise err 296 finally: 297 self.last_fetch_at = time.monotonic() 298 self.log.debug('Fetched %s in %f seconds' % (url, time.monotonic() - st)) 299 return data 300 301 def start_fetch(self, url): 302 soup = BeautifulSoup('<a href="'+url+'" />') 303 res = self.process_links(soup, url, 0, into_dir='') 304 self.log.debug(url, 'saved to', res) 305 return res 306 307 def is_link_ok(self, url): 308 for i in self.__class__.LINK_FILTER: 309 if i.search(url): 310 return False 311 return True 312 313 def is_link_wanted(self, url, tag): 314 try: 315 return self._is_link_wanted(url, tag) 316 except NotImplementedError: 317 pass 318 except: 319 return False 320 if self.filter_regexps: 321 for f in self.filter_regexps: 322 if f.search(url): 323 return False 324 if self.match_regexps: 325 for m in self.match_regexps: 326 if m.search(url): 327 return True 328 return False 329 return True 330 331 def process_stylesheets(self, soup, baseurl): 332 diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets')) 333 if not os.path.exists(diskpath): 334 os.mkdir(diskpath) 335 for c, tag in enumerate(soup.findAll(name=['link', 'style'])): 336 try: 337 mtype = tag['type'] 338 except KeyError: 339 mtype = 'text/css' if tag.name.lower() == 'style' else '' 340 if mtype.lower() != 'text/css': 341 continue 342 if tag.has_attr('href'): 343 iurl = tag['href'] 344 if not urlsplit(iurl).scheme: 345 iurl = urljoin(baseurl, iurl, False) 346 found_cached = False 347 with self.stylemap_lock: 348 if iurl in self.stylemap: 349 tag['href'] = self.stylemap[iurl] 350 found_cached = True 351 if found_cached: 352 continue 353 try: 354 data = self.fetch_url(iurl) 355 except Exception: 356 self.log.exception('Could not fetch stylesheet ', iurl) 357 continue 358 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') 359 with self.stylemap_lock: 360 self.stylemap[iurl] = stylepath 361 with open(stylepath, 'wb') as x: 362 x.write(data) 363 tag['href'] = stylepath 364 else: 365 for ns in tag.findAll(text=True): 366 src = str(ns) 367 m = self.__class__.CSS_IMPORT_PATTERN.search(src) 368 if m: 369 iurl = m.group(1) 370 if not urlsplit(iurl).scheme: 371 iurl = urljoin(baseurl, iurl, False) 372 found_cached = False 373 with self.stylemap_lock: 374 if iurl in self.stylemap: 375 ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) 376 found_cached = True 377 if found_cached: 378 continue 379 try: 380 data = self.fetch_url(iurl) 381 except Exception: 382 self.log.exception('Could not fetch stylesheet ', iurl) 383 continue 384 c += 1 385 stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') 386 with self.stylemap_lock: 387 self.stylemap[iurl] = stylepath 388 with open(stylepath, 'wb') as x: 389 x.write(data) 390 ns.replaceWith(src.replace(m.group(1), stylepath)) 391 392 def rescale_image(self, data): 393 return rescale_image(data, self.scale_news_images, self.compress_news_images_max_size, self.compress_news_images_auto_size) 394 395 def process_images(self, soup, baseurl): 396 diskpath = unicode_path(os.path.join(self.current_dir, 'images')) 397 if not os.path.exists(diskpath): 398 os.mkdir(diskpath) 399 c = 0 400 for tag in soup.findAll('img', src=True): 401 iurl = tag['src'] 402 if iurl.startswith('data:'): 403 try: 404 data = urlopen(iurl).read() 405 except Exception: 406 self.log.exception('Failed to decode embedded image') 407 continue 408 else: 409 if callable(self.image_url_processor): 410 iurl = self.image_url_processor(baseurl, iurl) 411 if not urlsplit(iurl).scheme: 412 iurl = urljoin(baseurl, iurl, False) 413 found_in_cache = False 414 with self.imagemap_lock: 415 if iurl in self.imagemap: 416 tag['src'] = self.imagemap[iurl] 417 found_in_cache = True 418 if found_in_cache: 419 continue 420 try: 421 data = self.fetch_url(iurl) 422 if data == b'GIF89a\x01': 423 # Skip empty GIF files as PIL errors on them anyway 424 continue 425 except Exception: 426 self.log.exception('Could not fetch image ', iurl) 427 continue 428 c += 1 429 fname = ascii_filename('img'+str(c)) 430 data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data 431 if data is None: 432 continue 433 itype = what(None, data) 434 if itype == 'svg' or (itype is None and b'<svg' in data[:1024]): 435 # SVG image 436 imgpath = os.path.join(diskpath, fname+'.svg') 437 with self.imagemap_lock: 438 self.imagemap[iurl] = imgpath 439 with open(imgpath, 'wb') as x: 440 x.write(data) 441 tag['src'] = imgpath 442 else: 443 try: 444 # Ensure image is valid 445 img = image_from_data(data) 446 if itype not in {'png', 'jpg', 'jpeg'}: 447 itype = 'png' if itype == 'gif' else 'jpeg' 448 data = image_to_data(img, fmt=itype) 449 if self.compress_news_images and itype in {'jpg','jpeg'}: 450 try: 451 data = self.rescale_image(data) 452 except Exception: 453 self.log.exception('failed to compress image '+iurl) 454 # Moon+ apparently cannot handle .jpeg files 455 if itype == 'jpeg': 456 itype = 'jpg' 457 imgpath = os.path.join(diskpath, fname+'.'+itype) 458 with self.imagemap_lock: 459 self.imagemap[iurl] = imgpath 460 with open(imgpath, 'wb') as x: 461 x.write(data) 462 tag['src'] = imgpath 463 except Exception: 464 traceback.print_exc() 465 continue 466 467 def absurl(self, baseurl, tag, key, filter=True): 468 iurl = tag[key] 469 parts = urlsplit(iurl) 470 if not parts.netloc and not parts.path and not parts.query: 471 return None 472 if not parts.scheme: 473 iurl = urljoin(baseurl, iurl, False) 474 if not self.is_link_ok(iurl): 475 self.log.debug('Skipping invalid link:', iurl) 476 return None 477 if filter and not self.is_link_wanted(iurl, tag): 478 self.log.debug('Filtered link: '+iurl) 479 return None 480 return iurl 481 482 def normurl(self, url): 483 parts = list(urlsplit(url)) 484 parts[4] = '' 485 return urlunsplit(parts) 486 487 def localize_link(self, tag, key, path): 488 parts = urlsplit(tag[key]) 489 suffix = ('#'+parts.fragment) if parts.fragment else '' 490 tag[key] = path+suffix 491 492 def process_return_links(self, soup, baseurl): 493 for tag in soup.findAll('a', href=True): 494 iurl = self.absurl(baseurl, tag, 'href') 495 if not iurl: 496 continue 497 nurl = self.normurl(iurl) 498 if nurl in self.filemap: 499 self.localize_link(tag, 'href', self.filemap[nurl]) 500 501 def process_links(self, soup, baseurl, recursion_level, into_dir='links'): 502 res = '' 503 diskpath = os.path.join(self.current_dir, into_dir) 504 if not os.path.exists(diskpath): 505 os.mkdir(diskpath) 506 prev_dir = self.current_dir 507 try: 508 self.current_dir = diskpath 509 tags = list(soup.findAll('a', href=True)) 510 511 for c, tag in enumerate(tags): 512 if self.show_progress: 513 print('.', end=' ') 514 sys.stdout.flush() 515 sys.stdout.flush() 516 iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0) 517 if not iurl: 518 continue 519 nurl = self.normurl(iurl) 520 if nurl in self.filemap: 521 self.localize_link(tag, 'href', self.filemap[nurl]) 522 continue 523 if self.files > self.max_files: 524 return res 525 linkdir = 'link'+str(c) if into_dir else '' 526 linkdiskpath = os.path.join(diskpath, linkdir) 527 if not os.path.exists(linkdiskpath): 528 os.mkdir(linkdiskpath) 529 try: 530 self.current_dir = linkdiskpath 531 dsrc = self.fetch_url(iurl) 532 newbaseurl = dsrc.newurl 533 if len(dsrc) == 0 or \ 534 len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0: 535 raise ValueError('No content at URL %r'%iurl) 536 if callable(self.encoding): 537 dsrc = self.encoding(dsrc) 538 elif self.encoding is not None: 539 dsrc = dsrc.decode(self.encoding, 'replace') 540 else: 541 dsrc = xml_to_unicode(dsrc, self.verbose)[0] 542 543 st = time.monotonic() 544 soup = self.get_soup(dsrc, url=iurl) 545 self.log.debug('Parsed %s in %.1f seconds' % (iurl, time.monotonic() - st)) 546 547 base = soup.find('base', href=True) 548 if base is not None: 549 newbaseurl = base['href'] 550 self.log.debug('Processing images...') 551 self.process_images(soup, newbaseurl) 552 if self.download_stylesheets: 553 self.process_stylesheets(soup, newbaseurl) 554 555 _fname = basename(iurl) 556 if not isinstance(_fname, str): 557 _fname.decode('latin1', 'replace') 558 _fname = _fname.replace('%', '').replace(os.sep, '') 559 _fname = ascii_filename(_fname) 560 _fname = os.path.splitext(_fname)[0][:120] + '.xhtml' 561 res = os.path.join(linkdiskpath, _fname) 562 self.downloaded_paths.append(res) 563 self.filemap[nurl] = res 564 if recursion_level < self.max_recursions: 565 self.log.debug('Processing links...') 566 self.process_links(soup, newbaseurl, recursion_level+1) 567 else: 568 self.process_return_links(soup, newbaseurl) 569 self.log.debug('Recursion limit reached. Skipping links in', iurl) 570 571 if newbaseurl and not newbaseurl.startswith('/'): 572 for atag in soup.findAll('a', href=lambda x: x and x.startswith('/')): 573 atag['href'] = urljoin(newbaseurl, atag['href'], True) 574 if callable(self.postprocess_html_ext): 575 soup = self.postprocess_html_ext(soup, 576 c==0 and recursion_level==0 and not getattr(self, 'called_first', False), 577 self.job_info) 578 579 if c==0 and recursion_level == 0: 580 self.called_first = True 581 582 save_soup(soup, res) 583 self.localize_link(tag, 'href', res) 584 except Exception as err: 585 if isinstance(err, AbortArticle): 586 raise 587 self.failed_links.append((iurl, traceback.format_exc())) 588 self.log.exception('Could not fetch link', iurl) 589 finally: 590 self.current_dir = diskpath 591 self.files += 1 592 finally: 593 self.current_dir = prev_dir 594 if self.show_progress: 595 print() 596 return res 597 598 599def option_parser(usage=_('%prog URL\n\nWhere URL is for example https://google.com')): 600 parser = OptionParser(usage=usage) 601 parser.add_option('-d', '--base-dir', 602 help=_('Base folder into which URL is saved. Default is %default'), 603 default='.', type='string', dest='dir') 604 parser.add_option('-t', '--timeout', 605 help=_('Timeout in seconds to wait for a response from the server. Default: %default s'), 606 default=10.0, type='float', dest='timeout') 607 parser.add_option('-r', '--max-recursions', default=1, 608 help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'), 609 type='int', dest='max_recursions') 610 parser.add_option('-n', '--max-files', default=sys.maxsize, type='int', dest='max_files', 611 help=_('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default')) 612 parser.add_option('--delay', default=0, dest='delay', type='float', 613 help=_('Minimum interval in seconds between consecutive fetches. Default is %default s')) 614 parser.add_option('--encoding', default=None, 615 help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.')) 616 parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps', 617 help=_('Only links that match this regular expression will be followed. ' 618 'This option can be specified multiple times, in which case as long ' 619 'as a link matches any one regexp, it will be followed. By default all ' 620 'links are followed.')) 621 parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps', 622 help=_('Any link that matches this regular expression will be ignored.' 623 ' This option can be specified multiple times, in which case as' 624 ' long as any regexp matches a link, it will be ignored. By' 625 ' default, no links are ignored. If both filter regexp and match' 626 ' regexp are specified, then filter regexp is applied first.')) 627 parser.add_option('--dont-download-stylesheets', action='store_true', default=False, 628 help=_('Do not download CSS stylesheets.'), dest='no_stylesheets') 629 parser.add_option('--verbose', help=_('Show detailed output information. Useful for debugging'), 630 default=False, action='store_true', dest='verbose') 631 return parser 632 633 634def create_fetcher(options, image_map={}, log=None): 635 if log is None: 636 log = Log(level=Log.DEBUG) if options.verbose else Log() 637 return RecursiveFetcher(options, log, image_map={}) 638 639 640def main(args=sys.argv): 641 parser = option_parser() 642 options, args = parser.parse_args(args) 643 if len(args) != 2: 644 parser.print_help() 645 return 1 646 647 fetcher = create_fetcher(options) 648 fetcher.start_fetch(args[1]) 649 650 651if __name__ == '__main__': 652 sys.exit(main()) 653