4__license__   = 'GPL v3'
5__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
8Fetch a webpage and its links recursively. The webpages are saved to disk in
9UTF-8 encoding with any charset declarations removed.
13import os
14import re
15import socket
16import sys
17import threading
18import time
19import traceback
20from urllib.request import urlopen
22from calibre import browser, relpath, unicode_path
23from calibre.constants import filesystem_encoding, iswindows
24from calibre.ebooks.BeautifulSoup import BeautifulSoup
25from calibre.ebooks.chardet import xml_to_unicode
26from calibre.utils.config import OptionParser
27from calibre.utils.filenames import ascii_filename
28from calibre.utils.img import image_from_data, image_to_data
29from calibre.utils.imghdr import what
30from calibre.utils.logging import Log
31from calibre.web.fetch.utils import rescale_image
32from polyglot.http_client import responses
33from polyglot.urllib import (
34    URLError, quote, url2pathname, urljoin, urlparse, urlsplit, urlunparse,
35    urlunsplit
39class AbortArticle(Exception):
40    pass
43class FetchError(Exception):
44    pass
47class closing:
49    'Context to automatically close something at the end of a block.'
51    def __init__(self, thing):
52        self.thing = thing
54    def __enter__(self):
55        return self.thing
57    def __exit__(self, *exc_info):
58        try:
59            self.thing.close()
60        except Exception:
61            pass
64def canonicalize_url(url):
65    # mechanize does not handle quoting automatically
66    if re.search(r'\s+', url) is not None:
67        purl = list(urlparse(url))
68        for i in range(2, 6):
69            purl[i] = quote(purl[i])
70        url = urlunparse(purl)
71    return url
74bad_url_counter = 0
77def basename(url):
78    try:
79        parts = urlsplit(url)
80        path = url2pathname(parts.path)
81        res = os.path.basename(path)
82    except:
83        global bad_url_counter
84        bad_url_counter += 1
85        return 'bad_url_%d.html'%bad_url_counter
86    if not os.path.splitext(res)[1]:
87        return 'index.html'
88    return res
91def save_soup(soup, target):
92    for meta in soup.findAll('meta', content=True):
93        if 'charset' in meta['content'].lower():
94            meta.extract()
95    for meta in soup.findAll('meta', charset=True):
96        meta.extract()
97    head = soup.find('head')
98    if head is not None:
99        nm = soup.new_tag('meta', charset='utf-8')
100        head.insert(0, nm)
102    selfdir = os.path.dirname(target)
104    for tag in soup.findAll(['img', 'link', 'a']):
105        for key in ('src', 'href'):
106            path = tag.get(key, None)
107            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
108                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
110    html = str(soup)
111    with open(target, 'wb') as f:
112        f.write(html.encode('utf-8'))
115class response(bytes):
117    def __new__(cls, *args):
118        obj = super().__new__(cls, *args)
119        obj.newurl = None
120        return obj
123def default_is_link_wanted(url, tag):
124    raise NotImplementedError()
127class RecursiveFetcher:
128    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
129                ('.exe\\s*$', '.mp3\\s*$', '.ogg\\s*$', '^\\s*mailto:', '^\\s*$'))
130    # ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
131    #                       (
132    #
133    #                        )
134    #                       )
135    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
136    default_timeout = socket.getdefaulttimeout()  # Needed here as it is used in __del__
138    def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
139        bd = options.dir
140        if not isinstance(bd, str):
141            bd = bd.decode(filesystem_encoding)
143        self.base_dir = os.path.abspath(os.path.expanduser(bd))
144        if not os.path.exists(self.base_dir):
145            os.makedirs(self.base_dir)
146        self.log = log
147        self.verbose = options.verbose
148        self.timeout = options.timeout
149        self.encoding = options.encoding
150        self.browser = options.browser if hasattr(options, 'browser') else browser()
151        self.max_recursions = options.max_recursions
152        self.match_regexps  = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
153        self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
154        self.max_files = options.max_files
155        self.delay = options.delay
156        self.last_fetch_at = 0.
157        self.filemap = {}
158        self.imagemap = image_map
159        self.imagemap_lock = threading.RLock()
160        self.stylemap = css_map
161        self.image_url_processor = None
162        self.stylemap_lock = threading.RLock()
163        self.downloaded_paths = []
164        self.current_dir = self.base_dir
165        self.files = 0
166        self.preprocess_regexps  = getattr(options, 'preprocess_regexps', [])
167        self.remove_tags         = getattr(options, 'remove_tags', [])
168        self.remove_tags_after   = getattr(options, 'remove_tags_after', None)
169        self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
170        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
171        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
172        self.preprocess_raw_html = getattr(options, 'preprocess_raw_html',
173                lambda raw, url: raw)
174        self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
175        self.postprocess_html_ext = getattr(options, 'postprocess_html', None)
176        self.preprocess_image_ext = getattr(options, 'preprocess_image', None)
177        self._is_link_wanted     = getattr(options, 'is_link_wanted',
178                default_is_link_wanted)
179        self.compress_news_images_max_size = getattr(options, 'compress_news_images_max_size', None)
180        self.compress_news_images = getattr(options, 'compress_news_images', False)
181        self.compress_news_images_auto_size = getattr(options, 'compress_news_images_auto_size', 16)
182        self.scale_news_images = getattr(options, 'scale_news_images', None)
183        self.download_stylesheets = not options.no_stylesheets
184        self.show_progress = True
185        self.failed_links = []
186        self.job_info = job_info
188    def get_soup(self, src, url=None):
189        nmassage = []
190        nmassage.extend(self.preprocess_regexps)
191        # Remove comments as they can leave detritus when extracting tags leaves
192        # multiple nested comments
193        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
194        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
195        usrc = self.preprocess_raw_html(usrc, url)
196        for pat, repl in nmassage:
197            usrc = pat.sub(repl, usrc)
198        soup = BeautifulSoup(usrc)
200        replace = self.prepreprocess_html_ext(soup)
201        if replace is not None:
202            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
203            for pat, repl in nmassage:
204                replace = pat.sub(repl, replace)
205            soup = BeautifulSoup(replace)
207        if self.keep_only_tags:
208            body = soup.new_tag('body')
209            try:
210                if isinstance(self.keep_only_tags, dict):
211                    self.keep_only_tags = [self.keep_only_tags]
212                for spec in self.keep_only_tags:
213                    for tag in soup.find('body').findAll(**spec):
214                        body.insert(len(body.contents), tag)
215                soup.find('body').replaceWith(body)
216            except AttributeError:  # soup has no body element
217                pass
219        def remove_beyond(tag, next):
220            while tag is not None and getattr(tag, 'name', None) != 'body':
221                after = getattr(tag, next)
222                while after is not None:
223                    ns = getattr(tag, next)
224                    after.extract()
225                    after = ns
226                tag = tag.parent
228        if self.remove_tags_after is not None:
229            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
230            for spec in rt:
231                tag = soup.find(**spec)
232                remove_beyond(tag, 'nextSibling')
234        if self.remove_tags_before is not None:
235            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
236            for spec in rt:
237                tag = soup.find(**spec)
238                remove_beyond(tag, 'previousSibling')
240        for kwds in self.remove_tags:
241            for tag in soup.findAll(**kwds):
242                tag.extract()
243        return self.preprocess_html_ext(soup)
245    def fetch_url(self, url):
246        data = None
247        self.log.debug('Fetching', url)
248        st = time.monotonic()
250        # Check for a URL pointing to the local filesystem and special case it
251        # for efficiency and robustness. Bypasses delay checking as it does not
252        # apply to local fetches. Ensures that unicode paths that are not
253        # representable in the filesystem_encoding work.
254        is_local = 0
255        if url.startswith('file://'):
256            is_local = 7
257        elif url.startswith('file:'):
258            is_local = 5
259        if is_local > 0:
260            url = url[is_local:]
261            if iswindows and url.startswith('/'):
262                url = url[1:]
263            with open(url, 'rb') as f:
264                data = response(f.read())
265                data.newurl = 'file:'+url  # This is what mechanize does for
266                # local URLs
267            self.log.debug('Fetched %s in %.1f seconds' % (url, time.monotonic() - st))
268            return data
270        delta = time.monotonic() - self.last_fetch_at
271        if delta < self.delay:
272            time.sleep(self.delay - delta)
273        url = canonicalize_url(url)
274        open_func = getattr(self.browser, 'open_novisit', self.browser.open)
275        try:
276            with closing(open_func(url, timeout=self.timeout)) as f:
277                data = response(f.read()+f.read())
278                data.newurl = f.geturl()
279        except URLError as err:
280            if hasattr(err, 'code') and err.code in responses:
281                raise FetchError(responses[err.code])
282            is_temp = False
283            reason = getattr(err, 'reason', None)
284            if isinstance(reason, socket.gaierror):
285                # see man gai_strerror() for details
286                if getattr(reason, 'errno', None) in (socket.EAI_AGAIN, socket.EAI_NONAME):
287                    is_temp = True
288            if is_temp:  # Connection reset by peer or Name or service not known
289                self.log.debug('Temporary error, retrying in 1 second')
290                time.sleep(1)
291                with closing(open_func(url, timeout=self.timeout)) as f:
292                    data = response(f.read()+f.read())
293                    data.newurl = f.geturl()
294            else:
295                raise err
296        finally:
297            self.last_fetch_at = time.monotonic()
298        self.log.debug('Fetched %s in %f seconds' % (url, time.monotonic() - st))
299        return data
301    def start_fetch(self, url):
302        soup = BeautifulSoup('<a href="'+url+'" />')
303        res = self.process_links(soup, url, 0, into_dir='')
304        self.log.debug(url, 'saved to', res)
305        return res
307    def is_link_ok(self, url):
308        for i in self.__class__.LINK_FILTER:
309            if i.search(url):
310                return False
311        return True
313    def is_link_wanted(self, url, tag):
314        try:
315            return self._is_link_wanted(url, tag)
316        except NotImplementedError:
317            pass
318        except:
319            return False
320        if self.filter_regexps:
321            for f in self.filter_regexps:
322                if f.search(url):
323                    return False
324        if self.match_regexps:
325            for m in self.match_regexps:
326                if m.search(url):
327                    return True
328            return False
329        return True
331    def process_stylesheets(self, soup, baseurl):
332        diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
333        if not os.path.exists(diskpath):
334            os.mkdir(diskpath)
335        for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
336            try:
337                mtype = tag['type']
338            except KeyError:
339                mtype = 'text/css' if tag.name.lower() == 'style' else ''
340            if mtype.lower() != 'text/css':
341                continue
342            if tag.has_attr('href'):
343                iurl = tag['href']
344                if not urlsplit(iurl).scheme:
345                    iurl = urljoin(baseurl, iurl, False)
346                found_cached = False
347                with self.stylemap_lock:
348                    if iurl in self.stylemap:
349                        tag['href'] = self.stylemap[iurl]
350                        found_cached = True
351                if found_cached:
352                    continue
353                try:
354                    data = self.fetch_url(iurl)
355                except Exception:
356                    self.log.exception('Could not fetch stylesheet ', iurl)
357                    continue
358                stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
359                with self.stylemap_lock:
360                    self.stylemap[iurl] = stylepath
361                with open(stylepath, 'wb') as x:
362                    x.write(data)
363                tag['href'] = stylepath
364            else:
365                for ns in tag.findAll(text=True):
366                    src = str(ns)
367                    m = self.__class__.CSS_IMPORT_PATTERN.search(src)
368                    if m:
369                        iurl = m.group(1)
370                        if not urlsplit(iurl).scheme:
371                            iurl = urljoin(baseurl, iurl, False)
372                        found_cached = False
373                        with self.stylemap_lock:
374                            if iurl in self.stylemap:
375                                ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
376                                found_cached = True
377                        if found_cached:
378                            continue
379                        try:
380                            data = self.fetch_url(iurl)
381                        except Exception:
382                            self.log.exception('Could not fetch stylesheet ', iurl)
383                            continue
384                        c += 1
385                        stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
386                        with self.stylemap_lock:
387                            self.stylemap[iurl] = stylepath
388                        with open(stylepath, 'wb') as x:
389                            x.write(data)
390                        ns.replaceWith(src.replace(m.group(1), stylepath))
392    def rescale_image(self, data):
393        return rescale_image(data, self.scale_news_images, self.compress_news_images_max_size, self.compress_news_images_auto_size)
395    def process_images(self, soup, baseurl):
396        diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
397        if not os.path.exists(diskpath):
398            os.mkdir(diskpath)
399        c = 0
400        for tag in soup.findAll('img', src=True):
401            iurl = tag['src']
402            if iurl.startswith('data:'):
403                try:
404                    data = urlopen(iurl).read()
405                except Exception:
406                    self.log.exception('Failed to decode embedded image')
407                    continue
408            else:
409                if callable(self.image_url_processor):
410                    iurl = self.image_url_processor(baseurl, iurl)
411                if not urlsplit(iurl).scheme:
412                    iurl = urljoin(baseurl, iurl, False)
413                found_in_cache = False
414                with self.imagemap_lock:
415                    if iurl in self.imagemap:
416                        tag['src'] = self.imagemap[iurl]
417                        found_in_cache = True
418                if found_in_cache:
419                    continue
420                try:
421                    data = self.fetch_url(iurl)
422                    if data == b'GIF89a\x01':
423                        # Skip empty GIF files as PIL errors on them anyway
424                        continue
425                except Exception:
426                    self.log.exception('Could not fetch image ', iurl)
427                    continue
428            c += 1
429            fname = ascii_filename('img'+str(c))
430            data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
431            if data is None:
432                continue
433            itype = what(None, data)
434            if itype == 'svg' or (itype is None and b'<svg' in data[:1024]):
435                # SVG image
436                imgpath = os.path.join(diskpath, fname+'.svg')
437                with self.imagemap_lock:
438                    self.imagemap[iurl] = imgpath
439                with open(imgpath, 'wb') as x:
440                    x.write(data)
441                tag['src'] = imgpath
442            else:
443                try:
444                    # Ensure image is valid
445                    img = image_from_data(data)
446                    if itype not in {'png', 'jpg', 'jpeg'}:
447                        itype = 'png' if itype == 'gif' else 'jpeg'
448                        data = image_to_data(img, fmt=itype)
449                    if self.compress_news_images and itype in {'jpg','jpeg'}:
450                        try:
451                            data = self.rescale_image(data)
452                        except Exception:
453                            self.log.exception('failed to compress image '+iurl)
454                    # Moon+ apparently cannot handle .jpeg files
455                    if itype == 'jpeg':
456                        itype = 'jpg'
457                    imgpath = os.path.join(diskpath, fname+'.'+itype)
458                    with self.imagemap_lock:
459                        self.imagemap[iurl] = imgpath
460                    with open(imgpath, 'wb') as x:
461                        x.write(data)
462                    tag['src'] = imgpath
463                except Exception:
464                    traceback.print_exc()
465                    continue
467    def absurl(self, baseurl, tag, key, filter=True):
468        iurl = tag[key]
469        parts = urlsplit(iurl)
470        if not parts.netloc and not parts.path and not parts.query:
471            return None
472        if not parts.scheme:
473            iurl = urljoin(baseurl, iurl, False)
474        if not self.is_link_ok(iurl):
475            self.log.debug('Skipping invalid link:', iurl)
476            return None
477        if filter and not self.is_link_wanted(iurl, tag):
478            self.log.debug('Filtered link: '+iurl)
479            return None
480        return iurl
482    def normurl(self, url):
483        parts = list(urlsplit(url))
484        parts[4] = ''
485        return urlunsplit(parts)
487    def localize_link(self, tag, key, path):
488        parts = urlsplit(tag[key])
489        suffix = ('#'+parts.fragment) if parts.fragment else ''
490        tag[key] = path+suffix
492    def process_return_links(self, soup, baseurl):
493        for tag in soup.findAll('a', href=True):
494            iurl = self.absurl(baseurl, tag, 'href')
495            if not iurl:
496                continue
497            nurl = self.normurl(iurl)
498            if nurl in self.filemap:
499                self.localize_link(tag, 'href', self.filemap[nurl])
501    def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
502        res = ''
503        diskpath = os.path.join(self.current_dir, into_dir)
504        if not os.path.exists(diskpath):
505            os.mkdir(diskpath)
506        prev_dir = self.current_dir
507        try:
508            self.current_dir = diskpath
509            tags = list(soup.findAll('a', href=True))
511            for c, tag in enumerate(tags):
512                if self.show_progress:
513                    print('.', end=' ')
514                    sys.stdout.flush()
515                sys.stdout.flush()
516                iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
517                if not iurl:
518                    continue
519                nurl = self.normurl(iurl)
520                if nurl in self.filemap:
521                    self.localize_link(tag, 'href', self.filemap[nurl])
522                    continue
523                if self.files > self.max_files:
524                    return res
525                linkdir = 'link'+str(c) if into_dir else ''
526                linkdiskpath = os.path.join(diskpath, linkdir)
527                if not os.path.exists(linkdiskpath):
528                    os.mkdir(linkdiskpath)
529                try:
530                    self.current_dir = linkdiskpath
531                    dsrc = self.fetch_url(iurl)
532                    newbaseurl = dsrc.newurl
533                    if len(dsrc) == 0 or \
534                       len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
535                        raise ValueError('No content at URL %r'%iurl)
536                    if callable(self.encoding):
537                        dsrc = self.encoding(dsrc)
538                    elif self.encoding is not None:
539                        dsrc = dsrc.decode(self.encoding, 'replace')
540                    else:
541                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]
543                    st = time.monotonic()
544                    soup = self.get_soup(dsrc, url=iurl)
545                    self.log.debug('Parsed %s in %.1f seconds' % (iurl, time.monotonic() - st))
547                    base = soup.find('base', href=True)
548                    if base is not None:
549                        newbaseurl = base['href']
550                    self.log.debug('Processing images...')
551                    self.process_images(soup, newbaseurl)
552                    if self.download_stylesheets:
553                        self.process_stylesheets(soup, newbaseurl)
555                    _fname = basename(iurl)
556                    if not isinstance(_fname, str):
557                        _fname.decode('latin1', 'replace')
558                    _fname = _fname.replace('%', '').replace(os.sep, '')
559                    _fname = ascii_filename(_fname)
560                    _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
561                    res = os.path.join(linkdiskpath, _fname)
562                    self.downloaded_paths.append(res)
563                    self.filemap[nurl] = res
564                    if recursion_level < self.max_recursions:
565                        self.log.debug('Processing links...')
566                        self.process_links(soup, newbaseurl, recursion_level+1)
567                    else:
568                        self.process_return_links(soup, newbaseurl)
569                        self.log.debug('Recursion limit reached. Skipping links in', iurl)
571                    if newbaseurl and not newbaseurl.startswith('/'):
572                        for atag in soup.findAll('a', href=lambda x: x and x.startswith('/')):
573                            atag['href'] = urljoin(newbaseurl, atag['href'], True)
574                    if callable(self.postprocess_html_ext):
575                        soup = self.postprocess_html_ext(soup,
576                                c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
577                                self.job_info)
579                        if c==0 and recursion_level == 0:
580                            self.called_first = True
582                    save_soup(soup, res)
583                    self.localize_link(tag, 'href', res)
584                except Exception as err:
585                    if isinstance(err, AbortArticle):
586                        raise
587                    self.failed_links.append((iurl, traceback.format_exc()))
588                    self.log.exception('Could not fetch link', iurl)
589                finally:
590                    self.current_dir = diskpath
591                    self.files += 1
592        finally:
593            self.current_dir = prev_dir
594        if self.show_progress:
595            print()
596        return res
599def option_parser(usage=_('%prog URL\n\nWhere URL is for example https://google.com')):
600    parser = OptionParser(usage=usage)
601    parser.add_option('-d', '--base-dir',
602                      help=_('Base folder into which URL is saved. Default is %default'),
603                      default='.', type='string', dest='dir')
604    parser.add_option('-t', '--timeout',
605                      help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),
606                      default=10.0, type='float', dest='timeout')
607    parser.add_option('-r', '--max-recursions', default=1,
608                      help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'),
609                      type='int', dest='max_recursions')
610    parser.add_option('-n', '--max-files', default=sys.maxsize, type='int', dest='max_files',
611                      help=_('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default'))
612    parser.add_option('--delay', default=0, dest='delay', type='float',
613                      help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
614    parser.add_option('--encoding', default=None,
615                      help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
616    parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
617                      help=_('Only links that match this regular expression will be followed. '
618                             'This option can be specified multiple times, in which case as long '
619                             'as a link matches any one regexp, it will be followed. By default all '
620                             'links are followed.'))
621    parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
622                      help=_('Any link that matches this regular expression will be ignored.'
623                             ' This option can be specified multiple times, in which case as'
624                             ' long as any regexp matches a link, it will be ignored. By'
625                             ' default, no links are ignored. If both filter regexp and match'
626                             ' regexp are specified, then filter regexp is applied first.'))
627    parser.add_option('--dont-download-stylesheets', action='store_true', default=False,
628                      help=_('Do not download CSS stylesheets.'), dest='no_stylesheets')
629    parser.add_option('--verbose', help=_('Show detailed output information. Useful for debugging'),
630                      default=False, action='store_true', dest='verbose')
631    return parser
634def create_fetcher(options, image_map={}, log=None):
635    if log is None:
636        log = Log(level=Log.DEBUG) if options.verbose else Log()
637    return RecursiveFetcher(options, log, image_map={})
640def main(args=sys.argv):
641    parser = option_parser()
642    options, args = parser.parse_args(args)
643    if len(args) != 2:
644        parser.print_help()
645        return 1
647    fetcher = create_fetcher(options)
648    fetcher.start_fetch(args[1])
651if __name__ == '__main__':
652    sys.exit(main())