1#!/usr/local/bin/python3.8
2
3
4__license__   = 'GPL v3'
5__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
6
7'''
8Fetch a webpage and its links recursively. The webpages are saved to disk in
9UTF-8 encoding with any charset declarations removed.
10'''
11
12
13import os
14import re
15import socket
16import sys
17import threading
18import time
19import traceback
20from urllib.request import urlopen
21
22from calibre import browser, relpath, unicode_path
23from calibre.constants import filesystem_encoding, iswindows
24from calibre.ebooks.BeautifulSoup import BeautifulSoup
25from calibre.ebooks.chardet import xml_to_unicode
26from calibre.utils.config import OptionParser
27from calibre.utils.filenames import ascii_filename
28from calibre.utils.img import image_from_data, image_to_data
29from calibre.utils.imghdr import what
30from calibre.utils.logging import Log
31from calibre.web.fetch.utils import rescale_image
32from polyglot.http_client import responses
33from polyglot.urllib import (
34    URLError, quote, url2pathname, urljoin, urlparse, urlsplit, urlunparse,
35    urlunsplit
36)
37
38
39class AbortArticle(Exception):
40    pass
41
42
43class FetchError(Exception):
44    pass
45
46
47class closing:
48
49    'Context to automatically close something at the end of a block.'
50
51    def __init__(self, thing):
52        self.thing = thing
53
54    def __enter__(self):
55        return self.thing
56
57    def __exit__(self, *exc_info):
58        try:
59            self.thing.close()
60        except Exception:
61            pass
62
63
64def canonicalize_url(url):
65    # mechanize does not handle quoting automatically
66    if re.search(r'\s+', url) is not None:
67        purl = list(urlparse(url))
68        for i in range(2, 6):
69            purl[i] = quote(purl[i])
70        url = urlunparse(purl)
71    return url
72
73
74bad_url_counter = 0
75
76
77def basename(url):
78    try:
79        parts = urlsplit(url)
80        path = url2pathname(parts.path)
81        res = os.path.basename(path)
82    except:
83        global bad_url_counter
84        bad_url_counter += 1
85        return 'bad_url_%d.html'%bad_url_counter
86    if not os.path.splitext(res)[1]:
87        return 'index.html'
88    return res
89
90
91def save_soup(soup, target):
92    for meta in soup.findAll('meta', content=True):
93        if 'charset' in meta['content'].lower():
94            meta.extract()
95    for meta in soup.findAll('meta', charset=True):
96        meta.extract()
97    head = soup.find('head')
98    if head is not None:
99        nm = soup.new_tag('meta', charset='utf-8')
100        head.insert(0, nm)
101
102    selfdir = os.path.dirname(target)
103
104    for tag in soup.findAll(['img', 'link', 'a']):
105        for key in ('src', 'href'):
106            path = tag.get(key, None)
107            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
108                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))
109
110    html = str(soup)
111    with open(target, 'wb') as f:
112        f.write(html.encode('utf-8'))
113
114
115class response(bytes):
116
117    def __new__(cls, *args):
118        obj = super().__new__(cls, *args)
119        obj.newurl = None
120        return obj
121
122
123def default_is_link_wanted(url, tag):
124    raise NotImplementedError()
125
126
127class RecursiveFetcher:
128    LINK_FILTER = tuple(re.compile(i, re.IGNORECASE) for i in
129                ('.exe\\s*$', '.mp3\\s*$', '.ogg\\s*$', '^\\s*mailto:', '^\\s*$'))
130    # ADBLOCK_FILTER = tuple(re.compile(i, re.IGNORECASE) for it in
131    #                       (
132    #
133    #                        )
134    #                       )
135    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
136    default_timeout = socket.getdefaulttimeout()  # Needed here as it is used in __del__
137
138    def __init__(self, options, log, image_map={}, css_map={}, job_info=None):
139        bd = options.dir
140        if not isinstance(bd, str):
141            bd = bd.decode(filesystem_encoding)
142
143        self.base_dir = os.path.abspath(os.path.expanduser(bd))
144        if not os.path.exists(self.base_dir):
145            os.makedirs(self.base_dir)
146        self.log = log
147        self.verbose = options.verbose
148        self.timeout = options.timeout
149        self.encoding = options.encoding
150        self.browser = options.browser if hasattr(options, 'browser') else browser()
151        self.max_recursions = options.max_recursions
152        self.match_regexps  = [re.compile(i, re.IGNORECASE) for i in options.match_regexps]
153        self.filter_regexps = [re.compile(i, re.IGNORECASE) for i in options.filter_regexps]
154        self.max_files = options.max_files
155        self.delay = options.delay
156        self.last_fetch_at = 0.
157        self.filemap = {}
158        self.imagemap = image_map
159        self.imagemap_lock = threading.RLock()
160        self.stylemap = css_map
161        self.image_url_processor = None
162        self.stylemap_lock = threading.RLock()
163        self.downloaded_paths = []
164        self.current_dir = self.base_dir
165        self.files = 0
166        self.preprocess_regexps  = getattr(options, 'preprocess_regexps', [])
167        self.remove_tags         = getattr(options, 'remove_tags', [])
168        self.remove_tags_after   = getattr(options, 'remove_tags_after', None)
169        self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
170        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
171        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
172        self.preprocess_raw_html = getattr(options, 'preprocess_raw_html',
173                lambda raw, url: raw)
174        self.prepreprocess_html_ext = getattr(options, 'skip_ad_pages', lambda soup: None)
175        self.postprocess_html_ext = getattr(options, 'postprocess_html', None)
176        self.preprocess_image_ext = getattr(options, 'preprocess_image', None)
177        self._is_link_wanted     = getattr(options, 'is_link_wanted',
178                default_is_link_wanted)
179        self.compress_news_images_max_size = getattr(options, 'compress_news_images_max_size', None)
180        self.compress_news_images = getattr(options, 'compress_news_images', False)
181        self.compress_news_images_auto_size = getattr(options, 'compress_news_images_auto_size', 16)
182        self.scale_news_images = getattr(options, 'scale_news_images', None)
183        self.download_stylesheets = not options.no_stylesheets
184        self.show_progress = True
185        self.failed_links = []
186        self.job_info = job_info
187
188    def get_soup(self, src, url=None):
189        nmassage = []
190        nmassage.extend(self.preprocess_regexps)
191        # Remove comments as they can leave detritus when extracting tags leaves
192        # multiple nested comments
193        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
194        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
195        usrc = self.preprocess_raw_html(usrc, url)
196        for pat, repl in nmassage:
197            usrc = pat.sub(repl, usrc)
198        soup = BeautifulSoup(usrc)
199
200        replace = self.prepreprocess_html_ext(soup)
201        if replace is not None:
202            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
203            for pat, repl in nmassage:
204                replace = pat.sub(repl, replace)
205            soup = BeautifulSoup(replace)
206
207        if self.keep_only_tags:
208            body = soup.new_tag('body')
209            try:
210                if isinstance(self.keep_only_tags, dict):
211                    self.keep_only_tags = [self.keep_only_tags]
212                for spec in self.keep_only_tags:
213                    for tag in soup.find('body').findAll(**spec):
214                        body.insert(len(body.contents), tag)
215                soup.find('body').replaceWith(body)
216            except AttributeError:  # soup has no body element
217                pass
218
219        def remove_beyond(tag, next):
220            while tag is not None and getattr(tag, 'name', None) != 'body':
221                after = getattr(tag, next)
222                while after is not None:
223                    ns = getattr(tag, next)
224                    after.extract()
225                    after = ns
226                tag = tag.parent
227
228        if self.remove_tags_after is not None:
229            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
230            for spec in rt:
231                tag = soup.find(**spec)
232                remove_beyond(tag, 'nextSibling')
233
234        if self.remove_tags_before is not None:
235            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
236            for spec in rt:
237                tag = soup.find(**spec)
238                remove_beyond(tag, 'previousSibling')
239
240        for kwds in self.remove_tags:
241            for tag in soup.findAll(**kwds):
242                tag.extract()
243        return self.preprocess_html_ext(soup)
244
245    def fetch_url(self, url):
246        data = None
247        self.log.debug('Fetching', url)
248        st = time.monotonic()
249
250        # Check for a URL pointing to the local filesystem and special case it
251        # for efficiency and robustness. Bypasses delay checking as it does not
252        # apply to local fetches. Ensures that unicode paths that are not
253        # representable in the filesystem_encoding work.
254        is_local = 0
255        if url.startswith('file://'):
256            is_local = 7
257        elif url.startswith('file:'):
258            is_local = 5
259        if is_local > 0:
260            url = url[is_local:]
261            if iswindows and url.startswith('/'):
262                url = url[1:]
263            with open(url, 'rb') as f:
264                data = response(f.read())
265                data.newurl = 'file:'+url  # This is what mechanize does for
266                # local URLs
267            self.log.debug('Fetched %s in %.1f seconds' % (url, time.monotonic() - st))
268            return data
269
270        delta = time.monotonic() - self.last_fetch_at
271        if delta < self.delay:
272            time.sleep(self.delay - delta)
273        url = canonicalize_url(url)
274        open_func = getattr(self.browser, 'open_novisit', self.browser.open)
275        try:
276            with closing(open_func(url, timeout=self.timeout)) as f:
277                data = response(f.read()+f.read())
278                data.newurl = f.geturl()
279        except URLError as err:
280            if hasattr(err, 'code') and err.code in responses:
281                raise FetchError(responses[err.code])
282            is_temp = False
283            reason = getattr(err, 'reason', None)
284            if isinstance(reason, socket.gaierror):
285                # see man gai_strerror() for details
286                if getattr(reason, 'errno', None) in (socket.EAI_AGAIN, socket.EAI_NONAME):
287                    is_temp = True
288            if is_temp:  # Connection reset by peer or Name or service not known
289                self.log.debug('Temporary error, retrying in 1 second')
290                time.sleep(1)
291                with closing(open_func(url, timeout=self.timeout)) as f:
292                    data = response(f.read()+f.read())
293                    data.newurl = f.geturl()
294            else:
295                raise err
296        finally:
297            self.last_fetch_at = time.monotonic()
298        self.log.debug('Fetched %s in %f seconds' % (url, time.monotonic() - st))
299        return data
300
301    def start_fetch(self, url):
302        soup = BeautifulSoup('<a href="'+url+'" />')
303        res = self.process_links(soup, url, 0, into_dir='')
304        self.log.debug(url, 'saved to', res)
305        return res
306
307    def is_link_ok(self, url):
308        for i in self.__class__.LINK_FILTER:
309            if i.search(url):
310                return False
311        return True
312
313    def is_link_wanted(self, url, tag):
314        try:
315            return self._is_link_wanted(url, tag)
316        except NotImplementedError:
317            pass
318        except:
319            return False
320        if self.filter_regexps:
321            for f in self.filter_regexps:
322                if f.search(url):
323                    return False
324        if self.match_regexps:
325            for m in self.match_regexps:
326                if m.search(url):
327                    return True
328            return False
329        return True
330
331    def process_stylesheets(self, soup, baseurl):
332        diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
333        if not os.path.exists(diskpath):
334            os.mkdir(diskpath)
335        for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
336            try:
337                mtype = tag['type']
338            except KeyError:
339                mtype = 'text/css' if tag.name.lower() == 'style' else ''
340            if mtype.lower() != 'text/css':
341                continue
342            if tag.has_attr('href'):
343                iurl = tag['href']
344                if not urlsplit(iurl).scheme:
345                    iurl = urljoin(baseurl, iurl, False)
346                found_cached = False
347                with self.stylemap_lock:
348                    if iurl in self.stylemap:
349                        tag['href'] = self.stylemap[iurl]
350                        found_cached = True
351                if found_cached:
352                    continue
353                try:
354                    data = self.fetch_url(iurl)
355                except Exception:
356                    self.log.exception('Could not fetch stylesheet ', iurl)
357                    continue
358                stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
359                with self.stylemap_lock:
360                    self.stylemap[iurl] = stylepath
361                with open(stylepath, 'wb') as x:
362                    x.write(data)
363                tag['href'] = stylepath
364            else:
365                for ns in tag.findAll(text=True):
366                    src = str(ns)
367                    m = self.__class__.CSS_IMPORT_PATTERN.search(src)
368                    if m:
369                        iurl = m.group(1)
370                        if not urlsplit(iurl).scheme:
371                            iurl = urljoin(baseurl, iurl, False)
372                        found_cached = False
373                        with self.stylemap_lock:
374                            if iurl in self.stylemap:
375                                ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
376                                found_cached = True
377                        if found_cached:
378                            continue
379                        try:
380                            data = self.fetch_url(iurl)
381                        except Exception:
382                            self.log.exception('Could not fetch stylesheet ', iurl)
383                            continue
384                        c += 1
385                        stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
386                        with self.stylemap_lock:
387                            self.stylemap[iurl] = stylepath
388                        with open(stylepath, 'wb') as x:
389                            x.write(data)
390                        ns.replaceWith(src.replace(m.group(1), stylepath))
391
392    def rescale_image(self, data):
393        return rescale_image(data, self.scale_news_images, self.compress_news_images_max_size, self.compress_news_images_auto_size)
394
395    def process_images(self, soup, baseurl):
396        diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
397        if not os.path.exists(diskpath):
398            os.mkdir(diskpath)
399        c = 0
400        for tag in soup.findAll('img', src=True):
401            iurl = tag['src']
402            if iurl.startswith('data:'):
403                try:
404                    data = urlopen(iurl).read()
405                except Exception:
406                    self.log.exception('Failed to decode embedded image')
407                    continue
408            else:
409                if callable(self.image_url_processor):
410                    iurl = self.image_url_processor(baseurl, iurl)
411                if not urlsplit(iurl).scheme:
412                    iurl = urljoin(baseurl, iurl, False)
413                found_in_cache = False
414                with self.imagemap_lock:
415                    if iurl in self.imagemap:
416                        tag['src'] = self.imagemap[iurl]
417                        found_in_cache = True
418                if found_in_cache:
419                    continue
420                try:
421                    data = self.fetch_url(iurl)
422                    if data == b'GIF89a\x01':
423                        # Skip empty GIF files as PIL errors on them anyway
424                        continue
425                except Exception:
426                    self.log.exception('Could not fetch image ', iurl)
427                    continue
428            c += 1
429            fname = ascii_filename('img'+str(c))
430            data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
431            if data is None:
432                continue
433            itype = what(None, data)
434            if itype == 'svg' or (itype is None and b'<svg' in data[:1024]):
435                # SVG image
436                imgpath = os.path.join(diskpath, fname+'.svg')
437                with self.imagemap_lock:
438                    self.imagemap[iurl] = imgpath
439                with open(imgpath, 'wb') as x:
440                    x.write(data)
441                tag['src'] = imgpath
442            else:
443                try:
444                    # Ensure image is valid
445                    img = image_from_data(data)
446                    if itype not in {'png', 'jpg', 'jpeg'}:
447                        itype = 'png' if itype == 'gif' else 'jpeg'
448                        data = image_to_data(img, fmt=itype)
449                    if self.compress_news_images and itype in {'jpg','jpeg'}:
450                        try:
451                            data = self.rescale_image(data)
452                        except Exception:
453                            self.log.exception('failed to compress image '+iurl)
454                    # Moon+ apparently cannot handle .jpeg files
455                    if itype == 'jpeg':
456                        itype = 'jpg'
457                    imgpath = os.path.join(diskpath, fname+'.'+itype)
458                    with self.imagemap_lock:
459                        self.imagemap[iurl] = imgpath
460                    with open(imgpath, 'wb') as x:
461                        x.write(data)
462                    tag['src'] = imgpath
463                except Exception:
464                    traceback.print_exc()
465                    continue
466
467    def absurl(self, baseurl, tag, key, filter=True):
468        iurl = tag[key]
469        parts = urlsplit(iurl)
470        if not parts.netloc and not parts.path and not parts.query:
471            return None
472        if not parts.scheme:
473            iurl = urljoin(baseurl, iurl, False)
474        if not self.is_link_ok(iurl):
475            self.log.debug('Skipping invalid link:', iurl)
476            return None
477        if filter and not self.is_link_wanted(iurl, tag):
478            self.log.debug('Filtered link: '+iurl)
479            return None
480        return iurl
481
482    def normurl(self, url):
483        parts = list(urlsplit(url))
484        parts[4] = ''
485        return urlunsplit(parts)
486
487    def localize_link(self, tag, key, path):
488        parts = urlsplit(tag[key])
489        suffix = ('#'+parts.fragment) if parts.fragment else ''
490        tag[key] = path+suffix
491
492    def process_return_links(self, soup, baseurl):
493        for tag in soup.findAll('a', href=True):
494            iurl = self.absurl(baseurl, tag, 'href')
495            if not iurl:
496                continue
497            nurl = self.normurl(iurl)
498            if nurl in self.filemap:
499                self.localize_link(tag, 'href', self.filemap[nurl])
500
501    def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
502        res = ''
503        diskpath = os.path.join(self.current_dir, into_dir)
504        if not os.path.exists(diskpath):
505            os.mkdir(diskpath)
506        prev_dir = self.current_dir
507        try:
508            self.current_dir = diskpath
509            tags = list(soup.findAll('a', href=True))
510
511            for c, tag in enumerate(tags):
512                if self.show_progress:
513                    print('.', end=' ')
514                    sys.stdout.flush()
515                sys.stdout.flush()
516                iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
517                if not iurl:
518                    continue
519                nurl = self.normurl(iurl)
520                if nurl in self.filemap:
521                    self.localize_link(tag, 'href', self.filemap[nurl])
522                    continue
523                if self.files > self.max_files:
524                    return res
525                linkdir = 'link'+str(c) if into_dir else ''
526                linkdiskpath = os.path.join(diskpath, linkdir)
527                if not os.path.exists(linkdiskpath):
528                    os.mkdir(linkdiskpath)
529                try:
530                    self.current_dir = linkdiskpath
531                    dsrc = self.fetch_url(iurl)
532                    newbaseurl = dsrc.newurl
533                    if len(dsrc) == 0 or \
534                       len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
535                        raise ValueError('No content at URL %r'%iurl)
536                    if callable(self.encoding):
537                        dsrc = self.encoding(dsrc)
538                    elif self.encoding is not None:
539                        dsrc = dsrc.decode(self.encoding, 'replace')
540                    else:
541                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]
542
543                    st = time.monotonic()
544                    soup = self.get_soup(dsrc, url=iurl)
545                    self.log.debug('Parsed %s in %.1f seconds' % (iurl, time.monotonic() - st))
546
547                    base = soup.find('base', href=True)
548                    if base is not None:
549                        newbaseurl = base['href']
550                    self.log.debug('Processing images...')
551                    self.process_images(soup, newbaseurl)
552                    if self.download_stylesheets:
553                        self.process_stylesheets(soup, newbaseurl)
554
555                    _fname = basename(iurl)
556                    if not isinstance(_fname, str):
557                        _fname.decode('latin1', 'replace')
558                    _fname = _fname.replace('%', '').replace(os.sep, '')
559                    _fname = ascii_filename(_fname)
560                    _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
561                    res = os.path.join(linkdiskpath, _fname)
562                    self.downloaded_paths.append(res)
563                    self.filemap[nurl] = res
564                    if recursion_level < self.max_recursions:
565                        self.log.debug('Processing links...')
566                        self.process_links(soup, newbaseurl, recursion_level+1)
567                    else:
568                        self.process_return_links(soup, newbaseurl)
569                        self.log.debug('Recursion limit reached. Skipping links in', iurl)
570
571                    if newbaseurl and not newbaseurl.startswith('/'):
572                        for atag in soup.findAll('a', href=lambda x: x and x.startswith('/')):
573                            atag['href'] = urljoin(newbaseurl, atag['href'], True)
574                    if callable(self.postprocess_html_ext):
575                        soup = self.postprocess_html_ext(soup,
576                                c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
577                                self.job_info)
578
579                        if c==0 and recursion_level == 0:
580                            self.called_first = True
581
582                    save_soup(soup, res)
583                    self.localize_link(tag, 'href', res)
584                except Exception as err:
585                    if isinstance(err, AbortArticle):
586                        raise
587                    self.failed_links.append((iurl, traceback.format_exc()))
588                    self.log.exception('Could not fetch link', iurl)
589                finally:
590                    self.current_dir = diskpath
591                    self.files += 1
592        finally:
593            self.current_dir = prev_dir
594        if self.show_progress:
595            print()
596        return res
597
598
599def option_parser(usage=_('%prog URL\n\nWhere URL is for example https://google.com')):
600    parser = OptionParser(usage=usage)
601    parser.add_option('-d', '--base-dir',
602                      help=_('Base folder into which URL is saved. Default is %default'),
603                      default='.', type='string', dest='dir')
604    parser.add_option('-t', '--timeout',
605                      help=_('Timeout in seconds to wait for a response from the server. Default: %default s'),
606                      default=10.0, type='float', dest='timeout')
607    parser.add_option('-r', '--max-recursions', default=1,
608                      help=_('Maximum number of levels to recurse i.e. depth of links to follow. Default %default'),
609                      type='int', dest='max_recursions')
610    parser.add_option('-n', '--max-files', default=sys.maxsize, type='int', dest='max_files',
611                      help=_('The maximum number of files to download. This only applies to files from <a href> tags. Default is %default'))
612    parser.add_option('--delay', default=0, dest='delay', type='float',
613                      help=_('Minimum interval in seconds between consecutive fetches. Default is %default s'))
614    parser.add_option('--encoding', default=None,
615                      help=_('The character encoding for the websites you are trying to download. The default is to try and guess the encoding.'))
616    parser.add_option('--match-regexp', default=[], action='append', dest='match_regexps',
617                      help=_('Only links that match this regular expression will be followed. '
618                             'This option can be specified multiple times, in which case as long '
619                             'as a link matches any one regexp, it will be followed. By default all '
620                             'links are followed.'))
621    parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
622                      help=_('Any link that matches this regular expression will be ignored.'
623                             ' This option can be specified multiple times, in which case as'
624                             ' long as any regexp matches a link, it will be ignored. By'
625                             ' default, no links are ignored. If both filter regexp and match'
626                             ' regexp are specified, then filter regexp is applied first.'))
627    parser.add_option('--dont-download-stylesheets', action='store_true', default=False,
628                      help=_('Do not download CSS stylesheets.'), dest='no_stylesheets')
629    parser.add_option('--verbose', help=_('Show detailed output information. Useful for debugging'),
630                      default=False, action='store_true', dest='verbose')
631    return parser
632
633
634def create_fetcher(options, image_map={}, log=None):
635    if log is None:
636        log = Log(level=Log.DEBUG) if options.verbose else Log()
637    return RecursiveFetcher(options, log, image_map={})
638
639
640def main(args=sys.argv):
641    parser = option_parser()
642    options, args = parser.parse_args(args)
643    if len(args) != 2:
644        parser.print_help()
645        return 1
646
647    fetcher = create_fetcher(options)
648    fetcher.start_fetch(args[1])
649
650
651if __name__ == '__main__':
652    sys.exit(main())
653