1__license__   = 'GPL v3'
2__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
3'''
4Defines various abstract base classes that can be subclassed to create powerful news fetching recipes.
5'''
6__docformat__ = "restructuredtext en"
7
8
9import io
10import os
11import re
12import sys
13import time
14import traceback
15from collections import defaultdict
16from contextlib import closing
17from urllib.parse import urlparse, urlsplit
18
19from calibre import (
20    __appname__, as_unicode, browser, force_unicode, iswindows, preferred_encoding,
21    random_user_agent, strftime
22)
23from calibre.ebooks.BeautifulSoup import BeautifulSoup, CData, NavigableString, Tag
24from calibre.ebooks.metadata import MetaInformation
25from calibre.ebooks.metadata.opf2 import OPFCreator
26from calibre.ebooks.metadata.toc import TOC
27from calibre.ptempfile import PersistentTemporaryFile
28from calibre.utils.date import now as nowf
29from calibre.utils.icu import numeric_sort_key
30from calibre.utils.img import add_borders_to_image, image_to_data, save_cover_data_to
31from calibre.utils.localization import canonicalize_lang
32from calibre.utils.logging import ThreadSafeWrapper
33from calibre.utils.threadpool import NoResultsPending, ThreadPool, WorkRequest
34from calibre.web import Recipe
35from calibre.web.feeds import Feed, feed_from_xml, feeds_from_index, templates
36from calibre.web.fetch.simple import (
37    AbortArticle, RecursiveFetcher, option_parser as web2disk_option_parser
38)
39from calibre.web.fetch.utils import prepare_masthead_image
40from polyglot.builtins import string_or_bytes
41
42
43def classes(classes):
44    q = frozenset(classes.split(' '))
45    return dict(attrs={
46        'class': lambda x: x and frozenset(x.split()).intersection(q)})
47
48
49def prefixed_classes(classes):
50    q = frozenset(classes.split(' '))
51
52    def matcher(x):
53        if x:
54            for candidate in frozenset(x.split()):
55                for x in q:
56                    if candidate.startswith(x):
57                        return True
58        return False
59    return {'attrs': {'class': matcher}}
60
61
62class LoginFailed(ValueError):
63    pass
64
65
66class DownloadDenied(ValueError):
67    pass
68
69
70class BasicNewsRecipe(Recipe):
71    '''
72    Base class that contains logic needed in all recipes. By overriding
73    progressively more of the functionality in this class, you can make
74    progressively more customized/powerful recipes. For a tutorial introduction
75    to creating recipes, see :doc:`news`.
76    '''
77
78    #: The title to use for the e-book
79    title                  = _('Unknown News Source')
80
81    #: A couple of lines that describe the content this recipe downloads.
82    #: This will be used primarily in a GUI that presents a list of recipes.
83    description = ''
84
85    #: The author of this recipe
86    __author__             = __appname__
87
88    #: Minimum calibre version needed to use this recipe
89    requires_version = (0, 6, 0)
90
91    #: The language that the news is in. Must be an ISO-639 code either
92    #: two or three characters long
93    language               = 'und'
94
95    #: Maximum number of articles to download from each feed. This is primarily
96    #: useful for feeds that don't have article dates. For most feeds, you should
97    #: use :attr:`BasicNewsRecipe.oldest_article`
98    max_articles_per_feed  = 100
99
100    #: Oldest article to download from this news source. In days.
101    oldest_article         = 7.0
102
103    #: Number of levels of links to follow on article webpages
104    recursions             = 0
105
106    #: Delay between consecutive downloads in seconds. The argument may be a
107    #: floating point number to indicate a more precise time.
108    delay                  = 0
109
110    #: Publication type
111    #: Set to newspaper, magazine or blog. If set to None, no publication type
112    #: metadata will be written to the opf file.
113    publication_type = 'unknown'
114
115    #: Number of simultaneous downloads. Set to 1 if the server is picky.
116    #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
117    simultaneous_downloads = 5
118
119    #: Timeout for fetching files from server in seconds
120    timeout                = 120.0
121
122    #: The format string for the date shown on the first page.
123    #: By default: Day_Name, Day_Number Month_Name Year
124    timefmt                = ' [%a, %d %b %Y]'
125
126    #: List of feeds to download.
127    #: Can be either ``[url1, url2, ...]`` or ``[('title1', url1), ('title2', url2),...]``
128    feeds = None
129
130    #: Max number of characters in the short description
131    summary_length         = 500
132
133    #: Convenient flag to disable loading of stylesheets for websites
134    #: that have overly complex stylesheets unsuitable for conversion
135    #: to e-book formats.
136    #: If True stylesheets are not downloaded and processed
137    no_stylesheets         = False
138
139    #: Convenient flag to strip all JavaScript tags from the downloaded HTML
140    remove_javascript      = True
141
142    #: If True the GUI will ask the user for a username and password
143    #: to use while downloading.
144    #: If set to "optional" the use of a username and password becomes optional
145    needs_subscription     = False
146
147    #: If True the navigation bar is center aligned, otherwise it is left aligned
148    center_navbar = True
149
150    #: Specify an override encoding for sites that have an incorrect
151    #: charset specification. The most common being specifying ``latin1`` and
152    #: using ``cp1252``. If None, try to detect the encoding. If it is a
153    #: callable, the callable is called with two arguments: The recipe object
154    #: and the source to be decoded. It must return the decoded source.
155    encoding               = None
156
157    #: Normally we try to guess if a feed has full articles embedded in it
158    #: based on the length of the embedded content. If `None`, then the
159    #: default guessing is used. If `True` then the we always assume the feeds has
160    #: embedded content and if `False` we always assume the feed does not have
161    #: embedded content.
162    use_embedded_content   = None
163
164    #: Set to True and implement :meth:`get_obfuscated_article` to handle
165    #: websites that try to make it difficult to scrape content.
166    articles_are_obfuscated = False
167
168    #: Reverse the order of articles in each feed
169    reverse_article_order = False
170
171    #: Automatically extract all the text from downloaded article pages. Uses
172    #: the algorithms from the readability project. Setting this to True, means
173    #: that you do not have to worry about cleaning up the downloaded HTML
174    #: manually (though manual cleanup will always be superior).
175    auto_cleanup = False
176
177    #: Specify elements that the auto cleanup algorithm should never remove.
178    #: The syntax is a XPath expression. For example::
179    #:
180    #:   auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
181    #:                                                  id="article-image"
182    #:   auto_cleanup_keep = '//*[@class="important"]' will keep all elements
183    #:                                               with class="important"
184    #:   auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
185    #:                     will keep all divs with id="article-image" and spans
186    #:                     with class="important"
187    #:
188    auto_cleanup_keep = None
189
190    #: Specify any extra :term:`CSS` that should be added to downloaded :term:`HTML` files.
191    #: It will be inserted into `<style>` tags, just before the closing
192    #: `</head>` tag thereby overriding all :term:`CSS` except that which is
193    #: declared using the style attribute on individual :term:`HTML` tags.
194    #: Note that if you want to programmatically generate the extra_css override
195    #: the :meth:`get_extra_css()` method instead.
196    #: For example::
197    #:
198    #:     extra_css = '.heading { font: serif x-large }'
199    #:
200    extra_css              = None
201
202    #: If True empty feeds are removed from the output.
203    #: This option has no effect if parse_index is overridden in
204    #: the sub class. It is meant only for recipes that return a list
205    #: of feeds using `feeds` or :meth:`get_feeds`. It is also used if you use
206    #: the ignore_duplicate_articles option.
207    remove_empty_feeds = False
208
209    #: List of regular expressions that determines which links to follow.
210    #: If empty, it is ignored. Used only if is_link_wanted is
211    #: not implemented. For example::
212    #:
213    #:     match_regexps = [r'page=[0-9]+']
214    #:
215    #: will match all URLs that have `page=some number` in them.
216    #:
217    #: Only one of :attr:`BasicNewsRecipe.match_regexps` or
218    #: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
219    match_regexps         = []
220
221    #: List of regular expressions that determines which links to ignore.
222    #: If empty it is ignored. Used only if is_link_wanted is not
223    #: implemented. For example::
224    #:
225    #:     filter_regexps = [r'ads\.doubleclick\.net']
226    #:
227    #: will remove all URLs that have `ads.doubleclick.net` in them.
228    #:
229    #: Only one of :attr:`BasicNewsRecipe.match_regexps` or
230    #: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
231    filter_regexps        = []
232
233    #: Recipe specific options to control the conversion of the downloaded
234    #: content into an e-book. These will override any user or plugin specified
235    #: values, so only use if absolutely necessary. For example::
236    #:
237    #:   conversion_options = {
238    #:     'base_font_size'   : 16,
239    #:     'linearize_tables' : True,
240    #:   }
241    #:
242    conversion_options = {}
243
244    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
245    #: A tag is specified as a dictionary of the form::
246    #:
247    #:    {
248    #:     name      : 'tag name',   #e.g. 'div'
249    #:     attrs     : a dictionary, #e.g. {'class': 'advertisment'}
250    #:    }
251    #:
252    #: All keys are optional. For a full explanation of the search criteria, see
253    #: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-the-tree>`__
254    #: A common example::
255    #:
256    #:   remove_tags = [dict(name='div', class_='advert')]
257    #:
258    #: This will remove all `<div class="advert">` tags and all
259    #: their children from the downloaded :term:`HTML`.
260    remove_tags           = []
261
262    #: Remove all tags that occur after the specified tag.
263    #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`.
264    #: For example::
265    #:
266    #:     remove_tags_after = [dict(id='content')]
267    #:
268    #: will remove all
269    #: tags after the first element with `id="content"`.
270    remove_tags_after     = None
271
272    #: Remove all tags that occur before the specified tag.
273    #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`.
274    #: For example::
275    #:
276    #:     remove_tags_before = dict(id='content')
277    #:
278    #: will remove all
279    #: tags before the first element with `id="content"`.
280    remove_tags_before    = None
281
282    #: List of attributes to remove from all tags.
283    #: For example::
284    #:
285    #:   remove_attributes = ['style', 'font']
286    remove_attributes = []
287
288    #: Keep only the specified tags and their children.
289    #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`.
290    #: If this list is not empty, then the `<body>` tag will be emptied and re-filled with
291    #: the tags that match the entries in this list. For example::
292    #:
293    #:     keep_only_tags = [dict(id=['content', 'heading'])]
294    #:
295    #: will keep only tags that have an `id` attribute of `"content"` or `"heading"`.
296    keep_only_tags        = []
297
298    #: List of :term:`regexp` substitution rules to run on the downloaded :term:`HTML`.
299    #: Each element of the
300    #: list should be a two element tuple. The first element of the tuple should
301    #: be a compiled regular expression and the second a callable that takes
302    #: a single match object and returns a string to replace the match. For example::
303    #:
304    #:     preprocess_regexps = [
305    #:        (re.compile(r'<!--Article ends here-->.*</body>', re.DOTALL|re.IGNORECASE),
306    #:         lambda match: '</body>'),
307    #:     ]
308    #:
309    #: will remove everything from `<!--Article ends here-->` to `</body>`.
310    preprocess_regexps    = []
311
312    #: The CSS that is used to style the templates, i.e., the navigation bars and
313    #: the Tables of Contents. Rather than overriding this variable, you should
314    #: use `extra_css` in your recipe to customize look and feel.
315    template_css = '''
316            .article_date {
317                color: gray; font-family: monospace;
318            }
319
320            .article_description {
321                text-indent: 0pt;
322            }
323
324            a.article {
325                font-weight: bold; text-align:left;
326            }
327
328            a.feed {
329                font-weight: bold;
330            }
331
332            .calibre_navbar {
333                font-family:monospace;
334            }
335    '''
336
337    #: By default, calibre will use a default image for the masthead (Kindle only).
338    #: Override this in your recipe to provide a url to use as a masthead.
339    masthead_url = None
340
341    #: By default, the cover image returned by get_cover_url() will be used as
342    #: the cover for the periodical.  Overriding this in your recipe instructs
343    #: calibre to render the downloaded cover into a frame whose width and height
344    #: are expressed as a percentage of the downloaded cover.
345    #: cover_margins = (10, 15, '#ffffff') pads the cover with a white margin
346    #: 10px on the left and right, 15px on the top and bottom.
347    #: Color names are defined `here <https://www.imagemagick.org/script/color.php>`_.
348    #: Note that for some reason, white does not always work in Windows. Use
349    #: #ffffff instead
350    cover_margins = (0, 0, '#ffffff')
351
352    #: Set to a non empty string to disable this recipe.
353    #: The string will be used as the disabled message
354    recipe_disabled = None
355
356    #: Ignore duplicates of articles that are present in more than one section.
357    #: A duplicate article is an article that has the same title and/or URL.
358    #: To ignore articles with the same title, set this to::
359    #:
360    #:   ignore_duplicate_articles = {'title'}
361    #:
362    #: To use URLs instead, set it to::
363    #:
364    #:   ignore_duplicate_articles = {'url'}
365    #:
366    #: To match on title or URL, set it to::
367    #:
368    #:   ignore_duplicate_articles = {'title', 'url'}
369    ignore_duplicate_articles = None
370
371    # The following parameters control how the recipe attempts to minimize
372    # JPEG image sizes
373
374    #: Set this to False to ignore all scaling and compression parameters and
375    #: pass images through unmodified. If True and the other compression
376    #: parameters are left at their default values, JPEG images will be scaled to fit
377    #: in the screen dimensions set by the output profile and compressed to size at
378    #: most (w * h)/16 where w x h are the scaled image dimensions.
379    compress_news_images = False
380
381    #: The factor used when auto compressing JPEG images. If set to None,
382    #: auto compression is disabled. Otherwise, the images will be reduced in size to
383    #: (w * h)/compress_news_images_auto_size bytes if possible by reducing
384    #: the quality level, where w x h are the image dimensions in pixels.
385    #: The minimum JPEG quality will be 5/100 so it is possible this constraint
386    #: will not be met.  This parameter can be overridden by the parameter
387    #: compress_news_images_max_size which provides a fixed maximum size for images.
388    #: Note that if you enable scale_news_images_to_device then the image will
389    #: first be scaled and then its quality lowered until its size is less than
390    #: (w * h)/factor where w and h are now the *scaled* image dimensions. In
391    #: other words, this compression happens after scaling.
392    compress_news_images_auto_size = 16
393
394    #: Set JPEG quality so images do not exceed the size given (in KBytes).
395    #: If set, this parameter overrides auto compression via compress_news_images_auto_size.
396    #: The minimum JPEG quality will be 5/100 so it is possible this constraint
397    #: will not be met.
398    compress_news_images_max_size = None
399
400    #: Rescale images to fit in the device screen dimensions set by the output profile.
401    #: Ignored if no output profile is set.
402    scale_news_images_to_device = True
403
404    #: Maximum dimensions (w,h) to scale images to. If scale_news_images_to_device is True
405    #: this is set to the device screen dimensions set by the output profile unless
406    #: there is no profile set, in which case it is left at whatever value it has been
407    #: assigned (default None).
408    scale_news_images = None
409
410    #: If set to True then links in downloaded articles that point to other downloaded articles are
411    #: changed to point to the downloaded copy of the article rather than its original web URL. If you
412    #: set this to True, you might also need to implement :meth:`canonicalize_internal_url` to work
413    #: with the URL scheme of your particular website.
414    resolve_internal_links = False
415
416    #: Set to False if you dont want to use gzipped transfers. Note that some old servers flake out with gzip
417    handle_gzip = True
418
419    # See the built-in recipes for examples of these settings.
420
421    def short_title(self):
422        return force_unicode(self.title, preferred_encoding)
423
424    def is_link_wanted(self, url, tag):
425        '''
426        Return True if the link should be followed or False otherwise. By
427        default, raises NotImplementedError which causes the downloader to
428        ignore it.
429
430        :param url: The URL to be followed
431        :param tag: The tag from which the URL was derived
432        '''
433        raise NotImplementedError()
434
435    def get_extra_css(self):
436        '''
437        By default returns `self.extra_css`. Override if you want to programmatically generate the
438        extra_css.
439        '''
440        return self.extra_css
441
442    def get_cover_url(self):
443        '''
444        Return a :term:`URL` to the cover image for this issue or `None`.
445        By default it returns the value of the member `self.cover_url` which
446        is normally `None`. If you want your recipe to download a cover for the e-book
447        override this method in your subclass, or set the member variable `self.cover_url`
448        before this method is called.
449        '''
450        return getattr(self, 'cover_url', None)
451
452    def get_masthead_url(self):
453        '''
454        Return a :term:`URL` to the masthead image for this issue or `None`.
455        By default it returns the value of the member `self.masthead_url` which
456        is normally `None`. If you want your recipe to download a masthead for the e-book
457        override this method in your subclass, or set the member variable `self.masthead_url`
458        before this method is called.
459        Masthead images are used in Kindle MOBI files.
460        '''
461        return getattr(self, 'masthead_url', None)
462
463    def get_feeds(self):
464        '''
465        Return a list of :term:`RSS` feeds to fetch for this profile. Each element of the list
466        must be a 2-element tuple of the form (title, url). If title is None or an
467        empty string, the title from the feed is used. This method is useful if your recipe
468        needs to do some processing to figure out the list of feeds to download. If
469        so, override in your subclass.
470        '''
471        if not self.feeds:
472            raise NotImplementedError()
473        if self.test:
474            return self.feeds[:self.test[0]]
475        return self.feeds
476
477    @classmethod
478    def print_version(cls, url):
479        '''
480        Take a `url` pointing to the webpage with article content and return the
481        :term:`URL` pointing to the print version of the article. By default does
482        nothing. For example::
483
484            def print_version(self, url):
485                return url + '?&pagewanted=print'
486
487        '''
488        raise NotImplementedError()
489
490    @classmethod
491    def image_url_processor(cls, baseurl, url):
492        '''
493        Perform some processing on image urls (perhaps removing size restrictions for
494        dynamically generated images, etc.) and return the precessed URL.
495        '''
496        return url
497
498    def preprocess_image(self, img_data, image_url):
499        '''
500        Perform some processing on downloaded image data. This is called on the raw
501        data before any resizing is done. Must return the processed raw data. Return
502        None to skip the image.
503        '''
504        return img_data
505
506    def get_browser(self, *args, **kwargs):
507        '''
508        Return a browser instance used to fetch documents from the web. By default
509        it returns a `mechanize <https://mechanize.readthedocs.io/en/latest/>`_
510        browser instance that supports cookies, ignores robots.txt, handles
511        refreshes and has a mozilla firefox user agent.
512
513        If your recipe requires that you login first, override this method
514        in your subclass. For example, the following code is used in the New York
515        Times recipe to login for full access::
516
517            def get_browser(self):
518                br = BasicNewsRecipe.get_browser(self)
519                if self.username is not None and self.password is not None:
520                    br.open('https://www.nytimes.com/auth/login')
521                    br.select_form(name='login')
522                    br['USERID']   = self.username
523                    br['PASSWORD'] = self.password
524                    br.submit()
525                return br
526
527        '''
528        if 'user_agent' not in kwargs:
529            # More and more news sites are serving JPEG XR images to IE
530            ua = getattr(self, 'last_used_user_agent', None) or self.calibre_most_common_ua or random_user_agent(allow_ie=False)
531            kwargs['user_agent'] = self.last_used_user_agent = ua
532        self.log('Using user agent:', kwargs['user_agent'])
533        br = browser(*args, **kwargs)
534        br.addheaders += [('Accept', '*/*')]
535        if self.handle_gzip:
536            br.set_handle_gzip(True)
537        return br
538
539    def clone_browser(self, br):
540        '''
541        Clone the browser br. Cloned browsers are used for multi-threaded
542        downloads, since mechanize is not thread safe. The default cloning
543        routines should capture most browser customization, but if you do
544        something exotic in your recipe, you should override this method in
545        your recipe and clone manually.
546
547        Cloned browser instances use the same, thread-safe CookieJar by
548        default, unless you have customized cookie handling.
549        '''
550        if callable(getattr(br, 'clone_browser', None)):
551            return br.clone_browser()
552
553        # Uh-oh recipe using something exotic, call get_browser
554        return self.get_browser()
555
556    @property
557    def cloned_browser(self):
558        if hasattr(self.get_browser, 'is_base_class_implementation'):
559            # We are using the default get_browser, which means no need to
560            # clone
561            br = BasicNewsRecipe.get_browser(self)
562        else:
563            br = self.clone_browser(self.browser)
564        return br
565
566    def get_article_url(self, article):
567        '''
568        Override in a subclass to customize extraction of the :term:`URL` that points
569        to the content for each article. Return the
570        article URL. It is called with `article`, an object representing a parsed article
571        from a feed. See `feedparser <https://pythonhosted.org/feedparser/>`_.
572        By default it looks for the original link (for feeds syndicated via a
573        service like feedburner or pheedo) and if found,
574        returns that or else returns
575        `article.link <https://pythonhosted.org/feedparser/reference-entry-link.html>`_.
576        '''
577        for key in article.keys():
578            if key.endswith('_origlink'):
579                url = article[key]
580                if url and (url.startswith('http://') or url.startswith('https://')):
581                    return url
582        ans = article.get('link', None)
583        if not ans and getattr(article, 'links', None):
584            for item in article.links:
585                if item.get('rel', 'alternate') == 'alternate':
586                    ans = item['href']
587                    break
588        return ans
589
590    def skip_ad_pages(self, soup):
591        '''
592        This method is called with the source of each downloaded :term:`HTML` file, before
593        any of the cleanup attributes like remove_tags, keep_only_tags are
594        applied. Note that preprocess_regexps will have already been applied.
595        It is meant to allow the recipe to skip ad pages. If the soup represents
596        an ad page, return the HTML of the real page. Otherwise return
597        None.
598
599        `soup`: A `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__
600        instance containing the downloaded :term:`HTML`.
601        '''
602        return None
603
604    def abort_article(self, msg=None):
605        ''' Call this method inside any of the preprocess methods to abort the
606        download for the current article. Useful to skip articles that contain
607        inappropriate content, such as pure video articles. '''
608        raise AbortArticle(msg or _('Article download aborted'))
609
610    def preprocess_raw_html(self, raw_html, url):
611        '''
612        This method is called with the source of each downloaded :term:`HTML` file, before
613        it is parsed into an object tree. raw_html is a unicode string
614        representing the raw HTML downloaded from the web. url is the URL from
615        which the HTML was downloaded.
616
617        Note that this method acts *before* preprocess_regexps.
618
619        This method must return the processed raw_html as a unicode object.
620        '''
621        return raw_html
622
623    def preprocess_raw_html_(self, raw_html, url):
624        raw_html = self.preprocess_raw_html(raw_html, url)
625        if self.auto_cleanup:
626            try:
627                raw_html = self.extract_readable_article(raw_html, url)
628            except:
629                self.log.exception('Auto cleanup of URL: %r failed'%url)
630
631        return raw_html
632
633    def preprocess_html(self, soup):
634        '''
635        This method is called with the source of each downloaded :term:`HTML` file, before
636        it is parsed for links and images. It is called after the cleanup as
637        specified by remove_tags etc.
638        It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`.
639        It should return `soup` after processing it.
640
641        `soup`: A `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__
642        instance containing the downloaded :term:`HTML`.
643        '''
644        return soup
645
646    def postprocess_html(self, soup, first_fetch):
647        '''
648        This method is called with the source of each downloaded :term:`HTML` file, after
649        it is parsed for links and images.
650        It can be used to do arbitrarily powerful post-processing on the :term:`HTML`.
651        It should return `soup` after processing it.
652
653        :param soup: A `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__  instance containing the downloaded :term:`HTML`.
654        :param first_fetch: True if this is the first page of an article.
655
656        '''
657        return soup
658
659    def cleanup(self):
660        '''
661        Called after all articles have been download. Use it to do any cleanup like
662        logging out of subscription sites, etc.
663        '''
664        pass
665
666    def canonicalize_internal_url(self, url, is_link=True):
667        '''
668        Return a set of canonical representations of ``url``.  The default
669        implementation uses just the server hostname and path of the URL,
670        ignoring any query parameters, fragments, etc. The canonical
671        representations must be unique across all URLs for this news source. If
672        they are not, then internal links may be resolved incorrectly.
673
674        :param is_link: Is True if the URL is coming from an internal link in
675                        an HTML file. False if the URL is the URL used to
676                        download an article.
677        '''
678        try:
679            parts = urlparse(url)
680        except Exception:
681            self.log.error('Failed to parse url: %r, ignoring' % url)
682            return frozenset()
683        nl = parts.netloc
684        path = parts.path or ''
685        if isinstance(nl, bytes):
686            nl = nl.decode('utf-8', 'replace')
687        if isinstance(path, bytes):
688            path = path.decode('utf-8', 'replace')
689        return frozenset({(nl, path.rstrip('/'))})
690
691    def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
692        '''
693        Convenience method that takes an URL to the index page and returns
694        a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc>`__
695        of it.
696
697        `url_or_raw`: Either a URL or the downloaded index page as a string
698        '''
699        if re.match((br'\w+://' if isinstance(url_or_raw, bytes) else r'\w+://'), url_or_raw):
700            # We may be called in a thread (in the skip_ad_pages method), so
701            # clone the browser to be safe. We cannot use self.cloned_browser
702            # as it may or may not actually clone the browser, depending on if
703            # the recipe implements get_browser() or not
704            br = self.clone_browser(self.browser)
705            open_func = getattr(br, 'open_novisit', br.open)
706            with closing(open_func(url_or_raw, timeout=self.timeout)) as f:
707                _raw = f.read()
708            if not _raw:
709                raise RuntimeError('Could not fetch index from %s'%url_or_raw)
710        else:
711            _raw = url_or_raw
712        if raw:
713            return _raw
714        if not isinstance(_raw, str) and self.encoding:
715            if callable(self.encoding):
716                _raw = self.encoding(_raw)
717            else:
718                _raw = _raw.decode(self.encoding, 'replace')
719        from calibre.ebooks.chardet import (
720            strip_encoding_declarations, xml_to_unicode
721        )
722        from calibre.utils.cleantext import clean_xml_chars
723        if isinstance(_raw, str):
724            _raw = strip_encoding_declarations(_raw)
725        else:
726            _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
727        _raw = clean_xml_chars(_raw)
728        if save_raw:
729            with lopen(save_raw, 'wb') as f:
730                f.write(_raw.encode('utf-8'))
731        if as_tree:
732            from html5_parser import parse
733            return parse(_raw)
734        return BeautifulSoup(_raw)
735
736    def extract_readable_article(self, html, url):
737        '''
738        Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
739        Based on the original readability algorithm by Arc90.
740        '''
741        from lxml.html import document_fromstring, fragment_fromstring, tostring
742
743        from calibre.ebooks.readability import readability
744
745        doc = readability.Document(html, self.log, url=url,
746                keep_elements=self.auto_cleanup_keep)
747        article_html = doc.summary()
748        extracted_title = doc.title()
749
750        try:
751            frag = fragment_fromstring(article_html)
752        except:
753            doc = document_fromstring(article_html)
754            frag = doc.xpath('//body')[-1]
755        if frag.tag == 'html':
756            root = frag
757        elif frag.tag == 'body':
758            root = document_fromstring(
759                '<html><head><title>%s</title></head></html>' %
760                extracted_title)
761            root.append(frag)
762        else:
763            root = document_fromstring(
764                '<html><head><title>%s</title></head><body/></html>' %
765                extracted_title)
766            root.xpath('//body')[0].append(frag)
767
768        body = root.xpath('//body')[0]
769        has_title = False
770        for x in body.iterdescendants():
771            if x.text == extracted_title:
772                has_title = True
773        inline_titles = body.xpath('//h1|//h2')
774        if not has_title and not inline_titles:
775            heading = body.makeelement('h2')
776            heading.text = extracted_title
777            body.insert(0, heading)
778
779        raw_html = tostring(root, encoding='unicode')
780
781        return raw_html
782
783    def sort_index_by(self, index, weights):
784        '''
785        Convenience method to sort the titles in `index` according to `weights`.
786        `index` is sorted in place. Returns `index`.
787
788        `index`: A list of titles.
789
790        `weights`: A dictionary that maps weights to titles. If any titles
791        in index are not in weights, they are assumed to have a weight of 0.
792        '''
793        weights = defaultdict(lambda: 0, weights)
794        index.sort(key=lambda x: weights[x])
795        return index
796
797    def parse_index(self):
798        '''
799        This method should be implemented in recipes that parse a website
800        instead of feeds to generate a list of articles. Typical uses are for
801        news sources that have a "Print Edition" webpage that lists all the
802        articles in the current print edition. If this function is implemented,
803        it will be used in preference to :meth:`BasicNewsRecipe.parse_feeds`.
804
805        It must return a list. Each element of the list must be a 2-element tuple
806        of the form ``('feed title', list of articles)``.
807
808        Each list of articles must contain dictionaries of the form::
809
810            {
811            'title'       : article title,
812            'url'         : URL of print version,
813            'date'        : The publication date of the article as a string,
814            'description' : A summary of the article
815            'content'     : The full article (can be an empty string). Obsolete
816                            do not use, instead save the content to a temporary
817                            file and pass a file:///path/to/temp/file.html as
818                            the URL.
819            }
820
821        For an example, see the recipe for downloading `The Atlantic`.
822        In addition, you can add 'author' for the author of the article.
823
824        If you want to abort processing for some reason and have
825        calibre show the user a simple message instead of an error, call
826        :meth:`abort_recipe_processing`.
827        '''
828        raise NotImplementedError()
829
830    def abort_recipe_processing(self, msg):
831        '''
832        Causes the recipe download system to abort the download of this recipe,
833        displaying a simple feedback message to the user.
834        '''
835        from calibre.ebooks.conversion import ConversionUserFeedBack
836        raise ConversionUserFeedBack(_('Failed to download %s')%self.title,
837                msg)
838
839    def get_obfuscated_article(self, url):
840        '''
841        If you set `articles_are_obfuscated` this method is called with
842        every article URL. It should return the path to a file on the filesystem
843        that contains the article HTML. That file is processed by the recursive
844        HTML fetching engine, so it can contain links to pages/images on the web.
845
846        This method is typically useful for sites that try to make it difficult to
847        access article content automatically.
848        '''
849        raise NotImplementedError()
850
851    def add_toc_thumbnail(self, article, src):
852        '''
853        Call this from populate_article_metadata with the src attribute of an
854        <img> tag from the article that is appropriate for use as the thumbnail
855        representing the article in the Table of Contents. Whether the
856        thumbnail is actually used is device dependent (currently only used by
857        the Kindles). Note that the referenced image must be one that was
858        successfully downloaded, otherwise it will be ignored.
859        '''
860        if not src or not hasattr(article, 'toc_thumbnail'):
861            return
862
863        src = src.replace('\\', '/')
864        if re.search(r'feed_\d+/article_\d+/images/img', src, flags=re.I) is None:
865            self.log.warn('Ignoring invalid TOC thumbnail image: %r'%src)
866            return
867        article.toc_thumbnail = re.sub(r'^.*?feed', 'feed',
868                src, flags=re.IGNORECASE)
869
870    def populate_article_metadata(self, article, soup, first):
871        '''
872        Called when each HTML page belonging to article is downloaded.
873        Intended to be used to get article metadata like author/summary/etc.
874        from the parsed HTML (soup).
875
876        :param article: A object of class :class:`calibre.web.feeds.Article`.
877            If you change the summary, remember to also change the text_summary
878        :param soup: Parsed HTML belonging to this article
879        :param first: True iff the parsed HTML is the first page of the article.
880        '''
881        pass
882
883    def postprocess_book(self, oeb, opts, log):
884        '''
885        Run any needed post processing on the parsed downloaded e-book.
886
887        :param oeb: An OEBBook object
888        :param opts: Conversion options
889        '''
890        pass
891
892    def __init__(self, options, log, progress_reporter):
893        '''
894        Initialize the recipe.
895        :param options: Parsed commandline options
896        :param log:  Logging object
897        :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
898        '''
899        self.log = ThreadSafeWrapper(log)
900        if not isinstance(self.title, str):
901            self.title = str(self.title, 'utf-8', 'replace')
902
903        self.debug = options.verbose > 1
904        self.output_dir = os.path.abspath(os.getcwd())
905        self.verbose = options.verbose
906        self.test = options.test
907        if self.test and not isinstance(self.test, tuple):
908            self.test = (2, 2)
909        self.username = options.username
910        self.password = options.password
911        self.lrf = options.lrf
912        self.output_profile = options.output_profile
913        self.touchscreen = getattr(self.output_profile, 'touchscreen', False)
914        if self.touchscreen:
915            self.template_css += self.output_profile.touchscreen_news_css
916
917        if self.test:
918            self.max_articles_per_feed = self.test[1]
919            self.simultaneous_downloads = min(4, self.simultaneous_downloads)
920
921        if self.debug:
922            self.verbose = True
923        self.report_progress = progress_reporter
924
925        if self.needs_subscription and (
926                self.username is None or self.password is None or (
927                    not self.username and not self.password)):
928            if self.needs_subscription != 'optional':
929                raise ValueError(_('The "%s" recipe needs a username and password.')%self.title)
930
931        self.browser = self.get_browser()
932        self.image_map, self.image_counter = {}, 1
933        self.css_map = {}
934
935        web2disk_cmdline = ['web2disk',
936            '--timeout', str(self.timeout),
937            '--max-recursions', str(self.recursions),
938            '--delay', str(self.delay),
939            ]
940
941        if self.verbose:
942            web2disk_cmdline.append('--verbose')
943
944        if self.no_stylesheets:
945            web2disk_cmdline.append('--dont-download-stylesheets')
946
947        for reg in self.match_regexps:
948            web2disk_cmdline.extend(['--match-regexp', reg])
949
950        for reg in self.filter_regexps:
951            web2disk_cmdline.extend(['--filter-regexp', reg])
952
953        if options.output_profile.short_name in ('default', 'tablet'):
954            self.scale_news_images_to_device = False
955        elif self.scale_news_images_to_device:
956            self.scale_news_images = options.output_profile.screen_size
957
958        self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
959        for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
960                      'skip_ad_pages', 'preprocess_html', 'remove_tags_after',
961                      'remove_tags_before', 'is_link_wanted',
962                      'compress_news_images', 'compress_news_images_max_size',
963                      'compress_news_images_auto_size', 'scale_news_images'):
964            setattr(self.web2disk_options, extra, getattr(self, extra))
965
966        self.web2disk_options.postprocess_html = self._postprocess_html
967        self.web2disk_options.preprocess_image = self.preprocess_image
968        self.web2disk_options.encoding = self.encoding
969        self.web2disk_options.preprocess_raw_html = self.preprocess_raw_html_
970
971        if self.delay > 0:
972            self.simultaneous_downloads = 1
973
974        self.navbar = templates.TouchscreenNavBarTemplate() if self.touchscreen else \
975                      templates.NavBarTemplate()
976        self.failed_downloads = []
977        self.partial_failures = []
978
979    def _postprocess_html(self, soup, first_fetch, job_info):
980        if self.no_stylesheets:
981            for link in soup.findAll('link'):
982                if (link.get('type') or 'text/css').lower() == 'text/css' and 'stylesheet' in (link.get('rel') or ('stylesheet',)):
983                    link.extract()
984            for style in soup.findAll('style'):
985                style.extract()
986        head = soup.find('head')
987        if not head:
988            head = soup.find('body')
989        if not head:
990            head = soup.find(True)
991        css = self.template_css + '\n\n' + (self.get_extra_css() or '')
992        style = soup.new_tag('style', type='text/css', title='override_css')
993        style.append(css)
994        head.append(style)
995        if first_fetch and job_info:
996            url, f, a, feed_len = job_info
997            body = soup.find('body')
998            if body is not None:
999                templ = self.navbar.generate(False, f, a, feed_len,
1000                                             not self.has_single_feed,
1001                                             url, __appname__,
1002                                             center=self.center_navbar,
1003                                             extra_css=self.get_extra_css() or '')
1004                elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
1005                body.insert(0, elem)
1006                # This is needed because otherwise inserting elements into
1007                # the soup breaks find()
1008                soup = BeautifulSoup(soup.decode_contents())
1009        if self.remove_javascript:
1010            for script in list(soup.findAll('script')):
1011                script.extract()
1012            for o in soup.findAll(onload=True):
1013                del o['onload']
1014
1015        for attr in self.remove_attributes:
1016            for x in soup.findAll(attrs={attr:True}):
1017                del x[attr]
1018        for bad_tag in list(soup.findAll(['base', 'iframe', 'canvas', 'embed',
1019            'command', 'datalist', 'video', 'audio', 'noscript', 'link', 'meta'])):
1020            # link tags can be used for preloading causing network activity in
1021            # calibre viewer. meta tags can do all sorts of crazy things,
1022            # including http-equiv refresh, viewport shenanigans, etc.
1023            bad_tag.extract()
1024        # srcset causes some viewers, like calibre's to load images from the
1025        # web, and it also possible causes iBooks on iOS to barf, see
1026        # https://bugs.launchpad.net/bugs/1713986
1027        for img in soup.findAll('img', srcset=True):
1028            del img['srcset']
1029
1030        ans = self.postprocess_html(soup, first_fetch)
1031
1032        # Nuke HTML5 tags
1033        for x in ans.findAll(['article', 'aside', 'header', 'footer', 'nav',
1034            'figcaption', 'figure', 'section']):
1035            x.name = 'div'
1036
1037        if job_info:
1038            url, f, a, feed_len = job_info
1039            try:
1040                article = self.feed_objects[f].articles[a]
1041            except:
1042                self.log.exception('Failed to get article object for postprocessing')
1043                pass
1044            else:
1045                self.populate_article_metadata(article, ans, first_fetch)
1046        return ans
1047
1048    def download(self):
1049        '''
1050        Download and pre-process all articles from the feeds in this recipe.
1051        This method should be called only once on a particular Recipe instance.
1052        Calling it more than once will lead to undefined behavior.
1053        :return: Path to index.html
1054        '''
1055        try:
1056            res = self.build_index()
1057            self.report_progress(1, _('Download finished'))
1058            if self.failed_downloads:
1059                self.log.warning(_('Failed to download the following articles:'))
1060                for feed, article, debug in self.failed_downloads:
1061                    self.log.warning(article.title, 'from', feed.title)
1062                    self.log.debug(article.url)
1063                    self.log.debug(debug)
1064            if self.partial_failures:
1065                self.log.warning(_('Failed to download parts of the following articles:'))
1066                for feed, atitle, aurl, debug in self.partial_failures:
1067                    self.log.warning(atitle + _(' from ') + feed)
1068                    self.log.debug(aurl)
1069                    self.log.warning(_('\tFailed links:'))
1070                    for l, tb in debug:
1071                        self.log.warning(l)
1072                        self.log.debug(tb)
1073            return res
1074        finally:
1075            self.cleanup()
1076
1077    @property
1078    def lang_for_html(self):
1079        try:
1080            lang = self.language.replace('_', '-').partition('-')[0].lower()
1081            if lang == 'und':
1082                lang = None
1083        except:
1084            lang = None
1085        return lang
1086
1087    def feeds2index(self, feeds):
1088        templ = (templates.TouchscreenIndexTemplate if self.touchscreen else
1089                templates.IndexTemplate)
1090        templ = templ(lang=self.lang_for_html)
1091        css = self.template_css + '\n\n' +(self.get_extra_css() or '')
1092        timefmt = self.timefmt
1093        return templ.generate(self.title, "mastheadImage.jpg", timefmt, feeds,
1094                              extra_css=css).render(doctype='xhtml')
1095
1096    @classmethod
1097    def description_limiter(cls, src):
1098        if not src:
1099            return ''
1100        src = force_unicode(src, 'utf-8')
1101        pos = cls.summary_length
1102        fuzz = 50
1103        si = src.find(';', pos)
1104        if si > 0 and si-pos > fuzz:
1105            si = -1
1106        gi = src.find('>', pos)
1107        if gi > 0 and gi-pos > fuzz:
1108            gi = -1
1109        npos = max(si, gi)
1110        if npos < 0:
1111            npos = pos
1112        ans = src[:npos+1]
1113        if len(ans) < len(src):
1114            from calibre.utils.cleantext import clean_xml_chars
1115
1116            # Truncating the string could cause a dangling UTF-16 half-surrogate, which will cause lxml to barf, clean it
1117            ans = clean_xml_chars(ans) + '\u2026'
1118        return ans
1119
1120    def feed2index(self, f, feeds):
1121        feed = feeds[f]
1122        if feed.image_url is not None:  # Download feed image
1123            imgdir = os.path.join(self.output_dir, 'images')
1124            if not os.path.isdir(imgdir):
1125                os.makedirs(imgdir)
1126
1127            if feed.image_url in self.image_map:
1128                feed.image_url = self.image_map[feed.image_url]
1129            else:
1130                bn = urlsplit(feed.image_url).path
1131                if bn:
1132                    bn = bn.rpartition('/')[-1]
1133                    if bn:
1134                        img = os.path.join(imgdir, 'feed_image_%d%s'%(self.image_counter, os.path.splitext(bn)))
1135                        try:
1136                            with open(img, 'wb') as fi, closing(self.browser.open(feed.image_url, timeout=self.timeout)) as r:
1137                                fi.write(r.read())
1138                            self.image_counter += 1
1139                            feed.image_url = img
1140                            self.image_map[feed.image_url] = img
1141                        except:
1142                            pass
1143            if isinstance(feed.image_url, bytes):
1144                feed.image_url = feed.image_url.decode(sys.getfilesystemencoding(), 'strict')
1145
1146        templ = (templates.TouchscreenFeedTemplate if self.touchscreen else
1147                    templates.FeedTemplate)
1148        templ = templ(lang=self.lang_for_html)
1149        css = self.template_css + '\n\n' +(self.get_extra_css() or '')
1150
1151        return templ.generate(f, feeds, self.description_limiter,
1152                              extra_css=css).render(doctype='xhtml')
1153
1154    def _fetch_article(self, url, dir_, f, a, num_of_feeds):
1155        br = self.browser
1156        if hasattr(self.get_browser, 'is_base_class_implementation'):
1157            # We are using the default get_browser, which means no need to
1158            # clone
1159            br = BasicNewsRecipe.get_browser(self)
1160        else:
1161            br = self.clone_browser(self.browser)
1162        self.web2disk_options.browser = br
1163        fetcher = RecursiveFetcher(self.web2disk_options, self.log,
1164                self.image_map, self.css_map,
1165                (url, f, a, num_of_feeds))
1166        fetcher.browser = br
1167        fetcher.base_dir = dir_
1168        fetcher.current_dir = dir_
1169        fetcher.show_progress = False
1170        fetcher.image_url_processor = self.image_url_processor
1171        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
1172        if not res or not os.path.exists(res):
1173            msg = _('Could not fetch article.') + ' '
1174            if self.debug:
1175                msg += _('The debug traceback is available earlier in this log')
1176            else:
1177                msg += _('Run with -vv to see the reason')
1178            raise Exception(msg)
1179
1180        return res, path, failures
1181
1182    def fetch_article(self, url, dir, f, a, num_of_feeds):
1183        return self._fetch_article(url, dir, f, a, num_of_feeds)
1184
1185    def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds):
1186        path = os.path.abspath(self.get_obfuscated_article(url))
1187        url = ('file:'+path) if iswindows else ('file://'+path)
1188        return self._fetch_article(url, dir, f, a, num_of_feeds)
1189
1190    def fetch_embedded_article(self, article, dir, f, a, num_of_feeds):
1191        templ = templates.EmbeddedContent()
1192        raw = templ.generate(article).render('html')
1193        with PersistentTemporaryFile('_feeds2disk.html') as pt:
1194            pt.write(raw)
1195            url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
1196        return self._fetch_article(url, dir, f, a, num_of_feeds)
1197
1198    def remove_duplicate_articles(self, feeds):
1199        seen_keys = defaultdict(set)
1200        remove = []
1201        for f in feeds:
1202            for article in f:
1203                for key in self.ignore_duplicate_articles:
1204                    val = getattr(article, key)
1205                    seen = seen_keys[key]
1206                    if val:
1207                        if val in seen:
1208                            remove.append((f, article))
1209                        else:
1210                            seen.add(val)
1211
1212        for feed, article in remove:
1213            self.log.debug('Removing duplicate article: %s from section: %s'%(
1214                article.title, feed.title))
1215            feed.remove_article(article)
1216
1217        if self.remove_empty_feeds:
1218            feeds = [f for f in feeds if len(f) > 0]
1219        return feeds
1220
1221    def build_index(self):
1222        self.report_progress(0, _('Fetching feeds...'))
1223        feeds = None
1224        try:
1225            feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
1226                                     max_articles_per_feed=self.max_articles_per_feed,
1227                                     log=self.log)
1228            self.report_progress(0, _('Got feeds from index page'))
1229        except NotImplementedError:
1230            pass
1231
1232        if feeds is None:
1233            feeds = self.parse_feeds()
1234
1235        if not feeds:
1236            raise ValueError('No articles found, aborting')
1237
1238        if self.ignore_duplicate_articles is not None:
1239            feeds = self.remove_duplicate_articles(feeds)
1240
1241        self.report_progress(0, _('Trying to download cover...'))
1242        self.download_cover()
1243        self.report_progress(0, _('Generating masthead...'))
1244        self.resolve_masthead()
1245
1246        if self.test:
1247            feeds = feeds[:self.test[0]]
1248        self.has_single_feed = len(feeds) == 1
1249
1250        index = os.path.join(self.output_dir, 'index.html')
1251
1252        html = self.feeds2index(feeds)
1253        with open(index, 'wb') as fi:
1254            fi.write(html)
1255
1256        self.jobs = []
1257
1258        if self.reverse_article_order:
1259            for feed in feeds:
1260                if hasattr(feed, 'reverse'):
1261                    feed.reverse()
1262
1263        self.feed_objects = feeds
1264        for f, feed in enumerate(feeds):
1265            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
1266            if not os.path.isdir(feed_dir):
1267                os.makedirs(feed_dir)
1268
1269            for a, article in enumerate(feed):
1270                if a >= self.max_articles_per_feed:
1271                    break
1272                art_dir = os.path.join(feed_dir, 'article_%d'%a)
1273                if not os.path.isdir(art_dir):
1274                    os.makedirs(art_dir)
1275                try:
1276                    url = self.print_version(article.url)
1277                except NotImplementedError:
1278                    url = article.url
1279                except:
1280                    self.log.exception('Failed to find print version for: '+article.url)
1281                    url = None
1282                if not url:
1283                    continue
1284                func, arg = (self.fetch_embedded_article, article) \
1285                            if self.use_embedded_content or (self.use_embedded_content is None and feed.has_embedded_content()) \
1286                            else \
1287                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated
1288                              else self.fetch_article), url)
1289                req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
1290                                      {}, (f, a), self.article_downloaded,
1291                                      self.error_in_article_download)
1292                req.feed = feed
1293                req.article = article
1294                req.feed_dir = feed_dir
1295                self.jobs.append(req)
1296
1297        self.jobs_done = 0
1298        tp = ThreadPool(self.simultaneous_downloads)
1299        for req in self.jobs:
1300            tp.putRequest(req, block=True, timeout=0)
1301
1302        self.report_progress(0, ngettext(
1303            'Starting download in a single thread...',
1304            'Starting download [{} threads]...', self.simultaneous_downloads).format(self.simultaneous_downloads))
1305        while True:
1306            try:
1307                tp.poll()
1308                time.sleep(0.1)
1309            except NoResultsPending:
1310                break
1311
1312        for f, feed in enumerate(feeds):
1313            html = self.feed2index(f,feeds)
1314            feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
1315            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
1316                fi.write(html)
1317        self.create_opf(feeds)
1318        self.report_progress(1, _('Feeds downloaded to %s')%index)
1319
1320        return index
1321
1322    def _download_cover(self):
1323        self.cover_path = None
1324        try:
1325            cu = self.get_cover_url()
1326        except Exception as err:
1327            self.log.error(_('Could not download cover: %s')%as_unicode(err))
1328            self.log.debug(traceback.format_exc())
1329        else:
1330            if not cu:
1331                return
1332            cdata = None
1333            if hasattr(cu, 'read'):
1334                cdata = cu.read()
1335                cu = getattr(cu, 'name', 'cover.jpg')
1336            elif os.access(cu, os.R_OK):
1337                with open(cu, 'rb') as f:
1338                    cdata = f.read()
1339            else:
1340                self.report_progress(1, _('Downloading cover from %s')%cu)
1341                with closing(self.browser.open(cu, timeout=self.timeout)) as r:
1342                    cdata = r.read()
1343            if not cdata:
1344                return
1345            ext = cu.split('/')[-1].rpartition('.')[-1].lower().strip()
1346            if ext == 'pdf':
1347                from calibre.ebooks.metadata.pdf import get_metadata
1348                stream = io.BytesIO(cdata)
1349                cdata = None
1350                mi = get_metadata(stream)
1351                if mi.cover_data and mi.cover_data[1]:
1352                    cdata = mi.cover_data[1]
1353            if not cdata:
1354                return
1355            if self.cover_margins[0] or self.cover_margins[1]:
1356                cdata = image_to_data(add_borders_to_image(cdata,
1357                            left=self.cover_margins[0],right=self.cover_margins[0],
1358                            top=self.cover_margins[1],bottom=self.cover_margins[1],
1359                            border_color=self.cover_margins[2]))
1360
1361            cpath = os.path.join(self.output_dir, 'cover.jpg')
1362            save_cover_data_to(cdata, cpath)
1363            self.cover_path = cpath
1364
1365    def download_cover(self):
1366        self.cover_path = None
1367        try:
1368            self._download_cover()
1369        except:
1370            self.log.exception('Failed to download cover')
1371            self.cover_path = None
1372
1373    def _download_masthead(self, mu):
1374        if hasattr(mu, 'rpartition'):
1375            ext = mu.rpartition('.')[-1]
1376            if '?' in ext:
1377                ext = ''
1378        else:
1379            ext = mu.name.rpartition('.')[-1]
1380        ext = ext.lower() if ext else 'jpg'
1381        mpath = os.path.join(self.output_dir, 'masthead_source.'+ext)
1382        outfile = os.path.join(self.output_dir, 'mastheadImage.jpg')
1383        if hasattr(mu, 'read'):
1384            with open(mpath, 'wb') as mfile:
1385                mfile.write(mu.read())
1386        elif os.access(mu, os.R_OK):
1387            with open(mpath, 'wb') as mfile:
1388                mfile.write(open(mu, 'rb').read())
1389        else:
1390            with open(mpath, 'wb') as mfile, closing(self.browser.open(mu, timeout=self.timeout)) as r:
1391                mfile.write(r.read())
1392            self.report_progress(1, _('Masthead image downloaded'))
1393        self.prepare_masthead_image(mpath, outfile)
1394        self.masthead_path = outfile
1395        if os.path.exists(mpath):
1396            os.remove(mpath)
1397
1398    def download_masthead(self, url):
1399        try:
1400            self._download_masthead(url)
1401        except:
1402            self.log.exception("Failed to download supplied masthead_url")
1403
1404    def resolve_masthead(self):
1405        self.masthead_path = None
1406        try:
1407            murl = self.get_masthead_url()
1408        except:
1409            self.log.exception('Failed to get masthead url')
1410            murl = None
1411
1412        if murl is not None:
1413            # Try downloading the user-supplied masthead_url
1414            # Failure sets self.masthead_path to None
1415            self.download_masthead(murl)
1416        if self.masthead_path is None:
1417            self.log.info("Synthesizing mastheadImage")
1418            self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
1419            try:
1420                self.default_masthead_image(self.masthead_path)
1421            except:
1422                self.log.exception('Failed to generate default masthead image')
1423                self.masthead_path = None
1424
1425    def default_cover(self, cover_file):
1426        '''
1427        Create a generic cover for recipes that don't have a cover
1428        '''
1429        try:
1430            from calibre.ebooks.covers import create_cover
1431            title = self.title if isinstance(self.title, str) else \
1432                    self.title.decode(preferred_encoding, 'replace')
1433            date = strftime(self.timefmt).replace('[', '').replace(']', '')
1434            img_data = create_cover(title, [date])
1435            cover_file.write(img_data)
1436            cover_file.flush()
1437        except:
1438            self.log.exception('Failed to generate default cover')
1439            return False
1440        return True
1441
1442    def get_masthead_title(self):
1443        'Override in subclass to use something other than the recipe title'
1444        return self.title
1445
1446    MI_WIDTH = 600
1447    MI_HEIGHT = 60
1448
1449    def default_masthead_image(self, out_path):
1450        from calibre.ebooks import generate_masthead
1451        generate_masthead(self.get_masthead_title(), output_path=out_path,
1452                width=self.MI_WIDTH, height=self.MI_HEIGHT)
1453
1454    def prepare_masthead_image(self, path_to_image, out_path):
1455        prepare_masthead_image(path_to_image, out_path, self.MI_WIDTH, self.MI_HEIGHT)
1456
1457    def publication_date(self):
1458        return nowf()
1459
1460    def create_opf(self, feeds, dir=None):
1461        if dir is None:
1462            dir = self.output_dir
1463        title = self.short_title()
1464        if self.output_profile.periodical_date_in_title:
1465            title += strftime(self.timefmt)
1466        mi = MetaInformation(title, [__appname__])
1467        mi.publisher = __appname__
1468        mi.author_sort = __appname__
1469        if self.publication_type:
1470            mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
1471        mi.timestamp = nowf()
1472        article_titles, aseen = [], set()
1473        for f in feeds:
1474            for a in f:
1475                if a.title and a.title not in aseen:
1476                    aseen.add(a.title)
1477                    article_titles.append(force_unicode(a.title, 'utf-8'))
1478
1479        desc = self.description
1480        if not isinstance(desc, str):
1481            desc = desc.decode('utf-8', 'replace')
1482        mi.comments = (_('Articles in this issue:'
1483            ) + '\n\n' + '\n\n'.join(article_titles)) + '\n\n' + desc
1484
1485        language = canonicalize_lang(self.language)
1486        if language is not None:
1487            mi.language = language
1488        mi.pubdate = self.publication_date()
1489        opf_path = os.path.join(dir, 'index.opf')
1490        ncx_path = os.path.join(dir, 'index.ncx')
1491
1492        opf = OPFCreator(dir, mi)
1493        # Add mastheadImage entry to <guide> section
1494        mp = getattr(self, 'masthead_path', None)
1495        if mp is not None and os.access(mp, os.R_OK):
1496            from calibre.ebooks.metadata.opf2 import Guide
1497            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwd())
1498            ref.type = 'masthead'
1499            ref.title = 'Masthead Image'
1500            opf.guide.append(ref)
1501
1502        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
1503        manifest.append(os.path.join(dir, 'index.html'))
1504        manifest.append(os.path.join(dir, 'index.ncx'))
1505
1506        # Get cover
1507        cpath = getattr(self, 'cover_path', None)
1508        if cpath is None:
1509            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
1510            if self.default_cover(pf):
1511                cpath =  pf.name
1512        if cpath is not None and os.access(cpath, os.R_OK):
1513            opf.cover = cpath
1514            manifest.append(cpath)
1515
1516        # Get masthead
1517        mpath = getattr(self, 'masthead_path', None)
1518        if mpath is not None and os.access(mpath, os.R_OK):
1519            manifest.append(mpath)
1520
1521        opf.create_manifest_from_files_in(manifest)
1522        for mani in opf.manifest:
1523            if mani.path.endswith('.ncx'):
1524                mani.id = 'ncx'
1525            if mani.path.endswith('mastheadImage.jpg'):
1526                mani.id = 'masthead-image'
1527
1528        entries = ['index.html']
1529        toc = TOC(base_path=dir)
1530        self.play_order_counter = 0
1531        self.play_order_map = {}
1532
1533        self.article_url_map = aumap = defaultdict(set)
1534
1535        def feed_index(num, parent):
1536            f = feeds[num]
1537            for j, a in enumerate(f):
1538                if getattr(a, 'downloaded', False):
1539                    adir = 'feed_%d/article_%d/'%(num, j)
1540                    auth = a.author
1541                    if not auth:
1542                        auth = None
1543                    desc = a.text_summary
1544                    if not desc:
1545                        desc = None
1546                    else:
1547                        desc = self.description_limiter(desc)
1548                    tt = a.toc_thumbnail if a.toc_thumbnail else None
1549                    entries.append('%sindex.html'%adir)
1550                    po = self.play_order_map.get(entries[-1], None)
1551                    if po is None:
1552                        self.play_order_counter += 1
1553                        po = self.play_order_counter
1554                    arelpath = '%sindex.html'%adir
1555                    for curl in self.canonicalize_internal_url(a.orig_url, is_link=False):
1556                        aumap[curl].add(arelpath)
1557                    article_toc_entry = parent.add_item(arelpath, None,
1558                            a.title if a.title else _('Untitled article'),
1559                            play_order=po, author=auth,
1560                            description=desc, toc_thumbnail=tt)
1561                    for entry in a.internal_toc_entries:
1562                        anchor = entry.get('anchor')
1563                        if anchor:
1564                            self.play_order_counter += 1
1565                            po += 1
1566                            article_toc_entry.add_item(
1567                                arelpath, entry['anchor'], entry['title'] or _('Unknown section'),
1568                                play_order=po
1569                            )
1570                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
1571                    for sp in a.sub_pages:
1572                        prefix = os.path.commonprefix([opf_path, sp])
1573                        relp = sp[len(prefix):]
1574                        entries.append(relp.replace(os.sep, '/'))
1575                        last = sp
1576
1577                    if os.path.exists(last):
1578                        with open(last, 'rb') as fi:
1579                            src = fi.read().decode('utf-8')
1580                        soup = BeautifulSoup(src)
1581                        body = soup.find('body')
1582                        if body is not None:
1583                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
1584                            templ = self.navbar.generate(True, num, j, len(f),
1585                                            not self.has_single_feed,
1586                                            a.orig_url, __appname__, prefix=prefix,
1587                                            center=self.center_navbar)
1588                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
1589                            body.insert(len(body.contents), elem)
1590                            with open(last, 'wb') as fi:
1591                                fi.write(str(soup).encode('utf-8'))
1592        if len(feeds) == 0:
1593            raise Exception('All feeds are empty, aborting.')
1594
1595        if len(feeds) > 1:
1596            for i, f in enumerate(feeds):
1597                entries.append('feed_%d/index.html'%i)
1598                po = self.play_order_map.get(entries[-1], None)
1599                if po is None:
1600                    self.play_order_counter += 1
1601                    po = self.play_order_counter
1602                auth = getattr(f, 'author', None)
1603                if not auth:
1604                    auth = None
1605                desc = getattr(f, 'description', None)
1606                if not desc:
1607                    desc = None
1608                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
1609                    f.title, play_order=po, description=desc, author=auth))
1610
1611        else:
1612            entries.append('feed_%d/index.html'%0)
1613            feed_index(0, toc)
1614
1615        for i, p in enumerate(entries):
1616            entries[i] = os.path.join(dir, p.replace('/', os.sep))
1617        opf.create_spine(entries)
1618        opf.set_toc(toc)
1619
1620        with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file:
1621            opf.render(opf_file, ncx_file)
1622
1623    def article_downloaded(self, request, result):
1624        index = os.path.join(os.path.dirname(result[0]), 'index.html')
1625        if index != result[0]:
1626            if os.path.exists(index):
1627                os.remove(index)
1628            os.rename(result[0], index)
1629        a = request.requestID[1]
1630
1631        article = request.article
1632        self.log.debug('Downloaded article:', article.title, 'from', article.url)
1633        article.orig_url = article.url
1634        article.url = 'article_%d/index.html'%a
1635        article.downloaded = True
1636        article.sub_pages  = result[1][1:]
1637        self.jobs_done += 1
1638        self.report_progress(float(self.jobs_done)/len(self.jobs),
1639            _('Article downloaded: %s')%force_unicode(article.title))
1640        if result[2]:
1641            self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
1642
1643    def error_in_article_download(self, request, traceback):
1644        self.jobs_done += 1
1645        if traceback and re.search('^AbortArticle:', traceback, flags=re.M) is not None:
1646            self.log.warn('Aborted download of article:', request.article.title,
1647                          'from', request.article.url)
1648            self.report_progress(float(self.jobs_done)/len(self.jobs),
1649                _('Article download aborted: %s')%force_unicode(request.article.title))
1650        else:
1651            self.log.error('Failed to download article:', request.article.title,
1652            'from', request.article.url)
1653            self.log.debug(traceback)
1654            self.log.debug('\n')
1655            self.report_progress(float(self.jobs_done)/len(self.jobs),
1656                    _('Article download failed: %s')%force_unicode(request.article.title))
1657            self.failed_downloads.append((request.feed, request.article, traceback))
1658
1659    def parse_feeds(self):
1660        '''
1661        Create a list of articles from the list of feeds returned by :meth:`BasicNewsRecipe.get_feeds`.
1662        Return a list of :class:`Feed` objects.
1663        '''
1664        feeds = self.get_feeds()
1665        parsed_feeds = []
1666        br = self.browser
1667        for obj in feeds:
1668            if isinstance(obj, string_or_bytes):
1669                title, url = None, obj
1670            else:
1671                title, url = obj
1672            if isinstance(title, bytes):
1673                title = title.decode('utf-8')
1674            if isinstance(url, bytes):
1675                url = url.decode('utf-8')
1676            if url.startswith('feed://'):
1677                url = 'http'+url[4:]
1678            self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url))
1679            try:
1680                purl = urlparse(url, allow_fragments=False)
1681                if purl.username or purl.password:
1682                    hostname = purl.hostname
1683                    if purl.port:
1684                        hostname += f':{purl.port}'
1685                    url = purl._replace(netloc=hostname).geturl()
1686                    if purl.username and purl.password:
1687                        br.add_password(url, purl.username, purl.password)
1688                with closing(br.open_novisit(url, timeout=self.timeout)) as f:
1689                    raw = f.read()
1690                parsed_feeds.append(feed_from_xml(
1691                    raw, title=title, log=self.log,
1692                    oldest_article=self.oldest_article,
1693                    max_articles_per_feed=self.max_articles_per_feed,
1694                    get_article_url=self.get_article_url
1695                ))
1696            except Exception as err:
1697                feed = Feed()
1698                msg = 'Failed feed: %s'%(title if title else url)
1699                feed.populate_from_preparsed_feed(msg, [])
1700                feed.description = as_unicode(err)
1701                parsed_feeds.append(feed)
1702                self.log.exception(msg)
1703            if self.delay > 0:
1704                time.sleep(self.delay)
1705
1706        remove = [fl for fl in parsed_feeds if len(fl) == 0 and self.remove_empty_feeds]
1707        for f in remove:
1708            parsed_feeds.remove(f)
1709
1710        return parsed_feeds
1711
1712    @classmethod
1713    def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True):
1714        '''
1715        Convenience method to take a
1716        `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
1717        :code:`Tag` and extract the text from it recursively, including any CDATA sections
1718        and alt tag attributes. Return a possibly empty Unicode string.
1719
1720        `use_alt`: If `True` try to use the alt attribute for tags that don't
1721        have any textual content
1722
1723        `tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_
1724        :code:`Tag`
1725        '''
1726        if tag is None:
1727            return ''
1728        if isinstance(tag, string_or_bytes):
1729            return tag
1730        if callable(getattr(tag, 'xpath', None)) and not hasattr(tag, 'contents'):  # a lxml tag
1731            from lxml.etree import tostring
1732            ans = tostring(tag, method='text', encoding='unicode', with_tail=False)
1733        else:
1734            strings = []
1735            for item in tag.contents:
1736                if isinstance(item, (NavigableString, CData)):
1737                    strings.append(item.string)
1738                elif isinstance(item, Tag):
1739                    res = self.tag_to_string(item)
1740                    if res:
1741                        strings.append(res)
1742                    elif use_alt:
1743                        try:
1744                            strings.append(item['alt'])
1745                        except KeyError:
1746                            pass
1747            ans = ''.join(strings)
1748        if normalize_whitespace:
1749            ans = re.sub(r'\s+', ' ', ans)
1750        return ans
1751
1752    @classmethod
1753    def soup(cls, raw):
1754        return BeautifulSoup(raw)
1755
1756    @classmethod
1757    def adeify_images(cls, soup):
1758        '''
1759        If your recipe when converted to EPUB has problems with images when
1760        viewed in Adobe Digital Editions, call this method from within
1761        :meth:`postprocess_html`.
1762        '''
1763        for item in soup.findAll('img'):
1764            for attrib in ['height','width','border','align','style']:
1765                try:
1766                    del item[attrib]
1767                except KeyError:
1768                    pass
1769            oldParent = item.parent
1770            myIndex = oldParent.contents.index(item)
1771            item.extract()
1772            divtag = soup.new_tag('div')
1773            brtag  = soup.new_tag('br')
1774            oldParent.insert(myIndex,divtag)
1775            divtag.append(item)
1776            divtag.append(brtag)
1777        return soup
1778
1779    def internal_postprocess_book(self, oeb, opts, log):
1780        if self.resolve_internal_links and self.article_url_map:
1781            seen = set()
1782            for item in oeb.spine:
1783                for a in item.data.xpath('//*[local-name()="a" and @href]'):
1784                    if a.get('rel') == 'calibre-downloaded-from':
1785                        continue
1786                    url = a.get('href')
1787                    for curl in self.canonicalize_internal_url(url):
1788                        articles = self.article_url_map.get(curl)
1789                        if articles:
1790                            arelpath = sorted(articles, key=numeric_sort_key)[0]
1791                            a.set('href', item.relhref(arelpath))
1792                            if url not in seen:
1793                                log.debug('Resolved internal URL: %s -> %s' % (url, arelpath))
1794                                seen.add(url)
1795
1796
1797class CustomIndexRecipe(BasicNewsRecipe):
1798
1799    def custom_index(self):
1800        '''
1801        Return the filesystem path to a custom HTML document that will serve as the index for
1802        this recipe. The index document will typically contain many `<a href="...">`
1803        tags that point to resources on the internet that should be downloaded.
1804        '''
1805        raise NotImplementedError
1806
1807    def create_opf(self):
1808        mi = MetaInformation(self.title + strftime(self.timefmt), [__appname__])
1809        mi.publisher = __appname__
1810        mi.author_sort = __appname__
1811        mi = OPFCreator(self.output_dir, mi)
1812        mi.create_manifest_from_files_in([self.output_dir])
1813        mi.create_spine([os.path.join(self.output_dir, 'index.html')])
1814        with open(os.path.join(self.output_dir, 'index.opf'), 'wb') as opf_file:
1815            mi.render(opf_file)
1816
1817    def download(self):
1818        index = os.path.abspath(self.custom_index())
1819        url = 'file:'+index if iswindows else 'file://'+index
1820        self.web2disk_options.browser = self.clone_browser(self.browser)
1821        fetcher = RecursiveFetcher(self.web2disk_options, self.log)
1822        fetcher.base_dir = self.output_dir
1823        fetcher.current_dir = self.output_dir
1824        fetcher.show_progress = False
1825        res = fetcher.start_fetch(url)
1826        self.create_opf()
1827        return res
1828
1829
1830class AutomaticNewsRecipe(BasicNewsRecipe):
1831
1832    auto_cleanup = True
1833
1834
1835class CalibrePeriodical(BasicNewsRecipe):
1836
1837    #: Set this to the slug for the calibre periodical
1838    calibre_periodicals_slug = None
1839
1840    LOG_IN = 'https://news.calibre-ebook.com/accounts/login'
1841    needs_subscription = True
1842    __author__ = 'calibre Periodicals'
1843
1844    def get_browser(self):
1845        br = BasicNewsRecipe.get_browser(self)
1846        br.open(self.LOG_IN)
1847        br.select_form(name='login')
1848        br['username'] = self.username
1849        br['password'] = self.password
1850        raw = br.submit().read()
1851        if 'href="/my-account"' not in raw:
1852            raise LoginFailed(
1853                    _('Failed to log in, check your username and password for'
1854                    ' the calibre Periodicals service.'))
1855
1856        return br
1857    get_browser.is_base_class_implementation = True
1858
1859    def download(self):
1860        self.log('Fetching downloaded recipe')
1861        try:
1862            raw = self.browser.open_novisit(
1863                'https://news.calibre-ebook.com/subscribed_files/%s/0/temp.downloaded_recipe'
1864                % self.calibre_periodicals_slug
1865                    ).read()
1866        except Exception as e:
1867            if hasattr(e, 'getcode') and e.getcode() == 403:
1868                raise DownloadDenied(
1869                        _('You do not have permission to download this issue.'
1870                        ' Either your subscription has expired or you have'
1871                        ' exceeded the maximum allowed downloads for today.'))
1872            raise
1873        f = io.BytesIO(raw)
1874        from calibre.utils.zipfile import ZipFile
1875        zf = ZipFile(f)
1876        zf.extractall()
1877        zf.close()
1878        from glob import glob
1879
1880        from calibre.web.feeds.recipes import compile_recipe
1881        try:
1882            recipe = compile_recipe(open(glob('*.recipe')[0],
1883                'rb').read())
1884            self.conversion_options = recipe.conversion_options
1885        except:
1886            self.log.exception('Failed to compile downloaded recipe')
1887        return os.path.abspath('index.html')
1888