1__license__ = 'GPL v3' 2__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' 3''' 4Defines various abstract base classes that can be subclassed to create powerful news fetching recipes. 5''' 6__docformat__ = "restructuredtext en" 7 8 9import io 10import os 11import re 12import sys 13import time 14import traceback 15from collections import defaultdict 16from contextlib import closing 17from urllib.parse import urlparse, urlsplit 18 19from calibre import ( 20 __appname__, as_unicode, browser, force_unicode, iswindows, preferred_encoding, 21 random_user_agent, strftime 22) 23from calibre.ebooks.BeautifulSoup import BeautifulSoup, CData, NavigableString, Tag 24from calibre.ebooks.metadata import MetaInformation 25from calibre.ebooks.metadata.opf2 import OPFCreator 26from calibre.ebooks.metadata.toc import TOC 27from calibre.ptempfile import PersistentTemporaryFile 28from calibre.utils.date import now as nowf 29from calibre.utils.icu import numeric_sort_key 30from calibre.utils.img import add_borders_to_image, image_to_data, save_cover_data_to 31from calibre.utils.localization import canonicalize_lang 32from calibre.utils.logging import ThreadSafeWrapper 33from calibre.utils.threadpool import NoResultsPending, ThreadPool, WorkRequest 34from calibre.web import Recipe 35from calibre.web.feeds import Feed, feed_from_xml, feeds_from_index, templates 36from calibre.web.fetch.simple import ( 37 AbortArticle, RecursiveFetcher, option_parser as web2disk_option_parser 38) 39from calibre.web.fetch.utils import prepare_masthead_image 40from polyglot.builtins import string_or_bytes 41 42 43def classes(classes): 44 q = frozenset(classes.split(' ')) 45 return dict(attrs={ 46 'class': lambda x: x and frozenset(x.split()).intersection(q)}) 47 48 49def prefixed_classes(classes): 50 q = frozenset(classes.split(' ')) 51 52 def matcher(x): 53 if x: 54 for candidate in frozenset(x.split()): 55 for x in q: 56 if candidate.startswith(x): 57 return True 58 return False 59 return {'attrs': {'class': matcher}} 60 61 62class LoginFailed(ValueError): 63 pass 64 65 66class DownloadDenied(ValueError): 67 pass 68 69 70class BasicNewsRecipe(Recipe): 71 ''' 72 Base class that contains logic needed in all recipes. By overriding 73 progressively more of the functionality in this class, you can make 74 progressively more customized/powerful recipes. For a tutorial introduction 75 to creating recipes, see :doc:`news`. 76 ''' 77 78 #: The title to use for the e-book 79 title = _('Unknown News Source') 80 81 #: A couple of lines that describe the content this recipe downloads. 82 #: This will be used primarily in a GUI that presents a list of recipes. 83 description = '' 84 85 #: The author of this recipe 86 __author__ = __appname__ 87 88 #: Minimum calibre version needed to use this recipe 89 requires_version = (0, 6, 0) 90 91 #: The language that the news is in. Must be an ISO-639 code either 92 #: two or three characters long 93 language = 'und' 94 95 #: Maximum number of articles to download from each feed. This is primarily 96 #: useful for feeds that don't have article dates. For most feeds, you should 97 #: use :attr:`BasicNewsRecipe.oldest_article` 98 max_articles_per_feed = 100 99 100 #: Oldest article to download from this news source. In days. 101 oldest_article = 7.0 102 103 #: Number of levels of links to follow on article webpages 104 recursions = 0 105 106 #: Delay between consecutive downloads in seconds. The argument may be a 107 #: floating point number to indicate a more precise time. 108 delay = 0 109 110 #: Publication type 111 #: Set to newspaper, magazine or blog. If set to None, no publication type 112 #: metadata will be written to the opf file. 113 publication_type = 'unknown' 114 115 #: Number of simultaneous downloads. Set to 1 if the server is picky. 116 #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0 117 simultaneous_downloads = 5 118 119 #: Timeout for fetching files from server in seconds 120 timeout = 120.0 121 122 #: The format string for the date shown on the first page. 123 #: By default: Day_Name, Day_Number Month_Name Year 124 timefmt = ' [%a, %d %b %Y]' 125 126 #: List of feeds to download. 127 #: Can be either ``[url1, url2, ...]`` or ``[('title1', url1), ('title2', url2),...]`` 128 feeds = None 129 130 #: Max number of characters in the short description 131 summary_length = 500 132 133 #: Convenient flag to disable loading of stylesheets for websites 134 #: that have overly complex stylesheets unsuitable for conversion 135 #: to e-book formats. 136 #: If True stylesheets are not downloaded and processed 137 no_stylesheets = False 138 139 #: Convenient flag to strip all JavaScript tags from the downloaded HTML 140 remove_javascript = True 141 142 #: If True the GUI will ask the user for a username and password 143 #: to use while downloading. 144 #: If set to "optional" the use of a username and password becomes optional 145 needs_subscription = False 146 147 #: If True the navigation bar is center aligned, otherwise it is left aligned 148 center_navbar = True 149 150 #: Specify an override encoding for sites that have an incorrect 151 #: charset specification. The most common being specifying ``latin1`` and 152 #: using ``cp1252``. If None, try to detect the encoding. If it is a 153 #: callable, the callable is called with two arguments: The recipe object 154 #: and the source to be decoded. It must return the decoded source. 155 encoding = None 156 157 #: Normally we try to guess if a feed has full articles embedded in it 158 #: based on the length of the embedded content. If `None`, then the 159 #: default guessing is used. If `True` then the we always assume the feeds has 160 #: embedded content and if `False` we always assume the feed does not have 161 #: embedded content. 162 use_embedded_content = None 163 164 #: Set to True and implement :meth:`get_obfuscated_article` to handle 165 #: websites that try to make it difficult to scrape content. 166 articles_are_obfuscated = False 167 168 #: Reverse the order of articles in each feed 169 reverse_article_order = False 170 171 #: Automatically extract all the text from downloaded article pages. Uses 172 #: the algorithms from the readability project. Setting this to True, means 173 #: that you do not have to worry about cleaning up the downloaded HTML 174 #: manually (though manual cleanup will always be superior). 175 auto_cleanup = False 176 177 #: Specify elements that the auto cleanup algorithm should never remove. 178 #: The syntax is a XPath expression. For example:: 179 #: 180 #: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with 181 #: id="article-image" 182 #: auto_cleanup_keep = '//*[@class="important"]' will keep all elements 183 #: with class="important" 184 #: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]' 185 #: will keep all divs with id="article-image" and spans 186 #: with class="important" 187 #: 188 auto_cleanup_keep = None 189 190 #: Specify any extra :term:`CSS` that should be added to downloaded :term:`HTML` files. 191 #: It will be inserted into `<style>` tags, just before the closing 192 #: `</head>` tag thereby overriding all :term:`CSS` except that which is 193 #: declared using the style attribute on individual :term:`HTML` tags. 194 #: Note that if you want to programmatically generate the extra_css override 195 #: the :meth:`get_extra_css()` method instead. 196 #: For example:: 197 #: 198 #: extra_css = '.heading { font: serif x-large }' 199 #: 200 extra_css = None 201 202 #: If True empty feeds are removed from the output. 203 #: This option has no effect if parse_index is overridden in 204 #: the sub class. It is meant only for recipes that return a list 205 #: of feeds using `feeds` or :meth:`get_feeds`. It is also used if you use 206 #: the ignore_duplicate_articles option. 207 remove_empty_feeds = False 208 209 #: List of regular expressions that determines which links to follow. 210 #: If empty, it is ignored. Used only if is_link_wanted is 211 #: not implemented. For example:: 212 #: 213 #: match_regexps = [r'page=[0-9]+'] 214 #: 215 #: will match all URLs that have `page=some number` in them. 216 #: 217 #: Only one of :attr:`BasicNewsRecipe.match_regexps` or 218 #: :attr:`BasicNewsRecipe.filter_regexps` should be defined. 219 match_regexps = [] 220 221 #: List of regular expressions that determines which links to ignore. 222 #: If empty it is ignored. Used only if is_link_wanted is not 223 #: implemented. For example:: 224 #: 225 #: filter_regexps = [r'ads\.doubleclick\.net'] 226 #: 227 #: will remove all URLs that have `ads.doubleclick.net` in them. 228 #: 229 #: Only one of :attr:`BasicNewsRecipe.match_regexps` or 230 #: :attr:`BasicNewsRecipe.filter_regexps` should be defined. 231 filter_regexps = [] 232 233 #: Recipe specific options to control the conversion of the downloaded 234 #: content into an e-book. These will override any user or plugin specified 235 #: values, so only use if absolutely necessary. For example:: 236 #: 237 #: conversion_options = { 238 #: 'base_font_size' : 16, 239 #: 'linearize_tables' : True, 240 #: } 241 #: 242 conversion_options = {} 243 244 #: List of tags to be removed. Specified tags are removed from downloaded HTML. 245 #: A tag is specified as a dictionary of the form:: 246 #: 247 #: { 248 #: name : 'tag name', #e.g. 'div' 249 #: attrs : a dictionary, #e.g. {'class': 'advertisment'} 250 #: } 251 #: 252 #: All keys are optional. For a full explanation of the search criteria, see 253 #: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-the-tree>`__ 254 #: A common example:: 255 #: 256 #: remove_tags = [dict(name='div', class_='advert')] 257 #: 258 #: This will remove all `<div class="advert">` tags and all 259 #: their children from the downloaded :term:`HTML`. 260 remove_tags = [] 261 262 #: Remove all tags that occur after the specified tag. 263 #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`. 264 #: For example:: 265 #: 266 #: remove_tags_after = [dict(id='content')] 267 #: 268 #: will remove all 269 #: tags after the first element with `id="content"`. 270 remove_tags_after = None 271 272 #: Remove all tags that occur before the specified tag. 273 #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`. 274 #: For example:: 275 #: 276 #: remove_tags_before = dict(id='content') 277 #: 278 #: will remove all 279 #: tags before the first element with `id="content"`. 280 remove_tags_before = None 281 282 #: List of attributes to remove from all tags. 283 #: For example:: 284 #: 285 #: remove_attributes = ['style', 'font'] 286 remove_attributes = [] 287 288 #: Keep only the specified tags and their children. 289 #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`. 290 #: If this list is not empty, then the `<body>` tag will be emptied and re-filled with 291 #: the tags that match the entries in this list. For example:: 292 #: 293 #: keep_only_tags = [dict(id=['content', 'heading'])] 294 #: 295 #: will keep only tags that have an `id` attribute of `"content"` or `"heading"`. 296 keep_only_tags = [] 297 298 #: List of :term:`regexp` substitution rules to run on the downloaded :term:`HTML`. 299 #: Each element of the 300 #: list should be a two element tuple. The first element of the tuple should 301 #: be a compiled regular expression and the second a callable that takes 302 #: a single match object and returns a string to replace the match. For example:: 303 #: 304 #: preprocess_regexps = [ 305 #: (re.compile(r'<!--Article ends here-->.*</body>', re.DOTALL|re.IGNORECASE), 306 #: lambda match: '</body>'), 307 #: ] 308 #: 309 #: will remove everything from `<!--Article ends here-->` to `</body>`. 310 preprocess_regexps = [] 311 312 #: The CSS that is used to style the templates, i.e., the navigation bars and 313 #: the Tables of Contents. Rather than overriding this variable, you should 314 #: use `extra_css` in your recipe to customize look and feel. 315 template_css = ''' 316 .article_date { 317 color: gray; font-family: monospace; 318 } 319 320 .article_description { 321 text-indent: 0pt; 322 } 323 324 a.article { 325 font-weight: bold; text-align:left; 326 } 327 328 a.feed { 329 font-weight: bold; 330 } 331 332 .calibre_navbar { 333 font-family:monospace; 334 } 335 ''' 336 337 #: By default, calibre will use a default image for the masthead (Kindle only). 338 #: Override this in your recipe to provide a url to use as a masthead. 339 masthead_url = None 340 341 #: By default, the cover image returned by get_cover_url() will be used as 342 #: the cover for the periodical. Overriding this in your recipe instructs 343 #: calibre to render the downloaded cover into a frame whose width and height 344 #: are expressed as a percentage of the downloaded cover. 345 #: cover_margins = (10, 15, '#ffffff') pads the cover with a white margin 346 #: 10px on the left and right, 15px on the top and bottom. 347 #: Color names are defined `here <https://www.imagemagick.org/script/color.php>`_. 348 #: Note that for some reason, white does not always work in Windows. Use 349 #: #ffffff instead 350 cover_margins = (0, 0, '#ffffff') 351 352 #: Set to a non empty string to disable this recipe. 353 #: The string will be used as the disabled message 354 recipe_disabled = None 355 356 #: Ignore duplicates of articles that are present in more than one section. 357 #: A duplicate article is an article that has the same title and/or URL. 358 #: To ignore articles with the same title, set this to:: 359 #: 360 #: ignore_duplicate_articles = {'title'} 361 #: 362 #: To use URLs instead, set it to:: 363 #: 364 #: ignore_duplicate_articles = {'url'} 365 #: 366 #: To match on title or URL, set it to:: 367 #: 368 #: ignore_duplicate_articles = {'title', 'url'} 369 ignore_duplicate_articles = None 370 371 # The following parameters control how the recipe attempts to minimize 372 # JPEG image sizes 373 374 #: Set this to False to ignore all scaling and compression parameters and 375 #: pass images through unmodified. If True and the other compression 376 #: parameters are left at their default values, JPEG images will be scaled to fit 377 #: in the screen dimensions set by the output profile and compressed to size at 378 #: most (w * h)/16 where w x h are the scaled image dimensions. 379 compress_news_images = False 380 381 #: The factor used when auto compressing JPEG images. If set to None, 382 #: auto compression is disabled. Otherwise, the images will be reduced in size to 383 #: (w * h)/compress_news_images_auto_size bytes if possible by reducing 384 #: the quality level, where w x h are the image dimensions in pixels. 385 #: The minimum JPEG quality will be 5/100 so it is possible this constraint 386 #: will not be met. This parameter can be overridden by the parameter 387 #: compress_news_images_max_size which provides a fixed maximum size for images. 388 #: Note that if you enable scale_news_images_to_device then the image will 389 #: first be scaled and then its quality lowered until its size is less than 390 #: (w * h)/factor where w and h are now the *scaled* image dimensions. In 391 #: other words, this compression happens after scaling. 392 compress_news_images_auto_size = 16 393 394 #: Set JPEG quality so images do not exceed the size given (in KBytes). 395 #: If set, this parameter overrides auto compression via compress_news_images_auto_size. 396 #: The minimum JPEG quality will be 5/100 so it is possible this constraint 397 #: will not be met. 398 compress_news_images_max_size = None 399 400 #: Rescale images to fit in the device screen dimensions set by the output profile. 401 #: Ignored if no output profile is set. 402 scale_news_images_to_device = True 403 404 #: Maximum dimensions (w,h) to scale images to. If scale_news_images_to_device is True 405 #: this is set to the device screen dimensions set by the output profile unless 406 #: there is no profile set, in which case it is left at whatever value it has been 407 #: assigned (default None). 408 scale_news_images = None 409 410 #: If set to True then links in downloaded articles that point to other downloaded articles are 411 #: changed to point to the downloaded copy of the article rather than its original web URL. If you 412 #: set this to True, you might also need to implement :meth:`canonicalize_internal_url` to work 413 #: with the URL scheme of your particular website. 414 resolve_internal_links = False 415 416 #: Set to False if you dont want to use gzipped transfers. Note that some old servers flake out with gzip 417 handle_gzip = True 418 419 # See the built-in recipes for examples of these settings. 420 421 def short_title(self): 422 return force_unicode(self.title, preferred_encoding) 423 424 def is_link_wanted(self, url, tag): 425 ''' 426 Return True if the link should be followed or False otherwise. By 427 default, raises NotImplementedError which causes the downloader to 428 ignore it. 429 430 :param url: The URL to be followed 431 :param tag: The tag from which the URL was derived 432 ''' 433 raise NotImplementedError() 434 435 def get_extra_css(self): 436 ''' 437 By default returns `self.extra_css`. Override if you want to programmatically generate the 438 extra_css. 439 ''' 440 return self.extra_css 441 442 def get_cover_url(self): 443 ''' 444 Return a :term:`URL` to the cover image for this issue or `None`. 445 By default it returns the value of the member `self.cover_url` which 446 is normally `None`. If you want your recipe to download a cover for the e-book 447 override this method in your subclass, or set the member variable `self.cover_url` 448 before this method is called. 449 ''' 450 return getattr(self, 'cover_url', None) 451 452 def get_masthead_url(self): 453 ''' 454 Return a :term:`URL` to the masthead image for this issue or `None`. 455 By default it returns the value of the member `self.masthead_url` which 456 is normally `None`. If you want your recipe to download a masthead for the e-book 457 override this method in your subclass, or set the member variable `self.masthead_url` 458 before this method is called. 459 Masthead images are used in Kindle MOBI files. 460 ''' 461 return getattr(self, 'masthead_url', None) 462 463 def get_feeds(self): 464 ''' 465 Return a list of :term:`RSS` feeds to fetch for this profile. Each element of the list 466 must be a 2-element tuple of the form (title, url). If title is None or an 467 empty string, the title from the feed is used. This method is useful if your recipe 468 needs to do some processing to figure out the list of feeds to download. If 469 so, override in your subclass. 470 ''' 471 if not self.feeds: 472 raise NotImplementedError() 473 if self.test: 474 return self.feeds[:self.test[0]] 475 return self.feeds 476 477 @classmethod 478 def print_version(cls, url): 479 ''' 480 Take a `url` pointing to the webpage with article content and return the 481 :term:`URL` pointing to the print version of the article. By default does 482 nothing. For example:: 483 484 def print_version(self, url): 485 return url + '?&pagewanted=print' 486 487 ''' 488 raise NotImplementedError() 489 490 @classmethod 491 def image_url_processor(cls, baseurl, url): 492 ''' 493 Perform some processing on image urls (perhaps removing size restrictions for 494 dynamically generated images, etc.) and return the precessed URL. 495 ''' 496 return url 497 498 def preprocess_image(self, img_data, image_url): 499 ''' 500 Perform some processing on downloaded image data. This is called on the raw 501 data before any resizing is done. Must return the processed raw data. Return 502 None to skip the image. 503 ''' 504 return img_data 505 506 def get_browser(self, *args, **kwargs): 507 ''' 508 Return a browser instance used to fetch documents from the web. By default 509 it returns a `mechanize <https://mechanize.readthedocs.io/en/latest/>`_ 510 browser instance that supports cookies, ignores robots.txt, handles 511 refreshes and has a mozilla firefox user agent. 512 513 If your recipe requires that you login first, override this method 514 in your subclass. For example, the following code is used in the New York 515 Times recipe to login for full access:: 516 517 def get_browser(self): 518 br = BasicNewsRecipe.get_browser(self) 519 if self.username is not None and self.password is not None: 520 br.open('https://www.nytimes.com/auth/login') 521 br.select_form(name='login') 522 br['USERID'] = self.username 523 br['PASSWORD'] = self.password 524 br.submit() 525 return br 526 527 ''' 528 if 'user_agent' not in kwargs: 529 # More and more news sites are serving JPEG XR images to IE 530 ua = getattr(self, 'last_used_user_agent', None) or self.calibre_most_common_ua or random_user_agent(allow_ie=False) 531 kwargs['user_agent'] = self.last_used_user_agent = ua 532 self.log('Using user agent:', kwargs['user_agent']) 533 br = browser(*args, **kwargs) 534 br.addheaders += [('Accept', '*/*')] 535 if self.handle_gzip: 536 br.set_handle_gzip(True) 537 return br 538 539 def clone_browser(self, br): 540 ''' 541 Clone the browser br. Cloned browsers are used for multi-threaded 542 downloads, since mechanize is not thread safe. The default cloning 543 routines should capture most browser customization, but if you do 544 something exotic in your recipe, you should override this method in 545 your recipe and clone manually. 546 547 Cloned browser instances use the same, thread-safe CookieJar by 548 default, unless you have customized cookie handling. 549 ''' 550 if callable(getattr(br, 'clone_browser', None)): 551 return br.clone_browser() 552 553 # Uh-oh recipe using something exotic, call get_browser 554 return self.get_browser() 555 556 @property 557 def cloned_browser(self): 558 if hasattr(self.get_browser, 'is_base_class_implementation'): 559 # We are using the default get_browser, which means no need to 560 # clone 561 br = BasicNewsRecipe.get_browser(self) 562 else: 563 br = self.clone_browser(self.browser) 564 return br 565 566 def get_article_url(self, article): 567 ''' 568 Override in a subclass to customize extraction of the :term:`URL` that points 569 to the content for each article. Return the 570 article URL. It is called with `article`, an object representing a parsed article 571 from a feed. See `feedparser <https://pythonhosted.org/feedparser/>`_. 572 By default it looks for the original link (for feeds syndicated via a 573 service like feedburner or pheedo) and if found, 574 returns that or else returns 575 `article.link <https://pythonhosted.org/feedparser/reference-entry-link.html>`_. 576 ''' 577 for key in article.keys(): 578 if key.endswith('_origlink'): 579 url = article[key] 580 if url and (url.startswith('http://') or url.startswith('https://')): 581 return url 582 ans = article.get('link', None) 583 if not ans and getattr(article, 'links', None): 584 for item in article.links: 585 if item.get('rel', 'alternate') == 'alternate': 586 ans = item['href'] 587 break 588 return ans 589 590 def skip_ad_pages(self, soup): 591 ''' 592 This method is called with the source of each downloaded :term:`HTML` file, before 593 any of the cleanup attributes like remove_tags, keep_only_tags are 594 applied. Note that preprocess_regexps will have already been applied. 595 It is meant to allow the recipe to skip ad pages. If the soup represents 596 an ad page, return the HTML of the real page. Otherwise return 597 None. 598 599 `soup`: A `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__ 600 instance containing the downloaded :term:`HTML`. 601 ''' 602 return None 603 604 def abort_article(self, msg=None): 605 ''' Call this method inside any of the preprocess methods to abort the 606 download for the current article. Useful to skip articles that contain 607 inappropriate content, such as pure video articles. ''' 608 raise AbortArticle(msg or _('Article download aborted')) 609 610 def preprocess_raw_html(self, raw_html, url): 611 ''' 612 This method is called with the source of each downloaded :term:`HTML` file, before 613 it is parsed into an object tree. raw_html is a unicode string 614 representing the raw HTML downloaded from the web. url is the URL from 615 which the HTML was downloaded. 616 617 Note that this method acts *before* preprocess_regexps. 618 619 This method must return the processed raw_html as a unicode object. 620 ''' 621 return raw_html 622 623 def preprocess_raw_html_(self, raw_html, url): 624 raw_html = self.preprocess_raw_html(raw_html, url) 625 if self.auto_cleanup: 626 try: 627 raw_html = self.extract_readable_article(raw_html, url) 628 except: 629 self.log.exception('Auto cleanup of URL: %r failed'%url) 630 631 return raw_html 632 633 def preprocess_html(self, soup): 634 ''' 635 This method is called with the source of each downloaded :term:`HTML` file, before 636 it is parsed for links and images. It is called after the cleanup as 637 specified by remove_tags etc. 638 It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`. 639 It should return `soup` after processing it. 640 641 `soup`: A `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__ 642 instance containing the downloaded :term:`HTML`. 643 ''' 644 return soup 645 646 def postprocess_html(self, soup, first_fetch): 647 ''' 648 This method is called with the source of each downloaded :term:`HTML` file, after 649 it is parsed for links and images. 650 It can be used to do arbitrarily powerful post-processing on the :term:`HTML`. 651 It should return `soup` after processing it. 652 653 :param soup: A `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__ instance containing the downloaded :term:`HTML`. 654 :param first_fetch: True if this is the first page of an article. 655 656 ''' 657 return soup 658 659 def cleanup(self): 660 ''' 661 Called after all articles have been download. Use it to do any cleanup like 662 logging out of subscription sites, etc. 663 ''' 664 pass 665 666 def canonicalize_internal_url(self, url, is_link=True): 667 ''' 668 Return a set of canonical representations of ``url``. The default 669 implementation uses just the server hostname and path of the URL, 670 ignoring any query parameters, fragments, etc. The canonical 671 representations must be unique across all URLs for this news source. If 672 they are not, then internal links may be resolved incorrectly. 673 674 :param is_link: Is True if the URL is coming from an internal link in 675 an HTML file. False if the URL is the URL used to 676 download an article. 677 ''' 678 try: 679 parts = urlparse(url) 680 except Exception: 681 self.log.error('Failed to parse url: %r, ignoring' % url) 682 return frozenset() 683 nl = parts.netloc 684 path = parts.path or '' 685 if isinstance(nl, bytes): 686 nl = nl.decode('utf-8', 'replace') 687 if isinstance(path, bytes): 688 path = path.decode('utf-8', 'replace') 689 return frozenset({(nl, path.rstrip('/'))}) 690 691 def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None): 692 ''' 693 Convenience method that takes an URL to the index page and returns 694 a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc>`__ 695 of it. 696 697 `url_or_raw`: Either a URL or the downloaded index page as a string 698 ''' 699 if re.match((br'\w+://' if isinstance(url_or_raw, bytes) else r'\w+://'), url_or_raw): 700 # We may be called in a thread (in the skip_ad_pages method), so 701 # clone the browser to be safe. We cannot use self.cloned_browser 702 # as it may or may not actually clone the browser, depending on if 703 # the recipe implements get_browser() or not 704 br = self.clone_browser(self.browser) 705 open_func = getattr(br, 'open_novisit', br.open) 706 with closing(open_func(url_or_raw, timeout=self.timeout)) as f: 707 _raw = f.read() 708 if not _raw: 709 raise RuntimeError('Could not fetch index from %s'%url_or_raw) 710 else: 711 _raw = url_or_raw 712 if raw: 713 return _raw 714 if not isinstance(_raw, str) and self.encoding: 715 if callable(self.encoding): 716 _raw = self.encoding(_raw) 717 else: 718 _raw = _raw.decode(self.encoding, 'replace') 719 from calibre.ebooks.chardet import ( 720 strip_encoding_declarations, xml_to_unicode 721 ) 722 from calibre.utils.cleantext import clean_xml_chars 723 if isinstance(_raw, str): 724 _raw = strip_encoding_declarations(_raw) 725 else: 726 _raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0] 727 _raw = clean_xml_chars(_raw) 728 if save_raw: 729 with lopen(save_raw, 'wb') as f: 730 f.write(_raw.encode('utf-8')) 731 if as_tree: 732 from html5_parser import parse 733 return parse(_raw) 734 return BeautifulSoup(_raw) 735 736 def extract_readable_article(self, html, url): 737 ''' 738 Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple. 739 Based on the original readability algorithm by Arc90. 740 ''' 741 from lxml.html import document_fromstring, fragment_fromstring, tostring 742 743 from calibre.ebooks.readability import readability 744 745 doc = readability.Document(html, self.log, url=url, 746 keep_elements=self.auto_cleanup_keep) 747 article_html = doc.summary() 748 extracted_title = doc.title() 749 750 try: 751 frag = fragment_fromstring(article_html) 752 except: 753 doc = document_fromstring(article_html) 754 frag = doc.xpath('//body')[-1] 755 if frag.tag == 'html': 756 root = frag 757 elif frag.tag == 'body': 758 root = document_fromstring( 759 '<html><head><title>%s</title></head></html>' % 760 extracted_title) 761 root.append(frag) 762 else: 763 root = document_fromstring( 764 '<html><head><title>%s</title></head><body/></html>' % 765 extracted_title) 766 root.xpath('//body')[0].append(frag) 767 768 body = root.xpath('//body')[0] 769 has_title = False 770 for x in body.iterdescendants(): 771 if x.text == extracted_title: 772 has_title = True 773 inline_titles = body.xpath('//h1|//h2') 774 if not has_title and not inline_titles: 775 heading = body.makeelement('h2') 776 heading.text = extracted_title 777 body.insert(0, heading) 778 779 raw_html = tostring(root, encoding='unicode') 780 781 return raw_html 782 783 def sort_index_by(self, index, weights): 784 ''' 785 Convenience method to sort the titles in `index` according to `weights`. 786 `index` is sorted in place. Returns `index`. 787 788 `index`: A list of titles. 789 790 `weights`: A dictionary that maps weights to titles. If any titles 791 in index are not in weights, they are assumed to have a weight of 0. 792 ''' 793 weights = defaultdict(lambda: 0, weights) 794 index.sort(key=lambda x: weights[x]) 795 return index 796 797 def parse_index(self): 798 ''' 799 This method should be implemented in recipes that parse a website 800 instead of feeds to generate a list of articles. Typical uses are for 801 news sources that have a "Print Edition" webpage that lists all the 802 articles in the current print edition. If this function is implemented, 803 it will be used in preference to :meth:`BasicNewsRecipe.parse_feeds`. 804 805 It must return a list. Each element of the list must be a 2-element tuple 806 of the form ``('feed title', list of articles)``. 807 808 Each list of articles must contain dictionaries of the form:: 809 810 { 811 'title' : article title, 812 'url' : URL of print version, 813 'date' : The publication date of the article as a string, 814 'description' : A summary of the article 815 'content' : The full article (can be an empty string). Obsolete 816 do not use, instead save the content to a temporary 817 file and pass a file:///path/to/temp/file.html as 818 the URL. 819 } 820 821 For an example, see the recipe for downloading `The Atlantic`. 822 In addition, you can add 'author' for the author of the article. 823 824 If you want to abort processing for some reason and have 825 calibre show the user a simple message instead of an error, call 826 :meth:`abort_recipe_processing`. 827 ''' 828 raise NotImplementedError() 829 830 def abort_recipe_processing(self, msg): 831 ''' 832 Causes the recipe download system to abort the download of this recipe, 833 displaying a simple feedback message to the user. 834 ''' 835 from calibre.ebooks.conversion import ConversionUserFeedBack 836 raise ConversionUserFeedBack(_('Failed to download %s')%self.title, 837 msg) 838 839 def get_obfuscated_article(self, url): 840 ''' 841 If you set `articles_are_obfuscated` this method is called with 842 every article URL. It should return the path to a file on the filesystem 843 that contains the article HTML. That file is processed by the recursive 844 HTML fetching engine, so it can contain links to pages/images on the web. 845 846 This method is typically useful for sites that try to make it difficult to 847 access article content automatically. 848 ''' 849 raise NotImplementedError() 850 851 def add_toc_thumbnail(self, article, src): 852 ''' 853 Call this from populate_article_metadata with the src attribute of an 854 <img> tag from the article that is appropriate for use as the thumbnail 855 representing the article in the Table of Contents. Whether the 856 thumbnail is actually used is device dependent (currently only used by 857 the Kindles). Note that the referenced image must be one that was 858 successfully downloaded, otherwise it will be ignored. 859 ''' 860 if not src or not hasattr(article, 'toc_thumbnail'): 861 return 862 863 src = src.replace('\\', '/') 864 if re.search(r'feed_\d+/article_\d+/images/img', src, flags=re.I) is None: 865 self.log.warn('Ignoring invalid TOC thumbnail image: %r'%src) 866 return 867 article.toc_thumbnail = re.sub(r'^.*?feed', 'feed', 868 src, flags=re.IGNORECASE) 869 870 def populate_article_metadata(self, article, soup, first): 871 ''' 872 Called when each HTML page belonging to article is downloaded. 873 Intended to be used to get article metadata like author/summary/etc. 874 from the parsed HTML (soup). 875 876 :param article: A object of class :class:`calibre.web.feeds.Article`. 877 If you change the summary, remember to also change the text_summary 878 :param soup: Parsed HTML belonging to this article 879 :param first: True iff the parsed HTML is the first page of the article. 880 ''' 881 pass 882 883 def postprocess_book(self, oeb, opts, log): 884 ''' 885 Run any needed post processing on the parsed downloaded e-book. 886 887 :param oeb: An OEBBook object 888 :param opts: Conversion options 889 ''' 890 pass 891 892 def __init__(self, options, log, progress_reporter): 893 ''' 894 Initialize the recipe. 895 :param options: Parsed commandline options 896 :param log: Logging object 897 :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional. 898 ''' 899 self.log = ThreadSafeWrapper(log) 900 if not isinstance(self.title, str): 901 self.title = str(self.title, 'utf-8', 'replace') 902 903 self.debug = options.verbose > 1 904 self.output_dir = os.path.abspath(os.getcwd()) 905 self.verbose = options.verbose 906 self.test = options.test 907 if self.test and not isinstance(self.test, tuple): 908 self.test = (2, 2) 909 self.username = options.username 910 self.password = options.password 911 self.lrf = options.lrf 912 self.output_profile = options.output_profile 913 self.touchscreen = getattr(self.output_profile, 'touchscreen', False) 914 if self.touchscreen: 915 self.template_css += self.output_profile.touchscreen_news_css 916 917 if self.test: 918 self.max_articles_per_feed = self.test[1] 919 self.simultaneous_downloads = min(4, self.simultaneous_downloads) 920 921 if self.debug: 922 self.verbose = True 923 self.report_progress = progress_reporter 924 925 if self.needs_subscription and ( 926 self.username is None or self.password is None or ( 927 not self.username and not self.password)): 928 if self.needs_subscription != 'optional': 929 raise ValueError(_('The "%s" recipe needs a username and password.')%self.title) 930 931 self.browser = self.get_browser() 932 self.image_map, self.image_counter = {}, 1 933 self.css_map = {} 934 935 web2disk_cmdline = ['web2disk', 936 '--timeout', str(self.timeout), 937 '--max-recursions', str(self.recursions), 938 '--delay', str(self.delay), 939 ] 940 941 if self.verbose: 942 web2disk_cmdline.append('--verbose') 943 944 if self.no_stylesheets: 945 web2disk_cmdline.append('--dont-download-stylesheets') 946 947 for reg in self.match_regexps: 948 web2disk_cmdline.extend(['--match-regexp', reg]) 949 950 for reg in self.filter_regexps: 951 web2disk_cmdline.extend(['--filter-regexp', reg]) 952 953 if options.output_profile.short_name in ('default', 'tablet'): 954 self.scale_news_images_to_device = False 955 elif self.scale_news_images_to_device: 956 self.scale_news_images = options.output_profile.screen_size 957 958 self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0] 959 for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 960 'skip_ad_pages', 'preprocess_html', 'remove_tags_after', 961 'remove_tags_before', 'is_link_wanted', 962 'compress_news_images', 'compress_news_images_max_size', 963 'compress_news_images_auto_size', 'scale_news_images'): 964 setattr(self.web2disk_options, extra, getattr(self, extra)) 965 966 self.web2disk_options.postprocess_html = self._postprocess_html 967 self.web2disk_options.preprocess_image = self.preprocess_image 968 self.web2disk_options.encoding = self.encoding 969 self.web2disk_options.preprocess_raw_html = self.preprocess_raw_html_ 970 971 if self.delay > 0: 972 self.simultaneous_downloads = 1 973 974 self.navbar = templates.TouchscreenNavBarTemplate() if self.touchscreen else \ 975 templates.NavBarTemplate() 976 self.failed_downloads = [] 977 self.partial_failures = [] 978 979 def _postprocess_html(self, soup, first_fetch, job_info): 980 if self.no_stylesheets: 981 for link in soup.findAll('link'): 982 if (link.get('type') or 'text/css').lower() == 'text/css' and 'stylesheet' in (link.get('rel') or ('stylesheet',)): 983 link.extract() 984 for style in soup.findAll('style'): 985 style.extract() 986 head = soup.find('head') 987 if not head: 988 head = soup.find('body') 989 if not head: 990 head = soup.find(True) 991 css = self.template_css + '\n\n' + (self.get_extra_css() or '') 992 style = soup.new_tag('style', type='text/css', title='override_css') 993 style.append(css) 994 head.append(style) 995 if first_fetch and job_info: 996 url, f, a, feed_len = job_info 997 body = soup.find('body') 998 if body is not None: 999 templ = self.navbar.generate(False, f, a, feed_len, 1000 not self.has_single_feed, 1001 url, __appname__, 1002 center=self.center_navbar, 1003 extra_css=self.get_extra_css() or '') 1004 elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') 1005 body.insert(0, elem) 1006 # This is needed because otherwise inserting elements into 1007 # the soup breaks find() 1008 soup = BeautifulSoup(soup.decode_contents()) 1009 if self.remove_javascript: 1010 for script in list(soup.findAll('script')): 1011 script.extract() 1012 for o in soup.findAll(onload=True): 1013 del o['onload'] 1014 1015 for attr in self.remove_attributes: 1016 for x in soup.findAll(attrs={attr:True}): 1017 del x[attr] 1018 for bad_tag in list(soup.findAll(['base', 'iframe', 'canvas', 'embed', 1019 'command', 'datalist', 'video', 'audio', 'noscript', 'link', 'meta'])): 1020 # link tags can be used for preloading causing network activity in 1021 # calibre viewer. meta tags can do all sorts of crazy things, 1022 # including http-equiv refresh, viewport shenanigans, etc. 1023 bad_tag.extract() 1024 # srcset causes some viewers, like calibre's to load images from the 1025 # web, and it also possible causes iBooks on iOS to barf, see 1026 # https://bugs.launchpad.net/bugs/1713986 1027 for img in soup.findAll('img', srcset=True): 1028 del img['srcset'] 1029 1030 ans = self.postprocess_html(soup, first_fetch) 1031 1032 # Nuke HTML5 tags 1033 for x in ans.findAll(['article', 'aside', 'header', 'footer', 'nav', 1034 'figcaption', 'figure', 'section']): 1035 x.name = 'div' 1036 1037 if job_info: 1038 url, f, a, feed_len = job_info 1039 try: 1040 article = self.feed_objects[f].articles[a] 1041 except: 1042 self.log.exception('Failed to get article object for postprocessing') 1043 pass 1044 else: 1045 self.populate_article_metadata(article, ans, first_fetch) 1046 return ans 1047 1048 def download(self): 1049 ''' 1050 Download and pre-process all articles from the feeds in this recipe. 1051 This method should be called only once on a particular Recipe instance. 1052 Calling it more than once will lead to undefined behavior. 1053 :return: Path to index.html 1054 ''' 1055 try: 1056 res = self.build_index() 1057 self.report_progress(1, _('Download finished')) 1058 if self.failed_downloads: 1059 self.log.warning(_('Failed to download the following articles:')) 1060 for feed, article, debug in self.failed_downloads: 1061 self.log.warning(article.title, 'from', feed.title) 1062 self.log.debug(article.url) 1063 self.log.debug(debug) 1064 if self.partial_failures: 1065 self.log.warning(_('Failed to download parts of the following articles:')) 1066 for feed, atitle, aurl, debug in self.partial_failures: 1067 self.log.warning(atitle + _(' from ') + feed) 1068 self.log.debug(aurl) 1069 self.log.warning(_('\tFailed links:')) 1070 for l, tb in debug: 1071 self.log.warning(l) 1072 self.log.debug(tb) 1073 return res 1074 finally: 1075 self.cleanup() 1076 1077 @property 1078 def lang_for_html(self): 1079 try: 1080 lang = self.language.replace('_', '-').partition('-')[0].lower() 1081 if lang == 'und': 1082 lang = None 1083 except: 1084 lang = None 1085 return lang 1086 1087 def feeds2index(self, feeds): 1088 templ = (templates.TouchscreenIndexTemplate if self.touchscreen else 1089 templates.IndexTemplate) 1090 templ = templ(lang=self.lang_for_html) 1091 css = self.template_css + '\n\n' +(self.get_extra_css() or '') 1092 timefmt = self.timefmt 1093 return templ.generate(self.title, "mastheadImage.jpg", timefmt, feeds, 1094 extra_css=css).render(doctype='xhtml') 1095 1096 @classmethod 1097 def description_limiter(cls, src): 1098 if not src: 1099 return '' 1100 src = force_unicode(src, 'utf-8') 1101 pos = cls.summary_length 1102 fuzz = 50 1103 si = src.find(';', pos) 1104 if si > 0 and si-pos > fuzz: 1105 si = -1 1106 gi = src.find('>', pos) 1107 if gi > 0 and gi-pos > fuzz: 1108 gi = -1 1109 npos = max(si, gi) 1110 if npos < 0: 1111 npos = pos 1112 ans = src[:npos+1] 1113 if len(ans) < len(src): 1114 from calibre.utils.cleantext import clean_xml_chars 1115 1116 # Truncating the string could cause a dangling UTF-16 half-surrogate, which will cause lxml to barf, clean it 1117 ans = clean_xml_chars(ans) + '\u2026' 1118 return ans 1119 1120 def feed2index(self, f, feeds): 1121 feed = feeds[f] 1122 if feed.image_url is not None: # Download feed image 1123 imgdir = os.path.join(self.output_dir, 'images') 1124 if not os.path.isdir(imgdir): 1125 os.makedirs(imgdir) 1126 1127 if feed.image_url in self.image_map: 1128 feed.image_url = self.image_map[feed.image_url] 1129 else: 1130 bn = urlsplit(feed.image_url).path 1131 if bn: 1132 bn = bn.rpartition('/')[-1] 1133 if bn: 1134 img = os.path.join(imgdir, 'feed_image_%d%s'%(self.image_counter, os.path.splitext(bn))) 1135 try: 1136 with open(img, 'wb') as fi, closing(self.browser.open(feed.image_url, timeout=self.timeout)) as r: 1137 fi.write(r.read()) 1138 self.image_counter += 1 1139 feed.image_url = img 1140 self.image_map[feed.image_url] = img 1141 except: 1142 pass 1143 if isinstance(feed.image_url, bytes): 1144 feed.image_url = feed.image_url.decode(sys.getfilesystemencoding(), 'strict') 1145 1146 templ = (templates.TouchscreenFeedTemplate if self.touchscreen else 1147 templates.FeedTemplate) 1148 templ = templ(lang=self.lang_for_html) 1149 css = self.template_css + '\n\n' +(self.get_extra_css() or '') 1150 1151 return templ.generate(f, feeds, self.description_limiter, 1152 extra_css=css).render(doctype='xhtml') 1153 1154 def _fetch_article(self, url, dir_, f, a, num_of_feeds): 1155 br = self.browser 1156 if hasattr(self.get_browser, 'is_base_class_implementation'): 1157 # We are using the default get_browser, which means no need to 1158 # clone 1159 br = BasicNewsRecipe.get_browser(self) 1160 else: 1161 br = self.clone_browser(self.browser) 1162 self.web2disk_options.browser = br 1163 fetcher = RecursiveFetcher(self.web2disk_options, self.log, 1164 self.image_map, self.css_map, 1165 (url, f, a, num_of_feeds)) 1166 fetcher.browser = br 1167 fetcher.base_dir = dir_ 1168 fetcher.current_dir = dir_ 1169 fetcher.show_progress = False 1170 fetcher.image_url_processor = self.image_url_processor 1171 res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links 1172 if not res or not os.path.exists(res): 1173 msg = _('Could not fetch article.') + ' ' 1174 if self.debug: 1175 msg += _('The debug traceback is available earlier in this log') 1176 else: 1177 msg += _('Run with -vv to see the reason') 1178 raise Exception(msg) 1179 1180 return res, path, failures 1181 1182 def fetch_article(self, url, dir, f, a, num_of_feeds): 1183 return self._fetch_article(url, dir, f, a, num_of_feeds) 1184 1185 def fetch_obfuscated_article(self, url, dir, f, a, num_of_feeds): 1186 path = os.path.abspath(self.get_obfuscated_article(url)) 1187 url = ('file:'+path) if iswindows else ('file://'+path) 1188 return self._fetch_article(url, dir, f, a, num_of_feeds) 1189 1190 def fetch_embedded_article(self, article, dir, f, a, num_of_feeds): 1191 templ = templates.EmbeddedContent() 1192 raw = templ.generate(article).render('html') 1193 with PersistentTemporaryFile('_feeds2disk.html') as pt: 1194 pt.write(raw) 1195 url = ('file:'+pt.name) if iswindows else ('file://'+pt.name) 1196 return self._fetch_article(url, dir, f, a, num_of_feeds) 1197 1198 def remove_duplicate_articles(self, feeds): 1199 seen_keys = defaultdict(set) 1200 remove = [] 1201 for f in feeds: 1202 for article in f: 1203 for key in self.ignore_duplicate_articles: 1204 val = getattr(article, key) 1205 seen = seen_keys[key] 1206 if val: 1207 if val in seen: 1208 remove.append((f, article)) 1209 else: 1210 seen.add(val) 1211 1212 for feed, article in remove: 1213 self.log.debug('Removing duplicate article: %s from section: %s'%( 1214 article.title, feed.title)) 1215 feed.remove_article(article) 1216 1217 if self.remove_empty_feeds: 1218 feeds = [f for f in feeds if len(f) > 0] 1219 return feeds 1220 1221 def build_index(self): 1222 self.report_progress(0, _('Fetching feeds...')) 1223 feeds = None 1224 try: 1225 feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, 1226 max_articles_per_feed=self.max_articles_per_feed, 1227 log=self.log) 1228 self.report_progress(0, _('Got feeds from index page')) 1229 except NotImplementedError: 1230 pass 1231 1232 if feeds is None: 1233 feeds = self.parse_feeds() 1234 1235 if not feeds: 1236 raise ValueError('No articles found, aborting') 1237 1238 if self.ignore_duplicate_articles is not None: 1239 feeds = self.remove_duplicate_articles(feeds) 1240 1241 self.report_progress(0, _('Trying to download cover...')) 1242 self.download_cover() 1243 self.report_progress(0, _('Generating masthead...')) 1244 self.resolve_masthead() 1245 1246 if self.test: 1247 feeds = feeds[:self.test[0]] 1248 self.has_single_feed = len(feeds) == 1 1249 1250 index = os.path.join(self.output_dir, 'index.html') 1251 1252 html = self.feeds2index(feeds) 1253 with open(index, 'wb') as fi: 1254 fi.write(html) 1255 1256 self.jobs = [] 1257 1258 if self.reverse_article_order: 1259 for feed in feeds: 1260 if hasattr(feed, 'reverse'): 1261 feed.reverse() 1262 1263 self.feed_objects = feeds 1264 for f, feed in enumerate(feeds): 1265 feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) 1266 if not os.path.isdir(feed_dir): 1267 os.makedirs(feed_dir) 1268 1269 for a, article in enumerate(feed): 1270 if a >= self.max_articles_per_feed: 1271 break 1272 art_dir = os.path.join(feed_dir, 'article_%d'%a) 1273 if not os.path.isdir(art_dir): 1274 os.makedirs(art_dir) 1275 try: 1276 url = self.print_version(article.url) 1277 except NotImplementedError: 1278 url = article.url 1279 except: 1280 self.log.exception('Failed to find print version for: '+article.url) 1281 url = None 1282 if not url: 1283 continue 1284 func, arg = (self.fetch_embedded_article, article) \ 1285 if self.use_embedded_content or (self.use_embedded_content is None and feed.has_embedded_content()) \ 1286 else \ 1287 ((self.fetch_obfuscated_article if self.articles_are_obfuscated 1288 else self.fetch_article), url) 1289 req = WorkRequest(func, (arg, art_dir, f, a, len(feed)), 1290 {}, (f, a), self.article_downloaded, 1291 self.error_in_article_download) 1292 req.feed = feed 1293 req.article = article 1294 req.feed_dir = feed_dir 1295 self.jobs.append(req) 1296 1297 self.jobs_done = 0 1298 tp = ThreadPool(self.simultaneous_downloads) 1299 for req in self.jobs: 1300 tp.putRequest(req, block=True, timeout=0) 1301 1302 self.report_progress(0, ngettext( 1303 'Starting download in a single thread...', 1304 'Starting download [{} threads]...', self.simultaneous_downloads).format(self.simultaneous_downloads)) 1305 while True: 1306 try: 1307 tp.poll() 1308 time.sleep(0.1) 1309 except NoResultsPending: 1310 break 1311 1312 for f, feed in enumerate(feeds): 1313 html = self.feed2index(f,feeds) 1314 feed_dir = os.path.join(self.output_dir, 'feed_%d'%f) 1315 with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: 1316 fi.write(html) 1317 self.create_opf(feeds) 1318 self.report_progress(1, _('Feeds downloaded to %s')%index) 1319 1320 return index 1321 1322 def _download_cover(self): 1323 self.cover_path = None 1324 try: 1325 cu = self.get_cover_url() 1326 except Exception as err: 1327 self.log.error(_('Could not download cover: %s')%as_unicode(err)) 1328 self.log.debug(traceback.format_exc()) 1329 else: 1330 if not cu: 1331 return 1332 cdata = None 1333 if hasattr(cu, 'read'): 1334 cdata = cu.read() 1335 cu = getattr(cu, 'name', 'cover.jpg') 1336 elif os.access(cu, os.R_OK): 1337 with open(cu, 'rb') as f: 1338 cdata = f.read() 1339 else: 1340 self.report_progress(1, _('Downloading cover from %s')%cu) 1341 with closing(self.browser.open(cu, timeout=self.timeout)) as r: 1342 cdata = r.read() 1343 if not cdata: 1344 return 1345 ext = cu.split('/')[-1].rpartition('.')[-1].lower().strip() 1346 if ext == 'pdf': 1347 from calibre.ebooks.metadata.pdf import get_metadata 1348 stream = io.BytesIO(cdata) 1349 cdata = None 1350 mi = get_metadata(stream) 1351 if mi.cover_data and mi.cover_data[1]: 1352 cdata = mi.cover_data[1] 1353 if not cdata: 1354 return 1355 if self.cover_margins[0] or self.cover_margins[1]: 1356 cdata = image_to_data(add_borders_to_image(cdata, 1357 left=self.cover_margins[0],right=self.cover_margins[0], 1358 top=self.cover_margins[1],bottom=self.cover_margins[1], 1359 border_color=self.cover_margins[2])) 1360 1361 cpath = os.path.join(self.output_dir, 'cover.jpg') 1362 save_cover_data_to(cdata, cpath) 1363 self.cover_path = cpath 1364 1365 def download_cover(self): 1366 self.cover_path = None 1367 try: 1368 self._download_cover() 1369 except: 1370 self.log.exception('Failed to download cover') 1371 self.cover_path = None 1372 1373 def _download_masthead(self, mu): 1374 if hasattr(mu, 'rpartition'): 1375 ext = mu.rpartition('.')[-1] 1376 if '?' in ext: 1377 ext = '' 1378 else: 1379 ext = mu.name.rpartition('.')[-1] 1380 ext = ext.lower() if ext else 'jpg' 1381 mpath = os.path.join(self.output_dir, 'masthead_source.'+ext) 1382 outfile = os.path.join(self.output_dir, 'mastheadImage.jpg') 1383 if hasattr(mu, 'read'): 1384 with open(mpath, 'wb') as mfile: 1385 mfile.write(mu.read()) 1386 elif os.access(mu, os.R_OK): 1387 with open(mpath, 'wb') as mfile: 1388 mfile.write(open(mu, 'rb').read()) 1389 else: 1390 with open(mpath, 'wb') as mfile, closing(self.browser.open(mu, timeout=self.timeout)) as r: 1391 mfile.write(r.read()) 1392 self.report_progress(1, _('Masthead image downloaded')) 1393 self.prepare_masthead_image(mpath, outfile) 1394 self.masthead_path = outfile 1395 if os.path.exists(mpath): 1396 os.remove(mpath) 1397 1398 def download_masthead(self, url): 1399 try: 1400 self._download_masthead(url) 1401 except: 1402 self.log.exception("Failed to download supplied masthead_url") 1403 1404 def resolve_masthead(self): 1405 self.masthead_path = None 1406 try: 1407 murl = self.get_masthead_url() 1408 except: 1409 self.log.exception('Failed to get masthead url') 1410 murl = None 1411 1412 if murl is not None: 1413 # Try downloading the user-supplied masthead_url 1414 # Failure sets self.masthead_path to None 1415 self.download_masthead(murl) 1416 if self.masthead_path is None: 1417 self.log.info("Synthesizing mastheadImage") 1418 self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg') 1419 try: 1420 self.default_masthead_image(self.masthead_path) 1421 except: 1422 self.log.exception('Failed to generate default masthead image') 1423 self.masthead_path = None 1424 1425 def default_cover(self, cover_file): 1426 ''' 1427 Create a generic cover for recipes that don't have a cover 1428 ''' 1429 try: 1430 from calibre.ebooks.covers import create_cover 1431 title = self.title if isinstance(self.title, str) else \ 1432 self.title.decode(preferred_encoding, 'replace') 1433 date = strftime(self.timefmt).replace('[', '').replace(']', '') 1434 img_data = create_cover(title, [date]) 1435 cover_file.write(img_data) 1436 cover_file.flush() 1437 except: 1438 self.log.exception('Failed to generate default cover') 1439 return False 1440 return True 1441 1442 def get_masthead_title(self): 1443 'Override in subclass to use something other than the recipe title' 1444 return self.title 1445 1446 MI_WIDTH = 600 1447 MI_HEIGHT = 60 1448 1449 def default_masthead_image(self, out_path): 1450 from calibre.ebooks import generate_masthead 1451 generate_masthead(self.get_masthead_title(), output_path=out_path, 1452 width=self.MI_WIDTH, height=self.MI_HEIGHT) 1453 1454 def prepare_masthead_image(self, path_to_image, out_path): 1455 prepare_masthead_image(path_to_image, out_path, self.MI_WIDTH, self.MI_HEIGHT) 1456 1457 def publication_date(self): 1458 return nowf() 1459 1460 def create_opf(self, feeds, dir=None): 1461 if dir is None: 1462 dir = self.output_dir 1463 title = self.short_title() 1464 if self.output_profile.periodical_date_in_title: 1465 title += strftime(self.timefmt) 1466 mi = MetaInformation(title, [__appname__]) 1467 mi.publisher = __appname__ 1468 mi.author_sort = __appname__ 1469 if self.publication_type: 1470 mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() 1471 mi.timestamp = nowf() 1472 article_titles, aseen = [], set() 1473 for f in feeds: 1474 for a in f: 1475 if a.title and a.title not in aseen: 1476 aseen.add(a.title) 1477 article_titles.append(force_unicode(a.title, 'utf-8')) 1478 1479 desc = self.description 1480 if not isinstance(desc, str): 1481 desc = desc.decode('utf-8', 'replace') 1482 mi.comments = (_('Articles in this issue:' 1483 ) + '\n\n' + '\n\n'.join(article_titles)) + '\n\n' + desc 1484 1485 language = canonicalize_lang(self.language) 1486 if language is not None: 1487 mi.language = language 1488 mi.pubdate = self.publication_date() 1489 opf_path = os.path.join(dir, 'index.opf') 1490 ncx_path = os.path.join(dir, 'index.ncx') 1491 1492 opf = OPFCreator(dir, mi) 1493 # Add mastheadImage entry to <guide> section 1494 mp = getattr(self, 'masthead_path', None) 1495 if mp is not None and os.access(mp, os.R_OK): 1496 from calibre.ebooks.metadata.opf2 import Guide 1497 ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwd()) 1498 ref.type = 'masthead' 1499 ref.title = 'Masthead Image' 1500 opf.guide.append(ref) 1501 1502 manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] 1503 manifest.append(os.path.join(dir, 'index.html')) 1504 manifest.append(os.path.join(dir, 'index.ncx')) 1505 1506 # Get cover 1507 cpath = getattr(self, 'cover_path', None) 1508 if cpath is None: 1509 pf = open(os.path.join(dir, 'cover.jpg'), 'wb') 1510 if self.default_cover(pf): 1511 cpath = pf.name 1512 if cpath is not None and os.access(cpath, os.R_OK): 1513 opf.cover = cpath 1514 manifest.append(cpath) 1515 1516 # Get masthead 1517 mpath = getattr(self, 'masthead_path', None) 1518 if mpath is not None and os.access(mpath, os.R_OK): 1519 manifest.append(mpath) 1520 1521 opf.create_manifest_from_files_in(manifest) 1522 for mani in opf.manifest: 1523 if mani.path.endswith('.ncx'): 1524 mani.id = 'ncx' 1525 if mani.path.endswith('mastheadImage.jpg'): 1526 mani.id = 'masthead-image' 1527 1528 entries = ['index.html'] 1529 toc = TOC(base_path=dir) 1530 self.play_order_counter = 0 1531 self.play_order_map = {} 1532 1533 self.article_url_map = aumap = defaultdict(set) 1534 1535 def feed_index(num, parent): 1536 f = feeds[num] 1537 for j, a in enumerate(f): 1538 if getattr(a, 'downloaded', False): 1539 adir = 'feed_%d/article_%d/'%(num, j) 1540 auth = a.author 1541 if not auth: 1542 auth = None 1543 desc = a.text_summary 1544 if not desc: 1545 desc = None 1546 else: 1547 desc = self.description_limiter(desc) 1548 tt = a.toc_thumbnail if a.toc_thumbnail else None 1549 entries.append('%sindex.html'%adir) 1550 po = self.play_order_map.get(entries[-1], None) 1551 if po is None: 1552 self.play_order_counter += 1 1553 po = self.play_order_counter 1554 arelpath = '%sindex.html'%adir 1555 for curl in self.canonicalize_internal_url(a.orig_url, is_link=False): 1556 aumap[curl].add(arelpath) 1557 article_toc_entry = parent.add_item(arelpath, None, 1558 a.title if a.title else _('Untitled article'), 1559 play_order=po, author=auth, 1560 description=desc, toc_thumbnail=tt) 1561 for entry in a.internal_toc_entries: 1562 anchor = entry.get('anchor') 1563 if anchor: 1564 self.play_order_counter += 1 1565 po += 1 1566 article_toc_entry.add_item( 1567 arelpath, entry['anchor'], entry['title'] or _('Unknown section'), 1568 play_order=po 1569 ) 1570 last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) 1571 for sp in a.sub_pages: 1572 prefix = os.path.commonprefix([opf_path, sp]) 1573 relp = sp[len(prefix):] 1574 entries.append(relp.replace(os.sep, '/')) 1575 last = sp 1576 1577 if os.path.exists(last): 1578 with open(last, 'rb') as fi: 1579 src = fi.read().decode('utf-8') 1580 soup = BeautifulSoup(src) 1581 body = soup.find('body') 1582 if body is not None: 1583 prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) 1584 templ = self.navbar.generate(True, num, j, len(f), 1585 not self.has_single_feed, 1586 a.orig_url, __appname__, prefix=prefix, 1587 center=self.center_navbar) 1588 elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') 1589 body.insert(len(body.contents), elem) 1590 with open(last, 'wb') as fi: 1591 fi.write(str(soup).encode('utf-8')) 1592 if len(feeds) == 0: 1593 raise Exception('All feeds are empty, aborting.') 1594 1595 if len(feeds) > 1: 1596 for i, f in enumerate(feeds): 1597 entries.append('feed_%d/index.html'%i) 1598 po = self.play_order_map.get(entries[-1], None) 1599 if po is None: 1600 self.play_order_counter += 1 1601 po = self.play_order_counter 1602 auth = getattr(f, 'author', None) 1603 if not auth: 1604 auth = None 1605 desc = getattr(f, 'description', None) 1606 if not desc: 1607 desc = None 1608 feed_index(i, toc.add_item('feed_%d/index.html'%i, None, 1609 f.title, play_order=po, description=desc, author=auth)) 1610 1611 else: 1612 entries.append('feed_%d/index.html'%0) 1613 feed_index(0, toc) 1614 1615 for i, p in enumerate(entries): 1616 entries[i] = os.path.join(dir, p.replace('/', os.sep)) 1617 opf.create_spine(entries) 1618 opf.set_toc(toc) 1619 1620 with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file: 1621 opf.render(opf_file, ncx_file) 1622 1623 def article_downloaded(self, request, result): 1624 index = os.path.join(os.path.dirname(result[0]), 'index.html') 1625 if index != result[0]: 1626 if os.path.exists(index): 1627 os.remove(index) 1628 os.rename(result[0], index) 1629 a = request.requestID[1] 1630 1631 article = request.article 1632 self.log.debug('Downloaded article:', article.title, 'from', article.url) 1633 article.orig_url = article.url 1634 article.url = 'article_%d/index.html'%a 1635 article.downloaded = True 1636 article.sub_pages = result[1][1:] 1637 self.jobs_done += 1 1638 self.report_progress(float(self.jobs_done)/len(self.jobs), 1639 _('Article downloaded: %s')%force_unicode(article.title)) 1640 if result[2]: 1641 self.partial_failures.append((request.feed.title, article.title, article.url, result[2])) 1642 1643 def error_in_article_download(self, request, traceback): 1644 self.jobs_done += 1 1645 if traceback and re.search('^AbortArticle:', traceback, flags=re.M) is not None: 1646 self.log.warn('Aborted download of article:', request.article.title, 1647 'from', request.article.url) 1648 self.report_progress(float(self.jobs_done)/len(self.jobs), 1649 _('Article download aborted: %s')%force_unicode(request.article.title)) 1650 else: 1651 self.log.error('Failed to download article:', request.article.title, 1652 'from', request.article.url) 1653 self.log.debug(traceback) 1654 self.log.debug('\n') 1655 self.report_progress(float(self.jobs_done)/len(self.jobs), 1656 _('Article download failed: %s')%force_unicode(request.article.title)) 1657 self.failed_downloads.append((request.feed, request.article, traceback)) 1658 1659 def parse_feeds(self): 1660 ''' 1661 Create a list of articles from the list of feeds returned by :meth:`BasicNewsRecipe.get_feeds`. 1662 Return a list of :class:`Feed` objects. 1663 ''' 1664 feeds = self.get_feeds() 1665 parsed_feeds = [] 1666 br = self.browser 1667 for obj in feeds: 1668 if isinstance(obj, string_or_bytes): 1669 title, url = None, obj 1670 else: 1671 title, url = obj 1672 if isinstance(title, bytes): 1673 title = title.decode('utf-8') 1674 if isinstance(url, bytes): 1675 url = url.decode('utf-8') 1676 if url.startswith('feed://'): 1677 url = 'http'+url[4:] 1678 self.report_progress(0, _('Fetching feed')+' %s...'%(title if title else url)) 1679 try: 1680 purl = urlparse(url, allow_fragments=False) 1681 if purl.username or purl.password: 1682 hostname = purl.hostname 1683 if purl.port: 1684 hostname += f':{purl.port}' 1685 url = purl._replace(netloc=hostname).geturl() 1686 if purl.username and purl.password: 1687 br.add_password(url, purl.username, purl.password) 1688 with closing(br.open_novisit(url, timeout=self.timeout)) as f: 1689 raw = f.read() 1690 parsed_feeds.append(feed_from_xml( 1691 raw, title=title, log=self.log, 1692 oldest_article=self.oldest_article, 1693 max_articles_per_feed=self.max_articles_per_feed, 1694 get_article_url=self.get_article_url 1695 )) 1696 except Exception as err: 1697 feed = Feed() 1698 msg = 'Failed feed: %s'%(title if title else url) 1699 feed.populate_from_preparsed_feed(msg, []) 1700 feed.description = as_unicode(err) 1701 parsed_feeds.append(feed) 1702 self.log.exception(msg) 1703 if self.delay > 0: 1704 time.sleep(self.delay) 1705 1706 remove = [fl for fl in parsed_feeds if len(fl) == 0 and self.remove_empty_feeds] 1707 for f in remove: 1708 parsed_feeds.remove(f) 1709 1710 return parsed_feeds 1711 1712 @classmethod 1713 def tag_to_string(self, tag, use_alt=True, normalize_whitespace=True): 1714 ''' 1715 Convenience method to take a 1716 `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_ 1717 :code:`Tag` and extract the text from it recursively, including any CDATA sections 1718 and alt tag attributes. Return a possibly empty Unicode string. 1719 1720 `use_alt`: If `True` try to use the alt attribute for tags that don't 1721 have any textual content 1722 1723 `tag`: `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`_ 1724 :code:`Tag` 1725 ''' 1726 if tag is None: 1727 return '' 1728 if isinstance(tag, string_or_bytes): 1729 return tag 1730 if callable(getattr(tag, 'xpath', None)) and not hasattr(tag, 'contents'): # a lxml tag 1731 from lxml.etree import tostring 1732 ans = tostring(tag, method='text', encoding='unicode', with_tail=False) 1733 else: 1734 strings = [] 1735 for item in tag.contents: 1736 if isinstance(item, (NavigableString, CData)): 1737 strings.append(item.string) 1738 elif isinstance(item, Tag): 1739 res = self.tag_to_string(item) 1740 if res: 1741 strings.append(res) 1742 elif use_alt: 1743 try: 1744 strings.append(item['alt']) 1745 except KeyError: 1746 pass 1747 ans = ''.join(strings) 1748 if normalize_whitespace: 1749 ans = re.sub(r'\s+', ' ', ans) 1750 return ans 1751 1752 @classmethod 1753 def soup(cls, raw): 1754 return BeautifulSoup(raw) 1755 1756 @classmethod 1757 def adeify_images(cls, soup): 1758 ''' 1759 If your recipe when converted to EPUB has problems with images when 1760 viewed in Adobe Digital Editions, call this method from within 1761 :meth:`postprocess_html`. 1762 ''' 1763 for item in soup.findAll('img'): 1764 for attrib in ['height','width','border','align','style']: 1765 try: 1766 del item[attrib] 1767 except KeyError: 1768 pass 1769 oldParent = item.parent 1770 myIndex = oldParent.contents.index(item) 1771 item.extract() 1772 divtag = soup.new_tag('div') 1773 brtag = soup.new_tag('br') 1774 oldParent.insert(myIndex,divtag) 1775 divtag.append(item) 1776 divtag.append(brtag) 1777 return soup 1778 1779 def internal_postprocess_book(self, oeb, opts, log): 1780 if self.resolve_internal_links and self.article_url_map: 1781 seen = set() 1782 for item in oeb.spine: 1783 for a in item.data.xpath('//*[local-name()="a" and @href]'): 1784 if a.get('rel') == 'calibre-downloaded-from': 1785 continue 1786 url = a.get('href') 1787 for curl in self.canonicalize_internal_url(url): 1788 articles = self.article_url_map.get(curl) 1789 if articles: 1790 arelpath = sorted(articles, key=numeric_sort_key)[0] 1791 a.set('href', item.relhref(arelpath)) 1792 if url not in seen: 1793 log.debug('Resolved internal URL: %s -> %s' % (url, arelpath)) 1794 seen.add(url) 1795 1796 1797class CustomIndexRecipe(BasicNewsRecipe): 1798 1799 def custom_index(self): 1800 ''' 1801 Return the filesystem path to a custom HTML document that will serve as the index for 1802 this recipe. The index document will typically contain many `<a href="...">` 1803 tags that point to resources on the internet that should be downloaded. 1804 ''' 1805 raise NotImplementedError 1806 1807 def create_opf(self): 1808 mi = MetaInformation(self.title + strftime(self.timefmt), [__appname__]) 1809 mi.publisher = __appname__ 1810 mi.author_sort = __appname__ 1811 mi = OPFCreator(self.output_dir, mi) 1812 mi.create_manifest_from_files_in([self.output_dir]) 1813 mi.create_spine([os.path.join(self.output_dir, 'index.html')]) 1814 with open(os.path.join(self.output_dir, 'index.opf'), 'wb') as opf_file: 1815 mi.render(opf_file) 1816 1817 def download(self): 1818 index = os.path.abspath(self.custom_index()) 1819 url = 'file:'+index if iswindows else 'file://'+index 1820 self.web2disk_options.browser = self.clone_browser(self.browser) 1821 fetcher = RecursiveFetcher(self.web2disk_options, self.log) 1822 fetcher.base_dir = self.output_dir 1823 fetcher.current_dir = self.output_dir 1824 fetcher.show_progress = False 1825 res = fetcher.start_fetch(url) 1826 self.create_opf() 1827 return res 1828 1829 1830class AutomaticNewsRecipe(BasicNewsRecipe): 1831 1832 auto_cleanup = True 1833 1834 1835class CalibrePeriodical(BasicNewsRecipe): 1836 1837 #: Set this to the slug for the calibre periodical 1838 calibre_periodicals_slug = None 1839 1840 LOG_IN = 'https://news.calibre-ebook.com/accounts/login' 1841 needs_subscription = True 1842 __author__ = 'calibre Periodicals' 1843 1844 def get_browser(self): 1845 br = BasicNewsRecipe.get_browser(self) 1846 br.open(self.LOG_IN) 1847 br.select_form(name='login') 1848 br['username'] = self.username 1849 br['password'] = self.password 1850 raw = br.submit().read() 1851 if 'href="/my-account"' not in raw: 1852 raise LoginFailed( 1853 _('Failed to log in, check your username and password for' 1854 ' the calibre Periodicals service.')) 1855 1856 return br 1857 get_browser.is_base_class_implementation = True 1858 1859 def download(self): 1860 self.log('Fetching downloaded recipe') 1861 try: 1862 raw = self.browser.open_novisit( 1863 'https://news.calibre-ebook.com/subscribed_files/%s/0/temp.downloaded_recipe' 1864 % self.calibre_periodicals_slug 1865 ).read() 1866 except Exception as e: 1867 if hasattr(e, 'getcode') and e.getcode() == 403: 1868 raise DownloadDenied( 1869 _('You do not have permission to download this issue.' 1870 ' Either your subscription has expired or you have' 1871 ' exceeded the maximum allowed downloads for today.')) 1872 raise 1873 f = io.BytesIO(raw) 1874 from calibre.utils.zipfile import ZipFile 1875 zf = ZipFile(f) 1876 zf.extractall() 1877 zf.close() 1878 from glob import glob 1879 1880 from calibre.web.feeds.recipes import compile_recipe 1881 try: 1882 recipe = compile_recipe(open(glob('*.recipe')[0], 1883 'rb').read()) 1884 self.conversion_options = recipe.conversion_options 1885 except: 1886 self.log.exception('Failed to compile downloaded recipe') 1887 return os.path.abspath('index.html') 1888