1## 2# Title: BBC News, Sport, and Blog Calibre Recipe 3# Contact: mattst - jmstanfield@gmail.com 4## 5# License: GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html 6# Copyright: mattst - jmstanfield@gmail.com 7## 8# Written: November 2011 9# Last Edited: 2011-11-19 10## 11 12__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html' 13__copyright__ = 'mattst - jmstanfield@gmail.com' 14 15 16''' 17BBC News, Sport, and Blog Calibre Recipe 18''' 19 20# Import the regular expressions module. 21import re 22 23# Import the BasicNewsRecipe class which this class extends. 24from calibre.web.feeds.recipes import BasicNewsRecipe 25 26 27class BBCBrasilRecipe(BasicNewsRecipe): 28 29 # 30 # **** IMPORTANT USERS READ ME **** 31 # 32 # First select the feeds you want then scroll down below the feeds list 33 # and select the values you want for the other user preferences, like 34 # oldest_article and such like. 35 # 36 # 37 # Select the BBC rss feeds which you want in your ebook. 38 # Selected feed have NO '#' at their start, de-selected feeds begin with a '#'. 39 # 40 # Eg. ("News Home", "http://feeds.bbci.co.uk/... - include feed. 41 # Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed. 42 # 43 # There are 68 feeds below which constitute the bulk of the available rss 44 # feeds on the BBC web site. These include 5 blogs by editors and 45 # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West 46 # Wales, Scotland Business), and 7 Welsh language feeds. 47 # 48 # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click) 49 # so if "oldest_article = 1.5" (only articles published in the last 36 hours) 50 # you may get some 'empty feeds' which will not then be included in the ebook. 51 # 52 # The 15 feeds currently selected below are simply my default ones. 53 # 54 # Note: With all 68 feeds selected, oldest_article set to 2, 55 # max_articles_per_feed set to 100, and simultaneous_downloads set to 10, 56 # the ebook creation took 29 minutes on my speedy 100 mbps net connection, 57 # fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx). 58 # More realistically with 15 feeds selected, oldest_article set to 1.5, 59 # max_articles_per_feed set to 100, and simultaneous_downloads set to 20, 60 # it took 6 minutes. If that's too slow increase 'simultaneous_downloads'. 61 # 62 # Select / de-select the feeds you want in your ebook. 63 # 64 feeds = [ 65 (u'Primeira P\xe1gina', u'http://www.bbc.co.uk/portuguese/index.xml'), 66 (u'\xdaltimas Not\xedcias', 67 u'http://www.bbc.co.uk/portuguese/ultimas_noticias/index.xml'), 68 (u'Internacional', 69 u'http://www.bbc.co.uk/portuguese/topicos/internacional/index.xml'), 70 (u'Brasil', u'http://www.bbc.co.uk/portuguese/topicos/brasil/index.xml'), 71 (u'Am\xe9rica Latina', 72 u'http://www.bbc.co.uk/portuguese/topicos/america_latina/index.xml'), 73 (u'Economia', u'http://www.bbc.co.uk/portuguese/topicos/economia/index.xml'), 74 (u'Sa\xfade', u'http://www.bbc.co.uk/portuguese/topicos/saude/index.xml'), 75 (u'Ci\xeancia e Tecnologia', 76 u'http://www.bbc.co.uk/portuguese/topicos/ciencia_e_tecnologia/index.xml'), 77 (u'Cultura', u'http://www.bbc.co.uk/portuguese/topicos/cultura/index.xml'), 78 (u'V\xeddeos e Fotos', 79 u'http://www.bbc.co.uk/portuguese/videos_e_fotos/index.xml'), 80 (u'Especiais', u'http://www.bbc.co.uk/portuguese/especiais/index.xml') 81 ] 82 83 # **** SELECT YOUR USER PREFERENCES **** 84 85 # Title to use for the ebook. 86 # 87 title = 'BBC Brasil' 88 89 # A brief description for the ebook. 90 # 91 description = u'Not\xedcias do Brasil e do mundo pela British Broadcasting Corporation' 92 93 # The max number of articles which may be downloaded from each feed. 94 # I've never seen more than about 70 articles in a single feed in the 95 # BBC feeds. 96 # 97 max_articles_per_feed = 100 98 99 # The max age of articles which may be downloaded from each feed. This is 100 # specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a 101 # half days). My default of 1.5 days is the last 36 hours, the point at 102 # which I've decided 'news' becomes 'old news', but be warned this is not 103 # so good for the blogs, technology, magazine, etc., and sports feeds. 104 # You may wish to extend this to 2-5 but watch out ebook creation time will 105 # increase as well. Setting this to 30 will get everything (AFAICT) as long 106 # as max_articles_per_feed remains set high (except for 'Click' which is 107 # v. low volume and its currently oldest article is 4th Feb 2011). 108 # 109 oldest_article = 1.5 110 111 # Number of simultaneous downloads. 20 is consistantly working fine on the 112 # BBC News feeds with no problems. Speeds things up from the defualt of 5. 113 # If you have a lot of feeds and/or have increased oldest_article above 2 114 # then you may wish to try increasing simultaneous_downloads to 25-30, 115 # Or, of course, if you are in a hurry. [I've not tried beyond 20.] 116 # 117 simultaneous_downloads = 20 118 119 # Timeout for fetching files from the server in seconds. The default of 120 # 120 seconds, seems somewhat excessive. 121 # 122 timeout = 30 123 124 # The format string for the date shown on the ebook's first page. 125 # List of all values: http://docs.python.org/library/time.html 126 # Default in news.py has a leading space so that's mirrored here. 127 # As with 'feeds' select/de-select by adding/removing the initial '#', 128 # only one timefmt should be selected, here's a few to choose from. 129 # 130 # [Fri, 14 Nov 2011] (Calibre default) 131 timefmt = ' [%a, %d %b %Y]' 132 # timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30] 133 # timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM] 134 # timefmt = ' [%d %b %Y]' # [14 Nov 2011] 135 # timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30] 136 # timefmt = ' [%Y-%m-%d]' # [2011-11-14] 137 # timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30] 138 139 # 140 # **** IMPORTANT **** 141 # 142 # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. 143 # 144 # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. 145 # 146 # I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :) 147 # 148 # **** IMPORTANT **** 149 # 150 151 # Author of this recipe. 152 __author__ = 'Carlos Laviola' 153 154 language = 'pt_BR' 155 156 # Set tags. 157 tags = 'news, sport, blog' 158 159 # Set publisher and publication type. 160 publisher = 'BBC' 161 publication_type = 'newspaper' 162 163 # Disable stylesheets from site. 164 no_stylesheets = True 165 166 # Specifies an override encoding for sites that have an incorrect charset 167 # specified. Default of 'None' says to auto-detect. Some other BBC recipes 168 # use 'utf8', which works fine (so use that if necessary) but auto-detecting 169 # with None is working fine, so stick with that for robustness. 170 encoding = None 171 172 # Sets whether a feed has full articles embedded in it. The BBC feeds do 173 # not. 174 use_embedded_content = False 175 176 # Removes empty feeds - why keep them!? 177 remove_empty_feeds = True 178 179 # Create a custom title which fits nicely in the Kindle title list. 180 # Requires "import time" above class declaration, and replacing 181 # title with custom_title in conversion_options (right column only). 182 # Example of string below: "BBC News - 14 Nov 2011" 183 # 184 # custom_title = "BBC News - " + time.strftime('%d %b %Y') 185 186 ''' 187 # Conversion options for advanced users, but don't forget to comment out the 188 # current conversion_options below. Avoid setting 'linearize_tables' as that 189 # plays havoc with the 'old style' table based pages. 190 # 191 conversion_options = { 'title' : title, 192 'comments' : description, 193 'tags' : tags, 194 'language' : language, 195 'publisher' : publisher, 196 'authors' : publisher, 197 'smarten_punctuation' : True 198 } 199 ''' 200 201 conversion_options = {'smarten_punctuation': True} 202 203 # Specify extra CSS - overrides ALL other CSS (IE. Added last). 204 extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \ 205 .introduction, .first { font-weight: bold; } \ 206 .cross-head { font-weight: bold; font-size: 125%; } \ 207 .cap, .caption { display: block; font-size: 80%; font-style: italic; } \ 208 .cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \ 209 .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \ 210 .correspondent-portrait img, .byline-lead-in, .name, .role, .bbc-role { display: block; \ 211 text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \ 212 .story-date, .published, .datestamp { font-size: 80%; } \ 213 table { width: 100%; } \ 214 td img { display: block; margin: 5px auto; } \ 215 ul { padding-top: 10px; } \ 216 ol { padding-top: 10px; } \ 217 li { padding-top: 5px; padding-bottom: 5px; } \ 218 h1 { text-align: center; font-size: 175%; font-weight: bold; } \ 219 h2 { text-align: center; font-size: 150%; font-weight: bold; } \ 220 h3 { text-align: center; font-size: 125%; font-weight: bold; } \ 221 h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }' 222 223 # Remove various tag attributes to improve the look of the ebook pages. 224 remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan', 225 'valign', 'vspace', 'hspace', 'alt', 'width', 'height'] 226 227 # Remove the (admittedly rarely used) line breaks, "<br />", which sometimes 228 # cause a section of the ebook to start in an unsightly fashion or, more 229 # frequently, a "<br />" will muck up the formatting of a correspondant's byline. 230 # "<br />" and "<br clear/>" are far more frequently used on the table formatted 231 # style of pages, and really spoil the look of the ebook pages. 232 preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''), 233 (re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')] 234 235 # Create regular expressions for tag keeping and removal to make the matches more 236 # robust against minor changes and errors in the HTML, Eg. double spaces, leading 237 # and trailing spaces, missing hyphens, and such like. 238 # Python regular expression ('re' class) page: 239 # http://docs.python.org/library/re.html 240 241 # *************************************** 242 # Regular expressions for keep_only_tags: 243 # *************************************** 244 245 # The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML 246 # page which contains the main text of the article. Match storybody variants: 'storybody', 247 # 'story-body', 'story body','storybody ', etc. 248 storybody_reg_exp = '^.*story[_ -]*body.*$' 249 250 # The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title 251 # and published date. This is one level above the usual news pages which have the title 252 # and date within 'story-body'. This is annoying since 'blq_content' must also be kept, 253 # resulting in a lot of extra things to be removed by remove_tags. 254 blq_content_reg_exp = '^.*blq[_ -]*content.*$' 255 256 # The BBC has an alternative page design structure, which I suspect is an out-of-date 257 # design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack' 258 # (travel), and in some sport pages. These alternative pages are table based (which is 259 # why I think they are an out-of-date design) and account for -I'm guesstimaking- less 260 # than 1% of all articles. They use a table class 'storycontent' to hold the article 261 # and like blq_content (above) have required lots of extra removal by 262 # remove_tags. 263 story_content_reg_exp = '^.*story[_ -]*content.*$' 264 265 # Keep the sections of the HTML which match the list below. The HTML page created by 266 # Calibre will fill <body> with those sections which are matched. Note that the 267 # blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to 268 # it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body' 269 # will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at 270 # all). If they are the other way around in keep_only_tags then blq_content_reg_exp 271 # will end up being discarded. 272 keep_only_tags = [dict(name='table', attrs={'class': re.compile(story_content_reg_exp, re.IGNORECASE)}), 273 dict(name='div', attrs={'class': re.compile( 274 blq_content_reg_exp, re.IGNORECASE)}), 275 dict(name='div', attrs={'id': re.compile( 276 blq_content_reg_exp, re.IGNORECASE)}), 277 dict(name='div', attrs={'class': re.compile( 278 storybody_reg_exp, re.IGNORECASE)}), 279 dict(name='div', attrs={'id': re.compile(storybody_reg_exp, re.IGNORECASE)})] 280 281 # ************************************ 282 # Regular expressions for remove_tags: 283 # ************************************ 284 285 # Regular expression to remove share-help and variant tags. The share-help class 286 # is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious, 287 # twitter, email. Removed to avoid page clutter. 288 share_help_reg_exp = '^.*share[_ -]*help.*$' 289 290 # Regular expression to remove embedded-hyper and variant tags. This class is used to 291 # display links to other BBC News articles on the same/similar subject. 292 embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$' 293 294 # Regular expression to remove hypertabs and variant tags. This class is used to 295 # display a tab bar at the top of an article which allows the user to switch to 296 # an article (viewed on the same page) providing further info., 'in depth' analysis, 297 # an editorial, a correspondant's blog entry, and such like. The ability to handle 298 # a tab bar of this nature is currently beyond the scope of this recipe and 299 # possibly of Calibre itself (not sure about that - TO DO - check!). 300 hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$' 301 302 # Regular expression to remove story-feature and variant tags. Eg. 'story-feature', 303 # 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'. 304 # This class is used to add additional info. boxes, or small lists, outside of 305 # the main story. TO DO: Work out a way to incorporate these neatly. 306 story_feature_reg_exp = '^.*story[_ -]*feature.*$' 307 308 # Regular expression to remove video and variant tags, Eg. 'videoInStoryB', 309 # 'videoInStoryC'. This class is used to embed video. 310 video_reg_exp = '^.*video.*$' 311 312 # Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'. 313 # This class is used to embed audio. 314 audio_reg_exp = '^.*audio.*$' 315 316 # Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'. 317 # This class is used to embed a photo slideshow. See also 'slideshow' 318 # below. 319 picture_gallery_reg_exp = '^.*picture.*$' 320 321 # Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'. 322 # This class is used to embed a slideshow (not necessarily photo) but both 323 # 'slideshow' and 'pictureGallery' are used for slideshows. 324 slideshow_reg_exp = '^.*slide[_ -]*show.*$' 325 326 # Regular expression to remove social-links and variant tags. This class is used to 327 # display links to a BBC bloggers main page, used in various columnist's blogs 328 # (Eg. Nick Robinson, Robert Preston). 329 social_links_reg_exp = '^.*social[_ -]*links.*$' 330 331 # Regular expression to remove quote and (multi) variant tags, Eg. 'quote', 332 # 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually 333 # removed by 'story-feature' removal (as they are usually within them), but 334 # not always. The quotation removed is always (AFAICT) in the article text 335 # as well but a 2nd copy is placed in a quote tag to draw attention to it. 336 # The quote class tags may or may not appear in div's. 337 quote_reg_exp = '^.*quote.*$' 338 339 # Regular expression to remove hidden and variant tags, Eg. 'hidden'. 340 # The purpose of these is unclear, they seem to be an internal link to a 341 # section within the article, but the text of the link (Eg. 'Continue reading 342 # the main story') never seems to be displayed anyway. Removed to avoid clutter. 343 # The hidden class tags may or may not appear in div's. 344 hidden_reg_exp = '^.*hidden.*$' 345 346 # Regular expression to remove comment and variant tags, Eg. 'comment-introduction'. 347 # Used on the site to display text about registered users entering 348 # comments. 349 comment_reg_exp = '^.*comment.*$' 350 351 # Regular expression to remove form and variant tags, Eg. 'comment-form'. 352 # Used on the site to allow registered BBC users to fill in forms, typically 353 # for entering comments about an article. 354 form_reg_exp = '^.*form.*$' 355 356 # Extra things to remove due to the addition of 'blq_content' in 357 # keep_only_tags. 358 359 # <div class="story-actions"> Used on sports pages for 'email' and 'print'. 360 story_actions_reg_exp = '^.*story[_ -]*actions.*$' 361 362 # <div class="bookmark-list"> Used on sports pages instead of 'share-help' (for 363 # social networking links). 364 bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$' 365 366 # <div id="secondary-content" class="content-group"> 367 # NOTE: Don't remove class="content-group" that is needed. 368 # Used on sports pages to link to 'similar stories'. 369 secondary_content_reg_exp = '^.*secondary[_ -]*content.*$' 370 371 # <div id="featured-content" class="content-group"> 372 # NOTE: Don't remove class="content-group" that is needed. 373 # Used on sports pages to link to pages like 'tables', 'fixtures', etc. 374 featured_content_reg_exp = '^.*featured[_ -]*content.*$' 375 376 # <div id="navigation"> 377 # Used on sports pages to link to pages like 'tables', 'fixtures', etc. 378 # Used sometimes instead of "featured-content" above. 379 navigation_reg_exp = '^.*navigation.*$' 380 381 # <a class="skip" href="#blq-container-inner">Skip to top</a> 382 # Used on sports pages to link to the top of the page. 383 skip_reg_exp = '^.*skip.*$' 384 385 # Extra things to remove due to the addition of 'storycontent' in keep_only_tags, 386 # which are the alterative table design based pages. The purpose of some of these 387 # is not entirely clear from the pages (which are a total mess!). 388 389 # Remove mapping based tags, Eg. <map id="world_map"> 390 # The dynamic maps don't seem to work during ebook creation. TO DO: 391 # Investigate. 392 map_reg_exp = '^.*map.*$' 393 394 # Remove social bookmarking variation, called 'socialBookMarks'. 395 social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$' 396 397 # Remove page navigation tools, like 'search', 'email', 'print', called 398 # 'blq-mast'. 399 blq_mast_reg_exp = '^.*blq[_ -]*mast.*$' 400 401 # Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear 402 # alongside 'socialBookMarks' whenever that appears. I am removing it as well 403 # under the assumption that it can appear alone as well. 404 sharesb_reg_exp = '^.*sharesb.*$' 405 406 # Remove class 'o'. The worst named user created css class of all time. The creator 407 # should immediately be fired. I've seen it used to hold nothing at all but with 408 # 20 or so empty lines in it. Also to hold a single link to another article. 409 # Whatever it was designed to do it is not wanted by this recipe. Exact 410 # match only. 411 o_reg_exp = '^o$' 412 413 # Remove 'promotopbg' and 'promobottombg', link lists. Have decided to 414 # use two reg expressions to make removing this (and variants) robust. 415 promo_top_reg_exp = '^.*promotopbg.*$' 416 promo_bottom_reg_exp = '^.*promobottombg.*$' 417 418 # Remove 'nlp', provides heading for link lists. Requires an exact match due to 419 # risk of matching those letters in something needed, unless I see a variation 420 # of 'nlp' used at a later date. 421 nlp_reg_exp = '^nlp$' 422 423 # Remove 'mva', provides embedded floating content of various types. Variant 'mvb' 424 # has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of 425 # matching those letters in something needed. 426 mva_or_mvb_reg_exp = '^mv[ab]$' 427 428 # Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'. 429 mvtb_reg_exp = '^mvtb$' 430 431 # Remove 'blq-toplink', class to provide a link to the top of the page. 432 blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$' 433 434 # Remove 'products and services' links, Eg. desktop tools, alerts, and so on. 435 # Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to 436 # use two reg expressions to make removing this (and variants) robust. 437 prods_services_01_reg_exp = '^.*servicev4.*$' 438 prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$' 439 440 # Remove -what I think is- some kind of navigation tools helper class, though I am 441 # not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up 442 # frequently and it is not wanted. Have decided to use two reg expressions to make 443 # removing this (and variants) robust. 444 blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$' 445 blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$' 446 447 # Remove 'puffbox' - this may only appear inside 'storyextra', so it may not 448 # need removing - I have no clue what it does other than it contains links. 449 # Whatever it is - it is not part of the article and is not wanted. 450 puffbox_reg_exp = '^.*puffbox.*$' 451 452 # Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes. 453 sibtbg_reg_exp = '^.*sibtbg.*$' 454 455 # Remove 'storyextra' - links to relevant articles and external sites. 456 storyextra_reg_exp = '^.*story[_ -]*extra.*$' 457 458 remove_tags = [dict(name='div', attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}), 459 dict(name='div', attrs={'class': re.compile( 460 share_help_reg_exp, re.IGNORECASE)}), 461 dict(name='div', attrs={'class': re.compile( 462 embedded_hyper_reg_exp, re.IGNORECASE)}), 463 dict(name='div', attrs={'class': re.compile( 464 hypertabs_reg_exp, re.IGNORECASE)}), 465 dict(name='div', attrs={'class': re.compile( 466 video_reg_exp, re.IGNORECASE)}), 467 dict(name='div', attrs={'class': re.compile( 468 audio_reg_exp, re.IGNORECASE)}), 469 dict(name='div', attrs={'class': re.compile( 470 picture_gallery_reg_exp, re.IGNORECASE)}), 471 dict(name='div', attrs={'class': re.compile( 472 slideshow_reg_exp, re.IGNORECASE)}), 473 dict(name='div', attrs={'class': re.compile( 474 quote_reg_exp, re.IGNORECASE)}), 475 dict(name='div', attrs={'class': re.compile( 476 hidden_reg_exp, re.IGNORECASE)}), 477 dict(name='div', attrs={'class': re.compile( 478 comment_reg_exp, re.IGNORECASE)}), 479 dict(name='div', attrs={'class': re.compile( 480 story_actions_reg_exp, re.IGNORECASE)}), 481 dict(name='div', attrs={'class': re.compile( 482 bookmark_list_reg_exp, re.IGNORECASE)}), 483 dict(name='div', attrs={'id': re.compile( 484 secondary_content_reg_exp, re.IGNORECASE)}), 485 dict(name='div', attrs={'id': re.compile( 486 featured_content_reg_exp, re.IGNORECASE)}), 487 dict(name='div', attrs={'id': re.compile( 488 navigation_reg_exp, re.IGNORECASE)}), 489 dict(name='form', attrs={'id': re.compile( 490 form_reg_exp, re.IGNORECASE)}), 491 dict(attrs={'class': re.compile( 492 quote_reg_exp, re.IGNORECASE)}), 493 dict(attrs={'class': re.compile( 494 hidden_reg_exp, re.IGNORECASE)}), 495 dict(attrs={'class': re.compile( 496 social_links_reg_exp, re.IGNORECASE)}), 497 dict(attrs={'class': re.compile( 498 comment_reg_exp, re.IGNORECASE)}), 499 dict(attrs={'class': re.compile( 500 skip_reg_exp, re.IGNORECASE)}), 501 dict(name='map', attrs={'id': re.compile( 502 map_reg_exp, re.IGNORECASE)}), 503 dict(name='map', attrs={'name': re.compile( 504 map_reg_exp, re.IGNORECASE)}), 505 dict(name='div', attrs={'id': re.compile( 506 social_bookmarks_reg_exp, re.IGNORECASE)}), 507 dict(name='div', attrs={'id': re.compile( 508 blq_mast_reg_exp, re.IGNORECASE)}), 509 dict(name='div', attrs={'class': re.compile( 510 sharesb_reg_exp, re.IGNORECASE)}), 511 dict(name='div', attrs={ 512 'class': re.compile(o_reg_exp, re.IGNORECASE)}), 513 dict(name='div', attrs={'class': re.compile( 514 promo_top_reg_exp, re.IGNORECASE)}), 515 dict(name='div', attrs={'class': re.compile( 516 promo_bottom_reg_exp, re.IGNORECASE)}), 517 dict(name='div', attrs={ 518 'class': re.compile(nlp_reg_exp, re.IGNORECASE)}), 519 dict(name='div', attrs={'class': re.compile( 520 mva_or_mvb_reg_exp, re.IGNORECASE)}), 521 dict(name='div', attrs={'class': re.compile( 522 mvtb_reg_exp, re.IGNORECASE)}), 523 dict(name='div', attrs={'class': re.compile( 524 blq_toplink_reg_exp, re.IGNORECASE)}), 525 dict(name='div', attrs={'class': re.compile( 526 prods_services_01_reg_exp, re.IGNORECASE)}), 527 dict(name='div', attrs={'class': re.compile( 528 prods_services_02_reg_exp, re.IGNORECASE)}), 529 dict(name='div', attrs={'class': re.compile( 530 blq_misc_01_reg_exp, re.IGNORECASE)}), 531 dict(name='div', attrs={'class': re.compile( 532 blq_misc_02_reg_exp, re.IGNORECASE)}), 533 dict(name='div', attrs={'class': re.compile( 534 puffbox_reg_exp, re.IGNORECASE)}), 535 dict(attrs={'class': re.compile( 536 sibtbg_reg_exp, re.IGNORECASE)}), 537 dict(attrs={'class': re.compile( 538 storyextra_reg_exp, re.IGNORECASE)}), 539 dict(name='div', attrs={'class': 'tools-container'}), 540 dict(name='div', attrs={'class': 'tools-container-end'}), 541 dict(name='div', attrs={ 542 'class': 'g-block story-body contextual-links'}), 543 dict(name='div', attrs={'class': ' g-w11 sidebar'}) 544 ] 545 546 # Uses url to create and return the 'printer friendly' version of the url. 547 # In other words the 'print this page' address of the page. 548 # 549 # There are 3 types of urls used in the BBC site's rss feeds. There is just 550 # 1 type for the standard news while there are 2 used for sports feed urls. 551 # Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when 552 # there is a major story of interest to 'everyone'. So even if no BBC sports 553 # feeds are added to 'feeds' the logic of this method is still needed to avoid 554 # blank / missing / empty articles which have an index title and then no 555 # body. 556 def print_version(self, url): 557 558 # Handle sports page urls type 01: 559 if (url.find("go/rss/-/sport1/") != -1): 560 temp_url = url.replace("go/rss/-/", "") 561 562 # Handle sports page urls type 02: 563 elif (url.find("go/rss/int/news/-/sport1/") != -1): 564 temp_url = url.replace("go/rss/int/news/-/", "") 565 566 # Handle regular news page urls: 567 else: 568 temp_url = url.replace("go/rss/int/news/-/", "") 569 570 # Always add "?print=true" to the end of the url. 571 print_url = temp_url + "?print=true" 572 573 return print_url 574 575 # Remove articles in feeds based on a string in the article title or url. 576 # 577 # Code logic written by: Starson17 - posted in: "Recipes - Re-usable code" 578 # thread, in post with title: "Remove articles from feed", see url: 579 # http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6 580 # Many thanks and all credit to Starson17. 581 # 582 # Starson17's code has obviously been altered to suite my requirements. 583 def parse_feeds(self): 584 585 # Call parent's method. 586 feeds = BasicNewsRecipe.parse_feeds(self) 587 588 # Loop through all feeds. 589 for feed in feeds: 590 591 # Loop through all articles in feed. 592 for article in feed.articles[:]: 593 594 # Match key words and remove article if there's a match. 595 596 # Most BBC rss feed video only 'articles' use upper case 'VIDEO' 597 # as a title prefix. Just match upper case 'VIDEO', so that 598 # articles like 'Video game banned' won't be matched and 599 # removed. 600 if 'VIDEO' in article.title: 601 feed.articles.remove(article) 602 603 # Most BBC rss feed audio only 'articles' use upper case 'AUDIO' 604 # as a title prefix. Just match upper case 'AUDIO', so that 605 # articles like 'Hi-Def audio...' won't be matched and removed. 606 elif 'AUDIO' in article.title: 607 feed.articles.remove(article) 608 609 # Most BBC rss feed photo slideshow 'articles' use 'In Pictures', 610 # 'In pictures', and 'in pictures', somewhere in their title. 611 # Match any case of that phrase. 612 elif 'IN PICTURES' in article.title.upper(): 613 feed.articles.remove(article) 614 615 # As above, but user contributed pictures. Match any case. 616 elif 'YOUR PICTURES' in article.title.upper(): 617 feed.articles.remove(article) 618 619 # 'Sportsday Live' are articles which contain a constantly and 620 # dynamically updated 'running commentary' during a live sporting 621 # event. Match any case. 622 elif 'SPORTSDAY LIVE' in article.title.upper(): 623 feed.articles.remove(article) 624 625 # Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'. 626 # These are being matched below using 'Live - ' because removing all 627 # articles with 'live' in their titles would remove some articles 628 # that are in fact not live sports pages. Match any case. 629 elif 'LIVE - ' in article.title.upper(): 630 feed.articles.remove(article) 631 632 # 'Quiz of the week' is a Flash player weekly news quiz. Match only 633 # the 'Quiz of the' part in anticipation of monthly and yearly 634 # variants. Match any case. 635 elif 'QUIZ OF THE' in article.title.upper(): 636 feed.articles.remove(article) 637 638 # Remove articles with 'scorecards' in the url. These are BBC sports 639 # pages which just display a cricket scorecard. The pages have a mass 640 # of table and css entries to display the scorecards nicely. Probably 641 # could make them work with this recipe, but might take a whole day 642 # of work to sort out all the css - basically a formatting 643 # nightmare. 644 elif 'scorecards' in article.url: 645 feed.articles.remove(article) 646 647 return feeds 648 649# End of class and file. 650