1##
2# Title:        BBC News, Sport, and Blog Calibre Recipe
3# Contact:      mattst - jmstanfield@gmail.com
4##
5# License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
6# Copyright:    mattst - jmstanfield@gmail.com
7##
8# Written:      November 2011
9# Last Edited:  2011-11-19
10##
11
12__license__ = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
13__copyright__ = 'mattst - jmstanfield@gmail.com'
14
15
16'''
17BBC News, Sport, and Blog Calibre Recipe
18'''
19
20# Import the regular expressions module.
21import re
22
23# Import the BasicNewsRecipe class which this class extends.
24from calibre.web.feeds.recipes import BasicNewsRecipe
25
26
27class BBCBrasilRecipe(BasicNewsRecipe):
28
29    #
30    #    **** IMPORTANT USERS READ ME ****
31    #
32    #  First select the feeds you want then scroll down below the feeds list
33    #  and select the values you want for the other user preferences, like
34    #  oldest_article and such like.
35    #
36    #
37    #  Select the BBC rss feeds which you want in your ebook.
38    #  Selected feed have NO '#' at their start, de-selected feeds begin with a '#'.
39    #
40    #  Eg.  ("News Home", "http://feeds.bbci.co.uk/... - include feed.
41    #  Eg. #("News Home", "http://feeds.bbci.co.uk/... - do not include feed.
42    #
43    # There are 68 feeds below which constitute the bulk of the available rss
44    # feeds on the BBC web site. These include 5 blogs by editors and
45    # correspondants, 16 sports feeds, 15 'sub' regional feeds (Eg. North West
46    # Wales, Scotland Business), and 7 Welsh language feeds.
47    #
48    # Some of the feeds are low volume (Eg. blogs), or very low volume (Eg. Click)
49    # so if "oldest_article = 1.5" (only articles published in the last 36 hours)
50    # you may get some 'empty feeds' which will not then be included in the ebook.
51    #
52    # The 15 feeds currently selected below are simply my default ones.
53    #
54    # Note: With all 68 feeds selected, oldest_article set to 2,
55    # max_articles_per_feed set to 100, and simultaneous_downloads set to 10,
56    # the ebook creation took 29 minutes on my speedy 100 mbps net connection,
57    # fairly high-end desktop PC running Linux (Ubuntu Lucid-Lynx).
58    # More realistically with 15 feeds selected, oldest_article set to 1.5,
59    # max_articles_per_feed set to 100, and simultaneous_downloads set to 20,
60    # it took 6 minutes. If that's too slow increase 'simultaneous_downloads'.
61    #
62    # Select / de-select the feeds you want in your ebook.
63    #
64    feeds = [
65        (u'Primeira P\xe1gina', u'http://www.bbc.co.uk/portuguese/index.xml'),
66        (u'\xdaltimas Not\xedcias',
67         u'http://www.bbc.co.uk/portuguese/ultimas_noticias/index.xml'),
68        (u'Internacional',
69         u'http://www.bbc.co.uk/portuguese/topicos/internacional/index.xml'),
70        (u'Brasil', u'http://www.bbc.co.uk/portuguese/topicos/brasil/index.xml'),
71        (u'Am\xe9rica Latina',
72         u'http://www.bbc.co.uk/portuguese/topicos/america_latina/index.xml'),
73        (u'Economia', u'http://www.bbc.co.uk/portuguese/topicos/economia/index.xml'),
74        (u'Sa\xfade', u'http://www.bbc.co.uk/portuguese/topicos/saude/index.xml'),
75        (u'Ci\xeancia e Tecnologia',
76         u'http://www.bbc.co.uk/portuguese/topicos/ciencia_e_tecnologia/index.xml'),
77        (u'Cultura', u'http://www.bbc.co.uk/portuguese/topicos/cultura/index.xml'),
78        (u'V\xeddeos e Fotos',
79         u'http://www.bbc.co.uk/portuguese/videos_e_fotos/index.xml'),
80        (u'Especiais', u'http://www.bbc.co.uk/portuguese/especiais/index.xml')
81    ]
82
83    #    **** SELECT YOUR USER PREFERENCES ****
84
85    # Title to use for the ebook.
86    #
87    title = 'BBC Brasil'
88
89    # A brief description for the ebook.
90    #
91    description = u'Not\xedcias do Brasil e do mundo pela British Broadcasting Corporation'
92
93    # The max number of articles which may be downloaded from each feed.
94    # I've never seen more than about 70 articles in a single feed in the
95    # BBC feeds.
96    #
97    max_articles_per_feed = 100
98
99    # The max age of articles which may be downloaded from each feed. This is
100    # specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
101    # half days). My default of 1.5 days is the last 36 hours, the point at
102    # which I've decided 'news' becomes 'old news', but be warned this is not
103    # so good for the blogs, technology, magazine, etc., and sports feeds.
104    # You may wish to extend this to 2-5 but watch out ebook creation time will
105    # increase as well. Setting this to 30 will get everything (AFAICT) as long
106    # as max_articles_per_feed remains set high (except for 'Click' which is
107    # v. low volume and its currently oldest article is 4th Feb 2011).
108    #
109    oldest_article = 1.5
110
111    # Number of simultaneous downloads. 20 is consistantly working fine on the
112    # BBC News feeds with no problems. Speeds things up from the defualt of 5.
113    # If you have a lot of feeds and/or have increased oldest_article above 2
114    # then you may wish to try increasing simultaneous_downloads to 25-30,
115    # Or, of course, if you are in a hurry. [I've not tried beyond 20.]
116    #
117    simultaneous_downloads = 20
118
119    # Timeout for fetching files from the server in seconds. The default of
120    # 120 seconds, seems somewhat excessive.
121    #
122    timeout = 30
123
124    # The format string for the date shown on the ebook's first page.
125    # List of all values: http://docs.python.org/library/time.html
126    # Default in news.py has a leading space so that's mirrored here.
127    # As with 'feeds' select/de-select by adding/removing the initial '#',
128    # only one timefmt should be selected, here's a few to choose from.
129    #
130    # [Fri, 14 Nov 2011] (Calibre default)
131    timefmt = ' [%a, %d %b %Y]'
132    # timefmt = ' [%a, %d %b %Y %H:%M]'       # [Fri, 14 Nov 2011 18:30]
133    # timefmt = ' [%a, %d %b %Y %I:%M %p]'    # [Fri, 14 Nov 2011 06:30 PM]
134    # timefmt = ' [%d %b %Y]'                 # [14 Nov 2011]
135    # timefmt = ' [%d %b %Y %H:%M]'           # [14 Nov 2011 18.30]
136    # timefmt = ' [%Y-%m-%d]'                 # [2011-11-14]
137    # timefmt = ' [%Y-%m-%d-%H-%M]'           # [2011-11-14-18-30]
138
139    #
140    #    **** IMPORTANT ****
141    #
142    #    DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
143    #
144    #    DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
145    #
146    #    I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
147    #
148    #    **** IMPORTANT ****
149    #
150
151    # Author of this recipe.
152    __author__ = 'Carlos Laviola'
153
154    language = 'pt_BR'
155
156    # Set tags.
157    tags = 'news, sport, blog'
158
159    # Set publisher and publication type.
160    publisher = 'BBC'
161    publication_type = 'newspaper'
162
163    # Disable stylesheets from site.
164    no_stylesheets = True
165
166    # Specifies an override encoding for sites that have an incorrect charset
167    # specified. Default of 'None' says to auto-detect. Some other BBC recipes
168    # use 'utf8', which works fine (so use that if necessary) but auto-detecting
169    # with None is working fine, so stick with that for robustness.
170    encoding = None
171
172    # Sets whether a feed has full articles embedded in it. The BBC feeds do
173    # not.
174    use_embedded_content = False
175
176    # Removes empty feeds - why keep them!?
177    remove_empty_feeds = True
178
179    # Create a custom title which fits nicely in the Kindle title list.
180    # Requires "import time" above class declaration, and replacing
181    # title with custom_title in conversion_options (right column only).
182    # Example of string below: "BBC News - 14 Nov 2011"
183    #
184    # custom_title = "BBC News - " + time.strftime('%d %b %Y')
185
186    '''
187    # Conversion options for advanced users, but don't forget to comment out the
188    # current conversion_options below. Avoid setting 'linearize_tables' as that
189    # plays havoc with the 'old style' table based pages.
190    #
191    conversion_options = { 'title'       : title,
192                           'comments'    : description,
193                           'tags'        : tags,
194                           'language'    : language,
195                           'publisher'   : publisher,
196                           'authors'     : publisher,
197                           'smarten_punctuation' : True
198                         }
199    '''
200
201    conversion_options = {'smarten_punctuation': True}
202
203    # Specify extra CSS - overrides ALL other CSS (IE. Added last).
204    extra_css = 'body { font-family: verdana, helvetica, sans-serif; } \
205                 .introduction, .first { font-weight: bold; } \
206                 .cross-head { font-weight: bold; font-size: 125%; } \
207                 .cap, .caption { display: block; font-size: 80%; font-style: italic; } \
208                 .cap, .caption, .caption img, .caption span { display: block; text-align: center; margin: 5px auto; } \
209                 .byl, .byd, .byline img, .byline-name, .byline-title, .author-name, .author-position, \
210                    .correspondent-portrait img, .byline-lead-in, .name, .role, .bbc-role { display: block; \
211                    text-align: center; font-size: 80%; font-style: italic; margin: 1px auto; } \
212                 .story-date, .published, .datestamp { font-size: 80%; } \
213                 table { width: 100%; } \
214                 td img { display: block; margin: 5px auto; } \
215                 ul { padding-top: 10px; } \
216                 ol { padding-top: 10px; } \
217                 li { padding-top: 5px; padding-bottom: 5px; } \
218                 h1 { text-align: center; font-size: 175%; font-weight: bold; } \
219                 h2 { text-align: center; font-size: 150%; font-weight: bold; } \
220                 h3 { text-align: center; font-size: 125%; font-weight: bold; } \
221                 h4, h5, h6 { text-align: center; font-size: 100%; font-weight: bold; }'
222
223    # Remove various tag attributes to improve the look of the ebook pages.
224    remove_attributes = ['border', 'cellspacing', 'align', 'cellpadding', 'colspan',
225                         'valign', 'vspace', 'hspace', 'alt', 'width', 'height']
226
227    # Remove the (admittedly rarely used) line breaks, "<br />", which sometimes
228    # cause a section of the ebook to start in an unsightly fashion or, more
229    # frequently, a "<br />" will muck up the formatting of a correspondant's byline.
230    # "<br />" and "<br clear/>" are far more frequently used on the table formatted
231    # style of pages, and really spoil the look of the ebook pages.
232    preprocess_regexps = [(re.compile(r'<br[ ]*/>', re.IGNORECASE), lambda m: ''),
233                          (re.compile(r'<br[ ]*clear.*/>', re.IGNORECASE), lambda m: '')]
234
235    # Create regular expressions for tag keeping and removal to make the matches more
236    # robust against minor changes and errors in the HTML, Eg. double spaces, leading
237    # and trailing spaces, missing hyphens, and such like.
238    # Python regular expression ('re' class) page:
239    # http://docs.python.org/library/re.html
240
241    # ***************************************
242    # Regular expressions for keep_only_tags:
243    # ***************************************
244
245    # The BBC News HTML pages use variants of 'storybody' to denote the section of a HTML
246    # page which contains the main text of the article. Match storybody variants: 'storybody',
247    # 'story-body', 'story body','storybody ', etc.
248    storybody_reg_exp = '^.*story[_ -]*body.*$'
249
250    # The BBC sport and 'newsbeat' (features) HTML pages use 'blq_content' to hold the title
251    # and published date. This is one level above the usual news pages which have the title
252    # and date within 'story-body'. This is annoying since 'blq_content' must also be kept,
253    # resulting in a lot of extra things to be removed by remove_tags.
254    blq_content_reg_exp = '^.*blq[_ -]*content.*$'
255
256    # The BBC has an alternative page design structure, which I suspect is an out-of-date
257    # design but which is still used in some articles, Eg. 'Click' (technology), 'FastTrack'
258    # (travel), and in some sport pages. These alternative pages are table based (which is
259    # why I think they are an out-of-date design) and account for -I'm guesstimaking- less
260    # than 1% of all articles. They use a table class 'storycontent' to hold the article
261    # and like blq_content (above) have required lots of extra removal by
262    # remove_tags.
263    story_content_reg_exp = '^.*story[_ -]*content.*$'
264
265    # Keep the sections of the HTML which match the list below. The HTML page created by
266    # Calibre will fill <body> with those sections which are matched. Note that the
267    # blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags due to
268    # it being the parent of storybody_reg_exp, that is to say the div class/id 'story-body'
269    # will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is there at
270    # all). If they are the other way around in keep_only_tags then blq_content_reg_exp
271    # will end up being discarded.
272    keep_only_tags = [dict(name='table', attrs={'class': re.compile(story_content_reg_exp, re.IGNORECASE)}),
273                      dict(name='div',   attrs={'class': re.compile(
274                           blq_content_reg_exp, re.IGNORECASE)}),
275                      dict(name='div',   attrs={'id': re.compile(
276                           blq_content_reg_exp, re.IGNORECASE)}),
277                      dict(name='div',   attrs={'class': re.compile(
278                           storybody_reg_exp, re.IGNORECASE)}),
279                      dict(name='div',   attrs={'id': re.compile(storybody_reg_exp, re.IGNORECASE)})]
280
281    # ************************************
282    # Regular expressions for remove_tags:
283    # ************************************
284
285    # Regular expression to remove share-help and variant tags. The share-help class
286    # is used by the site for a variety of 'sharing' type links, Eg. Facebook, delicious,
287    # twitter, email. Removed to avoid page clutter.
288    share_help_reg_exp = '^.*share[_ -]*help.*$'
289
290    # Regular expression to remove embedded-hyper and variant tags. This class is used to
291    # display links to other BBC News articles on the same/similar subject.
292    embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
293
294    # Regular expression to remove hypertabs and variant tags. This class is used to
295    # display a tab bar at the top of an article which allows the user to switch to
296    # an article (viewed on the same page) providing further info., 'in depth' analysis,
297    # an editorial, a correspondant's blog entry, and such like. The ability to handle
298    # a tab bar of this nature is currently beyond the scope of this recipe and
299    # possibly of Calibre itself (not sure about that - TO DO - check!).
300    hypertabs_reg_exp = '^.*hyper[_ -]*tabs.*$'
301
302    # Regular expression to remove story-feature and variant tags. Eg. 'story-feature',
303    # 'story-feature related narrow', 'story-feature wide', 'story-feature narrow'.
304    # This class is used to add additional info. boxes, or small lists, outside of
305    # the main story. TO DO: Work out a way to incorporate these neatly.
306    story_feature_reg_exp = '^.*story[_ -]*feature.*$'
307
308    # Regular expression to remove video and variant tags, Eg. 'videoInStoryB',
309    # 'videoInStoryC'. This class is used to embed video.
310    video_reg_exp = '^.*video.*$'
311
312    # Regular expression to remove audio and variant tags, Eg. 'audioInStoryD'.
313    # This class is used to embed audio.
314    audio_reg_exp = '^.*audio.*$'
315
316    # Regular expression to remove pictureGallery and variant tags, Eg. 'pictureGallery'.
317    # This class is used to embed a photo slideshow. See also 'slideshow'
318    # below.
319    picture_gallery_reg_exp = '^.*picture.*$'
320
321    # Regular expression to remove slideshow and variant tags, Eg. 'dslideshow-enclosure'.
322    # This class is used to embed a slideshow (not necessarily photo) but both
323    # 'slideshow' and 'pictureGallery' are used for slideshows.
324    slideshow_reg_exp = '^.*slide[_ -]*show.*$'
325
326    # Regular expression to remove social-links and variant tags. This class is used to
327    # display links to a BBC bloggers main page, used in various columnist's blogs
328    # (Eg. Nick Robinson, Robert Preston).
329    social_links_reg_exp = '^.*social[_ -]*links.*$'
330
331    # Regular expression to remove quote and (multi) variant tags, Eg. 'quote',
332    # 'endquote', 'quote-credit', 'quote-credit-title', etc. These are usually
333    # removed by 'story-feature' removal (as they are usually within them), but
334    # not always. The quotation removed is always (AFAICT) in the article text
335    # as well but a 2nd copy is placed in a quote tag to draw attention to it.
336    # The quote class tags may or may not appear in div's.
337    quote_reg_exp = '^.*quote.*$'
338
339    # Regular expression to remove hidden and variant tags, Eg. 'hidden'.
340    # The purpose of these is unclear, they seem to be an internal link to a
341    # section within the article, but the text of the link (Eg. 'Continue reading
342    # the main story') never seems to be displayed anyway. Removed to avoid clutter.
343    # The hidden class tags may or may not appear in div's.
344    hidden_reg_exp = '^.*hidden.*$'
345
346    # Regular expression to remove comment and variant tags, Eg. 'comment-introduction'.
347    # Used on the site to display text about registered users entering
348    # comments.
349    comment_reg_exp = '^.*comment.*$'
350
351    # Regular expression to remove form and variant tags, Eg. 'comment-form'.
352    # Used on the site to allow registered BBC users to fill in forms, typically
353    # for entering comments about an article.
354    form_reg_exp = '^.*form.*$'
355
356    # Extra things to remove due to the addition of 'blq_content' in
357    # keep_only_tags.
358
359    # <div class="story-actions"> Used on sports pages for 'email' and 'print'.
360    story_actions_reg_exp = '^.*story[_ -]*actions.*$'
361
362    # <div class="bookmark-list"> Used on sports pages instead of 'share-help' (for
363    # social networking links).
364    bookmark_list_reg_exp = '^.*bookmark[_ -]*list.*$'
365
366    # <div id="secondary-content" class="content-group">
367    # NOTE: Don't remove class="content-group" that is needed.
368    # Used on sports pages to link to 'similar stories'.
369    secondary_content_reg_exp = '^.*secondary[_ -]*content.*$'
370
371    # <div id="featured-content" class="content-group">
372    # NOTE: Don't remove class="content-group" that is needed.
373    # Used on sports pages to link to pages like 'tables', 'fixtures', etc.
374    featured_content_reg_exp = '^.*featured[_ -]*content.*$'
375
376    # <div id="navigation">
377    # Used on sports pages to link to pages like 'tables', 'fixtures', etc.
378    # Used sometimes instead of "featured-content" above.
379    navigation_reg_exp = '^.*navigation.*$'
380
381    # <a class="skip" href="#blq-container-inner">Skip to top</a>
382    # Used on sports pages to link to the top of the page.
383    skip_reg_exp = '^.*skip.*$'
384
385    # Extra things to remove due to the addition of 'storycontent' in keep_only_tags,
386    # which are the alterative table design based pages. The purpose of some of these
387    # is not entirely clear from the pages (which are a total mess!).
388
389    # Remove mapping based tags, Eg. <map id="world_map">
390    # The dynamic maps don't seem to work during ebook creation. TO DO:
391    # Investigate.
392    map_reg_exp = '^.*map.*$'
393
394    # Remove social bookmarking variation, called 'socialBookMarks'.
395    social_bookmarks_reg_exp = '^.*social[_ -]*bookmarks.*$'
396
397    # Remove page navigation tools, like 'search', 'email', 'print', called
398    # 'blq-mast'.
399    blq_mast_reg_exp = '^.*blq[_ -]*mast.*$'
400
401    # Remove 'sharesb', I think this is a generic 'sharing' class. It seems to appear
402    # alongside 'socialBookMarks' whenever that appears. I am removing it as well
403    # under the assumption that it can appear alone as well.
404    sharesb_reg_exp = '^.*sharesb.*$'
405
406    # Remove class 'o'. The worst named user created css class of all time. The creator
407    # should immediately be fired. I've seen it used to hold nothing at all but with
408    # 20 or so empty lines in it. Also to hold a single link to another article.
409    # Whatever it was designed to do it is not wanted by this recipe. Exact
410    # match only.
411    o_reg_exp = '^o$'
412
413    # Remove 'promotopbg' and 'promobottombg', link lists. Have decided to
414    # use two reg expressions to make removing this (and variants) robust.
415    promo_top_reg_exp = '^.*promotopbg.*$'
416    promo_bottom_reg_exp = '^.*promobottombg.*$'
417
418    # Remove 'nlp', provides heading for link lists. Requires an exact match due to
419    # risk of matching those letters in something needed, unless I see a variation
420    # of 'nlp' used at a later date.
421    nlp_reg_exp = '^nlp$'
422
423    # Remove 'mva', provides embedded floating content of various types. Variant 'mvb'
424    # has also now been seen. Requires an exact match of 'mva' or 'mvb' due to risk of
425    # matching those letters in something needed.
426    mva_or_mvb_reg_exp = '^mv[ab]$'
427
428    # Remove 'mvtb', seems to be page navigation tools, like 'blq-mast'.
429    mvtb_reg_exp = '^mvtb$'
430
431    # Remove 'blq-toplink', class to provide a link to the top of the page.
432    blq_toplink_reg_exp = '^.*blq[_ -]*top[_ -]*link.*$'
433
434    # Remove 'products and services' links, Eg. desktop tools, alerts, and so on.
435    # Eg. Class="servicev4 ukfs_services" - what a mess of a name. Have decided to
436    # use two reg expressions to make removing this (and variants) robust.
437    prods_services_01_reg_exp = '^.*servicev4.*$'
438    prods_services_02_reg_exp = '^.*ukfs[_ -]*services.*$'
439
440    # Remove -what I think is- some kind of navigation tools helper class, though I am
441    # not sure, it's called: 'blq-rst blq-new-nav'. What I do know is it pops up
442    # frequently and it is not wanted. Have decided to use two reg expressions to make
443    # removing this (and variants) robust.
444    blq_misc_01_reg_exp = '^.*blq[_ -]*rst.*$'
445    blq_misc_02_reg_exp = '^.*blq[_ -]*new[_ -]*nav.*$'
446
447    # Remove 'puffbox' - this may only appear inside 'storyextra', so it may not
448    # need removing - I have no clue what it does other than it contains links.
449    # Whatever it is - it is not part of the article and is not wanted.
450    puffbox_reg_exp = '^.*puffbox.*$'
451
452    # Remove 'sibtbg' and 'sibtbgf' - some kind of table formatting classes.
453    sibtbg_reg_exp = '^.*sibtbg.*$'
454
455    # Remove 'storyextra' - links to relevant articles and external sites.
456    storyextra_reg_exp = '^.*story[_ -]*extra.*$'
457
458    remove_tags = [dict(name='div',  attrs={'class': re.compile(story_feature_reg_exp, re.IGNORECASE)}),
459                   dict(name='div',  attrs={'class': re.compile(
460                        share_help_reg_exp, re.IGNORECASE)}),
461                   dict(name='div',  attrs={'class': re.compile(
462                        embedded_hyper_reg_exp, re.IGNORECASE)}),
463                   dict(name='div',  attrs={'class': re.compile(
464                        hypertabs_reg_exp, re.IGNORECASE)}),
465                   dict(name='div',  attrs={'class': re.compile(
466                        video_reg_exp, re.IGNORECASE)}),
467                   dict(name='div',  attrs={'class': re.compile(
468                        audio_reg_exp, re.IGNORECASE)}),
469                   dict(name='div',  attrs={'class': re.compile(
470                        picture_gallery_reg_exp, re.IGNORECASE)}),
471                   dict(name='div',  attrs={'class': re.compile(
472                        slideshow_reg_exp, re.IGNORECASE)}),
473                   dict(name='div',  attrs={'class': re.compile(
474                        quote_reg_exp, re.IGNORECASE)}),
475                   dict(name='div',  attrs={'class': re.compile(
476                        hidden_reg_exp, re.IGNORECASE)}),
477                   dict(name='div',  attrs={'class': re.compile(
478                        comment_reg_exp, re.IGNORECASE)}),
479                   dict(name='div',  attrs={'class': re.compile(
480                        story_actions_reg_exp, re.IGNORECASE)}),
481                   dict(name='div',  attrs={'class': re.compile(
482                        bookmark_list_reg_exp, re.IGNORECASE)}),
483                   dict(name='div',  attrs={'id': re.compile(
484                        secondary_content_reg_exp, re.IGNORECASE)}),
485                   dict(name='div',  attrs={'id': re.compile(
486                        featured_content_reg_exp, re.IGNORECASE)}),
487                   dict(name='div',  attrs={'id': re.compile(
488                        navigation_reg_exp, re.IGNORECASE)}),
489                   dict(name='form', attrs={'id': re.compile(
490                        form_reg_exp, re.IGNORECASE)}),
491                   dict(attrs={'class': re.compile(
492                        quote_reg_exp, re.IGNORECASE)}),
493                   dict(attrs={'class': re.compile(
494                        hidden_reg_exp, re.IGNORECASE)}),
495                   dict(attrs={'class': re.compile(
496                        social_links_reg_exp, re.IGNORECASE)}),
497                   dict(attrs={'class': re.compile(
498                        comment_reg_exp, re.IGNORECASE)}),
499                   dict(attrs={'class': re.compile(
500                        skip_reg_exp, re.IGNORECASE)}),
501                   dict(name='map', attrs={'id': re.compile(
502                        map_reg_exp, re.IGNORECASE)}),
503                   dict(name='map', attrs={'name': re.compile(
504                        map_reg_exp, re.IGNORECASE)}),
505                   dict(name='div', attrs={'id': re.compile(
506                        social_bookmarks_reg_exp, re.IGNORECASE)}),
507                   dict(name='div', attrs={'id': re.compile(
508                        blq_mast_reg_exp, re.IGNORECASE)}),
509                   dict(name='div', attrs={'class': re.compile(
510                        sharesb_reg_exp, re.IGNORECASE)}),
511                   dict(name='div', attrs={
512                       'class': re.compile(o_reg_exp, re.IGNORECASE)}),
513                   dict(name='div',  attrs={'class': re.compile(
514                        promo_top_reg_exp, re.IGNORECASE)}),
515                   dict(name='div',  attrs={'class': re.compile(
516                        promo_bottom_reg_exp, re.IGNORECASE)}),
517                   dict(name='div',  attrs={
518                       'class': re.compile(nlp_reg_exp, re.IGNORECASE)}),
519                   dict(name='div',  attrs={'class': re.compile(
520                        mva_or_mvb_reg_exp, re.IGNORECASE)}),
521                   dict(name='div',  attrs={'class': re.compile(
522                        mvtb_reg_exp, re.IGNORECASE)}),
523                   dict(name='div',  attrs={'class': re.compile(
524                        blq_toplink_reg_exp, re.IGNORECASE)}),
525                   dict(name='div',  attrs={'class': re.compile(
526                        prods_services_01_reg_exp, re.IGNORECASE)}),
527                   dict(name='div',  attrs={'class': re.compile(
528                        prods_services_02_reg_exp, re.IGNORECASE)}),
529                   dict(name='div',  attrs={'class': re.compile(
530                        blq_misc_01_reg_exp, re.IGNORECASE)}),
531                   dict(name='div',  attrs={'class': re.compile(
532                        blq_misc_02_reg_exp, re.IGNORECASE)}),
533                   dict(name='div',  attrs={'class': re.compile(
534                        puffbox_reg_exp, re.IGNORECASE)}),
535                   dict(attrs={'class': re.compile(
536                        sibtbg_reg_exp, re.IGNORECASE)}),
537                   dict(attrs={'class': re.compile(
538                        storyextra_reg_exp, re.IGNORECASE)}),
539                   dict(name='div',  attrs={'class': 'tools-container'}),
540                   dict(name='div',  attrs={'class': 'tools-container-end'}),
541                   dict(name='div',  attrs={
542                       'class': 'g-block story-body contextual-links'}),
543                   dict(name='div',  attrs={'class': ' g-w11 sidebar'})
544                   ]
545
546    # Uses url to create and return the 'printer friendly' version of the url.
547    # In other words the 'print this page' address of the page.
548    #
549    # There are 3 types of urls used in the BBC site's rss feeds. There is just
550    # 1 type for the standard news while there are 2 used for sports feed urls.
551    # Note: Sports urls are linked from regular news feeds (Eg. 'News Home') when
552    # there is a major story of interest to 'everyone'. So even if no BBC sports
553    # feeds are added to 'feeds' the logic of this method is still needed to avoid
554    # blank / missing / empty articles which have an index title and then no
555    # body.
556    def print_version(self, url):
557
558        # Handle sports page urls type 01:
559        if (url.find("go/rss/-/sport1/") != -1):
560            temp_url = url.replace("go/rss/-/", "")
561
562        # Handle sports page urls type 02:
563        elif (url.find("go/rss/int/news/-/sport1/") != -1):
564            temp_url = url.replace("go/rss/int/news/-/", "")
565
566        # Handle regular news page urls:
567        else:
568            temp_url = url.replace("go/rss/int/news/-/", "")
569
570        # Always add "?print=true" to the end of the url.
571        print_url = temp_url + "?print=true"
572
573        return print_url
574
575    # Remove articles in feeds based on a string in the article title or url.
576    #
577    # Code logic written by: Starson17 - posted in: "Recipes - Re-usable code"
578    # thread, in post with title: "Remove articles from feed", see url:
579    # http://www.mobileread.com/forums/showpost.php?p=1165462&postcount=6
580    # Many thanks and all credit to Starson17.
581    #
582    # Starson17's code has obviously been altered to suite my requirements.
583    def parse_feeds(self):
584
585        # Call parent's method.
586        feeds = BasicNewsRecipe.parse_feeds(self)
587
588        # Loop through all feeds.
589        for feed in feeds:
590
591            # Loop through all articles in feed.
592            for article in feed.articles[:]:
593
594                # Match key words and remove article if there's a match.
595
596                # Most BBC rss feed video only 'articles' use upper case 'VIDEO'
597                # as a title prefix. Just match upper case 'VIDEO', so that
598                # articles like 'Video game banned' won't be matched and
599                # removed.
600                if 'VIDEO' in article.title:
601                    feed.articles.remove(article)
602
603                # Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
604                # as a title prefix. Just match upper case 'AUDIO', so that
605                # articles like 'Hi-Def audio...' won't be matched and removed.
606                elif 'AUDIO' in article.title:
607                    feed.articles.remove(article)
608
609                # Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
610                # 'In pictures', and 'in pictures', somewhere in their title.
611                # Match any case of that phrase.
612                elif 'IN PICTURES' in article.title.upper():
613                    feed.articles.remove(article)
614
615                # As above, but user contributed pictures. Match any case.
616                elif 'YOUR PICTURES' in article.title.upper():
617                    feed.articles.remove(article)
618
619                # 'Sportsday Live' are articles which contain a constantly and
620                # dynamically updated 'running commentary' during a live sporting
621                # event. Match any case.
622                elif 'SPORTSDAY LIVE' in article.title.upper():
623                    feed.articles.remove(article)
624
625                # Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
626                # These are being matched below using 'Live - ' because removing all
627                # articles with 'live' in their titles would remove some articles
628                # that are in fact not live sports pages. Match any case.
629                elif 'LIVE - ' in article.title.upper():
630                    feed.articles.remove(article)
631
632                # 'Quiz of the week' is a Flash player weekly news quiz. Match only
633                # the 'Quiz of the' part in anticipation of monthly and yearly
634                # variants. Match any case.
635                elif 'QUIZ OF THE' in article.title.upper():
636                    feed.articles.remove(article)
637
638                # Remove articles with 'scorecards' in the url. These are BBC sports
639                # pages which just display a cricket scorecard. The pages have a mass
640                # of table and css entries to display the scorecards nicely. Probably
641                # could make them work with this recipe, but might take a whole day
642                # of work to sort out all the css - basically a formatting
643                # nightmare.
644                elif 'scorecards' in article.url:
645                    feed.articles.remove(article)
646
647        return feeds
648
649# End of class and file.
650