1#!/usr/local/bin/python3.8 2 3__license__ = 'GPL v3' 4 5''' 6www.canada.com 7''' 8 9from calibre.web.feeds.recipes import BasicNewsRecipe 10 11 12class CanWestPaper(BasicNewsRecipe): 13 14 # un-comment the following three lines for the Windsor Star 15 title = u'Windsor Star' 16 url_prefix = 'http://www.windsorstar.com' 17 description = u'News from Windsor, ON' 18 19 # un-comment the following three lines for the Ottawa Citizen 20 # title = u'Ottawa Citizen' 21 # url_prefix = 'http://www.ottawacitizen.com' 22 # description = u'News from Ottawa, ON' 23 # 24 # un-comment the following three lines for the Montreal Gazette 25 # title = u'Montreal Gazette' 26 # url_prefix = 'http://www.montrealgazette.com' 27 # description = u'News from Montreal, QC' 28 29 language = 'en_CA' 30 __author__ = 'Nick Redding' 31 no_stylesheets = True 32 timefmt = ' [%b %d]' 33 extra_css = ''' 34 .timestamp { font-size:xx-small; display: block; } 35 #storyheader { font-size: medium; } 36 #storyheader h1 { font-size: x-large; } 37 #storyheader h2 { font-size: large; font-style: italic; } 38 .byline { font-size:xx-small; } 39 #photocaption { font-size: small; font-style: italic } 40 #photocredit { font-size: xx-small; }''' 41 keep_only_tags = [dict(name='div', attrs={'id': 'storyheader'}), dict( 42 name='div', attrs={'id': 'storycontent'})] 43 remove_tags = [{'class': 'comments'}, 44 dict(name='div', attrs={'class': 'navbar'}), dict( 45 name='div', attrs={'class': 'morelinks'}), 46 dict(name='div', attrs={'class': 'viewmore'}), dict( 47 name='li', attrs={'class': 'email'}), 48 dict(name='div', attrs={'class': 'story_tool_hr'}), dict( 49 name='div', attrs={'class': 'clear'}), 50 dict(name='div', attrs={'class': 'story_tool'}), dict( 51 name='div', attrs={'class': 'copyright'}), 52 dict(name='div', attrs={'class': 'rule_grey_solid'}), 53 dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})] 54 55 def preprocess_html(self, soup): 56 # delete iempty id attributes--they screw up the TOC for unknow reasons 57 divtags = soup.findAll('div', attrs={'id': ''}) 58 if divtags: 59 for div in divtags: 60 del(div['id']) 61 return soup 62 63 def parse_index(self): 64 soup = self.index_to_soup( 65 self.url_prefix + '/news/todays-paper/index.html') 66 67 articles = {} 68 key = 'News' 69 ans = ['News'] 70 71 # Find each instance of class="sectiontitle", class="featurecontent" 72 for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}): 73 if 'section_title' in ''.join(divtag['class']): 74 # div contains section title 75 if not divtag.h3: 76 continue 77 key = self.tag_to_string(divtag.h3, False) 78 ans.append(key) 79 self.log("Section name %s" % key) 80 continue 81 # div contains article data 82 h1tag = divtag.find('h1') 83 if not h1tag: 84 continue 85 atag = h1tag.find('a', href=True) 86 if not atag: 87 continue 88 url = self.url_prefix + '/news/todays-paper/' + atag['href'] 89 title = self.tag_to_string(atag, False) 90 pubdate = '' 91 description = '' 92 ptag = divtag.find('p') 93 if ptag: 94 description = self.tag_to_string(ptag, False) 95 author = '' 96 autag = divtag.find('h4') 97 if autag: 98 author = self.tag_to_string(autag, False) 99 if key not in articles: 100 articles[key] = [] 101 articles[key].append(dict(title=title, url=url, date=pubdate, 102 description=description, author=author, content='')) 103 104 ans = [(keyl, articles[key]) for keyl in ans if keyl in articles] 105 return ans 106