1#!/usr/local/bin/python3.8
2
3__license__ = 'GPL v3'
4
5'''
6www.canada.com
7'''
8
9from calibre.web.feeds.recipes import BasicNewsRecipe
10
11
12class CanWestPaper(BasicNewsRecipe):
13
14    # un-comment the following three lines for the Windsor Star
15    title = u'Windsor Star'
16    url_prefix = 'http://www.windsorstar.com'
17    description = u'News from Windsor, ON'
18
19    # un-comment the following three lines for the Ottawa Citizen
20    # title = u'Ottawa Citizen'
21    # url_prefix = 'http://www.ottawacitizen.com'
22    # description = u'News from Ottawa, ON'
23    #
24    # un-comment the following three lines for the Montreal Gazette
25    # title = u'Montreal Gazette'
26    # url_prefix = 'http://www.montrealgazette.com'
27    # description = u'News from Montreal, QC'
28
29    language = 'en_CA'
30    __author__ = 'Nick Redding'
31    no_stylesheets = True
32    timefmt = ' [%b %d]'
33    extra_css = '''
34                .timestamp {  font-size:xx-small; display: block; }
35                #storyheader { font-size: medium; }
36                #storyheader h1 { font-size: x-large; }
37                #storyheader h2 { font-size: large;  font-style: italic; }
38                .byline { font-size:xx-small; }
39                #photocaption { font-size: small; font-style: italic }
40                #photocredit { font-size: xx-small; }'''
41    keep_only_tags = [dict(name='div', attrs={'id': 'storyheader'}), dict(
42        name='div', attrs={'id': 'storycontent'})]
43    remove_tags = [{'class': 'comments'},
44                   dict(name='div', attrs={'class': 'navbar'}), dict(
45                       name='div', attrs={'class': 'morelinks'}),
46                   dict(name='div', attrs={'class': 'viewmore'}), dict(
47                       name='li', attrs={'class': 'email'}),
48                   dict(name='div', attrs={'class': 'story_tool_hr'}), dict(
49                       name='div', attrs={'class': 'clear'}),
50                   dict(name='div', attrs={'class': 'story_tool'}), dict(
51                       name='div', attrs={'class': 'copyright'}),
52                   dict(name='div', attrs={'class': 'rule_grey_solid'}),
53                   dict(name='li', attrs={'class': 'print'}), dict(name='li', attrs={'class': 'share'}), dict(name='ul', attrs={'class': 'bullet'})]
54
55    def preprocess_html(self, soup):
56        # delete iempty id attributes--they screw up the TOC for unknow reasons
57        divtags = soup.findAll('div', attrs={'id': ''})
58        if divtags:
59            for div in divtags:
60                del(div['id'])
61        return soup
62
63    def parse_index(self):
64        soup = self.index_to_soup(
65            self.url_prefix + '/news/todays-paper/index.html')
66
67        articles = {}
68        key = 'News'
69        ans = ['News']
70
71        # Find each instance of class="sectiontitle", class="featurecontent"
72        for divtag in soup.findAll('div', attrs={'class': ["section_title02", "featurecontent"]}):
73            if 'section_title' in ''.join(divtag['class']):
74                # div contains section title
75                if not divtag.h3:
76                    continue
77                key = self.tag_to_string(divtag.h3, False)
78                ans.append(key)
79                self.log("Section name %s" % key)
80                continue
81            # div contains article data
82            h1tag = divtag.find('h1')
83            if not h1tag:
84                continue
85            atag = h1tag.find('a', href=True)
86            if not atag:
87                continue
88            url = self.url_prefix + '/news/todays-paper/' + atag['href']
89            title = self.tag_to_string(atag, False)
90            pubdate = ''
91            description = ''
92            ptag = divtag.find('p')
93            if ptag:
94                description = self.tag_to_string(ptag, False)
95            author = ''
96            autag = divtag.find('h4')
97            if autag:
98                author = self.tag_to_string(autag, False)
99            if key not in articles:
100                articles[key] = []
101            articles[key].append(dict(title=title, url=url, date=pubdate,
102                                      description=description, author=author, content=''))
103
104        ans = [(keyl, articles[key]) for keyl in ans if keyl in articles]
105        return ans
106