1from collections import OrderedDict
2import re
3from calibre.web.feeds.news import BasicNewsRecipe
4
5
6class HistoryToday(BasicNewsRecipe):
7
8    title = 'History Today'
9    __author__ = 'Rick Shang'
10
11    description = 'UK-based magazine, publishing articles and book reviews covering all types and periods of history.'
12    language = 'en'
13    category = 'news'
14    encoding = 'UTF-8'
15
16    remove_tags = [dict(name='div', attrs={'class': ['print-logo', 'print-site_name', 'print-breadcrumb']}),
17                   dict(name='div', attrs={'id': ['ht-tools', 'ht-tools2', 'ht-tags']})]
18    no_javascript = True
19    no_stylesheets = True
20
21    needs_subscription = True
22
23    def get_browser(self):
24        br = BasicNewsRecipe.get_browser(self)
25        if self.username is not None and self.password is not None:
26            br.open('http://www.historytoday.com/user/login')
27            br.select_form(nr=1)
28            br['name'] = self.username
29            br['pass'] = self.password
30            res = br.submit()
31            raw = res.read()
32            if 'Session limit exceeded' in raw:
33                br.select_form(nr=1)
34            control = br.find_control('sid').items[1]
35            sid = []
36            br['sid'] = sid.join(control)
37            br.submit()
38        return br
39
40    def parse_index(self):
41
42        # Find date
43        soup0 = self.index_to_soup('http://www.historytoday.com/')
44        dates = self.tag_to_string(soup0.find(
45            'div', attrs={'id': 'block-block-226'}).span)
46        self.timefmt = u' [%s]' % dates
47
48        # Go to issue
49        soup = self.index_to_soup('http://www.historytoday.com/contents')
50        cover = soup.find('div', attrs={
51                          'id': 'content-area'}).find('img', attrs={'src': re.compile('.*cover.*')})['src']
52        self.cover_url = cover
53        self.log(self.cover_url)
54
55        # Go to the main body
56
57        div = soup.find('div', attrs={'class': 'region region-content-bottom'})
58
59        feeds = OrderedDict()
60        section_title = ''
61        for section in div.findAll('div', attrs={'id': re.compile(r"block\-views\-contents.*")}):
62            section_title = self.tag_to_string(
63                section.find('h2', attrs={'class': 'title'}))
64            sectionbody = section.find('div', attrs={'class': 'view-content'})
65            for article in sectionbody.findAll('div', attrs={'class': re.compile(r"views\-row.*")}):
66                articles = []
67                subarticle = []
68                subarticle = article.findAll('div')
69                if len(subarticle) < 2:
70                    continue
71                title = self.tag_to_string(subarticle[0])
72                originalurl = "http://www.historytoday.com" + \
73                    subarticle[0].span.a['href'].strip()
74                originalpage = self.index_to_soup(originalurl)
75                printurl = originalpage.find(
76                    'div', attrs={'id': 'ht-tools'}).a['href'].strip()
77                url = "http://www.historytoday.com" + printurl
78                desc = self.tag_to_string(subarticle[1])
79                articles.append({'title': title, 'url': url,
80                                 'description': desc, 'date': ''})
81
82                if articles:
83                    if section_title not in feeds:
84                        feeds[section_title] = []
85                    feeds[section_title] += articles
86
87        ans = [(key, val) for key, val in feeds.items()]
88        return ans
89
90    def cleanup(self):
91        self.browser.open('http://www.historytoday.com/logout')
92