1from collections import OrderedDict 2import re 3from calibre.web.feeds.news import BasicNewsRecipe 4 5 6class HistoryToday(BasicNewsRecipe): 7 8 title = 'History Today' 9 __author__ = 'Rick Shang' 10 11 description = 'UK-based magazine, publishing articles and book reviews covering all types and periods of history.' 12 language = 'en' 13 category = 'news' 14 encoding = 'UTF-8' 15 16 remove_tags = [dict(name='div', attrs={'class': ['print-logo', 'print-site_name', 'print-breadcrumb']}), 17 dict(name='div', attrs={'id': ['ht-tools', 'ht-tools2', 'ht-tags']})] 18 no_javascript = True 19 no_stylesheets = True 20 21 needs_subscription = True 22 23 def get_browser(self): 24 br = BasicNewsRecipe.get_browser(self) 25 if self.username is not None and self.password is not None: 26 br.open('http://www.historytoday.com/user/login') 27 br.select_form(nr=1) 28 br['name'] = self.username 29 br['pass'] = self.password 30 res = br.submit() 31 raw = res.read() 32 if 'Session limit exceeded' in raw: 33 br.select_form(nr=1) 34 control = br.find_control('sid').items[1] 35 sid = [] 36 br['sid'] = sid.join(control) 37 br.submit() 38 return br 39 40 def parse_index(self): 41 42 # Find date 43 soup0 = self.index_to_soup('http://www.historytoday.com/') 44 dates = self.tag_to_string(soup0.find( 45 'div', attrs={'id': 'block-block-226'}).span) 46 self.timefmt = u' [%s]' % dates 47 48 # Go to issue 49 soup = self.index_to_soup('http://www.historytoday.com/contents') 50 cover = soup.find('div', attrs={ 51 'id': 'content-area'}).find('img', attrs={'src': re.compile('.*cover.*')})['src'] 52 self.cover_url = cover 53 self.log(self.cover_url) 54 55 # Go to the main body 56 57 div = soup.find('div', attrs={'class': 'region region-content-bottom'}) 58 59 feeds = OrderedDict() 60 section_title = '' 61 for section in div.findAll('div', attrs={'id': re.compile(r"block\-views\-contents.*")}): 62 section_title = self.tag_to_string( 63 section.find('h2', attrs={'class': 'title'})) 64 sectionbody = section.find('div', attrs={'class': 'view-content'}) 65 for article in sectionbody.findAll('div', attrs={'class': re.compile(r"views\-row.*")}): 66 articles = [] 67 subarticle = [] 68 subarticle = article.findAll('div') 69 if len(subarticle) < 2: 70 continue 71 title = self.tag_to_string(subarticle[0]) 72 originalurl = "http://www.historytoday.com" + \ 73 subarticle[0].span.a['href'].strip() 74 originalpage = self.index_to_soup(originalurl) 75 printurl = originalpage.find( 76 'div', attrs={'id': 'ht-tools'}).a['href'].strip() 77 url = "http://www.historytoday.com" + printurl 78 desc = self.tag_to_string(subarticle[1]) 79 articles.append({'title': title, 'url': url, 80 'description': desc, 'date': ''}) 81 82 if articles: 83 if section_title not in feeds: 84 feeds[section_title] = [] 85 feeds[section_title] += articles 86 87 ans = [(key, val) for key, val in feeds.items()] 88 return ans 89 90 def cleanup(self): 91 self.browser.open('http://www.historytoday.com/logout') 92