1#!/usr/local/bin/python3.8 2# -*- coding: utf-8 -*- 3 4__license__ = 'GPL v3' 5__copyright__ = '2011, Piotr Kontek, piotr.kontek@gmail.com \ 6 2013-2018, Tomasz Długosz, tomek3d@gmail.com' 7 8from calibre.web.feeds.news import BasicNewsRecipe 9import re 10from lxml import html 11 12 13class GN(BasicNewsRecipe): 14 __author__ = 'Piotr Kontek, Tomasz Długosz' 15 title = u'Gość Niedzielny' 16 publisher = 'Wydawnictwo Kurii Metropolitalnej w Katowicach' 17 description = 'Ogólnopolski tygodnik katolicki - fragmenty artykułów z aktualnego numeru' 18 encoding = 'utf-8' 19 no_stylesheets = True 20 language = 'pl' 21 remove_javascript = True 22 ignore_duplicate_articles = {'url'} 23 masthead_url = 'http://gosc.pl/static/images/base/logotypes/100x68/gosc-niedzielny.png' 24 25 def find_last_issue(self): 26 raw = self.index_to_soup( 27 'http://gosc.pl/wyszukaj/wydania/3.Gosc-Niedzielny/', raw=True) 28 doc = html.fromstring(raw) 29 page = doc.xpath( 30 '//div[@class="search-result release-result"]/div[1]/div[2]/h1//a/@href') 31 32 return page[0] 33 34 def parse_index(self): 35 self.last_issue = self.find_last_issue() 36 soup = self.index_to_soup('http://gosc.pl' + self.last_issue) 37 self.cover_url = 'http://www.gosc.pl' + \ 38 soup.find('div', attrs={'class': 'fl-w100 release-wp'} 39 ).findAll('a')[-4].contents[0]['src'] 40 feeds = [] 41 # editorial: 42 a = soup.find('div', attrs={'class': 'release-wp-b'}) 43 art = a.find('a') 44 articles = [ 45 {'title': self.tag_to_string(art), 46 'url': 'http://www.gosc.pl' + art['href'], 47 'description': self.tag_to_string(a.find('p', attrs={'class': 'b lead'})) 48 }] 49 feeds.append((u'Na dobry początek', articles)) 50 # columns: 51 for addr in soup.findAll('a', attrs={'href': re.compile('kategoria')}): 52 if not addr.span: 53 main_block = self.index_to_soup( 54 'http://www.gosc.pl' + addr['href']) 55 articles = list(self.find_articles(main_block)) 56 if len(articles) > 0: 57 section = addr.contents[0] 58 feeds.append((section, articles)) 59 # not assigned content: 60 page = 1 61 not_assigned = [] 62 while True: 63 soup = self.index_to_soup( 64 'http://gosc.pl' + self.last_issue.replace('przeglad', 'wszystko') + '/' + str(page)) 65 articles = list(self.find_articles(soup)) 66 not_assigned.extend(articles) 67 page += 1 68 pages = soup.find('span', attrs={'class': 'pgr_nrs'}) 69 if str(page) not in [self.tag_to_string(x) for x in pages.findAll('a')]: 70 break 71 72 feeds.append((u'Nieprzypisane', not_assigned)) 73 return feeds 74 75 def find_articles(self, main_block): 76 for a in main_block.findAll('div', attrs={'class': ['infoBox']}): 77 art = a.find('a') 78 yield { 79 'title': self.tag_to_string(art), 80 'url': 'http://www.gosc.pl' + art['href'].split('.')[0], 81 'date': self.tag_to_string(a.find('b', attrs={'class': 'time'})).replace('DODANE', ' '), 82 'description': self.tag_to_string(a.find('div', attrs={'class': 'txt'})) 83 } 84 85 def append_page(self, soup, appendtag): 86 chpage = appendtag.find(attrs={'class': 'pgr_nrs'}) 87 if chpage: 88 for page in chpage.findAll('a'): 89 soup2 = self.index_to_soup('http://gosc.pl' + page['href']) 90 pagetext = soup2.find(attrs={'class': 'intextAd'}) 91 pos = len(appendtag.contents) 92 appendtag.insert(pos, pagetext) 93 94 def preprocess_html(self, soup): 95 self.append_page(soup, soup.body) 96 r = soup.find(attrs={'class': 'lightbox'}) 97 if r: 98 r.contents[0]['src'] = r['href'] 99 return soup 100 101 def postprocess_html(self, soup, first_fetch): 102 for r in soup.findAll(attrs={'class': 'pgr'}): 103 r.extract() 104 for r in soup.findAll(attrs={'class': 'cm-i-a'}): 105 r.replaceWith('<div style="clear:both"></div>' + r.prettify() + '<div style="clear:both"></div>') 106 return soup 107 108 keep_only_tags = [ 109 dict(name='div', attrs={'class': ['cf txt ', 'cf txt att-audio']}) 110 ] 111 112 remove_tags = [ 113 dict(name='p', attrs={'class': 'l l-2 doc-source'}), 114 dict(name='span', attrs={'class': 'wykop'}), 115 dict(name='div', attrs={'class': ['doc_actions', 'cf', 'fr1_cl','txt__social-icons','txt__tags','fb-like fb_iframe_widget', 'jp-audio', 'jp-jplayer']}), 116 dict(name='div', attrs={'id': 'vote'}), 117 dict(name='a', attrs={'class': 'img_enlarge'}) 118 ] 119 120 extra_css = ''' 121 h1 {font-size:150%} 122 p.limiter {font-size:150%; font-weight: bold} 123 p.gn_subtitle {font-weight: bold} 124 span.cm-i-a {text-transform:uppercase;font-size:50%} 125 span.cm-i-p {font-style:italic; font-size:70%;text-align:right} 126 span.gn_brb {color: red; font-weight: bold} 127 div.txt__lead {font-weight: bold; font-size:150%} 128 ''' 129