1#!/usr/local/bin/python3.8 2 3__license__ = 'GPL v3' 4__copyright__ = '2010, matek09, matek09@gmail.com' 5 6import re 7from calibre.web.feeds.news import BasicNewsRecipe 8from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment 9 10 11class Esensja(BasicNewsRecipe): 12 13 title = u'Esensja' 14 __author__ = 'matek09 & fenuks' 15 description = 'Magazyn kultury popularnej' 16 encoding = 'utf-8' 17 no_stylesheets = True 18 language = 'pl' 19 remove_javascript = True 20 masthead_url = 'http://esensja.pl/img/wrss.gif' 21 oldest_article = 1 22 URL = 'http://esensja.pl' 23 HREF = '0' 24 remove_attributes = ['style', 'bgcolor', 'alt', 'color'] 25 keep_only_tags = [dict(attrs={'class': 'sekcja'}), ] 26 # keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'}) 27 remove_tags_after = dict(id='tekst') 28 29 remove_tags = [dict(name='img', attrs={'src': ['../../../2000/01/img/tab_top.gif', '../../../2000/01/img/tab_bot.gif']}), 30 dict(name='div', attrs={'class': 't-title2 nextpage'}), 31 # dict(attrs={'rel':'lightbox[galeria]'}) 32 dict(attrs={'class': ['tekst_koniec', 'ref', 'wykop']}), 33 dict(attrs={'itemprop': ['copyrightHolder', 'publisher']}), 34 dict(id='komentarze') 35 36 ] 37 38 extra_css = ''' 39 .t-title {font-size: x-large; font-weight: bold; text-align: left} 40 .t-author {font-size: x-small; text-align: left} 41 .t-title2 {font-size: x-small; font-style: italic; text-align: left} 42 .text {font-size: small; text-align: left} 43 .annot-ref {font-style: italic; text-align: left} 44 ''' 45 46 preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''), 47 (re.compile( 48 u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''), 49 ] 50 51 def parse_index(self): 52 soup = self.index_to_soup('http://www.esensja.pl/magazyn/') 53 a = soup.find('a', attrs={'href': re.compile('.*/index.html')}) 54 year = a['href'].split('/')[0] 55 month = a['href'].split('/')[1] 56 self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/' 57 soup = self.index_to_soup(self.HREF + '01.html') 58 self.cover_url = 'http://www.esensja.pl/magazyn/' + \ 59 year + '/' + month + '/img/ilustr/cover_b.jpg' 60 feeds = [] 61 chapter = '' 62 subchapter = '' 63 articles = [] 64 intro = soup.find('div', attrs={'class': 'n-title'}) 65 ''' 66 introduction = {'title' : self.tag_to_string(intro.a), 67 'url' : self.HREF + intro.a['href'], 68 'date' : '', 69 'description' : ''} 70 chapter = 'Wprowadzenie' 71 articles.append(introduction) 72 ''' 73 74 for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}): 75 if tag.name in 'td': 76 if len(articles) > 0: 77 section = chapter 78 if len(subchapter) > 0: 79 section += ' - ' + subchapter 80 feeds.append((section, articles)) 81 articles = [] 82 if ''.join(tag['class']) == 'chapter': 83 chapter = self.tag_to_string(tag).capitalize() 84 subchapter = '' 85 else: 86 subchapter = self.tag_to_string(tag) 87 subchapter = self.tag_to_string(tag) 88 continue 89 90 finalurl = tag.a['href'] 91 if not finalurl.startswith('http'): 92 finalurl = self.HREF + finalurl 93 articles.append({'title': self.tag_to_string( 94 tag.a), 'url': finalurl, 'date': '', 'description': ''}) 95 96 a = self.index_to_soup(finalurl) 97 i = 1 98 99 while True: 100 div = a.find('div', attrs={'class': 't-title2 nextpage'}) 101 if div is not None: 102 link = div.a['href'] 103 if not link.startswith('http'): 104 link = self.HREF + link 105 a = self.index_to_soup(link) 106 articles.append({'title': self.tag_to_string( 107 tag.a) + ' c. d. ' + str(i), 'url': link, 'date': '', 'description': ''}) 108 i = i + 1 109 else: 110 break 111 112 return feeds 113 114 def append_page(self, soup, appendtag): 115 r = appendtag.find(attrs={'class': 'wiecej_xxx'}) 116 if r: 117 nr = r.findAll(attrs={'class': 'tn-link'})[-1] 118 try: 119 nr = int(nr.a.string) 120 except: 121 return 122 baseurl = soup.find(attrs={'property': 'og:url'})[ 123 'content'] + '&strona={0}' 124 for number in range(2, nr + 1): 125 soup2 = self.index_to_soup(baseurl.format(number)) 126 pagetext = soup2.find(attrs={'class': 'tresc'}) 127 pos = len(appendtag.contents) 128 appendtag.insert(pos, pagetext) 129 for r in appendtag.findAll(attrs={'class': ['wiecej_xxx', 'tekst_koniec']}): 130 r.extract() 131 for r in appendtag.findAll('script'): 132 r.extract() 133 134 comments = appendtag.findAll( 135 text=lambda text: isinstance(text, Comment)) 136 for comment in comments: 137 comment.extract() 138 139 def preprocess_html(self, soup): 140 self.append_page(soup, soup.body) 141 for tag in soup.findAll(attrs={'class': 'img_box_right'}): 142 temp = tag.find('img') 143 src = '' 144 if temp: 145 src = temp.get('src', '') 146 for r in tag.findAll('a', recursive=False): 147 r.extract() 148 info = tag.find(attrs={'class': 'img_info'}) 149 text = str(tag) 150 if not src: 151 src = re.search('src="[^"]*?"', text) 152 if src: 153 src = src.group(0) 154 src = src[5:].replace('//', '/') 155 if src: 156 tag.contents = [] 157 tag.insert(0, BeautifulSoup( 158 '<img src="{0}{1}" />'.format(self.URL, src))) 159 if info: 160 tag.insert(len(tag.contents), info) 161 return soup 162