1#!/usr/local/bin/python3.8
2
3__license__ = 'GPL v3'
4__copyright__ = '2010, matek09, matek09@gmail.com'
5
6import re
7from calibre.web.feeds.news import BasicNewsRecipe
8from calibre.ebooks.BeautifulSoup import BeautifulSoup, Comment
9
10
11class Esensja(BasicNewsRecipe):
12
13    title = u'Esensja'
14    __author__ = 'matek09 & fenuks'
15    description = 'Magazyn kultury popularnej'
16    encoding = 'utf-8'
17    no_stylesheets = True
18    language = 'pl'
19    remove_javascript = True
20    masthead_url = 'http://esensja.pl/img/wrss.gif'
21    oldest_article = 1
22    URL = 'http://esensja.pl'
23    HREF = '0'
24    remove_attributes = ['style', 'bgcolor', 'alt', 'color']
25    keep_only_tags = [dict(attrs={'class': 'sekcja'}), ]
26    # keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'article'})
27    remove_tags_after = dict(id='tekst')
28
29    remove_tags = [dict(name='img', attrs={'src': ['../../../2000/01/img/tab_top.gif', '../../../2000/01/img/tab_bot.gif']}),
30                   dict(name='div', attrs={'class': 't-title2 nextpage'}),
31                   # dict(attrs={'rel':'lightbox[galeria]'})
32                   dict(attrs={'class': ['tekst_koniec', 'ref', 'wykop']}),
33                   dict(attrs={'itemprop': ['copyrightHolder', 'publisher']}),
34                   dict(id='komentarze')
35
36                   ]
37
38    extra_css = '''
39                                    .t-title {font-size: x-large; font-weight: bold; text-align: left}
40                                    .t-author {font-size: x-small; text-align: left}
41                                    .t-title2 {font-size: x-small; font-style: italic; text-align: left}
42                                    .text {font-size: small; text-align: left}
43                                    .annot-ref {font-style: italic; text-align: left}
44                            '''
45
46    preprocess_regexps = [(re.compile(r'alt="[^"]*"'), lambda match: ''),
47                          (re.compile(
48                              u'(title|alt)="[^"]*?"', re.DOTALL), lambda match: ''),
49                          ]
50
51    def parse_index(self):
52        soup = self.index_to_soup('http://www.esensja.pl/magazyn/')
53        a = soup.find('a', attrs={'href': re.compile('.*/index.html')})
54        year = a['href'].split('/')[0]
55        month = a['href'].split('/')[1]
56        self.HREF = 'http://www.esensja.pl/magazyn/' + year + '/' + month + '/iso/'
57        soup = self.index_to_soup(self.HREF + '01.html')
58        self.cover_url = 'http://www.esensja.pl/magazyn/' + \
59            year + '/' + month + '/img/ilustr/cover_b.jpg'
60        feeds = []
61        chapter = ''
62        subchapter = ''
63        articles = []
64        intro = soup.find('div', attrs={'class': 'n-title'})
65        '''
66        introduction = {'title' : self.tag_to_string(intro.a),
67                                        'url' : self.HREF + intro.a['href'],
68                                        'date' : '',
69                                        'description' : ''}
70        chapter = 'Wprowadzenie'
71        articles.append(introduction)
72        '''
73
74        for tag in intro.findAllNext(attrs={'class': ['chapter', 'subchapter', 'n-title']}):
75            if tag.name in 'td':
76                if len(articles) > 0:
77                    section = chapter
78                    if len(subchapter) > 0:
79                        section += ' - ' + subchapter
80                    feeds.append((section, articles))
81                    articles = []
82                if ''.join(tag['class']) == 'chapter':
83                    chapter = self.tag_to_string(tag).capitalize()
84                    subchapter = ''
85                else:
86                    subchapter = self.tag_to_string(tag)
87                    subchapter = self.tag_to_string(tag)
88                continue
89
90            finalurl = tag.a['href']
91            if not finalurl.startswith('http'):
92                finalurl = self.HREF + finalurl
93            articles.append({'title': self.tag_to_string(
94                tag.a), 'url': finalurl, 'date': '', 'description': ''})
95
96            a = self.index_to_soup(finalurl)
97            i = 1
98
99            while True:
100                div = a.find('div', attrs={'class': 't-title2 nextpage'})
101                if div is not None:
102                    link = div.a['href']
103                    if not link.startswith('http'):
104                        link = self.HREF + link
105                    a = self.index_to_soup(link)
106                    articles.append({'title': self.tag_to_string(
107                        tag.a) + ' c. d. ' + str(i), 'url': link, 'date': '', 'description': ''})
108                    i = i + 1
109                else:
110                    break
111
112        return feeds
113
114    def append_page(self, soup, appendtag):
115        r = appendtag.find(attrs={'class': 'wiecej_xxx'})
116        if r:
117            nr = r.findAll(attrs={'class': 'tn-link'})[-1]
118            try:
119                nr = int(nr.a.string)
120            except:
121                return
122            baseurl = soup.find(attrs={'property': 'og:url'})[
123                'content'] + '&strona={0}'
124            for number in range(2, nr + 1):
125                soup2 = self.index_to_soup(baseurl.format(number))
126                pagetext = soup2.find(attrs={'class': 'tresc'})
127                pos = len(appendtag.contents)
128                appendtag.insert(pos, pagetext)
129            for r in appendtag.findAll(attrs={'class': ['wiecej_xxx', 'tekst_koniec']}):
130                r.extract()
131            for r in appendtag.findAll('script'):
132                r.extract()
133
134            comments = appendtag.findAll(
135                text=lambda text: isinstance(text, Comment))
136            for comment in comments:
137                comment.extract()
138
139    def preprocess_html(self, soup):
140        self.append_page(soup, soup.body)
141        for tag in soup.findAll(attrs={'class': 'img_box_right'}):
142            temp = tag.find('img')
143            src = ''
144            if temp:
145                src = temp.get('src', '')
146            for r in tag.findAll('a', recursive=False):
147                r.extract()
148            info = tag.find(attrs={'class': 'img_info'})
149            text = str(tag)
150            if not src:
151                src = re.search('src="[^"]*?"', text)
152                if src:
153                    src = src.group(0)
154                    src = src[5:].replace('//', '/')
155            if src:
156                tag.contents = []
157                tag.insert(0, BeautifulSoup(
158                    '<img src="{0}{1}" />'.format(self.URL, src)))
159            if info:
160                tag.insert(len(tag.contents), info)
161        return soup
162