1#!/usr/local/bin/python3.8
2# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
3
4try:
5    from http.cookiejar import Cookie
6except ImportError:
7    from cookielib import Cookie
8
9import json
10from html5_parser import parse
11from lxml import etree
12
13from calibre import replace_entities
14from calibre.ebooks.BeautifulSoup import NavigableString, Tag
15from calibre.utils.cleantext import clean_ascii_chars
16from calibre.utils.date import parse_only_date
17from calibre.web.feeds.news import BasicNewsRecipe
18
19# For past editions, set date to, for example, '2020-11-28'
20edition_date = None
21
22
23def E(parent, name, text='', **attrs):
24    ans = parent.makeelement(name, **attrs)
25    ans.text = text
26    parent.append(ans)
27    return ans
28
29
30def process_node(node, html_parent):
31    ntype = node.get('type')
32    if ntype == 'tag':
33        c = html_parent.makeelement(node['name'])
34        c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
35        html_parent.append(c)
36        for nc in node.get('children', ()):
37            process_node(nc, c)
38    elif ntype == 'text':
39        text = node.get('data')
40        if text:
41            text = replace_entities(text)
42            if len(html_parent):
43                t = html_parent[-1]
44                t.tail = (t.tail or '') + text
45            else:
46                html_parent.text = (html_parent.text or '') + text
47
48
49def load_article_from_json(raw, root):
50    data = json.loads(raw)['props']['pageProps']['content']
51    # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
52    if isinstance(data, list):
53        data = data[0]
54    body = root.xpath('//body')[0]
55    for child in tuple(body):
56        body.remove(child)
57    article = E(body, 'article')
58    E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
59    E(article, 'h1', data['headline'], style='font-size: x-large')
60    E(article, 'div', data['description'], style='font-style: italic')
61    E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
62    images = data['image']
63    if 'main' in images:
64        div = E(article, 'div')
65        try:
66            E(div, 'img', src=images['main']['url']['canonical'])
67        except Exception:
68            pass
69    text = data['text']
70    for node in text:
71        process_node(node, article)
72
73
74def classes(classes):
75    q = frozenset(classes.split(' '))
76    return dict(attrs={
77        'class': lambda x: x and frozenset(x.split()).intersection(q)})
78
79
80def new_tag(soup, name, attrs=()):
81    impl = getattr(soup, 'new_tag', None)
82    if impl is not None:
83        return impl(name, attrs=dict(attrs))
84    return Tag(soup, name, attrs=attrs or None)
85
86
87class NoArticles(Exception):
88    pass
89
90
91def process_url(url):
92    if url.startswith('/'):
93        url = 'https://www.economist.com' + url
94    return url
95
96
97class Economist(BasicNewsRecipe):
98
99    title = 'The Economist'
100    language = 'en'
101
102    __author__ = "Kovid Goyal"
103    description = (
104        'Global news and current affairs from a European'
105        ' perspective. Best downloaded on Friday mornings (GMT)'
106    )
107    extra_css = '''
108        .headline {font-size: x-large;}
109        h2 { font-size: small;  }
110        h1 { font-size: medium;  }
111        em.Bold {font-weight:bold;font-style:normal;}
112        em.Italic {font-style:italic;}
113        p.xhead {font-weight:bold;}
114        .pullquote {
115            float: right;
116            font-size: larger;
117            font-weight: bold;
118            font-style: italic;
119            page-break-inside:avoid;
120            border-bottom: 3px solid black;
121            border-top: 3px solid black;
122            width: 228px;
123            margin: 0px 0px 10px 15px;
124            padding: 7px 0px 9px;
125        }
126        .flytitle-and-title__flytitle {
127            display: block;
128            font-size: smaller;
129            color: red;
130        }
131        '''
132    oldest_article = 7.0
133    resolve_internal_links = True
134    remove_tags = [
135        dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
136        dict(attrs={'aria-label': "Article Teaser"}),
137        dict(attrs={
138                'class': [
139                    'dblClkTrk', 'ec-article-info', 'share_inline_header',
140                    'related-items', 'main-content-container', 'ec-topic-widget',
141                    'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label',
142                    'blog-post__foot-note', 'blog-post__sharebar', 'blog-post__bottom-panel',
143                    'newsletter-form','share-links-header','teaser--wrapped', 'latest-updates-panel__container',
144                    'latest-updates-panel__article-link','blog-post__section'
145                ]
146            }
147        ),
148        dict(attrs={
149                'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
150        classes(
151            'share-links-header teaser--wrapped latest-updates-panel__container'
152            ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
153        )
154    ]
155    keep_only_tags = [dict(name='article', id=lambda x: not x)]
156    no_stylesheets = True
157    remove_attributes = ['data-reactid', 'width', 'height']
158    # economist.com has started throttling after about 60% of the total has
159    # downloaded with connection reset by peer (104) errors.
160    delay = 1
161
162    needs_subscription = False
163
164    def __init__(self, *args, **kwargs):
165        BasicNewsRecipe.__init__(self, *args, **kwargs)
166        if self.output_profile.short_name.startswith('kindle'):
167            # Reduce image sizes to get file size below amazon's email
168            # sending threshold
169            self.web2disk_options.compress_news_images = True
170            self.web2disk_options.compress_news_images_auto_size = 5
171            self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold')
172
173    def get_browser(self):
174        br = BasicNewsRecipe.get_browser(self)
175        # Add a cookie indicating we have accepted Economist's cookie
176        # policy (needed when running from some European countries)
177        ck = Cookie(
178            version=0,
179            name='notice_preferences',
180            value='2:',
181            port=None,
182            port_specified=False,
183            domain='.economist.com',
184            domain_specified=False,
185            domain_initial_dot=True,
186            path='/',
187            path_specified=False,
188            secure=False,
189            expires=None,
190            discard=False,
191            comment=None,
192            comment_url=None,
193            rest={'HttpOnly': None},
194            rfc2109=False
195        )
196        br.cookiejar.set_cookie(ck)
197        br.set_handle_gzip(True)
198        return br
199
200    def preprocess_raw_html(self, raw, url):
201        # open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
202        root = parse(raw)
203        script = root.xpath('//script[@id="__NEXT_DATA__"]')
204        if script:
205            load_article_from_json(script[0].text, root)
206        for div in root.xpath('//div[@class="lazy-image"]'):
207            noscript = list(div.iter('noscript'))
208            if noscript and noscript[0].text:
209                img = list(parse(noscript[0].text).iter('img'))
210                if img:
211                    p = noscript[0].getparent()
212                    idx = p.index(noscript[0])
213                    p.insert(idx, p.makeelement('img', src=img[0].get('src')))
214                    p.remove(noscript[0])
215        for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
216            x.getparent().remove(x)
217        raw = etree.tostring(root, encoding='unicode')
218        return raw
219
220    def populate_article_metadata(self, article, soup, first):
221        els = soup.findAll(name=['span', 'p'],
222                           attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
223        result = []
224        for el in els[0:2]:
225            if el is not None and el.contents:
226                for descendant in el.contents:
227                    if isinstance(descendant, NavigableString):
228                        result.append(type(u'')(descendant))
229        article.summary = u'. '.join(result) + u'.'
230        article.text_summary = clean_ascii_chars(article.summary)
231
232    def publication_date(self):
233        if edition_date:
234            return parse_only_date(edition_date, as_utc=False)
235        return BasicNewsRecipe.publication_date(self)
236
237    def parse_index(self):
238        # return [('Articles', [{'title':'test',
239        #     'url':'https://www.economist.com/briefing/2021/09/11/how-america-wasted-its-unipolar-moment'
240        # }])]
241        if edition_date:
242            url = 'https://www.economist.com/weeklyedition/' + edition_date
243            self.timefmt = ' [' + edition_date + ']'
244        else:
245            url = 'https://www.economist.com/printedition'
246        raw = self.index_to_soup(url, raw=True)
247        # with open('/t/raw.html', 'wb') as f:
248        #     f.write(raw)
249        soup = self.index_to_soup(raw)
250        # nav = soup.find(attrs={'class':'navigation__wrapper'})
251        # if nav is not None:
252        #     a = nav.find('a', href=lambda x: x and '/printedition/' in x)
253        #     if a is not None:
254        #         self.log('Following nav link to current edition', a['href'])
255        #         soup = self.index_to_soup(process_url(a['href']))
256        ans = self.economist_parse_index(soup)
257        if not ans:
258            raise NoArticles(
259                'Could not find any articles, either the '
260                'economist.com server is having trouble and you should '
261                'try later or the website format has changed and the '
262                'recipe needs to be updated.'
263            )
264        return ans
265
266    def economist_parse_index(self, soup):
267        script_tag = soup.find("script", id="__NEXT_DATA__")
268        if script_tag is not None:
269            data = json.loads(script_tag.string)
270            self.cover_url = data['props']['pageProps']['content']['image']['main']['url']['canonical']
271            self.log('Got cover:', self.cover_url)
272        feeds = []
273        for section in soup.findAll(**classes('layout-weekly-edition-section')):
274            h2 = section.find('h2')
275            secname = self.tag_to_string(h2)
276            self.log(secname)
277            articles = []
278            for a in section.findAll('a', href=True, **classes('headline-link weekly-edition-wtw__link')):
279                spans = a.findAll('span')
280                if len(spans) == 2:
281                    title = u'{}: {}'.format(*map(self.tag_to_string, spans))
282                else:
283                    title = self.tag_to_string(a)
284                desc = ''
285                desc_parent = a.findParent('div')
286                if desc_parent is not None:
287                    p = desc_parent.find(itemprop='description')
288                    if p is not None:
289                        desc = self.tag_to_string(p)
290                articles.append({'title': title, 'url': process_url(a['href']), 'description': desc})
291                self.log(' ', title, articles[-1]['url'], '\n   ', desc)
292            if articles:
293                feeds.append((secname, articles))
294        return feeds
295
296    def eco_find_image_tables(self, soup):
297        for x in soup.findAll('table', align=['right', 'center']):
298            if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1:
299                yield x
300
301    def postprocess_html(self, soup, first):
302        for img in soup.findAll('img', srcset=True):
303            del img['srcset']
304        for table in list(self.eco_find_image_tables(soup)):
305            caption = table.find('font')
306            img = table.find('img')
307            div = new_tag(soup, 'div')
308            div['style'] = 'text-align:left;font-size:70%'
309            ns = NavigableString(self.tag_to_string(caption))
310            div.insert(0, ns)
311            div.insert(1, new_tag(soup, 'br'))
312            del img['width']
313            del img['height']
314            img.extract()
315            div.insert(2, img)
316            table.replaceWith(div)
317        return soup
318
319    def canonicalize_internal_url(self, url, is_link=True):
320        if url.endswith('/print'):
321            url = url.rpartition('/')[0]
322        return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)
323