1#!/usr/local/bin/python3.8 2# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net> 3 4try: 5 from http.cookiejar import Cookie 6except ImportError: 7 from cookielib import Cookie 8 9import json 10from html5_parser import parse 11from lxml import etree 12 13from calibre import replace_entities 14from calibre.ebooks.BeautifulSoup import NavigableString, Tag 15from calibre.utils.cleantext import clean_ascii_chars 16from calibre.utils.date import parse_only_date 17from calibre.web.feeds.news import BasicNewsRecipe 18 19# For past editions, set date to, for example, '2020-11-28' 20edition_date = None 21 22 23def E(parent, name, text='', **attrs): 24 ans = parent.makeelement(name, **attrs) 25 ans.text = text 26 parent.append(ans) 27 return ans 28 29 30def process_node(node, html_parent): 31 ntype = node.get('type') 32 if ntype == 'tag': 33 c = html_parent.makeelement(node['name']) 34 c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()}) 35 html_parent.append(c) 36 for nc in node.get('children', ()): 37 process_node(nc, c) 38 elif ntype == 'text': 39 text = node.get('data') 40 if text: 41 text = replace_entities(text) 42 if len(html_parent): 43 t = html_parent[-1] 44 t.tail = (t.tail or '') + text 45 else: 46 html_parent.text = (html_parent.text or '') + text 47 48 49def load_article_from_json(raw, root): 50 data = json.loads(raw)['props']['pageProps']['content'] 51 # open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True)) 52 if isinstance(data, list): 53 data = data[0] 54 body = root.xpath('//body')[0] 55 for child in tuple(body): 56 body.remove(child) 57 article = E(body, 'article') 58 E(article, 'h4', data['subheadline'], style='color: red; margin: 0') 59 E(article, 'h1', data['headline'], style='font-size: x-large') 60 E(article, 'div', data['description'], style='font-style: italic') 61 E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em') 62 images = data['image'] 63 if 'main' in images: 64 div = E(article, 'div') 65 try: 66 E(div, 'img', src=images['main']['url']['canonical']) 67 except Exception: 68 pass 69 text = data['text'] 70 for node in text: 71 process_node(node, article) 72 73 74def classes(classes): 75 q = frozenset(classes.split(' ')) 76 return dict(attrs={ 77 'class': lambda x: x and frozenset(x.split()).intersection(q)}) 78 79 80def new_tag(soup, name, attrs=()): 81 impl = getattr(soup, 'new_tag', None) 82 if impl is not None: 83 return impl(name, attrs=dict(attrs)) 84 return Tag(soup, name, attrs=attrs or None) 85 86 87class NoArticles(Exception): 88 pass 89 90 91def process_url(url): 92 if url.startswith('/'): 93 url = 'https://www.economist.com' + url 94 return url 95 96 97class Economist(BasicNewsRecipe): 98 99 title = 'The Economist' 100 language = 'en' 101 102 __author__ = "Kovid Goyal" 103 description = ( 104 'Global news and current affairs from a European' 105 ' perspective. Best downloaded on Friday mornings (GMT)' 106 ) 107 extra_css = ''' 108 .headline {font-size: x-large;} 109 h2 { font-size: small; } 110 h1 { font-size: medium; } 111 em.Bold {font-weight:bold;font-style:normal;} 112 em.Italic {font-style:italic;} 113 p.xhead {font-weight:bold;} 114 .pullquote { 115 float: right; 116 font-size: larger; 117 font-weight: bold; 118 font-style: italic; 119 page-break-inside:avoid; 120 border-bottom: 3px solid black; 121 border-top: 3px solid black; 122 width: 228px; 123 margin: 0px 0px 10px 15px; 124 padding: 7px 0px 9px; 125 } 126 .flytitle-and-title__flytitle { 127 display: block; 128 font-size: smaller; 129 color: red; 130 } 131 ''' 132 oldest_article = 7.0 133 resolve_internal_links = True 134 remove_tags = [ 135 dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']), 136 dict(attrs={'aria-label': "Article Teaser"}), 137 dict(attrs={ 138 'class': [ 139 'dblClkTrk', 'ec-article-info', 'share_inline_header', 140 'related-items', 'main-content-container', 'ec-topic-widget', 141 'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label', 142 'blog-post__foot-note', 'blog-post__sharebar', 'blog-post__bottom-panel', 143 'newsletter-form','share-links-header','teaser--wrapped', 'latest-updates-panel__container', 144 'latest-updates-panel__article-link','blog-post__section' 145 ] 146 } 147 ), 148 dict(attrs={ 149 'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}), 150 classes( 151 'share-links-header teaser--wrapped latest-updates-panel__container' 152 ' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel' 153 ) 154 ] 155 keep_only_tags = [dict(name='article', id=lambda x: not x)] 156 no_stylesheets = True 157 remove_attributes = ['data-reactid', 'width', 'height'] 158 # economist.com has started throttling after about 60% of the total has 159 # downloaded with connection reset by peer (104) errors. 160 delay = 1 161 162 needs_subscription = False 163 164 def __init__(self, *args, **kwargs): 165 BasicNewsRecipe.__init__(self, *args, **kwargs) 166 if self.output_profile.short_name.startswith('kindle'): 167 # Reduce image sizes to get file size below amazon's email 168 # sending threshold 169 self.web2disk_options.compress_news_images = True 170 self.web2disk_options.compress_news_images_auto_size = 5 171 self.log.warn('Kindle Output profile being used, reducing image quality to keep file size below amazon email threshold') 172 173 def get_browser(self): 174 br = BasicNewsRecipe.get_browser(self) 175 # Add a cookie indicating we have accepted Economist's cookie 176 # policy (needed when running from some European countries) 177 ck = Cookie( 178 version=0, 179 name='notice_preferences', 180 value='2:', 181 port=None, 182 port_specified=False, 183 domain='.economist.com', 184 domain_specified=False, 185 domain_initial_dot=True, 186 path='/', 187 path_specified=False, 188 secure=False, 189 expires=None, 190 discard=False, 191 comment=None, 192 comment_url=None, 193 rest={'HttpOnly': None}, 194 rfc2109=False 195 ) 196 br.cookiejar.set_cookie(ck) 197 br.set_handle_gzip(True) 198 return br 199 200 def preprocess_raw_html(self, raw, url): 201 # open('/t/raw.html', 'wb').write(raw.encode('utf-8')) 202 root = parse(raw) 203 script = root.xpath('//script[@id="__NEXT_DATA__"]') 204 if script: 205 load_article_from_json(script[0].text, root) 206 for div in root.xpath('//div[@class="lazy-image"]'): 207 noscript = list(div.iter('noscript')) 208 if noscript and noscript[0].text: 209 img = list(parse(noscript[0].text).iter('img')) 210 if img: 211 p = noscript[0].getparent() 212 idx = p.index(noscript[0]) 213 p.insert(idx, p.makeelement('img', src=img[0].get('src'))) 214 p.remove(noscript[0]) 215 for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'): 216 x.getparent().remove(x) 217 raw = etree.tostring(root, encoding='unicode') 218 return raw 219 220 def populate_article_metadata(self, article, soup, first): 221 els = soup.findAll(name=['span', 'p'], 222 attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']}) 223 result = [] 224 for el in els[0:2]: 225 if el is not None and el.contents: 226 for descendant in el.contents: 227 if isinstance(descendant, NavigableString): 228 result.append(type(u'')(descendant)) 229 article.summary = u'. '.join(result) + u'.' 230 article.text_summary = clean_ascii_chars(article.summary) 231 232 def publication_date(self): 233 if edition_date: 234 return parse_only_date(edition_date, as_utc=False) 235 return BasicNewsRecipe.publication_date(self) 236 237 def parse_index(self): 238 # return [('Articles', [{'title':'test', 239 # 'url':'https://www.economist.com/briefing/2021/09/11/how-america-wasted-its-unipolar-moment' 240 # }])] 241 if edition_date: 242 url = 'https://www.economist.com/weeklyedition/' + edition_date 243 self.timefmt = ' [' + edition_date + ']' 244 else: 245 url = 'https://www.economist.com/printedition' 246 raw = self.index_to_soup(url, raw=True) 247 # with open('/t/raw.html', 'wb') as f: 248 # f.write(raw) 249 soup = self.index_to_soup(raw) 250 # nav = soup.find(attrs={'class':'navigation__wrapper'}) 251 # if nav is not None: 252 # a = nav.find('a', href=lambda x: x and '/printedition/' in x) 253 # if a is not None: 254 # self.log('Following nav link to current edition', a['href']) 255 # soup = self.index_to_soup(process_url(a['href'])) 256 ans = self.economist_parse_index(soup) 257 if not ans: 258 raise NoArticles( 259 'Could not find any articles, either the ' 260 'economist.com server is having trouble and you should ' 261 'try later or the website format has changed and the ' 262 'recipe needs to be updated.' 263 ) 264 return ans 265 266 def economist_parse_index(self, soup): 267 script_tag = soup.find("script", id="__NEXT_DATA__") 268 if script_tag is not None: 269 data = json.loads(script_tag.string) 270 self.cover_url = data['props']['pageProps']['content']['image']['main']['url']['canonical'] 271 self.log('Got cover:', self.cover_url) 272 feeds = [] 273 for section in soup.findAll(**classes('layout-weekly-edition-section')): 274 h2 = section.find('h2') 275 secname = self.tag_to_string(h2) 276 self.log(secname) 277 articles = [] 278 for a in section.findAll('a', href=True, **classes('headline-link weekly-edition-wtw__link')): 279 spans = a.findAll('span') 280 if len(spans) == 2: 281 title = u'{}: {}'.format(*map(self.tag_to_string, spans)) 282 else: 283 title = self.tag_to_string(a) 284 desc = '' 285 desc_parent = a.findParent('div') 286 if desc_parent is not None: 287 p = desc_parent.find(itemprop='description') 288 if p is not None: 289 desc = self.tag_to_string(p) 290 articles.append({'title': title, 'url': process_url(a['href']), 'description': desc}) 291 self.log(' ', title, articles[-1]['url'], '\n ', desc) 292 if articles: 293 feeds.append((secname, articles)) 294 return feeds 295 296 def eco_find_image_tables(self, soup): 297 for x in soup.findAll('table', align=['right', 'center']): 298 if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1: 299 yield x 300 301 def postprocess_html(self, soup, first): 302 for img in soup.findAll('img', srcset=True): 303 del img['srcset'] 304 for table in list(self.eco_find_image_tables(soup)): 305 caption = table.find('font') 306 img = table.find('img') 307 div = new_tag(soup, 'div') 308 div['style'] = 'text-align:left;font-size:70%' 309 ns = NavigableString(self.tag_to_string(caption)) 310 div.insert(0, ns) 311 div.insert(1, new_tag(soup, 'br')) 312 del img['width'] 313 del img['height'] 314 img.extract() 315 div.insert(2, img) 316 table.replaceWith(div) 317 return soup 318 319 def canonicalize_internal_url(self, url, is_link=True): 320 if url.endswith('/print'): 321 url = url.rpartition('/')[0] 322 return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link) 323