1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3# 4# 11 Jan 2021 - L. Houpert - Major changes in the Mediapart recipe: 5# 1) Summary of the article are noow available 6# 2) Additional sections International, France, Economie and Culture have 7# been added through custom entries in the function my_parse_index. 8# 3) Fix the cover image so it doesnt disappear from the Kindle menu 9# ( cover image format is changed to .jpeg) 10# 14 Jan 2021 - Add Mediapart Logo url as masthead_url and change cover 11# by overlaying the date on top of the Mediapart cover 12from __future__ import unicode_literals 13 14__license__ = 'GPL v3' 15__copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert' # noqa 16''' 17Mediapart 18''' 19 20import re 21from datetime import date, datetime, timezone, timedelta 22from calibre.web.feeds import feeds_from_index 23from calibre.web.feeds.news import BasicNewsRecipe 24 25 26def classes(classes): 27 q = frozenset(classes.split(' ')) 28 return dict( 29 attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} 30 ) 31 32 33class Mediapart(BasicNewsRecipe): 34 title = 'Mediapart' 35 __author__ = 'Loïc Houpert' 36 description = 'Global news in French from news site Mediapart' 37 publication_type = 'newspaper' 38 language = 'fr' 39 needs_subscription = True 40 oldest_article = 2 41 42 use_embedded_content = False 43 no_stylesheets = True 44 45 keep_only_tags = [ 46 dict(name='h1'), 47 dict(name='div', **classes('author')), 48 classes('news__heading__top__intro news__body__center__article') 49 ] 50 remove_tags = [ 51 classes('login-subscribe print-source_url'), 52 dict(name='svg'), 53 ] 54 conversion_options = {'smarten_punctuation': True} 55 56 masthead_url = "https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart_masthead.png" 57 # cover_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg' 58 59 # -- 60 61 # Get date in french time zone format 62 today = datetime.now(timezone.utc) + timedelta(hours=1) 63 oldest_article_date = today - timedelta(days=oldest_article) 64 65 feeds = [ 66 ('La Une', 'http://www.mediapart.fr/articles/feed'), 67 ] 68 69 # The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10 70 # last elements so the articles are indexed on specific pages 71 # in the function my_parse_index. In this function the article are parsed 72 # using the funtion get_articles and the dict values dict_article_sources 73 74 def parse_feeds(self): 75 feeds = super(Mediapart, self).parse_feeds() 76 feeds += feeds_from_index(self.my_parse_index(feeds)) 77 return feeds 78 79 def my_parse_index(self, la_une): 80 81 dict_article_sources = [ 82 { 83 'type': 'Brèves', 84 'webpage': 'https://www.mediapart.fr/journal/fil-dactualites', 85 'separador': { 86 'page': 'ul', 87 'thread': 'li' 88 } 89 }, 90 { 91 'type': 'International', 92 'webpage': 'https://www.mediapart.fr/journal/international', 93 'separador': { 94 'page': 'div', 95 'thread': 'div' 96 } 97 }, 98 { 99 'type': 'France', 100 'webpage': 'https://www.mediapart.fr/journal/france', 101 'separador': { 102 'page': 'div', 103 'thread': 'div' 104 } 105 }, 106 { 107 'type': 'Économie', 108 'webpage': 'https://www.mediapart.fr/journal/economie', 109 'separador': { 110 'page': 'div', 111 'thread': 'div' 112 } 113 }, 114 { 115 'type': 'Culture', 116 'webpage': 'https://www.mediapart.fr/journal/culture-idees', 117 'separador': { 118 'page': 'div', 119 'thread': 'div' 120 } 121 }, 122 ] 123 124 def get_articles( 125 type_of_article, webpage, separador_page='ul', separador_thread='li' 126 ): 127 128 specific_articles = [] 129 130 webpage_article = [] 131 soup = self.index_to_soup(webpage) 132 page = soup.find('main', {'class': 'global-wrapper'}) 133 fils = page.find(separador_page, {'class': 'post-list universe-journal'}) 134 135 all_articles = fils.findAll(separador_thread) 136 for article in all_articles: 137 try: 138 title = article.find('h3', recursive=False) 139 if title is None or ''.join(title['class']) == 'title-specific': 140 # print(f"[BAD title entry] Print value of title:\n {title}") 141 continue 142 # print(f"\n[OK title entry] Print value of title:\n {title}\n") 143 144 try: 145 article_mot_cle = article.find( 146 'a', { 147 'href': re.compile(r'.*\/mot-cle\/.*') 148 } 149 ).renderContents().decode('utf-8') 150 except Exception: 151 article_mot_cle = '' 152 153 try: 154 article_type = article.find( 155 'a', { 156 'href': re.compile(r'.*\/type-darticles\/.*') 157 } 158 ).renderContents().decode('utf-8') 159 except Exception: 160 article_type = '' 161 162 for s in title('span'): 163 s.replaceWith(s.renderContents().decode('utf-8') + "\n") 164 url = title.find('a', href=True)['href'] 165 166 date = article.find('time', datetime=True)['datetime'] 167 article_date = datetime.strptime(date, '%Y-%m-%d') 168 # Add French timezone to date of the article for date check 169 article_date = article_date.replace(tzinfo=timezone.utc) + timedelta(hours=1) 170 if article_date < self.oldest_article_date: 171 print("article_date < self.oldest_article_date\n") 172 continue 173 174 # print("-------- Recent article added to the list ------- \n") 175 all_authors = article.findAll( 176 'a', {'class': re.compile(r'\bjournalist\b')} 177 ) 178 authors = [self.tag_to_string(a) for a in all_authors] 179 # print(f"Authors in tag <a>: {authors}") 180 181 # If not link to the author profile is available the 182 # html separador is a span tag 183 if not all_authors: 184 try: 185 all_authors = article.findAll( 186 'span', {'class': re.compile(r'\bjournalist\b')} 187 ) 188 authors = [self.tag_to_string(a) for a in all_authors] 189 # print(f"Authors in tag <span>: {authors}") 190 except: 191 authors = 'unknown' 192 193 description = article.find('p').renderContents().decode('utf-8') 194 # print(f" <p> in article : {self.tag_to_string(description).strip()} ") 195 196 summary = { 197 'title': self.tag_to_string(title).strip(), 198 'description': description, 199 'date': article_date.strftime("%a, %d %b, %Y %H:%M"), 200 'author': ', '.join(authors), 201 'article_type': article_type, 202 'mot_cle': article_mot_cle.capitalize(), 203 'url': 'https://www.mediapart.fr' + url, 204 } 205 206 webpage_article.append(summary) 207 except Exception: 208 pass 209 210 specific_articles += [(type_of_article, 211 webpage_article)] if webpage_article else [] 212 return specific_articles 213 214 articles = [] 215 216 for category in dict_article_sources: 217 articles += get_articles( 218 category['type'], category['webpage'], category['separador']['page'], 219 category['separador']['thread'] 220 ) 221 222 return articles 223 224 # non-locale specific date parse (strptime("%d %b %Y",s) would work with 225 # french locale) 226 def parse_french_date(self, date_str): 227 date_arr = date_str.lower().split() 228 return date( 229 day=int(date_arr[0]), 230 year=int(date_arr[2]), 231 month=[ 232 None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 233 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre' 234 ].index(date_arr[1]) 235 ) 236 237 def get_browser(self): 238 # -- Handle login 239 240 def is_form_login(form): 241 return "id" in form.attrs and form.attrs['id'] == "logFormEl" 242 243 br = BasicNewsRecipe.get_browser(self) 244 if self.username is not None and self.password is not None: 245 br.open('https://www.mediapart.fr/login') 246 br.select_form(predicate=is_form_login) 247 br['name'] = self.username 248 br['password'] = self.password 249 br.submit() 250 return br 251 252 def default_cover(self, cover_file): 253 ''' 254 Create a generic cover for recipes that don't have a cover 255 ''' 256 from PyQt5.Qt import QImage, QPainter, QPen, Qt, QFont, QRect 257 from calibre.gui2 import ensure_app, load_builtin_fonts, pixmap_to_data 258 259 def init_environment(): 260 ensure_app() 261 load_builtin_fonts() 262 263 def create_cover_mediapart(date): 264 ' Create a cover for mediapart adding the date on Mediapart Cover' 265 init_environment() 266 # Get data 267 image_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg' 268 data = self.index_to_soup(image_url, raw=True) 269 # Get date and hour corresponding to french time zone 270 today = datetime.now(timezone.utc) + timedelta(hours=1) 271 wkd = today.weekday() 272 french_weekday={0:'Mon',1:'Mar',2:'Mer',3:'Jeu',4:'Ven',5:'Sam',6:'Dim'} 273 day = french_weekday[wkd]+'.' 274 date = day + ' ' + today.strftime('%d %b. %Y') 275 edition = today.strftime('Édition de %Hh') 276 277 # Get Cover data 278 img = QImage() 279 img.loadFromData(data) 280 281 # Overlay date on cover 282 p = QPainter(img) 283 pen = QPen(Qt.black) 284 pen.setWidth(6) 285 p.setPen(pen) 286 font = QFont() 287 font.setFamily('Times') 288 font.setPointSize(78) 289 p.setFont(font) 290 r = QRect(0, 600, 744,100) 291 p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, date) 292 p.end() 293 294 # Overlay edition information on cover 295 p = QPainter(img) 296 pen = QPen(Qt.black) 297 pen.setWidth(4) 298 p.setPen(pen) 299 font = QFont() 300 font.setFamily('Times') 301 font.setItalic(True) 302 font.setPointSize(66) 303 p.setFont(font) 304 # Add date 305 r = QRect(0, 720, 744,100) 306 p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, edition) 307 p.end() 308 return pixmap_to_data(img) 309 310 try: 311 today=datetime.today() 312 date = today.strftime('%d %b %Y') 313 img_data = create_cover_mediapart(date) 314 cover_file.write(img_data) 315 cover_file.flush() 316 except Exception: 317 self.log.exception('Failed to generate default cover') 318 return False 319 return True 320