1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3#
4# 11 Jan 2021 -  L. Houpert - Major changes in the Mediapart recipe:
5#   1) Summary of the article are noow available
6#   2) Additional sections  International, France, Economie and Culture have
7# been added through custom entries in the function my_parse_index.
8#   3) Fix the cover image so it doesnt disappear from the Kindle menu
9# ( cover image format is changed to .jpeg)
10# 14 Jan 2021 - Add Mediapart Logo url as masthead_url and change cover
11#   by overlaying the date on top of the Mediapart cover
12from __future__ import unicode_literals
13
14__license__ = 'GPL v3'
15__copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert'  # noqa
16'''
17Mediapart
18'''
19
20import re
21from datetime import date, datetime, timezone, timedelta
22from calibre.web.feeds import feeds_from_index
23from calibre.web.feeds.news import BasicNewsRecipe
24
25
26def classes(classes):
27    q = frozenset(classes.split(' '))
28    return dict(
29        attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
30    )
31
32
33class Mediapart(BasicNewsRecipe):
34    title = 'Mediapart'
35    __author__ = 'Loïc Houpert'
36    description = 'Global news in French from news site Mediapart'
37    publication_type = 'newspaper'
38    language = 'fr'
39    needs_subscription = True
40    oldest_article = 2
41
42    use_embedded_content = False
43    no_stylesheets = True
44
45    keep_only_tags = [
46        dict(name='h1'),
47        dict(name='div', **classes('author')),
48        classes('news__heading__top__intro news__body__center__article')
49    ]
50    remove_tags = [
51        classes('login-subscribe print-source_url'),
52        dict(name='svg'),
53    ]
54    conversion_options = {'smarten_punctuation': True}
55
56    masthead_url = "https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart_masthead.png"
57    # cover_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg'
58
59    # --
60
61    # Get date in french time zone format
62    today = datetime.now(timezone.utc) + timedelta(hours=1)
63    oldest_article_date = today - timedelta(days=oldest_article)
64
65    feeds = [
66        ('La Une', 'http://www.mediapart.fr/articles/feed'),
67    ]
68
69    # The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10
70    # last elements so the articles are indexed on specific pages
71    # in the function my_parse_index. In this function the article are parsed
72    # using the funtion get_articles and the dict values dict_article_sources
73
74    def parse_feeds(self):
75        feeds = super(Mediapart, self).parse_feeds()
76        feeds += feeds_from_index(self.my_parse_index(feeds))
77        return feeds
78
79    def my_parse_index(self, la_une):
80
81        dict_article_sources = [
82            {
83                'type': 'Brèves',
84                'webpage': 'https://www.mediapart.fr/journal/fil-dactualites',
85                'separador': {
86                    'page': 'ul',
87                    'thread': 'li'
88                }
89            },
90            {
91                'type': 'International',
92                'webpage': 'https://www.mediapart.fr/journal/international',
93                'separador': {
94                    'page': 'div',
95                    'thread': 'div'
96                }
97            },
98            {
99                'type': 'France',
100                'webpage': 'https://www.mediapart.fr/journal/france',
101                'separador': {
102                    'page': 'div',
103                    'thread': 'div'
104                }
105            },
106            {
107                'type': 'Économie',
108                'webpage': 'https://www.mediapart.fr/journal/economie',
109                'separador': {
110                    'page': 'div',
111                    'thread': 'div'
112                }
113            },
114            {
115                'type': 'Culture',
116                'webpage': 'https://www.mediapart.fr/journal/culture-idees',
117                'separador': {
118                    'page': 'div',
119                    'thread': 'div'
120                }
121            },
122        ]
123
124        def get_articles(
125            type_of_article, webpage, separador_page='ul', separador_thread='li'
126        ):
127
128            specific_articles = []
129
130            webpage_article = []
131            soup = self.index_to_soup(webpage)
132            page = soup.find('main', {'class': 'global-wrapper'})
133            fils = page.find(separador_page, {'class': 'post-list universe-journal'})
134
135            all_articles = fils.findAll(separador_thread)
136            for article in all_articles:
137                try:
138                    title = article.find('h3', recursive=False)
139                    if title is None or ''.join(title['class']) == 'title-specific':
140                        # print(f"[BAD title entry] Print value of title:\n {title}")
141                        continue
142                    # print(f"\n[OK title entry] Print value of title:\n {title}\n")
143
144                    try:
145                        article_mot_cle = article.find(
146                            'a', {
147                                'href': re.compile(r'.*\/mot-cle\/.*')
148                            }
149                        ).renderContents().decode('utf-8')
150                    except Exception:
151                        article_mot_cle = ''
152
153                    try:
154                        article_type = article.find(
155                            'a', {
156                                'href': re.compile(r'.*\/type-darticles\/.*')
157                            }
158                        ).renderContents().decode('utf-8')
159                    except Exception:
160                        article_type = ''
161
162                    for s in title('span'):
163                        s.replaceWith(s.renderContents().decode('utf-8') + "\n")
164                    url = title.find('a', href=True)['href']
165
166                    date = article.find('time', datetime=True)['datetime']
167                    article_date = datetime.strptime(date, '%Y-%m-%d')
168                    # Add French timezone to date of the article for date check
169                    article_date = article_date.replace(tzinfo=timezone.utc) + timedelta(hours=1)
170                    if article_date < self.oldest_article_date:
171                        print("article_date < self.oldest_article_date\n")
172                        continue
173
174                    # print("-------- Recent article added to the list ------- \n")
175                    all_authors = article.findAll(
176                        'a', {'class': re.compile(r'\bjournalist\b')}
177                    )
178                    authors = [self.tag_to_string(a) for a in all_authors]
179                    # print(f"Authors in tag <a>: {authors}")
180
181                    # If not link to the author profile is available the
182                    # html separador is a span tag
183                    if not all_authors:
184                        try:
185                            all_authors = article.findAll(
186                                'span', {'class': re.compile(r'\bjournalist\b')}
187                            )
188                            authors = [self.tag_to_string(a) for a in all_authors]
189                            # print(f"Authors in tag <span>: {authors}")
190                        except:
191                            authors = 'unknown'
192
193                    description = article.find('p').renderContents().decode('utf-8')
194                    # print(f" <p> in article : {self.tag_to_string(description).strip()} ")
195
196                    summary = {
197                        'title': self.tag_to_string(title).strip(),
198                        'description': description,
199                        'date': article_date.strftime("%a, %d %b, %Y %H:%M"),
200                        'author': ', '.join(authors),
201                        'article_type': article_type,
202                        'mot_cle': article_mot_cle.capitalize(),
203                        'url': 'https://www.mediapart.fr' + url,
204                    }
205
206                    webpage_article.append(summary)
207                except Exception:
208                    pass
209
210            specific_articles += [(type_of_article,
211                                   webpage_article)] if webpage_article else []
212            return specific_articles
213
214        articles = []
215
216        for category in dict_article_sources:
217            articles += get_articles(
218                category['type'], category['webpage'], category['separador']['page'],
219                category['separador']['thread']
220            )
221
222        return articles
223
224    # non-locale specific date parse (strptime("%d %b %Y",s) would work with
225    # french locale)
226    def parse_french_date(self, date_str):
227        date_arr = date_str.lower().split()
228        return date(
229            day=int(date_arr[0]),
230            year=int(date_arr[2]),
231            month=[
232                None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
233                'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'
234            ].index(date_arr[1])
235        )
236
237    def get_browser(self):
238        # -- Handle login
239
240        def is_form_login(form):
241            return "id" in form.attrs and form.attrs['id'] == "logFormEl"
242
243        br = BasicNewsRecipe.get_browser(self)
244        if self.username is not None and self.password is not None:
245            br.open('https://www.mediapart.fr/login')
246            br.select_form(predicate=is_form_login)
247            br['name'] = self.username
248            br['password'] = self.password
249            br.submit()
250        return br
251
252    def default_cover(self, cover_file):
253        '''
254        Create a generic cover for recipes that don't have a cover
255        '''
256        from PyQt5.Qt import QImage, QPainter, QPen, Qt, QFont, QRect
257        from calibre.gui2 import ensure_app, load_builtin_fonts, pixmap_to_data
258
259        def init_environment():
260            ensure_app()
261            load_builtin_fonts()
262
263        def create_cover_mediapart(date):
264            ' Create a cover for mediapart adding the date on Mediapart Cover'
265            init_environment()
266            # Get data
267            image_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg'
268            data = self.index_to_soup(image_url, raw=True)
269            # Get date and hour corresponding to french time zone
270            today = datetime.now(timezone.utc) + timedelta(hours=1)
271            wkd = today.weekday()
272            french_weekday={0:'Mon',1:'Mar',2:'Mer',3:'Jeu',4:'Ven',5:'Sam',6:'Dim'}
273            day = french_weekday[wkd]+'.'
274            date = day + ' ' + today.strftime('%d %b. %Y')
275            edition = today.strftime('Édition de %Hh')
276
277            # Get Cover data
278            img  = QImage()
279            img.loadFromData(data)
280
281            # Overlay date on cover
282            p = QPainter(img)
283            pen = QPen(Qt.black)
284            pen.setWidth(6)
285            p.setPen(pen)
286            font = QFont()
287            font.setFamily('Times')
288            font.setPointSize(78)
289            p.setFont(font)
290            r = QRect(0, 600, 744,100)
291            p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, date)
292            p.end()
293
294            # Overlay edition information on cover
295            p = QPainter(img)
296            pen = QPen(Qt.black)
297            pen.setWidth(4)
298            p.setPen(pen)
299            font = QFont()
300            font.setFamily('Times')
301            font.setItalic(True)
302            font.setPointSize(66)
303            p.setFont(font)
304            # Add date
305            r = QRect(0, 720, 744,100)
306            p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, edition)
307            p.end()
308            return pixmap_to_data(img)
309
310        try:
311            today=datetime.today()
312            date = today.strftime('%d %b %Y')
313            img_data = create_cover_mediapart(date)
314            cover_file.write(img_data)
315            cover_file.flush()
316        except Exception:
317            self.log.exception('Failed to generate default cover')
318            return False
319        return True
320