1# -*- mode: python -*- 2# -*- coding: utf-8 -*- 3 4__license__ = 'GPL v3' 5__copyright__ = '2008-2018, Darko Miletic <darko.miletic at gmail.com>' 6''' 7pagina12.com.ar 8''' 9from calibre import strftime 10from calibre.web.feeds.news import BasicNewsRecipe 11from calibre.ptempfile import PersistentTemporaryFile 12 13 14class Pagina12(BasicNewsRecipe): 15 title = 'Pagina - 12' 16 __author__ = 'Darko Miletic' 17 description = 'Noticias de Argentina y el resto del mundo' 18 publisher = 'La Pagina S.A.' 19 category = 'news, politics, Argentina' 20 oldest_article = 2 21 no_stylesheets = True 22 encoding = 'utf8' 23 use_embedded_content = False 24 language = 'es_AR' 25 remove_empty_feeds = True 26 publication_type = 'newspaper' 27 auto_cleanup = False 28 delay = 1 29 simultaneous_downloads = 1 30 timeout = 8 31 ignore_duplicate_articles = {'url'} 32 articles_are_obfuscated = True 33 temp_files = [] 34 fetch_retries = 10 35 extra_css = """ 36 body{font-family: "Open Sans", sans-serif} 37 .article-date{font-size: small; margin-bottom: 1em;} 38 .article-title{font-size: x-large; font-weight: bold; display: block; margin-bottom: 1em; margin-top: 1em;} 39 .article-main-media{display: block; margin-bottom: 1em;} 40 .article-summary{margin-top:1em; margin-bottom: 1em; display:block} 41 .article-author{font-family: "Archivo Narrow",Helvetica,sans-serif; color: gray; font-size: small; margin-top:1em; margin-bottom: 1em} 42 img{margin-top:1em; margin-bottom: 1em; display:block} 43 .article-text p:first-letter{display: inline; font-size: xx-large; font-weight: bold} 44 .article-prefix{font-family: "Archivo Narrow",Helvetica,sans-serif; font-size: small; text-transform: uppercase;} 45 """ 46 47 conversion_options = { 48 'comment': description, 'tags': category, 'publisher': publisher, 'language': language 49 } 50 51 remove_tags = [ 52 dict(name=['meta', 'link']), 53 dict(attrs={'class':'article-main-media-social show-for-medium'}) 54 ] 55 56 keep_only_tags=[ 57 dict(name='div', attrs={'class':[ 58 'article-info', 59 'article-titles', 60 'article-main-media-header', 61 'article-main-media', 62 'article-text' 63 ]}) 64 ] 65 66 feeds = [ 67 (u'Diario de hoy' , u'https://www.pagina12.com.ar/rss/edicion-impresa'), 68 (u'El Pais' , u'https://www.pagina12.com.ar/rss/secciones/el-pais/notas'), 69 (u'Economia' , u'https://www.pagina12.com.ar/rss/secciones/economia/notas'), 70 (u'Sociedad' , u'https://www.pagina12.com.ar/rss/secciones/sociedad/notas'), 71 (u'El Mundo' , u'https://www.pagina12.com.ar/rss/secciones/el-mundo/notas'), 72 (u'Deportes' , u'https://www.pagina12.com.ar/rss/secciones/deportes/notas'), 73 (u'Cultura' , u'https://www.pagina12.com.ar/rss/secciones/cultura/notas'), 74 (u'Universidad' , u'https://www.pagina12.com.ar/rss/secciones/universidad/notas'), 75 (u'Ciencia' , u'https://www.pagina12.com.ar/rss/secciones/ciencia/notas'), 76 (u'Psicologia' , u'https://www.pagina12.com.ar/rss/secciones/psicologia/notas'), 77 (u'Ajedrez' , u'https://www.pagina12.com.ar/rss/secciones/ajedrez/notas'), 78 (u'La Ventana' , u'https://www.pagina12.com.ar/rss/secciones/la-ventana/notas'), 79 (u'Dialogos' , u'https://www.pagina12.com.ar/rss/secciones/dialogos/notas'), 80 (u'Hoy' , u'https://www.pagina12.com.ar/rss/secciones/hoy/notas'), 81 (u'Plastica' , u'https://www.pagina12.com.ar/rss/secciones/plastica/notas'), 82 (u'Cartas de Lectores', u'https://www.pagina12.com.ar/rss/secciones/cartas-de-lectores/notas'), 83 (u'Espectaculos' , u'https://www.pagina12.com.ar/rss/suplementos/cultura-y-espectaculos/notas'), 84 (u'Radar' , u'https://www.pagina12.com.ar/rss/suplementos/radar/notas'), 85 (u'Radar libros' , u'https://www.pagina12.com.ar/rss/suplementos/radar-libros/notas'), 86 (u'Cash' , u'https://www.pagina12.com.ar/rss/suplementos/cash/notas'), 87 (u'Turismo' , u'https://www.pagina12.com.ar/rss/suplementos/turismo/notas'), 88 (u'Libero' , u'https://www.pagina12.com.ar/rss/suplementos/libero/notas'), 89 (u'NO' , u'https://www.pagina12.com.ar/rss/suplementos/no/notas'), 90 (u'Las 12' , u'https://www.pagina12.com.ar/rss/suplementos/las12/notas'), 91 (u'Soy' , u'https://www.pagina12.com.ar/rss/suplementos/soy/notas'), 92 (u'M2' , u'https://www.pagina12.com.ar/rss/suplementos/m2/notas'), 93 (u'Rosario 12' , u'https://www.pagina12.com.ar/rss/suplementos/rosario12/notas') 94 ] 95 96 def get_cover_url(self): 97 lurl = strftime('https://www.pagina12.com.ar/edicion-impresa/%d-%m-%Y') 98 soup = self.index_to_soup(lurl) 99 mydiv = soup.find('div', {'class' : lambda x: x and 'printed-edition-cover' in x.split()}) 100 if mydiv: 101 for image in mydiv.findAll('img'): 102 if image['data-src'].startswith('https://images.pagina12.com.ar/styles/width700/public/'): 103 return image['data-src'] 104 return None 105 106 def get_obfuscated_article(self, url): 107 result = None 108 count = 0 109 while (count < self.fetch_retries): 110 try: 111 response = self.browser.open(url, timeout=self.timeout) 112 html = response.read() 113 count = self.fetch_retries 114 tfile = PersistentTemporaryFile('_fa.html') 115 tfile.write(html) 116 tfile.close() 117 self.temp_files.append(tfile) 118 result = tfile.name 119 except: 120 self.info("Retrying download...") 121 count += 1 122 return result 123