1#!/usr/local/bin/python3.8 2 3 4__license__ = 'GPL v3' 5__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' 6''' 7Contains the logic for parsing feeds. 8''' 9import time, traceback, copy, re 10 11from calibre.utils.logging import default_log 12from calibre import entity_to_unicode, strftime, force_unicode 13from calibre.utils.date import dt_factory, utcnow, local_tz 14from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars 15from polyglot.builtins import string_or_bytes 16 17 18class Article: 19 20 def __init__(self, id, title, url, author, summary, published, content): 21 from lxml import html 22 self.downloaded = False 23 self.id = id 24 if not title or not isinstance(title, string_or_bytes): 25 title = _('Unknown') 26 title = force_unicode(title, 'utf-8') 27 self._title = clean_xml_chars(title).strip() 28 try: 29 self._title = re.sub(r'&(\S+?);', 30 entity_to_unicode, self._title) 31 except: 32 pass 33 self._title = clean_ascii_chars(self._title) 34 self.url = url 35 self.author = author 36 self.toc_thumbnail = None 37 self.internal_toc_entries = () 38 if author and not isinstance(author, str): 39 author = author.decode('utf-8', 'replace') 40 if summary and not isinstance(summary, str): 41 summary = summary.decode('utf-8', 'replace') 42 summary = clean_xml_chars(summary) if summary else summary 43 self.summary = summary 44 if summary and '<' in summary: 45 try: 46 s = html.fragment_fromstring(summary, create_parent=True) 47 summary = html.tostring(s, method='text', encoding='unicode') 48 except: 49 print('Failed to process article summary, deleting:') 50 print(summary.encode('utf-8')) 51 traceback.print_exc() 52 summary = '' 53 self.text_summary = clean_ascii_chars(summary) 54 self.author = author 55 self.content = content 56 self.date = published 57 self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True) 58 self.localtime = self.utctime.astimezone(local_tz) 59 self._formatted_date = None 60 61 @property 62 def formatted_date(self): 63 64 if self._formatted_date is None: 65 self._formatted_date = strftime(" [%a, %d %b %H:%M]", 66 t=self.localtime.timetuple()) 67 return self._formatted_date 68 69 @formatted_date.setter 70 def formatted_date(self, val): 71 if isinstance(val, str): 72 self._formatted_date = val 73 74 @property 75 def title(self): 76 t = self._title 77 if not isinstance(t, str) and hasattr(t, 'decode'): 78 t = t.decode('utf-8', 'replace') 79 return t 80 81 @title.setter 82 def title(self, val): 83 self._title = clean_ascii_chars(val) 84 85 def __repr__(self): 86 return \ 87('''\ 88Title : %s 89URL : %s 90Author : %s 91Summary : %s 92Date : %s 93TOC thumb : %s 94Has content : %s 95'''%(self.title, self.url, self.author, self.summary[:20]+'...', 96 self.localtime.strftime('%a, %d %b, %Y %H:%M'), self.toc_thumbnail, 97 bool(self.content))) 98 99 def __str__(self): 100 return repr(self) 101 102 def is_same_as(self, other_article): 103 # if self.title != getattr(other_article, 'title', False): 104 # return False 105 if self.url: 106 return self.url == getattr(other_article, 'url', False) 107 return self.content == getattr(other_article, 'content', False) 108 109 110class Feed: 111 112 def __init__(self, get_article_url=lambda item: item.get('link', None), 113 log=default_log): 114 ''' 115 Parse a feed into articles. 116 ''' 117 self.logger = log 118 self.get_article_url = get_article_url 119 120 def populate_from_feed(self, feed, title=None, oldest_article=7, 121 max_articles_per_feed=100): 122 entries = feed.entries 123 feed = feed.feed 124 self.title = feed.get('title', _('Unknown section')) if not title else title 125 self.description = feed.get('description', '') 126 image = feed.get('image', {}) 127 self.image_url = image.get('href', None) 128 self.image_width = image.get('width', 88) 129 self.image_height = image.get('height', 31) 130 self.image_alt = image.get('title', '') 131 132 self.articles = [] 133 self.id_counter = 0 134 self.added_articles = [] 135 136 self.oldest_article = oldest_article 137 138 for item in entries: 139 if len(self.articles) >= max_articles_per_feed: 140 break 141 self.parse_article(item) 142 143 def populate_from_preparsed_feed(self, title, articles, oldest_article=7, 144 max_articles_per_feed=100): 145 self.title = str(title if title else _('Unknown feed')) 146 self.description = '' 147 self.image_url = None 148 self.articles = [] 149 self.added_articles = [] 150 151 self.oldest_article = oldest_article 152 self.id_counter = 0 153 154 for item in articles: 155 if len(self.articles) >= max_articles_per_feed: 156 break 157 self.id_counter += 1 158 id = item.get('id', None) 159 if not id: 160 id = 'internal id#%s'%self.id_counter 161 if id in self.added_articles: 162 return 163 self.added_articles.append(id) 164 published = time.gmtime(item.get('timestamp', time.time())) 165 title = item.get('title', _('Untitled article')) 166 link = item.get('url', None) 167 description = item.get('description', '') 168 content = item.get('content', '') 169 author = item.get('author', '') 170 article = Article(id, title, link, author, description, published, content) 171 delta = utcnow() - article.utctime 172 if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: 173 self.articles.append(article) 174 else: 175 t = strftime('%a, %d %b, %Y %H:%M', article.localtime.timetuple()) 176 self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'% 177 (title, t, self.title)) 178 d = item.get('date', '') 179 article.formatted_date = d 180 181 def parse_article(self, item): 182 self.id_counter += 1 183 id = item.get('id', None) 184 if not id: 185 id = 'internal id#%s'%self.id_counter 186 if id in self.added_articles: 187 return 188 published = None 189 for date_field in ('date_parsed', 'published_parsed', 190 'updated_parsed'): 191 published = item.get(date_field, None) 192 if published is not None: 193 break 194 if not published: 195 from dateutil.parser import parse 196 for date_field in ('date', 'published', 'updated'): 197 try: 198 published = parse(item[date_field]).timetuple() 199 except Exception: 200 continue 201 break 202 if not published: 203 published = time.gmtime() 204 self.added_articles.append(id) 205 206 title = item.get('title', _('Untitled article')) 207 if title.startswith('<'): 208 title = re.sub(r'<.+?>', '', title) 209 try: 210 link = self.get_article_url(item) 211 except: 212 self.logger.warning('Failed to get link for %s'%title) 213 self.logger.debug(traceback.format_exc()) 214 link = None 215 216 description = item.get('summary', None) 217 author = item.get('author', None) 218 219 content = [i.value for i in item.get('content', []) if i.value] 220 content = [i if isinstance(i, str) else i.decode('utf-8', 'replace') 221 for i in content] 222 content = '\n'.join(content) 223 if not content.strip(): 224 content = None 225 if not link and not content: 226 return 227 article = Article(id, title, link, author, description, published, content) 228 delta = utcnow() - article.utctime 229 if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: 230 self.articles.append(article) 231 else: 232 try: 233 self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'% 234 (title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title)) 235 except UnicodeDecodeError: 236 if not isinstance(title, str): 237 title = title.decode('utf-8', 'replace') 238 self.logger.debug('Skipping article %s as it is too old'%title) 239 240 def reverse(self): 241 self.articles.reverse() 242 243 def __iter__(self): 244 return iter(self.articles) 245 246 def __len__(self): 247 return len(self.articles) 248 249 def __repr__(self): 250 res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self] 251 252 return '\n'+'\n'.join(res)+'\n' 253 254 def __str__(self): 255 return repr(self) 256 257 def has_embedded_content(self): 258 length = 0 259 for a in self: 260 if a.content or a.summary: 261 length += max(len(a.content if a.content else ''), 262 len(a.summary if a.summary else '')) 263 264 return length > 2000 * len(self) 265 266 def has_article(self, article): 267 for a in self: 268 if a.is_same_as(article): 269 return True 270 return False 271 272 def find(self, article): 273 for i, a in enumerate(self): 274 if a.is_same_as(article): 275 return i 276 return -1 277 278 def remove(self, article): 279 i = self.index(article) 280 if i > -1: 281 self.articles[i:i+1] = [] 282 283 def remove_article(self, article): 284 try: 285 self.articles.remove(article) 286 except ValueError: 287 pass 288 289 290class FeedCollection(list): 291 292 def __init__(self, feeds): 293 list.__init__(self, [f for f in feeds if len(f.articles) > 0]) 294 found_articles = set() 295 duplicates = set() 296 297 def in_set(s, a): 298 for x in s: 299 if a.is_same_as(x): 300 return x 301 return None 302 303 print('#feeds', len(self)) 304 print(list(map(len, self))) 305 for f in self: 306 dups = [] 307 for a in f: 308 first = in_set(found_articles, a) 309 if first is not None: 310 dups.append(a) 311 duplicates.add((first, f)) 312 else: 313 found_articles.add(a) 314 for x in dups: 315 f.articles.remove(x) 316 317 self.duplicates = duplicates 318 print(len(duplicates)) 319 print(list(map(len, self))) 320 # raise 321 322 def find_article(self, article): 323 for j, f in enumerate(self): 324 for i, a in enumerate(f): 325 if a is article: 326 return (j, i) 327 328 def restore_duplicates(self): 329 temp = [] 330 for article, feed in self.duplicates: 331 art = copy.deepcopy(article) 332 j, i = self.find_article(article) 333 art.url = '../feed_%d/article_%d/index.html'%(j, i) 334 temp.append((feed, art)) 335 for feed, art in temp: 336 feed.articles.append(art) 337 338 339def feed_from_xml(raw_xml, title=None, oldest_article=7, 340 max_articles_per_feed=100, 341 get_article_url=lambda item: item.get('link', None), 342 log=default_log): 343 from calibre.web.feeds.feedparser import parse 344 # Handle unclosed escaped entities. They trip up feedparser and HBR for one 345 # generates them 346 raw_xml = re.sub(br'(&#\d+)([^0-9;])', br'\1;\2', raw_xml) 347 feed = parse(raw_xml) 348 pfeed = Feed(get_article_url=get_article_url, log=log) 349 pfeed.populate_from_feed(feed, title=title, 350 oldest_article=oldest_article, 351 max_articles_per_feed=max_articles_per_feed) 352 return pfeed 353 354 355def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100, 356 log=default_log): 357 ''' 358 @param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}. 359 @return: A list of L{Feed} objects. 360 @rtype: list 361 ''' 362 feeds = [] 363 for title, articles in index: 364 pfeed = Feed(log=log) 365 pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, 366 max_articles_per_feed=max_articles_per_feed) 367 feeds.append(pfeed) 368 return feeds 369