1#!/usr/local/bin/python3.8
2
3
4__license__   = 'GPL v3'
5__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
6'''
7Contains the logic for parsing feeds.
8'''
9import time, traceback, copy, re
10
11from calibre.utils.logging import default_log
12from calibre import entity_to_unicode, strftime, force_unicode
13from calibre.utils.date import dt_factory, utcnow, local_tz
14from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
15from polyglot.builtins import string_or_bytes
16
17
18class Article:
19
20    def __init__(self, id, title, url, author, summary, published, content):
21        from lxml import html
22        self.downloaded = False
23        self.id = id
24        if not title or not isinstance(title, string_or_bytes):
25            title = _('Unknown')
26        title = force_unicode(title, 'utf-8')
27        self._title = clean_xml_chars(title).strip()
28        try:
29            self._title = re.sub(r'&(\S+?);',
30                entity_to_unicode, self._title)
31        except:
32            pass
33        self._title = clean_ascii_chars(self._title)
34        self.url = url
35        self.author = author
36        self.toc_thumbnail = None
37        self.internal_toc_entries = ()
38        if author and not isinstance(author, str):
39            author = author.decode('utf-8', 'replace')
40        if summary and not isinstance(summary, str):
41            summary = summary.decode('utf-8', 'replace')
42        summary = clean_xml_chars(summary) if summary else summary
43        self.summary = summary
44        if summary and '<' in summary:
45            try:
46                s = html.fragment_fromstring(summary, create_parent=True)
47                summary = html.tostring(s, method='text', encoding='unicode')
48            except:
49                print('Failed to process article summary, deleting:')
50                print(summary.encode('utf-8'))
51                traceback.print_exc()
52                summary = ''
53        self.text_summary = clean_ascii_chars(summary)
54        self.author = author
55        self.content = content
56        self.date = published
57        self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True)
58        self.localtime = self.utctime.astimezone(local_tz)
59        self._formatted_date = None
60
61    @property
62    def formatted_date(self):
63
64        if self._formatted_date is None:
65            self._formatted_date = strftime(" [%a, %d %b %H:%M]",
66                    t=self.localtime.timetuple())
67        return self._formatted_date
68
69    @formatted_date.setter
70    def formatted_date(self, val):
71        if isinstance(val, str):
72            self._formatted_date = val
73
74    @property
75    def title(self):
76        t = self._title
77        if not isinstance(t, str) and hasattr(t, 'decode'):
78            t = t.decode('utf-8', 'replace')
79        return t
80
81    @title.setter
82    def title(self, val):
83        self._title = clean_ascii_chars(val)
84
85    def __repr__(self):
86        return \
87('''\
88Title       : %s
89URL         : %s
90Author      : %s
91Summary     : %s
92Date        : %s
93TOC thumb   : %s
94Has content : %s
95'''%(self.title, self.url, self.author, self.summary[:20]+'...',
96     self.localtime.strftime('%a, %d %b, %Y %H:%M'), self.toc_thumbnail,
97     bool(self.content)))
98
99    def __str__(self):
100        return repr(self)
101
102    def is_same_as(self, other_article):
103        # if self.title != getattr(other_article, 'title', False):
104        #    return False
105        if self.url:
106            return self.url == getattr(other_article, 'url', False)
107        return self.content == getattr(other_article, 'content', False)
108
109
110class Feed:
111
112    def __init__(self, get_article_url=lambda item: item.get('link', None),
113            log=default_log):
114        '''
115        Parse a feed into articles.
116        '''
117        self.logger = log
118        self.get_article_url = get_article_url
119
120    def populate_from_feed(self, feed, title=None, oldest_article=7,
121                           max_articles_per_feed=100):
122        entries = feed.entries
123        feed = feed.feed
124        self.title        = feed.get('title', _('Unknown section')) if not title else title
125        self.description  = feed.get('description', '')
126        image             = feed.get('image', {})
127        self.image_url    = image.get('href', None)
128        self.image_width  = image.get('width', 88)
129        self.image_height = image.get('height', 31)
130        self.image_alt    = image.get('title', '')
131
132        self.articles = []
133        self.id_counter = 0
134        self.added_articles = []
135
136        self.oldest_article = oldest_article
137
138        for item in entries:
139            if len(self.articles) >= max_articles_per_feed:
140                break
141            self.parse_article(item)
142
143    def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
144                           max_articles_per_feed=100):
145        self.title      = str(title if title else _('Unknown feed'))
146        self.description = ''
147        self.image_url  = None
148        self.articles   = []
149        self.added_articles = []
150
151        self.oldest_article = oldest_article
152        self.id_counter = 0
153
154        for item in articles:
155            if len(self.articles) >= max_articles_per_feed:
156                break
157            self.id_counter += 1
158            id = item.get('id', None)
159            if not id:
160                id = 'internal id#%s'%self.id_counter
161            if id in self.added_articles:
162                return
163            self.added_articles.append(id)
164            published   = time.gmtime(item.get('timestamp', time.time()))
165            title       = item.get('title', _('Untitled article'))
166            link        = item.get('url', None)
167            description = item.get('description', '')
168            content     = item.get('content', '')
169            author      = item.get('author', '')
170            article = Article(id, title, link, author, description, published, content)
171            delta = utcnow() - article.utctime
172            if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
173                self.articles.append(article)
174            else:
175                t = strftime('%a, %d %b, %Y %H:%M', article.localtime.timetuple())
176                self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
177                        (title, t, self.title))
178            d = item.get('date', '')
179            article.formatted_date = d
180
181    def parse_article(self, item):
182        self.id_counter += 1
183        id = item.get('id', None)
184        if not id:
185            id = 'internal id#%s'%self.id_counter
186        if id in self.added_articles:
187            return
188        published = None
189        for date_field in ('date_parsed', 'published_parsed',
190                           'updated_parsed'):
191            published = item.get(date_field, None)
192            if published is not None:
193                break
194        if not published:
195            from dateutil.parser import parse
196            for date_field in ('date', 'published', 'updated'):
197                try:
198                    published = parse(item[date_field]).timetuple()
199                except Exception:
200                    continue
201                break
202        if not published:
203            published = time.gmtime()
204        self.added_articles.append(id)
205
206        title = item.get('title', _('Untitled article'))
207        if title.startswith('<'):
208            title = re.sub(r'<.+?>', '', title)
209        try:
210            link  = self.get_article_url(item)
211        except:
212            self.logger.warning('Failed to get link for %s'%title)
213            self.logger.debug(traceback.format_exc())
214            link = None
215
216        description = item.get('summary', None)
217        author = item.get('author', None)
218
219        content = [i.value for i in item.get('content', []) if i.value]
220        content = [i if isinstance(i, str) else i.decode('utf-8', 'replace')
221                for i in content]
222        content = '\n'.join(content)
223        if not content.strip():
224            content = None
225        if not link and not content:
226            return
227        article = Article(id, title, link, author, description, published, content)
228        delta = utcnow() - article.utctime
229        if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
230            self.articles.append(article)
231        else:
232            try:
233                self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%
234                                  (title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
235            except UnicodeDecodeError:
236                if not isinstance(title, str):
237                    title = title.decode('utf-8', 'replace')
238                self.logger.debug('Skipping article %s as it is too old'%title)
239
240    def reverse(self):
241        self.articles.reverse()
242
243    def __iter__(self):
244        return iter(self.articles)
245
246    def __len__(self):
247        return len(self.articles)
248
249    def __repr__(self):
250        res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self]
251
252        return '\n'+'\n'.join(res)+'\n'
253
254    def __str__(self):
255        return repr(self)
256
257    def has_embedded_content(self):
258        length = 0
259        for a in self:
260            if a.content or a.summary:
261                length += max(len(a.content if a.content else ''),
262                              len(a.summary if a.summary else ''))
263
264        return length > 2000 * len(self)
265
266    def has_article(self, article):
267        for a in self:
268            if a.is_same_as(article):
269                return True
270        return False
271
272    def find(self, article):
273        for i, a in enumerate(self):
274            if a.is_same_as(article):
275                return i
276        return -1
277
278    def remove(self, article):
279        i = self.index(article)
280        if i > -1:
281            self.articles[i:i+1] = []
282
283    def remove_article(self, article):
284        try:
285            self.articles.remove(article)
286        except ValueError:
287            pass
288
289
290class FeedCollection(list):
291
292    def __init__(self, feeds):
293        list.__init__(self, [f for f in feeds if len(f.articles) > 0])
294        found_articles = set()
295        duplicates = set()
296
297        def in_set(s, a):
298            for x in s:
299                if a.is_same_as(x):
300                    return x
301            return None
302
303        print('#feeds', len(self))
304        print(list(map(len, self)))
305        for f in self:
306            dups = []
307            for a in f:
308                first = in_set(found_articles, a)
309                if first is not None:
310                    dups.append(a)
311                    duplicates.add((first, f))
312                else:
313                    found_articles.add(a)
314            for x in dups:
315                f.articles.remove(x)
316
317        self.duplicates = duplicates
318        print(len(duplicates))
319        print(list(map(len, self)))
320        # raise
321
322    def find_article(self, article):
323        for j, f in enumerate(self):
324            for i, a in enumerate(f):
325                if a is article:
326                    return (j, i)
327
328    def restore_duplicates(self):
329        temp = []
330        for article, feed in self.duplicates:
331            art = copy.deepcopy(article)
332            j, i = self.find_article(article)
333            art.url = '../feed_%d/article_%d/index.html'%(j, i)
334            temp.append((feed, art))
335        for feed, art in temp:
336            feed.articles.append(art)
337
338
339def feed_from_xml(raw_xml, title=None, oldest_article=7,
340                  max_articles_per_feed=100,
341                  get_article_url=lambda item: item.get('link', None),
342                  log=default_log):
343    from calibre.web.feeds.feedparser import parse
344    # Handle unclosed escaped entities. They trip up feedparser and HBR for one
345    # generates them
346    raw_xml = re.sub(br'(&amp;#\d+)([^0-9;])', br'\1;\2', raw_xml)
347    feed = parse(raw_xml)
348    pfeed = Feed(get_article_url=get_article_url, log=log)
349    pfeed.populate_from_feed(feed, title=title,
350                            oldest_article=oldest_article,
351                            max_articles_per_feed=max_articles_per_feed)
352    return pfeed
353
354
355def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
356        log=default_log):
357    '''
358    @param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
359    @return: A list of L{Feed} objects.
360    @rtype: list
361    '''
362    feeds = []
363    for title, articles in index:
364        pfeed = Feed(log=log)
365        pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
366                                       max_articles_per_feed=max_articles_per_feed)
367        feeds.append(pfeed)
368    return feeds
369