1# -*- coding: utf-8 -*-
2#
3# gPodder - A media aggregator and podcast client
4# Copyright (c) 2005-2018 The gPodder Team
5#
6# gPodder is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 3 of the License, or
9# (at your option) any later version.
10#
11# gPodder is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program.  If not, see <http://www.gnu.org/licenses/>.
18#
19
20#
21# Generic feed fetching module for aggregators
22# Thomas Perl <thp@gpodder.org>; 2009-06-11
23#
24
25import logging
26import urllib.parse
27from html.parser import HTMLParser
28from urllib.error import HTTPError
29
30import podcastparser
31
32from gpodder import util, youtube
33
34logger = logging.getLogger(__name__)
35
36
37try:
38    # Python 2
39    from rfc822 import mktime_tz
40    from StringIO import StringIO
41except ImportError:
42    # Python 3
43    from email.utils import mktime_tz
44    from io import StringIO
45
46
47class ExceptionWithData(Exception):
48    """Base exception with additional payload"""
49    def __init__(self, data):
50        Exception.__init__(self)
51        self.data = data
52
53    def __str__(self):
54        return '%s: %s' % (self.__class__.__name__, str(self.data))
55
56
57# Temporary errors
58class BadRequest(Exception): pass
59
60
61class InternalServerError(Exception): pass
62
63
64class WifiLogin(ExceptionWithData): pass
65
66
67# Fatal errors
68class Unsubscribe(Exception): pass
69
70
71class NotFound(Exception): pass
72
73
74class InvalidFeed(Exception): pass
75
76
77class UnknownStatusCode(ExceptionWithData): pass
78
79
80# Authentication error
81class AuthenticationRequired(Exception):
82    def __init__(self, msg, url=None):
83        super().__init__(msg)
84        self.url = url
85
86
87# Successful status codes
88UPDATED_FEED, NEW_LOCATION, NOT_MODIFIED = list(range(3))
89
90
91class Result:
92    def __init__(self, status, feed=None):
93        self.status = status
94        self.feed = feed
95
96
97class FeedAutodiscovery(HTMLParser):
98    def __init__(self, base):
99        HTMLParser.__init__(self)
100        self._base = base
101        self._resolved_url = None
102
103    def handle_starttag(self, tag, attrs):
104        if tag == 'link':
105            attrs = dict(attrs)
106
107            is_feed = attrs.get('type', '') in Fetcher.FEED_TYPES
108            is_youtube = 'youtube.com' in self._base
109            is_alternate = attrs.get('rel', '') == 'alternate'
110            is_canonical = attrs.get('rel', '') == 'canonical'
111            url = attrs.get('href', None)
112            url = urllib.parse.urljoin(self._base, url)
113
114            if is_feed and is_alternate and url:
115                logger.info('Feed autodiscovery: %s', url)
116                self._resolved_url = url
117            elif is_youtube and is_canonical and url:
118                url = youtube.parse_youtube_url(url)
119                logger.info('Feed autodiscovery: %s', url)
120                self._resolved_url = url
121
122
123class Fetcher(object):
124    # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
125    FEED_TYPES = ('application/rss+xml',
126                  'application/atom+xml',
127                  'application/rdf+xml',
128                  'application/xml',
129                  'text/xml')
130
131    def _resolve_url(self, url):
132        """Provide additional ways of resolving an URL
133
134        Subclasses can override this method to provide more
135        ways of resolving a given URL to a feed URL. If the
136        Fetcher is in "autodiscovery" mode, it will try this
137        method as a last resort for coming up with a feed URL.
138        """
139        return None
140
141    def _normalize_status(self, status):
142        # Based on Mark Pilgrim's "Atom aggregator behaviour" article
143        if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
144            return status
145        elif status >= 200 and status < 300:
146            return 200
147        elif status >= 300 and status < 400:
148            return 302
149        elif status >= 400 and status < 500:
150            return 400
151        elif status >= 500 and status < 600:
152            return 500
153        else:
154            return status
155
156    def _check_statuscode(self, response, feed):
157        status = self._normalize_status(response.getcode())
158
159        if status == 200:
160            return Result(UPDATED_FEED, feed)
161        elif status == 301:
162            return Result(NEW_LOCATION, feed)
163        elif status == 302:
164            return Result(UPDATED_FEED, feed)
165        elif status == 304:
166            return Result(NOT_MODIFIED, feed)
167
168        if status == 400:
169            raise BadRequest('bad request')
170        elif status == 401:
171            raise AuthenticationRequired('authentication required', feed)
172        elif status == 403:
173            raise Unsubscribe('forbidden')
174        elif status == 404:
175            raise NotFound('not found')
176        elif status == 410:
177            raise Unsubscribe('resource is gone')
178        elif status == 500:
179            raise InternalServerError('internal server error')
180        else:
181            raise UnknownStatusCode(status)
182
183    def _parse_feed(self, url, etag, modified, autodiscovery=True, max_episodes=0):
184        headers = {}
185        if modified is not None:
186            headers['If-Modified-Since'] = modified
187        if etag is not None:
188            headers['If-None-Match'] = etag
189
190        if url.startswith('file://'):
191            is_local = True
192            url = url[len('file://'):]
193            stream = open(url)
194        else:
195            is_local = False
196            try:
197                stream = util.urlopen(url, headers)
198            except HTTPError as e:
199                return self._check_statuscode(e, e.geturl())
200
201        data = stream
202        if autodiscovery and not is_local and stream.headers.get('content-type', '').startswith('text/html'):
203            # Not very robust attempt to detect encoding: http://stackoverflow.com/a/1495675/1072626
204            charset = stream.headers.get_param('charset')
205            if charset is None:
206                charset = 'utf-8'  # utf-8 appears hard-coded elsewhere in this codebase
207
208            # We use StringIO in case the stream needs to be read again
209            data = StringIO(stream.read().decode(charset))
210            ad = FeedAutodiscovery(url)
211
212            ad.feed(data.getvalue())
213            if ad._resolved_url and ad._resolved_url != url:
214                try:
215                    self._parse_feed(ad._resolved_url, None, None, False)
216                    return Result(NEW_LOCATION, ad._resolved_url)
217                except Exception as e:
218                    logger.warn('Feed autodiscovery failed', exc_info=True)
219
220            # Second, try to resolve the URL
221            new_url = self._resolve_url(url)
222            if new_url and new_url != url:
223                return Result(NEW_LOCATION, new_url)
224
225            # Reset the stream so podcastparser can give it a go
226            data.seek(0)
227
228        try:
229            feed = podcastparser.parse(url, data)
230            feed['url'] = url
231        except ValueError as e:
232            raise InvalidFeed('Could not parse feed: {msg}'.format(msg=e))
233
234        if is_local:
235            feed['headers'] = {}
236            return Result(UPDATED_FEED, feed)
237        else:
238            feed['headers'] = stream.headers
239            return self._check_statuscode(stream, feed)
240
241    def fetch(self, url, etag=None, modified=None, max_episodes=0):
242        return self._parse_feed(url, etag, modified, max_episodes)
243