1# -*- coding: utf-8 -*- 2# 3# gPodder - A media aggregator and podcast client 4# Copyright (c) 2005-2018 The gPodder Team 5# 6# gPodder is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 3 of the License, or 9# (at your option) any later version. 10# 11# gPodder is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program. If not, see <http://www.gnu.org/licenses/>. 18# 19 20# 21# Generic feed fetching module for aggregators 22# Thomas Perl <thp@gpodder.org>; 2009-06-11 23# 24 25import logging 26import urllib.parse 27from html.parser import HTMLParser 28from urllib.error import HTTPError 29 30import podcastparser 31 32from gpodder import util, youtube 33 34logger = logging.getLogger(__name__) 35 36 37try: 38 # Python 2 39 from rfc822 import mktime_tz 40 from StringIO import StringIO 41except ImportError: 42 # Python 3 43 from email.utils import mktime_tz 44 from io import StringIO 45 46 47class ExceptionWithData(Exception): 48 """Base exception with additional payload""" 49 def __init__(self, data): 50 Exception.__init__(self) 51 self.data = data 52 53 def __str__(self): 54 return '%s: %s' % (self.__class__.__name__, str(self.data)) 55 56 57# Temporary errors 58class BadRequest(Exception): pass 59 60 61class InternalServerError(Exception): pass 62 63 64class WifiLogin(ExceptionWithData): pass 65 66 67# Fatal errors 68class Unsubscribe(Exception): pass 69 70 71class NotFound(Exception): pass 72 73 74class InvalidFeed(Exception): pass 75 76 77class UnknownStatusCode(ExceptionWithData): pass 78 79 80# Authentication error 81class AuthenticationRequired(Exception): 82 def __init__(self, msg, url=None): 83 super().__init__(msg) 84 self.url = url 85 86 87# Successful status codes 88UPDATED_FEED, NEW_LOCATION, NOT_MODIFIED = list(range(3)) 89 90 91class Result: 92 def __init__(self, status, feed=None): 93 self.status = status 94 self.feed = feed 95 96 97class FeedAutodiscovery(HTMLParser): 98 def __init__(self, base): 99 HTMLParser.__init__(self) 100 self._base = base 101 self._resolved_url = None 102 103 def handle_starttag(self, tag, attrs): 104 if tag == 'link': 105 attrs = dict(attrs) 106 107 is_feed = attrs.get('type', '') in Fetcher.FEED_TYPES 108 is_youtube = 'youtube.com' in self._base 109 is_alternate = attrs.get('rel', '') == 'alternate' 110 is_canonical = attrs.get('rel', '') == 'canonical' 111 url = attrs.get('href', None) 112 url = urllib.parse.urljoin(self._base, url) 113 114 if is_feed and is_alternate and url: 115 logger.info('Feed autodiscovery: %s', url) 116 self._resolved_url = url 117 elif is_youtube and is_canonical and url: 118 url = youtube.parse_youtube_url(url) 119 logger.info('Feed autodiscovery: %s', url) 120 self._resolved_url = url 121 122 123class Fetcher(object): 124 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html 125 FEED_TYPES = ('application/rss+xml', 126 'application/atom+xml', 127 'application/rdf+xml', 128 'application/xml', 129 'text/xml') 130 131 def _resolve_url(self, url): 132 """Provide additional ways of resolving an URL 133 134 Subclasses can override this method to provide more 135 ways of resolving a given URL to a feed URL. If the 136 Fetcher is in "autodiscovery" mode, it will try this 137 method as a last resort for coming up with a feed URL. 138 """ 139 return None 140 141 def _normalize_status(self, status): 142 # Based on Mark Pilgrim's "Atom aggregator behaviour" article 143 if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500): 144 return status 145 elif status >= 200 and status < 300: 146 return 200 147 elif status >= 300 and status < 400: 148 return 302 149 elif status >= 400 and status < 500: 150 return 400 151 elif status >= 500 and status < 600: 152 return 500 153 else: 154 return status 155 156 def _check_statuscode(self, response, feed): 157 status = self._normalize_status(response.getcode()) 158 159 if status == 200: 160 return Result(UPDATED_FEED, feed) 161 elif status == 301: 162 return Result(NEW_LOCATION, feed) 163 elif status == 302: 164 return Result(UPDATED_FEED, feed) 165 elif status == 304: 166 return Result(NOT_MODIFIED, feed) 167 168 if status == 400: 169 raise BadRequest('bad request') 170 elif status == 401: 171 raise AuthenticationRequired('authentication required', feed) 172 elif status == 403: 173 raise Unsubscribe('forbidden') 174 elif status == 404: 175 raise NotFound('not found') 176 elif status == 410: 177 raise Unsubscribe('resource is gone') 178 elif status == 500: 179 raise InternalServerError('internal server error') 180 else: 181 raise UnknownStatusCode(status) 182 183 def _parse_feed(self, url, etag, modified, autodiscovery=True, max_episodes=0): 184 headers = {} 185 if modified is not None: 186 headers['If-Modified-Since'] = modified 187 if etag is not None: 188 headers['If-None-Match'] = etag 189 190 if url.startswith('file://'): 191 is_local = True 192 url = url[len('file://'):] 193 stream = open(url) 194 else: 195 is_local = False 196 try: 197 stream = util.urlopen(url, headers) 198 except HTTPError as e: 199 return self._check_statuscode(e, e.geturl()) 200 201 data = stream 202 if autodiscovery and not is_local and stream.headers.get('content-type', '').startswith('text/html'): 203 # Not very robust attempt to detect encoding: http://stackoverflow.com/a/1495675/1072626 204 charset = stream.headers.get_param('charset') 205 if charset is None: 206 charset = 'utf-8' # utf-8 appears hard-coded elsewhere in this codebase 207 208 # We use StringIO in case the stream needs to be read again 209 data = StringIO(stream.read().decode(charset)) 210 ad = FeedAutodiscovery(url) 211 212 ad.feed(data.getvalue()) 213 if ad._resolved_url and ad._resolved_url != url: 214 try: 215 self._parse_feed(ad._resolved_url, None, None, False) 216 return Result(NEW_LOCATION, ad._resolved_url) 217 except Exception as e: 218 logger.warn('Feed autodiscovery failed', exc_info=True) 219 220 # Second, try to resolve the URL 221 new_url = self._resolve_url(url) 222 if new_url and new_url != url: 223 return Result(NEW_LOCATION, new_url) 224 225 # Reset the stream so podcastparser can give it a go 226 data.seek(0) 227 228 try: 229 feed = podcastparser.parse(url, data) 230 feed['url'] = url 231 except ValueError as e: 232 raise InvalidFeed('Could not parse feed: {msg}'.format(msg=e)) 233 234 if is_local: 235 feed['headers'] = {} 236 return Result(UPDATED_FEED, feed) 237 else: 238 feed['headers'] = stream.headers 239 return self._check_statuscode(stream, feed) 240 241 def fetch(self, url, etag=None, modified=None, max_episodes=0): 242 return self._parse_feed(url, etag, modified, max_episodes) 243