1# -*- coding: utf-8 -*- 2# Manage Youtube subscriptions using youtube-dl (https://github.com/ytdl-org/youtube-dl) 3# Requirements: youtube-dl module (pip install youtube_dl) 4# (c) 2019-08-17 Eric Le Lay <elelay.fr:contact> 5# Released under the same license terms as gPodder itself. 6 7import logging 8import os 9import re 10import sys 11import time 12 13import youtube_dl 14from youtube_dl.utils import DownloadError, ExtractorError, sanitize_url 15 16import gpodder 17from gpodder import download, feedcore, model, registry, youtube 18from gpodder.util import mimetype_from_extension, remove_html_tags 19 20_ = gpodder.gettext 21 22 23logger = logging.getLogger(__name__) 24 25 26__title__ = 'Youtube-dl' 27__description__ = _('Manage Youtube subscriptions using youtube-dl (pip install youtube_dl)') 28__only_for__ = 'gtk, cli' 29__authors__ = 'Eric Le Lay <elelay.fr:contact>' 30__doc__ = 'https://gpodder.github.io/docs/extensions/youtubedl.html' 31 32DefaultConfig = { 33 # youtube-dl downloads and parses each video page to get informations about it, which is very slow. 34 # Set to False to fall back to the fast but limited (only 15 episodes) gpodder code 35 'manage_channel': True, 36 # If for some reason youtube-dl download doesn't work for you, you can fallback to gpodder code. 37 # Set to False to fall back to default gpodder code (less available formats). 38 'manage_downloads': True, 39} 40 41 42# youtube feed still preprocessed by youtube.py (compat) 43CHANNEL_RE = re.compile(r'''https://www.youtube.com/feeds/videos.xml\?channel_id=(.+)''') 44USER_RE = re.compile(r'''https://www.youtube.com/feeds/videos.xml\?user=(.+)''') 45PLAYLIST_RE = re.compile(r'''https://www.youtube.com/feeds/videos.xml\?playlist_id=(.+)''') 46 47 48def youtube_parsedate(s): 49 """Parse a string into a unix timestamp 50 51 Only strings provided by Youtube-dl API are 52 parsed with this function (20170920). 53 """ 54 if s: 55 return time.mktime(time.strptime(s, "%Y%m%d")) 56 return 0 57 58 59def video_guid(video_id): 60 """ 61 generate same guid as youtube 62 """ 63 return 'yt:video:{}'.format(video_id) 64 65 66class YoutubeCustomDownload(download.CustomDownload): 67 """ 68 Represents the download of a single episode using youtube-dl. 69 70 Actual youtube-dl interaction via gPodderYoutubeDL. 71 """ 72 def __init__(self, ytdl, url, episode): 73 self._ytdl = ytdl 74 self._url = url 75 self._reporthook = None 76 self._prev_dl_bytes = 0 77 self._episode = episode 78 79 def retrieve_resume(self, tempname, reporthook=None): 80 """ 81 called by download.DownloadTask to perform the download. 82 """ 83 self._reporthook = reporthook 84 # outtmpl: use given tempname by DownloadTask 85 # (escape % and $ because outtmpl used as a string template by youtube-dl) 86 outtmpl = tempname.replace('%', '%%').replace('$', '$$') 87 res = self._ytdl.fetch_video(self._url, outtmpl, self._my_hook) 88 if outtmpl != tempname: 89 if 'ext' in res and os.path.isfile(outtmpl + '.{}'.format(res['ext'])): 90 os.rename(outtmpl + '.{}'.format(res['ext']), tempname) 91 else: 92 os.rename(outtmpl, tempname) 93 if 'duration' in res and res['duration']: 94 self._episode.total_time = res['duration'] 95 headers = {} 96 # youtube-dl doesn't return a content-type but an extension 97 if 'ext' in res: 98 dot_ext = '.{}'.format(res['ext']) 99 # See #673 when merging multiple formats, the extension is appended to the tempname 100 # by YoutubeDL resulting in empty .partial file + .partial.mp4 exists 101 # and #796 .mkv is chosen by ytdl sometimes 102 tempstat = os.stat(tempname) 103 if not tempstat.st_size: 104 for try_ext in (dot_ext, ".mp4", ".m4a", ".webm", ".mkv"): 105 tempname_with_ext = tempname + try_ext 106 if os.path.isfile(tempname_with_ext): 107 logger.debug('Youtubedl downloaded to "%s" instead of "%s", moving', 108 os.path.basename(tempname_with_ext), 109 os.path.basename(tempname)) 110 os.remove(tempname) 111 os.rename(tempname_with_ext, tempname) 112 dot_ext = try_ext 113 break 114 ext_filetype = mimetype_from_extension(dot_ext) 115 if ext_filetype: 116 headers['content-type'] = ext_filetype 117 return headers, res.get('url', self._url) 118 119 def _my_hook(self, d): 120 if d['status'] == 'downloading': 121 if self._reporthook: 122 dl_bytes = d['downloaded_bytes'] 123 total_bytes = d.get('total_bytes') or d.get('total_bytes_estimate') or 0 124 self._reporthook(self._prev_dl_bytes + dl_bytes, 125 1, 126 self._prev_dl_bytes + total_bytes) 127 elif d['status'] == 'finished': 128 dl_bytes = d['downloaded_bytes'] 129 self._prev_dl_bytes += dl_bytes 130 if self._reporthook: 131 self._reporthook(self._prev_dl_bytes, 1, self._prev_dl_bytes) 132 elif d['status'] == 'error': 133 logger.error('download hook error: %r', d) 134 else: 135 logger.debug('unknown download hook status: %r', d) 136 137 138class YoutubeFeed(model.Feed): 139 """ 140 Represents the youtube feed for model.PodcastChannel 141 """ 142 def __init__(self, url, cover_url, description, max_episodes, ie_result, downloader): 143 self._url = url 144 self._cover_url = cover_url 145 self._description = description 146 self._max_episodes = max_episodes 147 ie_result['entries'] = self._process_entries(ie_result.get('entries', [])) 148 self._ie_result = ie_result 149 self._downloader = downloader 150 151 def _process_entries(self, entries): 152 filtered_entries = [] 153 seen_guids = set() 154 for i, e in enumerate(entries): # consumes the generator! 155 if e.get('_type', 'video') in ('url', 'url_transparent') and e.get('ie_key') == 'Youtube': 156 guid = video_guid(e['id']) 157 e['guid'] = guid 158 if guid in seen_guids: 159 logger.debug('dropping already seen entry %s title="%s"', guid, e.get('title')) 160 else: 161 filtered_entries.append(e) 162 seen_guids.add(guid) 163 else: 164 logger.debug('dropping entry not youtube video %r', e) 165 if len(filtered_entries) == self._max_episodes: 166 # entries is a generator: stopping now prevents it to download more pages 167 logger.debug('stopping entry enumeration') 168 break 169 return filtered_entries 170 171 def get_title(self): 172 return '{} (Youtube)'.format(self._ie_result.get('title') or self._ie_result.get('id') or self._url) 173 174 def get_link(self): 175 return self._ie_result.get('webpage_url') 176 177 def get_description(self): 178 return self._description 179 180 def get_cover_url(self): 181 return self._cover_url 182 183 def get_http_etag(self): 184 """ :return str: optional -- last HTTP etag header, for conditional request next time """ 185 # youtube-dl doesn't provide it! 186 return None 187 188 def get_http_last_modified(self): 189 """ :return str: optional -- last HTTP Last-Modified header, for conditional request next time """ 190 # youtube-dl doesn't provide it! 191 return None 192 193 def get_new_episodes(self, channel, existing_guids): 194 # entries are already sorted by decreasing date 195 # trim guids to max episodes 196 entries = [e for i, e in enumerate(self._ie_result['entries']) 197 if not self._max_episodes or i < self._max_episodes] 198 all_seen_guids = set(e['guid'] for e in entries) 199 # only fetch new ones from youtube since they are so slow to get 200 new_entries = [e for e in entries if e['guid'] not in existing_guids] 201 logger.debug('%i/%i new entries', len(new_entries), len(all_seen_guids)) 202 self._ie_result['entries'] = new_entries 203 self._downloader.refresh_entries(self._ie_result) 204 # episodes from entries 205 episodes = [] 206 for en in self._ie_result['entries']: 207 guid = video_guid(en['id']) 208 description = remove_html_tags(en.get('description') or _('No description available')) 209 html_description = self.nice_html_description(en, description) 210 if en.get('ext'): 211 mime_type = mimetype_from_extension('.{}'.format(en['ext'])) 212 else: 213 mime_type = 'application/octet-stream' 214 if en.get('filesize'): 215 filesize = int(en['filesize'] or 0) 216 else: 217 filesize = sum(int(f.get('filesize') or 0) 218 for f in en.get('requested_formats', [])) 219 ep = { 220 'title': en.get('title', guid), 221 'link': en.get('webpage_url'), 222 'description': description, 223 'description_html': html_description, 224 'url': en.get('webpage_url'), 225 'file_size': filesize, 226 'mime_type': mime_type, 227 'guid': guid, 228 'published': youtube_parsedate(en.get('upload_date', None)), 229 'total_time': int(en.get('duration') or 0), 230 } 231 episode = channel.episode_factory(ep) 232 episode.save() 233 episodes.append(episode) 234 return episodes, all_seen_guids 235 236 def get_next_page(self, channel, max_episodes): 237 """ 238 Paginated feed support (RFC 5005). 239 If the feed is paged, return the next feed page. 240 Returned page will in turn be asked for the next page, until None is returned. 241 :return feedcore.Result: the next feed's page, 242 as a fully parsed Feed or None 243 """ 244 return None 245 246 @staticmethod 247 def nice_html_description(en, description): 248 """ 249 basic html formating + hyperlink highlighting + video thumbnail 250 """ 251 description = re.sub(r'''https?://[^\s]+''', 252 r'''<a href="\g<0>">\g<0></a>''', 253 description) 254 description = description.replace('\n', '<br>') 255 html = """<style type="text/css"> 256 body > img { float: left; max-width: 30vw; margin: 0 1em 1em 0; } 257 </style> 258 """ 259 img = en.get('thumbnail') 260 if img: 261 html += '<img src="{}">'.format(img) 262 html += '<p>{}</p>'.format(description) 263 return html 264 265 266class gPodderYoutubeDL(download.CustomDownloader): 267 def __init__(self, gpodder_config, my_config, force=False): 268 """ 269 :param force: force using this downloader even if config says don't manage downloads 270 """ 271 self.gpodder_config = gpodder_config 272 self.my_config = my_config 273 self.force = force 274 # cachedir is not much used in youtube-dl, but set it anyway 275 cachedir = os.path.join(gpodder.home, 'youtube-dl') 276 os.makedirs(cachedir, exist_ok=True) 277 self._ydl_opts = { 278 'cachedir': cachedir, 279 'no_color': True, # prevent escape codes in desktop notifications on errors 280 } 281 if gpodder.verbose: 282 self._ydl_opts['verbose'] = True 283 else: 284 self._ydl_opts['quiet'] = True 285 # #686 on windows without a console, sys.stdout is None, causing exceptions 286 # when adding podcasts. 287 # See https://docs.python.org/3/library/sys.html#sys.__stderr__ Note 288 if not sys.stdout: 289 logger.debug('no stdout, setting YoutubeDL logger') 290 self._ydl_opts['logger'] = logger 291 292 def add_format(self, gpodder_config, opts, fallback=None): 293 """ construct youtube-dl -f argument from configured format. """ 294 # You can set a custom format or custom formats by editing the config for key 295 # `youtube.preferred_fmt_ids` 296 # 297 # It takes a list of format strings separated by comma: bestaudio, 18 298 # they are translated to youtube dl format bestaudio/18, meaning preferably 299 # the best audio quality (audio-only) and MP4 360p if it's not available. 300 # 301 # See https://github.com/ytdl-org/youtube-dl#format-selection for details 302 # about youtube-dl format specification. 303 fmt_ids = youtube.get_fmt_ids(gpodder_config.youtube, False) 304 opts['format'] = '/'.join(str(fmt) for fmt in fmt_ids) 305 if fallback: 306 opts['format'] += '/' + fallback 307 logger.debug('format=%s', opts['format']) 308 309 def fetch_video(self, url, tempname, reporthook): 310 opts = { 311 'outtmpl': tempname, 312 'nopart': True, # don't append .part (already .partial) 313 'retries': 3, # retry a few times 314 'progress_hooks': [reporthook] # to notify UI 315 } 316 opts.update(self._ydl_opts) 317 self.add_format(self.gpodder_config, opts) 318 with youtube_dl.YoutubeDL(opts) as ydl: 319 return ydl.extract_info(url, download=True) 320 321 def refresh_entries(self, ie_result): 322 # only interested in video metadata 323 opts = { 324 'skip_download': True, # don't download the video 325 'youtube_include_dash_manifest': False, # don't download the DASH manifest 326 } 327 self.add_format(self.gpodder_config, opts, fallback='18') 328 opts.update(self._ydl_opts) 329 new_entries = [] 330 # refresh videos one by one to catch single videos blocked by youtube 331 for e in ie_result.get('entries', []): 332 tmp = {k: v for k, v in ie_result.items() if k != 'entries'} 333 tmp['entries'] = [e] 334 try: 335 with youtube_dl.YoutubeDL(opts) as ydl: 336 ydl.process_ie_result(tmp, download=False) 337 new_entries.extend(tmp.get('entries')) 338 except DownloadError as ex: 339 if ex.exc_info[0] == ExtractorError: 340 # for instance "This video contains content from xyz, who has blocked it on copyright grounds" 341 logger.warning('Skipping %s: %s', e.get('title', ''), ex.exc_info[1]) 342 continue 343 logger.exception('Skipping %r: %s', tmp, ex.exc_info) 344 ie_result['entries'] = new_entries 345 346 def refresh(self, url, channel_url, max_episodes): 347 """ 348 Fetch a channel or playlist contents. 349 350 Doesn't yet fetch video entry informations, so we only get the video id and title. 351 """ 352 # Duplicate a bit of the YoutubeDL machinery here because we only 353 # want to parse the channel/playlist first, not to fetch video entries. 354 # We call YoutubeDL.extract_info(process=False), so we 355 # have to call extract_info again ourselves when we get a result of type 'url'. 356 def extract_type(ie_result): 357 result_type = ie_result.get('_type', 'video') 358 if result_type not in ('url', 'playlist', 'multi_video'): 359 raise Exception('Unsuported result_type: {}'.format(result_type)) 360 has_playlist = result_type in ('playlist', 'multi_video') 361 return result_type, has_playlist 362 363 opts = { 364 'youtube_include_dash_manifest': False, # only interested in video title and id 365 } 366 opts.update(self._ydl_opts) 367 with youtube_dl.YoutubeDL(opts) as ydl: 368 ie_result = ydl.extract_info(url, download=False, process=False) 369 result_type, has_playlist = extract_type(ie_result) 370 while not has_playlist: 371 if result_type in ('url', 'url_transparent'): 372 ie_result['url'] = sanitize_url(ie_result['url']) 373 if result_type == 'url': 374 logger.debug("extract_info(%s) to get the video list", ie_result['url']) 375 # We have to add extra_info to the results because it may be 376 # contained in a playlist 377 ie_result = ydl.extract_info(ie_result['url'], 378 download=False, 379 process=False, 380 ie_key=ie_result.get('ie_key')) 381 result_type, has_playlist = extract_type(ie_result) 382 cover_url = youtube.get_cover(channel_url) # youtube-dl doesn't provide the cover url! 383 description = youtube.get_channel_desc(channel_url) # youtube-dl doesn't provide the description! 384 return feedcore.Result(feedcore.UPDATED_FEED, 385 YoutubeFeed(url, cover_url, description, max_episodes, ie_result, self)) 386 387 def fetch_channel(self, channel, max_episodes=0): 388 """ 389 called by model.gPodderFetcher to get a custom feed. 390 :returns feedcore.Result: a YoutubeFeed or None if channel is not a youtube channel or playlist 391 """ 392 if not self.my_config.manage_channel: 393 return None 394 url = None 395 m = CHANNEL_RE.match(channel.url) 396 if m: 397 url = 'https://www.youtube.com/channel/{}/videos'.format(m.group(1)) 398 else: 399 m = USER_RE.match(channel.url) 400 if m: 401 url = 'https://www.youtube.com/user/{}/videos'.format(m.group(1)) 402 else: 403 m = PLAYLIST_RE.match(channel.url) 404 if m: 405 url = 'https://www.youtube.com/playlist?list={}'.format(m.group(1)) 406 if url: 407 logger.info('Youtube-dl Handling %s => %s', channel.url, url) 408 return self.refresh(url, channel.url, max_episodes) 409 return None 410 411 def custom_downloader(self, unused_config, episode): 412 """ 413 called from registry.custom_downloader.resolve 414 """ 415 if not self.force and not self.my_config.manage_downloads: 416 return None 417 if re.match(r'''https://www.youtube.com/watch\?v=.+''', episode.url): 418 return YoutubeCustomDownload(self, episode.url, episode) 419 elif re.match(r'''https://www.youtube.com/watch\?v=.+''', episode.link): 420 return YoutubeCustomDownload(self, episode.link, episode) 421 return None 422 423 424class gPodderExtension: 425 def __init__(self, container): 426 self.container = container 427 self.ytdl = None 428 429 def on_load(self): 430 self.ytdl = gPodderYoutubeDL(self.container.manager.core.config, self.container.config) 431 logger.info('Registering youtube-dl.') 432 registry.feed_handler.register(self.ytdl.fetch_channel) 433 registry.custom_downloader.register(self.ytdl.custom_downloader) 434 435 def on_unload(self): 436 logger.info('Unregistering youtube-dl.') 437 try: 438 registry.feed_handler.unregister(self.ytdl.fetch_channel) 439 except ValueError: 440 pass 441 try: 442 registry.custom_downloader.unregister(self.ytdl.custom_downloader) 443 except ValueError: 444 pass 445 self.ytdl = None 446 447 def on_ui_object_available(self, name, ui_object): 448 if name == 'gpodder-gtk': 449 self.gpodder = ui_object 450 451 def on_episodes_context_menu(self, episodes): 452 if not self.container.config.manage_downloads \ 453 and not all(e.was_downloaded(and_exists=True) for e in episodes): 454 return [(_("Download with Youtube-DL"), self.download_episodes)] 455 456 def download_episodes(self, episodes): 457 # create a new gPodderYoutubeDL to force using it even if manage_downloads is False 458 downloader = gPodderYoutubeDL(self.container.manager.core.config, self.container.config, force=True) 459 self.gpodder.download_episode_list(episodes, downloader=downloader) 460