1# -*- coding: utf-8 -*-
2# Manage Youtube subscriptions using youtube-dl (https://github.com/ytdl-org/youtube-dl)
3# Requirements: youtube-dl module (pip install youtube_dl)
4# (c) 2019-08-17 Eric Le Lay <elelay.fr:contact>
5# Released under the same license terms as gPodder itself.
6
7import logging
8import os
9import re
10import sys
11import time
12
13import youtube_dl
14from youtube_dl.utils import DownloadError, ExtractorError, sanitize_url
15
16import gpodder
17from gpodder import download, feedcore, model, registry, youtube
18from gpodder.util import mimetype_from_extension, remove_html_tags
19
20_ = gpodder.gettext
21
22
23logger = logging.getLogger(__name__)
24
25
26__title__ = 'Youtube-dl'
27__description__ = _('Manage Youtube subscriptions using youtube-dl (pip install youtube_dl)')
28__only_for__ = 'gtk, cli'
29__authors__ = 'Eric Le Lay <elelay.fr:contact>'
30__doc__ = 'https://gpodder.github.io/docs/extensions/youtubedl.html'
31
32DefaultConfig = {
33    # youtube-dl downloads and parses each video page to get informations about it, which is very slow.
34    # Set to False to fall back to the fast but limited (only 15 episodes) gpodder code
35    'manage_channel': True,
36    # If for some reason youtube-dl download doesn't work for you, you can fallback to gpodder code.
37    # Set to False to fall back to default gpodder code (less available formats).
38    'manage_downloads': True,
39}
40
41
42# youtube feed still preprocessed by youtube.py (compat)
43CHANNEL_RE = re.compile(r'''https://www.youtube.com/feeds/videos.xml\?channel_id=(.+)''')
44USER_RE = re.compile(r'''https://www.youtube.com/feeds/videos.xml\?user=(.+)''')
45PLAYLIST_RE = re.compile(r'''https://www.youtube.com/feeds/videos.xml\?playlist_id=(.+)''')
46
47
48def youtube_parsedate(s):
49    """Parse a string into a unix timestamp
50
51    Only strings provided by Youtube-dl API are
52    parsed with this function (20170920).
53    """
54    if s:
55        return time.mktime(time.strptime(s, "%Y%m%d"))
56    return 0
57
58
59def video_guid(video_id):
60    """
61    generate same guid as youtube
62    """
63    return 'yt:video:{}'.format(video_id)
64
65
66class YoutubeCustomDownload(download.CustomDownload):
67    """
68    Represents the download of a single episode using youtube-dl.
69
70    Actual youtube-dl interaction via gPodderYoutubeDL.
71    """
72    def __init__(self, ytdl, url, episode):
73        self._ytdl = ytdl
74        self._url = url
75        self._reporthook = None
76        self._prev_dl_bytes = 0
77        self._episode = episode
78
79    def retrieve_resume(self, tempname, reporthook=None):
80        """
81        called by download.DownloadTask to perform the download.
82        """
83        self._reporthook = reporthook
84        # outtmpl: use given tempname by DownloadTask
85        # (escape % and $ because outtmpl used as a string template by youtube-dl)
86        outtmpl = tempname.replace('%', '%%').replace('$', '$$')
87        res = self._ytdl.fetch_video(self._url, outtmpl, self._my_hook)
88        if outtmpl != tempname:
89            if 'ext' in res and os.path.isfile(outtmpl + '.{}'.format(res['ext'])):
90                os.rename(outtmpl + '.{}'.format(res['ext']), tempname)
91            else:
92                os.rename(outtmpl, tempname)
93        if 'duration' in res and res['duration']:
94            self._episode.total_time = res['duration']
95        headers = {}
96        # youtube-dl doesn't return a content-type but an extension
97        if 'ext' in res:
98            dot_ext = '.{}'.format(res['ext'])
99            # See #673 when merging multiple formats, the extension is appended to the tempname
100            # by YoutubeDL resulting in empty .partial file + .partial.mp4 exists
101            # and #796 .mkv is chosen by ytdl sometimes
102            tempstat = os.stat(tempname)
103            if not tempstat.st_size:
104                for try_ext in (dot_ext, ".mp4", ".m4a", ".webm", ".mkv"):
105                    tempname_with_ext = tempname + try_ext
106                    if os.path.isfile(tempname_with_ext):
107                        logger.debug('Youtubedl downloaded to "%s" instead of "%s", moving',
108                                     os.path.basename(tempname_with_ext),
109                                     os.path.basename(tempname))
110                        os.remove(tempname)
111                        os.rename(tempname_with_ext, tempname)
112                        dot_ext = try_ext
113                        break
114            ext_filetype = mimetype_from_extension(dot_ext)
115            if ext_filetype:
116                headers['content-type'] = ext_filetype
117        return headers, res.get('url', self._url)
118
119    def _my_hook(self, d):
120        if d['status'] == 'downloading':
121            if self._reporthook:
122                dl_bytes = d['downloaded_bytes']
123                total_bytes = d.get('total_bytes') or d.get('total_bytes_estimate') or 0
124                self._reporthook(self._prev_dl_bytes + dl_bytes,
125                                 1,
126                                 self._prev_dl_bytes + total_bytes)
127        elif d['status'] == 'finished':
128            dl_bytes = d['downloaded_bytes']
129            self._prev_dl_bytes += dl_bytes
130            if self._reporthook:
131                self._reporthook(self._prev_dl_bytes, 1, self._prev_dl_bytes)
132        elif d['status'] == 'error':
133            logger.error('download hook error: %r', d)
134        else:
135            logger.debug('unknown download hook status: %r', d)
136
137
138class YoutubeFeed(model.Feed):
139    """
140    Represents the youtube feed for model.PodcastChannel
141    """
142    def __init__(self, url, cover_url, description, max_episodes, ie_result, downloader):
143        self._url = url
144        self._cover_url = cover_url
145        self._description = description
146        self._max_episodes = max_episodes
147        ie_result['entries'] = self._process_entries(ie_result.get('entries', []))
148        self._ie_result = ie_result
149        self._downloader = downloader
150
151    def _process_entries(self, entries):
152        filtered_entries = []
153        seen_guids = set()
154        for i, e in enumerate(entries):  # consumes the generator!
155            if e.get('_type', 'video') in ('url', 'url_transparent') and e.get('ie_key') == 'Youtube':
156                guid = video_guid(e['id'])
157                e['guid'] = guid
158                if guid in seen_guids:
159                    logger.debug('dropping already seen entry %s title="%s"', guid, e.get('title'))
160                else:
161                    filtered_entries.append(e)
162                    seen_guids.add(guid)
163            else:
164                logger.debug('dropping entry not youtube video %r', e)
165            if len(filtered_entries) == self._max_episodes:
166                # entries is a generator: stopping now prevents it to download more pages
167                logger.debug('stopping entry enumeration')
168                break
169        return filtered_entries
170
171    def get_title(self):
172        return '{} (Youtube)'.format(self._ie_result.get('title') or self._ie_result.get('id') or self._url)
173
174    def get_link(self):
175        return self._ie_result.get('webpage_url')
176
177    def get_description(self):
178        return self._description
179
180    def get_cover_url(self):
181        return self._cover_url
182
183    def get_http_etag(self):
184        """ :return str: optional -- last HTTP etag header, for conditional request next time """
185        # youtube-dl doesn't provide it!
186        return None
187
188    def get_http_last_modified(self):
189        """ :return str: optional -- last HTTP Last-Modified header, for conditional request next time """
190        # youtube-dl doesn't provide it!
191        return None
192
193    def get_new_episodes(self, channel, existing_guids):
194        # entries are already sorted by decreasing date
195        # trim guids to max episodes
196        entries = [e for i, e in enumerate(self._ie_result['entries'])
197                   if not self._max_episodes or i < self._max_episodes]
198        all_seen_guids = set(e['guid'] for e in entries)
199        # only fetch new ones from youtube since they are so slow to get
200        new_entries = [e for e in entries if e['guid'] not in existing_guids]
201        logger.debug('%i/%i new entries', len(new_entries), len(all_seen_guids))
202        self._ie_result['entries'] = new_entries
203        self._downloader.refresh_entries(self._ie_result)
204        # episodes from entries
205        episodes = []
206        for en in self._ie_result['entries']:
207            guid = video_guid(en['id'])
208            description = remove_html_tags(en.get('description') or _('No description available'))
209            html_description = self.nice_html_description(en, description)
210            if en.get('ext'):
211                mime_type = mimetype_from_extension('.{}'.format(en['ext']))
212            else:
213                mime_type = 'application/octet-stream'
214            if en.get('filesize'):
215                filesize = int(en['filesize'] or 0)
216            else:
217                filesize = sum(int(f.get('filesize') or 0)
218                               for f in en.get('requested_formats', []))
219            ep = {
220                'title': en.get('title', guid),
221                'link': en.get('webpage_url'),
222                'description': description,
223                'description_html': html_description,
224                'url': en.get('webpage_url'),
225                'file_size': filesize,
226                'mime_type': mime_type,
227                'guid': guid,
228                'published': youtube_parsedate(en.get('upload_date', None)),
229                'total_time': int(en.get('duration') or 0),
230            }
231            episode = channel.episode_factory(ep)
232            episode.save()
233            episodes.append(episode)
234        return episodes, all_seen_guids
235
236    def get_next_page(self, channel, max_episodes):
237        """
238        Paginated feed support (RFC 5005).
239        If the feed is paged, return the next feed page.
240        Returned page will in turn be asked for the next page, until None is returned.
241        :return feedcore.Result: the next feed's page,
242                                 as a fully parsed Feed or None
243        """
244        return None
245
246    @staticmethod
247    def nice_html_description(en, description):
248        """
249        basic html formating + hyperlink highlighting + video thumbnail
250        """
251        description = re.sub(r'''https?://[^\s]+''',
252                             r'''<a href="\g<0>">\g<0></a>''',
253                             description)
254        description = description.replace('\n', '<br>')
255        html = """<style type="text/css">
256        body > img { float: left; max-width: 30vw; margin: 0 1em 1em 0; }
257        </style>
258        """
259        img = en.get('thumbnail')
260        if img:
261            html += '<img src="{}">'.format(img)
262        html += '<p>{}</p>'.format(description)
263        return html
264
265
266class gPodderYoutubeDL(download.CustomDownloader):
267    def __init__(self, gpodder_config, my_config, force=False):
268        """
269        :param force: force using this downloader even if config says don't manage downloads
270        """
271        self.gpodder_config = gpodder_config
272        self.my_config = my_config
273        self.force = force
274        # cachedir is not much used in youtube-dl, but set it anyway
275        cachedir = os.path.join(gpodder.home, 'youtube-dl')
276        os.makedirs(cachedir, exist_ok=True)
277        self._ydl_opts = {
278            'cachedir': cachedir,
279            'no_color': True,  # prevent escape codes in desktop notifications on errors
280        }
281        if gpodder.verbose:
282            self._ydl_opts['verbose'] = True
283        else:
284            self._ydl_opts['quiet'] = True
285        # #686 on windows without a console, sys.stdout is None, causing exceptions
286        # when adding podcasts.
287        # See https://docs.python.org/3/library/sys.html#sys.__stderr__ Note
288        if not sys.stdout:
289            logger.debug('no stdout, setting YoutubeDL logger')
290            self._ydl_opts['logger'] = logger
291
292    def add_format(self, gpodder_config, opts, fallback=None):
293        """ construct youtube-dl -f argument from configured format. """
294        # You can set a custom format or custom formats by editing the config for key
295        # `youtube.preferred_fmt_ids`
296        #
297        # It takes a list of format strings separated by comma: bestaudio, 18
298        # they are translated to youtube dl format bestaudio/18, meaning preferably
299        # the best audio quality (audio-only) and MP4 360p if it's not available.
300        #
301        # See https://github.com/ytdl-org/youtube-dl#format-selection for details
302        # about youtube-dl format specification.
303        fmt_ids = youtube.get_fmt_ids(gpodder_config.youtube, False)
304        opts['format'] = '/'.join(str(fmt) for fmt in fmt_ids)
305        if fallback:
306            opts['format'] += '/' + fallback
307        logger.debug('format=%s', opts['format'])
308
309    def fetch_video(self, url, tempname, reporthook):
310        opts = {
311            'outtmpl': tempname,
312            'nopart': True,  # don't append .part (already .partial)
313            'retries': 3,  # retry a few times
314            'progress_hooks': [reporthook]  # to notify UI
315        }
316        opts.update(self._ydl_opts)
317        self.add_format(self.gpodder_config, opts)
318        with youtube_dl.YoutubeDL(opts) as ydl:
319            return ydl.extract_info(url, download=True)
320
321    def refresh_entries(self, ie_result):
322        # only interested in video metadata
323        opts = {
324            'skip_download': True,  # don't download the video
325            'youtube_include_dash_manifest': False,  # don't download the DASH manifest
326        }
327        self.add_format(self.gpodder_config, opts, fallback='18')
328        opts.update(self._ydl_opts)
329        new_entries = []
330        # refresh videos one by one to catch single videos blocked by youtube
331        for e in ie_result.get('entries', []):
332            tmp = {k: v for k, v in ie_result.items() if k != 'entries'}
333            tmp['entries'] = [e]
334            try:
335                with youtube_dl.YoutubeDL(opts) as ydl:
336                    ydl.process_ie_result(tmp, download=False)
337                    new_entries.extend(tmp.get('entries'))
338            except DownloadError as ex:
339                if ex.exc_info[0] == ExtractorError:
340                    # for instance "This video contains content from xyz, who has blocked it on copyright grounds"
341                    logger.warning('Skipping %s: %s', e.get('title', ''), ex.exc_info[1])
342                    continue
343                logger.exception('Skipping %r: %s', tmp, ex.exc_info)
344        ie_result['entries'] = new_entries
345
346    def refresh(self, url, channel_url, max_episodes):
347        """
348        Fetch a channel or playlist contents.
349
350        Doesn't yet fetch video entry informations, so we only get the video id and title.
351        """
352        # Duplicate a bit of the YoutubeDL machinery here because we only
353        # want to parse the channel/playlist first, not to fetch video entries.
354        # We call YoutubeDL.extract_info(process=False), so we
355        # have to call extract_info again ourselves when we get a result of type 'url'.
356        def extract_type(ie_result):
357            result_type = ie_result.get('_type', 'video')
358            if result_type not in ('url', 'playlist', 'multi_video'):
359                raise Exception('Unsuported result_type: {}'.format(result_type))
360            has_playlist = result_type in ('playlist', 'multi_video')
361            return result_type, has_playlist
362
363        opts = {
364            'youtube_include_dash_manifest': False,  # only interested in video title and id
365        }
366        opts.update(self._ydl_opts)
367        with youtube_dl.YoutubeDL(opts) as ydl:
368            ie_result = ydl.extract_info(url, download=False, process=False)
369            result_type, has_playlist = extract_type(ie_result)
370            while not has_playlist:
371                if result_type in ('url', 'url_transparent'):
372                    ie_result['url'] = sanitize_url(ie_result['url'])
373                if result_type == 'url':
374                    logger.debug("extract_info(%s) to get the video list", ie_result['url'])
375                    # We have to add extra_info to the results because it may be
376                    # contained in a playlist
377                    ie_result = ydl.extract_info(ie_result['url'],
378                                                 download=False,
379                                                 process=False,
380                                                 ie_key=ie_result.get('ie_key'))
381                result_type, has_playlist = extract_type(ie_result)
382        cover_url = youtube.get_cover(channel_url)  # youtube-dl doesn't provide the cover url!
383        description = youtube.get_channel_desc(channel_url)  # youtube-dl doesn't provide the description!
384        return feedcore.Result(feedcore.UPDATED_FEED,
385            YoutubeFeed(url, cover_url, description, max_episodes, ie_result, self))
386
387    def fetch_channel(self, channel, max_episodes=0):
388        """
389        called by model.gPodderFetcher to get a custom feed.
390        :returns feedcore.Result: a YoutubeFeed or None if channel is not a youtube channel or playlist
391        """
392        if not self.my_config.manage_channel:
393            return None
394        url = None
395        m = CHANNEL_RE.match(channel.url)
396        if m:
397            url = 'https://www.youtube.com/channel/{}/videos'.format(m.group(1))
398        else:
399            m = USER_RE.match(channel.url)
400            if m:
401                url = 'https://www.youtube.com/user/{}/videos'.format(m.group(1))
402            else:
403                m = PLAYLIST_RE.match(channel.url)
404                if m:
405                    url = 'https://www.youtube.com/playlist?list={}'.format(m.group(1))
406        if url:
407            logger.info('Youtube-dl Handling %s => %s', channel.url, url)
408            return self.refresh(url, channel.url, max_episodes)
409        return None
410
411    def custom_downloader(self, unused_config, episode):
412        """
413        called from registry.custom_downloader.resolve
414        """
415        if not self.force and not self.my_config.manage_downloads:
416            return None
417        if re.match(r'''https://www.youtube.com/watch\?v=.+''', episode.url):
418            return YoutubeCustomDownload(self, episode.url, episode)
419        elif re.match(r'''https://www.youtube.com/watch\?v=.+''', episode.link):
420            return YoutubeCustomDownload(self, episode.link, episode)
421        return None
422
423
424class gPodderExtension:
425    def __init__(self, container):
426        self.container = container
427        self.ytdl = None
428
429    def on_load(self):
430        self.ytdl = gPodderYoutubeDL(self.container.manager.core.config, self.container.config)
431        logger.info('Registering youtube-dl.')
432        registry.feed_handler.register(self.ytdl.fetch_channel)
433        registry.custom_downloader.register(self.ytdl.custom_downloader)
434
435    def on_unload(self):
436        logger.info('Unregistering youtube-dl.')
437        try:
438            registry.feed_handler.unregister(self.ytdl.fetch_channel)
439        except ValueError:
440            pass
441        try:
442            registry.custom_downloader.unregister(self.ytdl.custom_downloader)
443        except ValueError:
444            pass
445        self.ytdl = None
446
447    def on_ui_object_available(self, name, ui_object):
448        if name == 'gpodder-gtk':
449            self.gpodder = ui_object
450
451    def on_episodes_context_menu(self, episodes):
452        if not self.container.config.manage_downloads \
453                and not all(e.was_downloaded(and_exists=True) for e in episodes):
454            return [(_("Download with Youtube-DL"), self.download_episodes)]
455
456    def download_episodes(self, episodes):
457        # create a new gPodderYoutubeDL to force using it even if manage_downloads is False
458        downloader = gPodderYoutubeDL(self.container.manager.core.config, self.container.config, force=True)
459        self.gpodder.download_episode_list(episodes, downloader=downloader)
460