1from __future__ import unicode_literals
2
3import re
4
5from .common import InfoExtractor
6from ..compat import compat_parse_qs
7from ..utils import (
8    determine_ext,
9    ExtractorError,
10    get_element_by_class,
11    int_or_none,
12    lowercase_escape,
13    try_get,
14    update_url_query,
15)
16
17
18class GoogleDriveIE(InfoExtractor):
19    _VALID_URL = r'''(?x)
20                        https?://
21                            (?:
22                                (?:docs|drive)\.google\.com/
23                                (?:
24                                    (?:uc|open)\?.*?id=|
25                                    file/d/
26                                )|
27                                video\.google\.com/get_player\?.*?docid=
28                            )
29                            (?P<id>[a-zA-Z0-9_-]{28,})
30                    '''
31    _TESTS = [{
32        'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
33        'md5': '5c602afbbf2c1db91831f5d82f678554',
34        'info_dict': {
35            'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
36            'ext': 'mp4',
37            'title': 'Big Buck Bunny.mp4',
38            'duration': 45,
39        }
40    }, {
41        # video can't be watched anonymously due to view count limit reached,
42        # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
43        'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
44        'only_matching': True,
45    }, {
46        # video id is longer than 28 characters
47        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
48        'only_matching': True,
49    }, {
50        'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
51        'only_matching': True,
52    }, {
53        'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
54        'only_matching': True,
55    }]
56    _FORMATS_EXT = {
57        '5': 'flv',
58        '6': 'flv',
59        '13': '3gp',
60        '17': '3gp',
61        '18': 'mp4',
62        '22': 'mp4',
63        '34': 'flv',
64        '35': 'flv',
65        '36': '3gp',
66        '37': 'mp4',
67        '38': 'mp4',
68        '43': 'webm',
69        '44': 'webm',
70        '45': 'webm',
71        '46': 'webm',
72        '59': 'mp4',
73    }
74    _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
75    _CAPTIONS_ENTRY_TAG = {
76        'subtitles': 'track',
77        'automatic_captions': 'target',
78    }
79    _caption_formats_ext = []
80    _captions_xml = None
81
82    @staticmethod
83    def _extract_url(webpage):
84        mobj = re.search(
85            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
86            webpage)
87        if mobj:
88            return 'https://drive.google.com/file/d/%s' % mobj.group('id')
89
90    def _download_subtitles_xml(self, video_id, subtitles_id, hl):
91        if self._captions_xml:
92            return
93        self._captions_xml = self._download_xml(
94            self._BASE_URL_CAPTIONS, video_id, query={
95                'id': video_id,
96                'vid': subtitles_id,
97                'hl': hl,
98                'v': video_id,
99                'type': 'list',
100                'tlangs': '1',
101                'fmts': '1',
102                'vssids': '1',
103            }, note='Downloading subtitles XML',
104            errnote='Unable to download subtitles XML', fatal=False)
105        if self._captions_xml:
106            for f in self._captions_xml.findall('format'):
107                if f.attrib.get('fmt_code') and not f.attrib.get('default'):
108                    self._caption_formats_ext.append(f.attrib['fmt_code'])
109
110    def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
111                              origin_lang_code=None):
112        if not subtitles_id or not caption_type:
113            return
114        captions = {}
115        for caption_entry in self._captions_xml.findall(
116                self._CAPTIONS_ENTRY_TAG[caption_type]):
117            caption_lang_code = caption_entry.attrib.get('lang_code')
118            if not caption_lang_code:
119                continue
120            caption_format_data = []
121            for caption_format in self._caption_formats_ext:
122                query = {
123                    'vid': subtitles_id,
124                    'v': video_id,
125                    'fmt': caption_format,
126                    'lang': (caption_lang_code if origin_lang_code is None
127                             else origin_lang_code),
128                    'type': 'track',
129                    'name': '',
130                    'kind': '',
131                }
132                if origin_lang_code is not None:
133                    query.update({'tlang': caption_lang_code})
134                caption_format_data.append({
135                    'url': update_url_query(self._BASE_URL_CAPTIONS, query),
136                    'ext': caption_format,
137                })
138            captions[caption_lang_code] = caption_format_data
139        return captions
140
141    def _get_subtitles(self, video_id, subtitles_id, hl):
142        if not subtitles_id or not hl:
143            return
144        self._download_subtitles_xml(video_id, subtitles_id, hl)
145        if not self._captions_xml:
146            return
147        return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
148
149    def _get_automatic_captions(self, video_id, subtitles_id, hl):
150        if not subtitles_id or not hl:
151            return
152        self._download_subtitles_xml(video_id, subtitles_id, hl)
153        if not self._captions_xml:
154            return
155        track = self._captions_xml.find('track')
156        if track is None:
157            return
158        origin_lang_code = track.attrib.get('lang_code')
159        if not origin_lang_code:
160            return
161        return self._get_captions_by_type(
162            video_id, subtitles_id, 'automatic_captions', origin_lang_code)
163
164    def _real_extract(self, url):
165        video_id = self._match_id(url)
166        video_info = compat_parse_qs(self._download_webpage(
167            'https://drive.google.com/get_video_info',
168            video_id, query={'docid': video_id}))
169
170        def get_value(key):
171            return try_get(video_info, lambda x: x[key][0])
172
173        reason = get_value('reason')
174        title = get_value('title')
175        if not title and reason:
176            raise ExtractorError(reason, expected=True)
177
178        formats = []
179        fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
180        fmt_list = (get_value('fmt_list') or '').split(',')
181        if fmt_stream_map and fmt_list:
182            resolutions = {}
183            for fmt in fmt_list:
184                mobj = re.search(
185                    r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
186                if mobj:
187                    resolutions[mobj.group('format_id')] = (
188                        int(mobj.group('width')), int(mobj.group('height')))
189
190            for fmt_stream in fmt_stream_map:
191                fmt_stream_split = fmt_stream.split('|')
192                if len(fmt_stream_split) < 2:
193                    continue
194                format_id, format_url = fmt_stream_split[:2]
195                f = {
196                    'url': lowercase_escape(format_url),
197                    'format_id': format_id,
198                    'ext': self._FORMATS_EXT[format_id],
199                }
200                resolution = resolutions.get(format_id)
201                if resolution:
202                    f.update({
203                        'width': resolution[0],
204                        'height': resolution[1],
205                    })
206                formats.append(f)
207
208        source_url = update_url_query(
209            'https://drive.google.com/uc', {
210                'id': video_id,
211                'export': 'download',
212            })
213
214        def request_source_file(source_url, kind):
215            return self._request_webpage(
216                source_url, video_id, note='Requesting %s file' % kind,
217                errnote='Unable to request %s file' % kind, fatal=False)
218        urlh = request_source_file(source_url, 'source')
219        if urlh:
220            def add_source_format(urlh):
221                formats.append({
222                    # Use redirect URLs as download URLs in order to calculate
223                    # correct cookies in _calc_cookies.
224                    # Using original URLs may result in redirect loop due to
225                    # google.com's cookies mistakenly used for googleusercontent.com
226                    # redirect URLs (see #23919).
227                    'url': urlh.geturl(),
228                    'ext': determine_ext(title, 'mp4').lower(),
229                    'format_id': 'source',
230                    'quality': 1,
231                })
232            if urlh.headers.get('Content-Disposition'):
233                add_source_format(urlh)
234            else:
235                confirmation_webpage = self._webpage_read_content(
236                    urlh, url, video_id, note='Downloading confirmation page',
237                    errnote='Unable to confirm download', fatal=False)
238                if confirmation_webpage:
239                    confirm = self._search_regex(
240                        r'confirm=([^&"\']+)', confirmation_webpage,
241                        'confirmation code', default=None)
242                    if confirm:
243                        confirmed_source_url = update_url_query(source_url, {
244                            'confirm': confirm,
245                        })
246                        urlh = request_source_file(confirmed_source_url, 'confirmed source')
247                        if urlh and urlh.headers.get('Content-Disposition'):
248                            add_source_format(urlh)
249                    else:
250                        self.report_warning(
251                            get_element_by_class('uc-error-subcaption', confirmation_webpage)
252                            or get_element_by_class('uc-error-caption', confirmation_webpage)
253                            or 'unable to extract confirmation code')
254
255        if not formats and reason:
256            raise ExtractorError(reason, expected=True)
257
258        self._sort_formats(formats)
259
260        hl = get_value('hl')
261        subtitles_id = None
262        ttsurl = get_value('ttsurl')
263        if ttsurl:
264            # the video Id for subtitles will be the last value in the ttsurl
265            # query string
266            subtitles_id = ttsurl.encode('utf-8').decode(
267                'unicode_escape').split('=')[-1]
268
269        return {
270            'id': video_id,
271            'title': title,
272            'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
273            'duration': int_or_none(get_value('length_seconds')),
274            'formats': formats,
275            'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
276            'automatic_captions': self.extract_automatic_captions(
277                video_id, subtitles_id, hl),
278        }
279