1# coding: utf-8 2from __future__ import unicode_literals 3 4import re 5 6from .common import InfoExtractor 7from ..compat import compat_urlparse 8from ..utils import ( 9 clean_html, 10 extract_attributes, 11 ExtractorError, 12 get_elements_by_class, 13 int_or_none, 14 js_to_json, 15 smuggle_url, 16 unescapeHTML, 17) 18 19 20def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): 21 """Return the content of the tag with the specified attribute in the passed HTML document""" 22 23 if tag is None: 24 tag = '[a-zA-Z0-9:._-]+' 25 if attribute is None: 26 attribute = '' 27 else: 28 attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute) 29 if value is None: 30 value = '' 31 else: 32 value = re.escape(value) if escape_value else value 33 value = '=[\'"]?(?P<value>%s)[\'"]?' % value 34 35 retlist = [] 36 for m in re.finditer(r'''(?xs) 37 <(?P<tag>%s) 38 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? 39 %s%s 40 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? 41 \s*> 42 (?P<content>.*?) 43 </\1> 44 ''' % (tag, attribute, value), html): 45 retlist.append(m) 46 47 return retlist 48 49 50def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): 51 retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value) 52 return retval[0] if retval else None 53 54 55class DubokuIE(InfoExtractor): 56 IE_NAME = 'duboku' 57 IE_DESC = 'www.duboku.co' 58 59 _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' 60 _TESTS = [{ 61 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', 62 'info_dict': { 63 'id': '1575-1-1', 64 'ext': 'ts', 65 'series': '白色月光', 66 'title': 'contains:白色月光', 67 'season_number': 1, 68 'episode_number': 1, 69 }, 70 'params': { 71 'skip_download': 'm3u8 download', 72 }, 73 }, { 74 'url': 'https://www.duboku.co/vodplay/1588-1-1.html', 75 'info_dict': { 76 'id': '1588-1-1', 77 'ext': 'ts', 78 'series': '亲爱的自己', 79 'title': 'contains:预告片', 80 'season_number': 1, 81 'episode_number': 1, 82 }, 83 'params': { 84 'skip_download': 'm3u8 download', 85 }, 86 }] 87 88 _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script' 89 90 def _real_extract(self, url): 91 video_id = self._match_id(url) 92 temp = video_id.split('-') 93 series_id = temp[0] 94 season_id = temp[1] 95 episode_id = temp[2] 96 97 webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id 98 webpage_html = self._download_webpage(webpage_url, video_id) 99 100 # extract video url 101 102 player_data = self._search_regex( 103 self._PLAYER_DATA_PATTERN, webpage_html, 'player_data') 104 player_data = self._parse_json(player_data, video_id, js_to_json) 105 106 # extract title 107 108 temp = get_elements_by_class('title', webpage_html) 109 series_title = None 110 title = None 111 for html in temp: 112 mobj = re.search(r'<a\s+.*>(.*)</a>', html) 113 if mobj: 114 href = extract_attributes(mobj.group(0)).get('href') 115 if href: 116 mobj1 = re.search(r'/(\d+)\.html', href) 117 if mobj1 and mobj1.group(1) == series_id: 118 series_title = clean_html(mobj.group(0)) 119 series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title) 120 title = clean_html(html) 121 title = re.sub(r'[\s\r\n\t]+', ' ', title) 122 break 123 124 data_url = player_data.get('url') 125 if not data_url: 126 raise ExtractorError('Cannot find url in player_data') 127 data_from = player_data.get('from') 128 129 # if it is an embedded iframe, maybe it's an external source 130 if data_from == 'iframe': 131 # use _type url_transparent to retain the meaningful details 132 # of the video. 133 return { 134 '_type': 'url_transparent', 135 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}), 136 'id': video_id, 137 'title': title, 138 'series': series_title, 139 'season_number': int_or_none(season_id), 140 'season_id': season_id, 141 'episode_number': int_or_none(episode_id), 142 'episode_id': episode_id, 143 } 144 145 formats = self._extract_m3u8_formats(data_url, video_id, 'mp4') 146 147 return { 148 'id': video_id, 149 'title': title, 150 'series': series_title, 151 'season_number': int_or_none(season_id), 152 'season_id': season_id, 153 'episode_number': int_or_none(episode_id), 154 'episode_id': episode_id, 155 'formats': formats, 156 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'} 157 } 158 159 160class DubokuPlaylistIE(InfoExtractor): 161 IE_NAME = 'duboku:list' 162 IE_DESC = 'www.duboku.co entire series' 163 164 _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*' 165 _TESTS = [{ 166 'url': 'https://www.duboku.co/voddetail/1575.html', 167 'info_dict': { 168 'id': 'startswith:1575', 169 'title': '白色月光', 170 }, 171 'playlist_count': 12, 172 }, { 173 'url': 'https://www.duboku.co/voddetail/1554.html', 174 'info_dict': { 175 'id': 'startswith:1554', 176 'title': '以家人之名', 177 }, 178 'playlist_mincount': 30, 179 }, { 180 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2', 181 'info_dict': { 182 'id': '1554#playlist2', 183 'title': '以家人之名', 184 }, 185 'playlist_mincount': 27, 186 }] 187 188 def _real_extract(self, url): 189 mobj = self._match_valid_url(url) 190 if mobj is None: 191 raise ExtractorError('Invalid URL: %s' % url) 192 series_id = mobj.group('id') 193 fragment = compat_urlparse.urlparse(url).fragment 194 195 webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id 196 webpage_html = self._download_webpage(webpage_url, series_id) 197 198 # extract title 199 200 title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title') 201 title = unescapeHTML(title.group('content')) if title else None 202 if not title: 203 title = self._html_search_meta('keywords', webpage_html) 204 if not title: 205 title = _get_element_by_tag_and_attrib(webpage_html, 'title') 206 title = unescapeHTML(title.group('content')) if title else None 207 208 # extract playlists 209 210 playlists = {} 211 for div in _get_elements_by_tag_and_attrib( 212 webpage_html, attribute='id', value='playlist\\d+', escape_value=False): 213 playlist_id = div.group('value') 214 playlist = [] 215 for a in _get_elements_by_tag_and_attrib( 216 div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False): 217 playlist.append({ 218 'href': unescapeHTML(a.group('value')), 219 'title': unescapeHTML(a.group('content')) 220 }) 221 playlists[playlist_id] = playlist 222 223 # select the specified playlist if url fragment exists 224 playlist = None 225 playlist_id = None 226 if fragment: 227 playlist = playlists.get(fragment) 228 playlist_id = fragment 229 else: 230 first = next(iter(playlists.items()), None) 231 if first: 232 (playlist_id, playlist) = first 233 if not playlist: 234 raise ExtractorError( 235 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist') 236 237 # return url results 238 return self.playlist_result([ 239 self.url_result( 240 compat_urlparse.urljoin('https://www.duboku.co', x['href']), 241 ie=DubokuIE.ie_key(), video_title=x.get('title')) 242 for x in playlist], series_id + '#' + playlist_id, title) 243