1#!/usr/bin/env python 2from ..common import * 3from urllib import parse, error 4import random 5from time import sleep 6import datetime 7import hashlib 8import base64 9import logging 10import re 11from xml.dom.minidom import parseString 12 13__all__ = ['icourses_download', 'icourses_playlist_download'] 14 15 16def icourses_download(url, output_dir='.', **kwargs): 17 if 'showResDetail.action' in url: 18 hit = re.search(r'id=(\d+)&courseId=(\d+)', url) 19 url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}'.format(hit.group(1), hit.group(2)) 20 if re.match(r'http://www.icourses.cn/coursestatic/course_(\d+).html', url): 21 raise Exception('You can download it with -l flag') 22 icourses_parser = ICousesExactor(url=url) 23 icourses_parser.basic_extract() 24 title = icourses_parser.title 25 size = None 26 for i in range(5): 27 try: 28 # use this url only for size 29 size_url = icourses_parser.generate_url(0) 30 _, type_, size = url_info(size_url, headers=fake_headers) 31 except error.HTTPError: 32 logging.warning('Failed to fetch the video file! Retrying...') 33 sleep(random.Random().randint(2, 5)) # Prevent from blockage 34 else: 35 print_info(site_info, title, type_, size) 36 break 37 38 if size is None: 39 raise Exception("Failed") 40 41 if not kwargs['info_only']: 42 real_url = icourses_parser.update_url(0) 43 headers = fake_headers.copy() 44 headers['Referer'] = url 45 download_urls_icourses(real_url, title, 'flv',total_size=size, output_dir=output_dir, max_size=15728640, dyn_callback=icourses_parser.update_url) 46 return 47 48 49def get_course_title(url, course_type, page=None): 50 if page is None: 51 try: 52 # shard course page could be gbk but with charset="utf-8" 53 page = get_content(url, decoded=False).decode('gbk') 54 except UnicodeDecodeError: 55 page = get_content(url, decoded=False).decode('utf8') 56 57 if course_type == 'shared_old': 58 patt = r'<div\s+class="top_left_til">(.+?)<\/div>' 59 elif course_type == 'shared_new': 60 patt = r'<h1>(.+?)<\/h1>' 61 else: 62 patt = r'<div\s+class="con">(.+?)<\/div>' 63 64 return re.search(patt, page).group(1) 65 66 67def public_course_playlist(url, page=None): 68 host = 'http://www.icourses.cn/' 69 patt = r'<a href="(.+?)"\s*title="(.+?)".+?>(?:.|\n)+?</a>' 70 71 if page is None: 72 page = get_content(url) 73 playlist = re.findall(patt, page) 74 return [(host+i[0], i[1]) for i in playlist] 75 76 77def public_course_get_title(url, page=None): 78 patt = r'<div\s*class="kcslbut">.+?第(\d+)讲' 79 80 if page is None: 81 page = get_content(url) 82 seq_num = int(re.search(patt, page).group(1)) - 1 83 course_main_title = get_course_title(url, 'public', page) 84 return '{}_第{}讲_{}'.format(course_main_title, seq_num+1, public_course_playlist(url, page)[seq_num][1]) 85 86 87def icourses_playlist_download(url, output_dir='.', **kwargs): 88 page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' 89 resid_courseid_patt = r'changeforvideo\(\'(\d+)\',\'(\d+)\',\'(\d+)\'\)' 90 ep = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}' 91 change_for_video_ip = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}' 92 video_list = [] 93 94 if 'viewVCourse' in url: 95 playlist = public_course_playlist(url) 96 for video in playlist: 97 icourses_download(video[0], output_dir=output_dir, **kwargs) 98 return 99 elif 'coursestatic' in url: 100 course_page = get_content(url) 101 page_navi_vars = re.search(page_type_patt, course_page) 102 103 if page_navi_vars is None: # type 2 shared course 104 video_list = icourses_playlist_new(url, course_page) 105 else: # type 1 shared course 106 sec_page = get_content(ep.format(page_navi_vars.group(2), page_navi_vars.group(1))) 107 video_list = re.findall(resid_courseid_patt, sec_page) 108 elif 'viewCharacterDetail.action' in url or 'changeforVideo.action' in url: 109 page = get_content(url) 110 video_list = re.findall(resid_courseid_patt, page) 111 112 if not video_list: 113 raise Exception('Unknown url pattern') 114 115 for video in video_list: 116 video_url = change_for_video_ip.format(video[0], video[1]) 117 sleep(random.Random().randint(0, 5)) # Prevent from blockage 118 icourses_download(video_url, output_dir=output_dir, **kwargs) 119 120 121def icourses_playlist_new(url, page=None): 122 # 2 helpers using same interface in the js code 123 def to_chap(course_id, chap_id, mod): 124 ep = 'http://www.icourses.cn/jpk/viewCharacterDetail2.action?courseId={}&characId={}&mod={}' 125 req = post_content(ep.format(course_id, chap_id, mod), post_data={}) 126 return req 127 128 def to_sec(course_id, chap_id, mod): 129 ep = 'http://www.icourses.cn/jpk/viewCharacterDetail2.action?courseId={}&characId={}&mod={}' 130 req = post_content(ep.format(course_id, chap_id, mod), post_data={}) 131 return req 132 133 def show_sec(course_id, chap_id): 134 ep = 'http://www.icourses.cn/jpk/getSectionNode.action?courseId={}&characId={}&mod=2' 135 req = post_content(ep.format(course_id, chap_id), post_data={}) 136 return req 137 138 if page is None: 139 page = get_content(url) 140 chap_patt = r'<h3>.+?id="parent_row_(\d+)".+?onclick="(\w+)\((.+)\)"' 141 to_chap_patt = r'this,(\d+),(\d+),(\d)' 142 show_sec_patt = r'this,(\d+),(\d+)' 143 res_patt = r'res_showResDetail\(\'(\d+)\',\'.+?\',\'\d+\',\'mp4\',\'(\d+)\'\)' 144 l = re.findall(chap_patt, page) 145 for i in l: 146 if i[1] == 'ajaxtocharac': 147 hit = re.search(to_chap_patt, i[2]) 148 page = to_chap(hit.group(1), hit.group(2), hit.group(3)) 149 hit_list = re.findall(res_patt, page) 150 if hit_list: 151 return get_playlist(hit_list[0][0], hit_list[0][1]) 152 for hit in hit_list: 153 print(hit) 154 elif i[1] == 'showSectionNode2': 155 hit = re.search(show_sec_patt, i[2]) 156 page = show_sec(hit.group(1), hit.group(2)) 157 # print(page) 158 patt = r'ajaxtosection\(this,(\d+),(\d+),(\d+)\)' 159 hit_list = re.findall(patt, page) 160 # print(hit_list) 161 for hit in hit_list: 162 page = to_sec(hit[0], hit[1], hit[2]) 163 vlist = re.findall(res_patt, page) 164 if vlist: 165 return get_playlist(vlist[0][0], vlist[0][1]) 166 raise Exception("No video found in this playlist") 167 168 169def get_playlist(res_id, course_id): 170 ep = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}' 171 req = get_content(ep.format(res_id, course_id)) 172 173 patt = r'<a.+?changeforvideo\(\'(\d+)\',\'(\d+)\',\'(\d+)\'\).+?title=\"(.+?)\"' 174 return re.findall(patt, req) 175 176 177class ICousesExactor(object): 178 PLAYER_BASE_VER = '150606-1' 179 ENCRYPT_MOD_VER = '151020' 180 ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... 181 182 def __init__(self, url): 183 self.url = url 184 self.title = '' 185 self.flashvars = '' 186 self.api_data = {} 187 self.media_url = '' 188 self.common_args = {} 189 self.enc_mode = True 190 self.page = get_content(self.url) 191 return 192 193 def get_title(self): 194 if 'viewVCourse' in self.url: 195 self.title = public_course_get_title(self.url, self.page) 196 return 197 title_a_patt = r'<div class="con"> <a.*?>(.*?)</a>' 198 title_b_patt = r'<div class="con"> <a.*?/a>((.|\n)*?)</div>' 199 title_a = match1(self.page, title_a_patt).strip() 200 title_b = match1(self.page, title_b_patt).strip() 201 title = title_a + title_b 202 title = re.sub('( +|\n|\t|\r| )', '', unescape_html(title).replace(' ', '')) 203 self.title = title 204 205 def get_flashvars(self): 206 patt = r'var flashvars\s*=\s*(\{(?:.|\n)+?\});' 207 hit = re.search(patt, self.page) 208 if hit is None: 209 raise Exception('Cannot find flashvars') 210 flashvar_str = hit.group(1) 211 212 uuid = re.search(r'uuid\s*:\s*\"?(\w+)\"?', flashvar_str).group(1) 213 other = re.search(r'other\s*:\s*"(.*?)"', flashvar_str).group(1) 214 isvc = re.search(r'IService\s*:\s*\'(.+?)\'', flashvar_str).group(1) 215 216 player_time_patt = r'MPlayer.swf\?v\=(\d+)' 217 player_time = re.search(player_time_patt, self.page).group(1) 218 219 self.flashvars = dict(IService=isvc, uuid=uuid, other=other, v=player_time) 220 221 def api_req(self, url): 222 xml_str = get_content(url) 223 dom = parseString(xml_str) 224 status = dom.getElementsByTagName('result')[0].getAttribute('status') 225 if status != 'success': 226 raise Exception('API returned fail') 227 228 api_res = {} 229 meta = dom.getElementsByTagName('metadata') 230 for m in meta: 231 key = m.getAttribute('name') 232 val = m.firstChild.nodeValue 233 api_res[key] = val 234 self.api_data = api_res 235 236 def basic_extract(self): 237 self.get_title() 238 self.get_flashvars() 239 api_req_url = '{}?{}'.format(self.flashvars['IService'], parse.urlencode(self.flashvars)) 240 self.api_req(api_req_url) 241 242 def do_extract(self, received=0): 243 self.basic_extract() 244 return self.generate_url(received) 245 246 def update_url(self, received): 247 args = self.common_args.copy() 248 play_type = 'seek' if received else 'play' 249 received = received if received else -1 250 args['ls'] = play_type 251 args['start'] = received + 1 252 args['lt'] = self.get_date_str() 253 if self.enc_mode: 254 ssl_ts, sign = self.get_sign(self.media_url) 255 extra_args = dict(h=sign, r=ssl_ts, p=self.__class__.ENCRYPT_MOD_VER) 256 args.update(extra_args) 257 return '{}?{}'.format(self.media_url, parse.urlencode(args)) 258 259 @classmethod 260 def get_date_str(self): 261 fmt_str = '%-m-%-d/%-H:%-M:%-S' 262 now = datetime.datetime.now() 263 try: 264 date_str = now.strftime(fmt_str) 265 except ValueError: # msvcrt 266 date_str = '{}-{}/{}:{}:{}'.format(now.month, now.day, now.hour, now.minute, now.second) 267 return date_str 268 269 def generate_url(self, received): 270 media_host = self.get_media_host(self.api_data['host']) 271 media_url = media_host + self.api_data['url'] 272 self.media_url = media_url 273 274 common_args = dict(lv=self.__class__.PLAYER_BASE_VER) 275 h = self.api_data.get('h') 276 r = self.api_data.get('p', self.__class__.ENCRYPT_MOD_VER) 277 278 if self.api_data['ssl'] != 'true': 279 self.enc_mode = False 280 common_args.update(dict(h=h, r=r)) 281 else: 282 self.enc_mode = True 283 common_args['p'] = self.__class__.ENCRYPT_MOD_VER 284 self.common_args = common_args 285 return self.update_url(received) 286 287 def get_sign(self, media_url): 288 media_host = parse.urlparse(media_url).netloc 289 ran = random.randint(0, 9999999) 290 ssl_callback = get_content('http://{}/ssl/ssl.shtml?r={}'.format(media_host, ran)).split(',') 291 ssl_ts = int(datetime.datetime.strptime(ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) 292 sign_this = self.__class__.ENCRYPT_SALT + parse.urlparse(media_url).path + str(ssl_ts) 293 arg_h = base64.b64encode(hashlib.md5(bytes(sign_this, 'utf-8')).digest(), altchars=b'-_') 294 return ssl_ts, arg_h.decode('utf-8').strip('=') 295 296 def get_media_host(self, ori_host): 297 res = get_content(ori_host + '/ssl/host.shtml').strip() 298 path = parse.urlparse(ori_host).path 299 return ''.join([res, path]) 300 301 302def download_urls_icourses(url, title, ext, total_size, output_dir='.', headers=None, **kwargs): 303 if dry_run or player: 304 log.wtf('Non standard protocol') 305 306 title = get_filename(title) 307 308 filename = '%s.%s' % (title, ext) 309 filepath = os.path.join(output_dir, filename) 310 if not force and os.path.exists(filepath): 311 print('Skipping {}: file already exists\n'.format(filepath)) 312 return 313 bar = SimpleProgressBar(total_size, 1) 314 print('Downloading %s ...' % tr(filename)) 315 url_save_icourses(url, filepath, bar, total_size, headers=headers, **kwargs) 316 bar.done() 317 318 print() 319 320 321def url_save_icourses(url, filepath, bar, total_size, dyn_callback=None, is_part=False, max_size=0, headers=None): 322 def dyn_update_url(received): 323 if callable(dyn_callback): 324 logging.debug('Calling callback %s for new URL from %s' % (dyn_callback.__name__, received)) 325 return dyn_callback(received) 326 if bar is None: 327 bar = DummyProgressBar() 328 if os.path.exists(filepath): 329 if not force: 330 if not is_part: 331 bar.done() 332 print('Skipping %s: file already exists' % tr(os.path.basename(filepath))) 333 else: 334 filesize = os.path.getsize(filepath) 335 bar.update_received(filesize) 336 return 337 else: 338 if not is_part: 339 bar.done() 340 print('Overwriting %s' % os.path.basename(filepath), '...') 341 elif not os.path.exists(os.path.dirname(filepath)): 342 os.mkdir(os.path.dirname(filepath)) 343 344 temp_filepath = filepath + '.download' 345 received = 0 346 if not force: 347 open_mode = 'ab' 348 349 if os.path.exists(temp_filepath): 350 tempfile_size = os.path.getsize(temp_filepath) 351 received += tempfile_size 352 bar.update_received(tempfile_size) 353 else: 354 open_mode = 'wb' 355 356 if received: 357 url = dyn_update_url(received) 358 359 if headers is None: 360 headers = {} 361 response = urlopen_with_retry(request.Request(url, headers=headers)) 362# Do not update content-length here. 363# Only the 1st segment's content-length is the content-length of the file. 364# For other segments, content-length is the standard one, 15 * 1024 * 1024 365 366 with open(temp_filepath, open_mode) as output: 367 before_this_uri = received 368# received - before_this_uri is size of the buf we get from one uri 369 while True: 370 update_bs = 256 * 1024 371 left_bytes = total_size - received 372 to_read = left_bytes if left_bytes <= update_bs else update_bs 373# calc the block size to read -- The server can fail to send an EOF 374 buffer = response.read(to_read) 375 if not buffer: 376 logging.debug('Got EOF from server') 377 break 378 output.write(buffer) 379 received += len(buffer) 380 bar.update_received(len(buffer)) 381 if received >= total_size: 382 break 383 if max_size and (received - before_this_uri) >= max_size: 384 url = dyn_update_url(received) 385 before_this_uri = received 386 response = urlopen_with_retry(request.Request(url, headers=headers)) 387 388 assert received == os.path.getsize(temp_filepath), '%s == %s' % (received, os.path.getsize(temp_filepath)) 389 390 if os.access(filepath, os.W_OK): 391 os.remove(filepath) # on Windows rename could fail if destination filepath exists 392 os.rename(temp_filepath, filepath) 393 394site_info = 'icourses.cn' 395download = icourses_download 396download_playlist = icourses_playlist_download 397