1#!/usr/bin/env python
2from ..common import *
3from urllib import parse, error
4import random
5from time import sleep
6import datetime
7import hashlib
8import base64
9import logging
10import re
11from xml.dom.minidom import parseString
12
13__all__ = ['icourses_download', 'icourses_playlist_download']
14
15
16def icourses_download(url, output_dir='.', **kwargs):
17    if 'showResDetail.action' in url:
18        hit = re.search(r'id=(\d+)&courseId=(\d+)', url)
19        url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}'.format(hit.group(1), hit.group(2))
20    if re.match(r'http://www.icourses.cn/coursestatic/course_(\d+).html', url):
21        raise Exception('You can download it with -l flag')
22    icourses_parser = ICousesExactor(url=url)
23    icourses_parser.basic_extract()
24    title = icourses_parser.title
25    size = None
26    for i in range(5):
27        try:
28            # use this url only for size
29            size_url = icourses_parser.generate_url(0)
30            _, type_, size = url_info(size_url, headers=fake_headers)
31        except error.HTTPError:
32            logging.warning('Failed to fetch the video file! Retrying...')
33            sleep(random.Random().randint(2, 5))  # Prevent from blockage
34        else:
35            print_info(site_info, title, type_, size)
36            break
37
38    if size is None:
39        raise Exception("Failed")
40
41    if not kwargs['info_only']:
42        real_url = icourses_parser.update_url(0)
43        headers = fake_headers.copy()
44        headers['Referer'] = url
45        download_urls_icourses(real_url, title, 'flv',total_size=size, output_dir=output_dir, max_size=15728640, dyn_callback=icourses_parser.update_url)
46    return
47
48
49def get_course_title(url, course_type, page=None):
50    if page is None:
51        try:
52            # shard course page could be gbk but with charset="utf-8"
53            page = get_content(url, decoded=False).decode('gbk')
54        except UnicodeDecodeError:
55            page = get_content(url, decoded=False).decode('utf8')
56
57    if course_type == 'shared_old':
58        patt = r'<div\s+class="top_left_til">(.+?)<\/div>'
59    elif course_type == 'shared_new':
60        patt = r'<h1>(.+?)<\/h1>'
61    else:
62        patt = r'<div\s+class="con">(.+?)<\/div>'
63
64    return re.search(patt, page).group(1)
65
66
67def public_course_playlist(url, page=None):
68    host = 'http://www.icourses.cn/'
69    patt = r'<a href="(.+?)"\s*title="(.+?)".+?>(?:.|\n)+?</a>'
70
71    if page is None:
72        page = get_content(url)
73    playlist = re.findall(patt, page)
74    return [(host+i[0], i[1]) for i in playlist]
75
76
77def public_course_get_title(url, page=None):
78    patt = r'<div\s*class="kcslbut">.+?第(\d+)讲'
79
80    if page is None:
81        page = get_content(url)
82    seq_num = int(re.search(patt, page).group(1)) - 1
83    course_main_title = get_course_title(url, 'public', page)
84    return '{}_第{}讲_{}'.format(course_main_title, seq_num+1, public_course_playlist(url, page)[seq_num][1])
85
86
87def icourses_playlist_download(url, output_dir='.', **kwargs):
88    page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)'
89    resid_courseid_patt = r'changeforvideo\(\'(\d+)\',\'(\d+)\',\'(\d+)\'\)'
90    ep = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'
91    change_for_video_ip = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}'
92    video_list = []
93
94    if 'viewVCourse' in url:
95        playlist = public_course_playlist(url)
96        for video in playlist:
97            icourses_download(video[0], output_dir=output_dir, **kwargs)
98        return
99    elif 'coursestatic' in url:
100        course_page = get_content(url)
101        page_navi_vars = re.search(page_type_patt, course_page)
102
103        if page_navi_vars is None:  # type 2 shared course
104            video_list = icourses_playlist_new(url, course_page)
105        else:  # type 1 shared course
106            sec_page = get_content(ep.format(page_navi_vars.group(2), page_navi_vars.group(1)))
107            video_list = re.findall(resid_courseid_patt, sec_page)
108    elif 'viewCharacterDetail.action' in url or 'changeforVideo.action' in url:
109        page = get_content(url)
110        video_list = re.findall(resid_courseid_patt, page)
111
112    if not video_list:
113        raise Exception('Unknown url pattern')
114
115    for video in video_list:
116        video_url = change_for_video_ip.format(video[0], video[1])
117        sleep(random.Random().randint(0, 5))  # Prevent from blockage
118        icourses_download(video_url, output_dir=output_dir, **kwargs)
119
120
121def icourses_playlist_new(url, page=None):
122    # 2 helpers using same interface in the js code
123    def to_chap(course_id, chap_id, mod):
124        ep = 'http://www.icourses.cn/jpk/viewCharacterDetail2.action?courseId={}&characId={}&mod={}'
125        req = post_content(ep.format(course_id, chap_id, mod), post_data={})
126        return req
127
128    def to_sec(course_id, chap_id, mod):
129        ep = 'http://www.icourses.cn/jpk/viewCharacterDetail2.action?courseId={}&characId={}&mod={}'
130        req = post_content(ep.format(course_id, chap_id, mod), post_data={})
131        return req
132
133    def show_sec(course_id, chap_id):
134        ep = 'http://www.icourses.cn/jpk/getSectionNode.action?courseId={}&characId={}&mod=2'
135        req = post_content(ep.format(course_id, chap_id), post_data={})
136        return req
137
138    if page is None:
139        page = get_content(url)
140    chap_patt = r'<h3>.+?id="parent_row_(\d+)".+?onclick="(\w+)\((.+)\)"'
141    to_chap_patt = r'this,(\d+),(\d+),(\d)'
142    show_sec_patt = r'this,(\d+),(\d+)'
143    res_patt = r'res_showResDetail\(\'(\d+)\',\'.+?\',\'\d+\',\'mp4\',\'(\d+)\'\)'
144    l = re.findall(chap_patt, page)
145    for i in l:
146        if i[1] == 'ajaxtocharac':
147            hit = re.search(to_chap_patt, i[2])
148            page = to_chap(hit.group(1), hit.group(2), hit.group(3))
149            hit_list = re.findall(res_patt, page)
150            if hit_list:
151                return get_playlist(hit_list[0][0], hit_list[0][1])
152            for hit in hit_list:
153                print(hit)
154        elif i[1] == 'showSectionNode2':
155            hit = re.search(show_sec_patt, i[2])
156            page = show_sec(hit.group(1), hit.group(2))
157            # print(page)
158            patt = r'ajaxtosection\(this,(\d+),(\d+),(\d+)\)'
159            hit_list = re.findall(patt, page)
160            # print(hit_list)
161            for hit in hit_list:
162                page = to_sec(hit[0], hit[1], hit[2])
163                vlist = re.findall(res_patt, page)
164                if vlist:
165                    return get_playlist(vlist[0][0], vlist[0][1])
166    raise Exception("No video found in this playlist")
167
168
169def get_playlist(res_id, course_id):
170    ep = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}'
171    req = get_content(ep.format(res_id, course_id))
172
173    patt = r'<a.+?changeforvideo\(\'(\d+)\',\'(\d+)\',\'(\d+)\'\).+?title=\"(.+?)\"'
174    return re.findall(patt, req)
175
176
177class ICousesExactor(object):
178    PLAYER_BASE_VER = '150606-1'
179    ENCRYPT_MOD_VER = '151020'
180    ENCRYPT_SALT = '3DAPmXsZ4o'  # It took really long time to find this...
181
182    def __init__(self, url):
183        self.url = url
184        self.title = ''
185        self.flashvars = ''
186        self.api_data = {}
187        self.media_url = ''
188        self.common_args = {}
189        self.enc_mode = True
190        self.page = get_content(self.url)
191        return
192
193    def get_title(self):
194        if 'viewVCourse' in self.url:
195            self.title = public_course_get_title(self.url, self.page)
196            return
197        title_a_patt = r'<div class="con"> <a.*?>(.*?)</a>'
198        title_b_patt = r'<div class="con"> <a.*?/a>((.|\n)*?)</div>'
199        title_a = match1(self.page, title_a_patt).strip()
200        title_b = match1(self.page, title_b_patt).strip()
201        title = title_a + title_b
202        title = re.sub('( +|\n|\t|\r|&nbsp;)', '', unescape_html(title).replace(' ', ''))
203        self.title = title
204
205    def get_flashvars(self):
206        patt = r'var flashvars\s*=\s*(\{(?:.|\n)+?\});'
207        hit = re.search(patt, self.page)
208        if hit is None:
209            raise Exception('Cannot find flashvars')
210        flashvar_str = hit.group(1)
211
212        uuid = re.search(r'uuid\s*:\s*\"?(\w+)\"?', flashvar_str).group(1)
213        other = re.search(r'other\s*:\s*"(.*?)"', flashvar_str).group(1)
214        isvc = re.search(r'IService\s*:\s*\'(.+?)\'', flashvar_str).group(1)
215
216        player_time_patt = r'MPlayer.swf\?v\=(\d+)'
217        player_time = re.search(player_time_patt, self.page).group(1)
218
219        self.flashvars = dict(IService=isvc, uuid=uuid, other=other, v=player_time)
220
221    def api_req(self, url):
222        xml_str = get_content(url)
223        dom = parseString(xml_str)
224        status = dom.getElementsByTagName('result')[0].getAttribute('status')
225        if status != 'success':
226            raise Exception('API returned fail')
227
228        api_res = {}
229        meta = dom.getElementsByTagName('metadata')
230        for m in meta:
231            key = m.getAttribute('name')
232            val = m.firstChild.nodeValue
233            api_res[key] = val
234        self.api_data = api_res
235
236    def basic_extract(self):
237        self.get_title()
238        self.get_flashvars()
239        api_req_url = '{}?{}'.format(self.flashvars['IService'], parse.urlencode(self.flashvars))
240        self.api_req(api_req_url)
241
242    def do_extract(self, received=0):
243        self.basic_extract()
244        return self.generate_url(received)
245
246    def update_url(self, received):
247        args = self.common_args.copy()
248        play_type = 'seek' if received else 'play'
249        received = received if received else -1
250        args['ls'] = play_type
251        args['start'] = received + 1
252        args['lt'] = self.get_date_str()
253        if self.enc_mode:
254            ssl_ts, sign = self.get_sign(self.media_url)
255            extra_args = dict(h=sign, r=ssl_ts, p=self.__class__.ENCRYPT_MOD_VER)
256            args.update(extra_args)
257        return '{}?{}'.format(self.media_url, parse.urlencode(args))
258
259    @classmethod
260    def get_date_str(self):
261        fmt_str = '%-m-%-d/%-H:%-M:%-S'
262        now = datetime.datetime.now()
263        try:
264            date_str =  now.strftime(fmt_str)
265        except ValueError:  # msvcrt
266            date_str = '{}-{}/{}:{}:{}'.format(now.month, now.day, now.hour, now.minute, now.second)
267        return date_str
268
269    def generate_url(self, received):
270        media_host = self.get_media_host(self.api_data['host'])
271        media_url = media_host + self.api_data['url']
272        self.media_url = media_url
273
274        common_args = dict(lv=self.__class__.PLAYER_BASE_VER)
275        h = self.api_data.get('h')
276        r = self.api_data.get('p', self.__class__.ENCRYPT_MOD_VER)
277
278        if self.api_data['ssl'] != 'true':
279            self.enc_mode = False
280            common_args.update(dict(h=h, r=r))
281        else:
282            self.enc_mode = True
283            common_args['p'] = self.__class__.ENCRYPT_MOD_VER
284        self.common_args = common_args
285        return self.update_url(received)
286
287    def get_sign(self, media_url):
288        media_host = parse.urlparse(media_url).netloc
289        ran = random.randint(0, 9999999)
290        ssl_callback = get_content('http://{}/ssl/ssl.shtml?r={}'.format(media_host, ran)).split(',')
291        ssl_ts = int(datetime.datetime.strptime(ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0]))
292        sign_this = self.__class__.ENCRYPT_SALT + parse.urlparse(media_url).path + str(ssl_ts)
293        arg_h = base64.b64encode(hashlib.md5(bytes(sign_this, 'utf-8')).digest(), altchars=b'-_')
294        return ssl_ts, arg_h.decode('utf-8').strip('=')
295
296    def get_media_host(self, ori_host):
297        res = get_content(ori_host + '/ssl/host.shtml').strip()
298        path = parse.urlparse(ori_host).path
299        return ''.join([res, path])
300
301
302def download_urls_icourses(url, title, ext, total_size, output_dir='.', headers=None, **kwargs):
303    if dry_run or player:
304        log.wtf('Non standard protocol')
305
306    title = get_filename(title)
307
308    filename = '%s.%s' % (title, ext)
309    filepath = os.path.join(output_dir, filename)
310    if not force and os.path.exists(filepath):
311        print('Skipping {}: file already exists\n'.format(filepath))
312        return
313    bar = SimpleProgressBar(total_size, 1)
314    print('Downloading %s ...' % tr(filename))
315    url_save_icourses(url, filepath, bar, total_size, headers=headers, **kwargs)
316    bar.done()
317
318    print()
319
320
321def url_save_icourses(url, filepath, bar, total_size, dyn_callback=None, is_part=False, max_size=0, headers=None):
322    def dyn_update_url(received):
323        if callable(dyn_callback):
324            logging.debug('Calling callback %s for new URL from %s' % (dyn_callback.__name__, received))
325            return dyn_callback(received)
326    if bar is None:
327        bar = DummyProgressBar()
328    if os.path.exists(filepath):
329        if not force:
330            if not is_part:
331                bar.done()
332                print('Skipping %s: file already exists' % tr(os.path.basename(filepath)))
333            else:
334                filesize = os.path.getsize(filepath)
335                bar.update_received(filesize)
336            return
337        else:
338            if not is_part:
339                bar.done()
340                print('Overwriting %s' % os.path.basename(filepath), '...')
341    elif not os.path.exists(os.path.dirname(filepath)):
342        os.mkdir(os.path.dirname(filepath))
343
344    temp_filepath = filepath + '.download'
345    received = 0
346    if not force:
347        open_mode = 'ab'
348
349        if os.path.exists(temp_filepath):
350            tempfile_size = os.path.getsize(temp_filepath)
351            received += tempfile_size
352            bar.update_received(tempfile_size)
353    else:
354        open_mode = 'wb'
355
356    if received:
357        url = dyn_update_url(received)
358
359    if headers is None:
360        headers = {}
361    response = urlopen_with_retry(request.Request(url, headers=headers))
362# Do not update content-length here.
363# Only the 1st segment's content-length is the content-length of the file.
364# For other segments, content-length is the standard one, 15 * 1024 * 1024
365
366    with open(temp_filepath, open_mode) as output:
367        before_this_uri = received
368# received - before_this_uri is size of the buf we get from one uri
369        while True:
370            update_bs = 256 * 1024
371            left_bytes = total_size - received
372            to_read = left_bytes if left_bytes <= update_bs else update_bs
373# calc the block size to read -- The server can fail to send an EOF
374            buffer = response.read(to_read)
375            if not buffer:
376                logging.debug('Got EOF from server')
377                break
378            output.write(buffer)
379            received += len(buffer)
380            bar.update_received(len(buffer))
381            if received >= total_size:
382                break
383            if max_size and (received - before_this_uri) >= max_size:
384                url = dyn_update_url(received)
385                before_this_uri = received
386                response = urlopen_with_retry(request.Request(url, headers=headers))
387
388    assert received == os.path.getsize(temp_filepath), '%s == %s' % (received, os.path.getsize(temp_filepath))
389
390    if os.access(filepath, os.W_OK):
391        os.remove(filepath)  # on Windows rename could fail if destination filepath exists
392    os.rename(temp_filepath, filepath)
393
394site_info = 'icourses.cn'
395download = icourses_download
396download_playlist = icourses_playlist_download
397