1# -*- coding: utf-8 -*-
2from __future__ import absolute_import
3import io
4import logging
5import os
6import zipfile
7import re
8import copy
9
10try:
11    from urlparse import urljoin
12except ImportError:
13    from urllib.parse import urljoin
14
15import rarfile
16from babelfish import language_converters
17from subzero.language import Language
18from guessit import guessit
19from requests import Session
20from six import text_type
21from random import randint
22
23from subliminal.providers import ParserBeautifulSoup
24from subliminal_patch.providers import Provider
25from subliminal.subtitle import (
26    SUBTITLE_EXTENSIONS,
27    fix_line_ending
28    )
29from subliminal_patch.subtitle import (
30    Subtitle,
31    guess_matches
32)
33from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST
34from subliminal.video import Episode, Movie
35
36logger = logging.getLogger(__name__)
37
38language_converters.register('zimuku = subliminal_patch.converters.zimuku:zimukuConverter')
39
40supported_languages = list(language_converters['zimuku'].to_zimuku.keys())
41
42class ZimukuSubtitle(Subtitle):
43    """Zimuku Subtitle."""
44
45    provider_name = "zimuku"
46
47    def __init__(self, language, page_link, version, session, year):
48        super(ZimukuSubtitle, self).__init__(language, page_link=page_link)
49        self.version = version
50        self.release_info = version
51        self.hearing_impaired = False
52        self.encoding = "utf-8"
53        self.session = session
54        self.year = year
55
56    @property
57    def id(self):
58        return self.page_link
59
60    def get_matches(self, video):
61        matches = set()
62
63        if video.year == self.year:
64            matches.add('year')
65
66        # episode
67        if isinstance(video, Episode):
68            info = guessit(self.version, {"type": "episode"})
69            # other properties
70            matches |= guess_matches(video, info)
71
72            # add year to matches if video doesn't have a year but series, season and episode are matched
73            if not video.year and all(item in matches for item in ['series', 'season', 'episode']):
74                matches |= {'year'}
75        # movie
76        elif isinstance(video, Movie):
77            # other properties
78            matches |= guess_matches(video, guessit(self.version, {"type": "movie"}))
79
80        return matches
81
82
83class ZimukuProvider(Provider):
84    """Zimuku Provider."""
85
86    languages = {Language(*l) for l in supported_languages}
87    video_types = (Episode, Movie)
88    logger.info(str(supported_languages))
89
90    server_url = "http://zimuku.org"
91    search_url = "/search?q={}"
92    download_url = "http://zimuku.org/"
93
94    subtitle_class = ZimukuSubtitle
95
96    def __init__(self):
97        self.session = None
98
99    def initialize(self):
100        self.session = Session()
101        self.session.headers["User-Agent"] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)]
102
103    def terminate(self):
104        self.session.close()
105
106    def _parse_episode_page(self, link, year):
107        r = self.session.get(link)
108        bs_obj = ParserBeautifulSoup(
109            r.content.decode("utf-8", "ignore"), ["html.parser"]
110        )
111        subs_body = bs_obj.find("div", class_="subs box clearfix").find("tbody")
112        subs = []
113        for sub in subs_body.find_all("tr"):
114            a = sub.find("a")
115            name = _extract_name(a.text)
116            name = os.path.splitext(name)[
117                0
118            ]  # remove ext because it can be an archive type
119
120            language = Language("eng")
121            for img in sub.find("td", class_="tac lang").find_all("img"):
122                if (
123                    "china" in img.attrs["src"]
124                    and "hongkong" in img.attrs["src"]
125                    ):
126                    language = Language("zho").add(Language('zho', 'TW', None))
127                    logger.debug("language:"+str(language))
128                elif (
129                    "china" in img.attrs["src"]
130                    or "jollyroger" in img.attrs["src"]
131                ):
132                    language = Language("zho")
133                elif "hongkong" in img.attrs["src"]:
134                    language =  Language('zho', 'TW', None)
135                    break
136            sub_page_link = urljoin(self.server_url, a.attrs["href"])
137            backup_session = copy.deepcopy(self.session)
138            backup_session.headers["Referer"] = link
139
140            subs.append(
141                self.subtitle_class(language, sub_page_link, name, backup_session, year)
142            )
143
144        return subs
145
146    def query(self, keyword, season=None, episode=None, year=None):
147        params = keyword
148        if season:
149            params += ".S{season:02d}".format(season=season)
150        elif year:
151            params += " {:4d}".format(year)
152
153        logger.debug("Searching subtitles %r", params)
154        subtitles = []
155        search_link = self.server_url + text_type(self.search_url).format(params)
156
157        r = self.session.get(search_link, timeout=30)
158        r.raise_for_status()
159
160        if not r.content:
161            logger.debug("No data returned from provider")
162            return []
163
164        html = r.content.decode("utf-8", "ignore")
165        # parse window location
166        pattern = r"url\s*=\s*'([^']*)'\s*\+\s*url"
167        parts = re.findall(pattern, html)
168        redirect_url = search_link
169        while parts:
170            parts.reverse()
171            redirect_url = urljoin(self.server_url, "".join(parts))
172            r = self.session.get(redirect_url, timeout=30)
173            html = r.content.decode("utf-8", "ignore")
174            parts = re.findall(pattern, html)
175        logger.debug("search url located: " + redirect_url)
176
177        soup = ParserBeautifulSoup(
178            r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]
179        )
180
181        # non-shooter result page
182        if soup.find("div", {"class": "item"}):
183            logger.debug("enter a non-shooter page")
184            for item in soup.find_all("div", {"class": "item"}):
185                title_a = item.find("p", class_="tt clearfix").find("a")
186                subs_year = year
187                if season:
188                    # episode year in zimuku is the season's year not show's year
189                    actual_subs_year = re.findall(r"\d{4}", title_a.text) or None
190                    if actual_subs_year:
191                        subs_year = int(actual_subs_year[0]) - season + 1
192                    title = title_a.text
193                    season_cn1 = re.search("第(.*)季", title)
194                    if not season_cn1:
195                        season_cn1 = "一"
196                    else:
197                        season_cn1 = season_cn1.group(1).strip()
198                    season_cn2 = num_to_cn(str(season))
199                    if season_cn1 != season_cn2:
200                        continue
201                episode_link = self.server_url + title_a.attrs["href"]
202                new_subs = self._parse_episode_page(episode_link, subs_year)
203                subtitles += new_subs
204
205        # NOTE: shooter result pages are ignored due to the existence of zimuku provider
206
207        return subtitles
208
209    def list_subtitles(self, video, languages):
210        if isinstance(video, Episode):
211            titles = [video.series] + video.alternative_series
212        elif isinstance(video, Movie):
213            titles = [video.title] + video.alternative_titles
214        else:
215            titles = []
216
217        subtitles = []
218        # query for subtitles with the show_id
219        for title in titles:
220            if isinstance(video, Episode):
221                subtitles += [
222                    s
223                    for s in self.query(
224                        title,
225                        season=video.season,
226                        episode=video.episode,
227                        year=video.year,
228                    )
229                    if s.language in languages
230                ]
231            elif isinstance(video, Movie):
232                subtitles += [
233                    s
234                    for s in self.query(title, year=video.year)
235                    if s.language in languages
236                ]
237
238        return subtitles
239
240    def download_subtitle(self, subtitle):
241        def _get_archive_dowload_link(session, sub_page_link):
242            r = session.get(sub_page_link)
243            bs_obj = ParserBeautifulSoup(
244                r.content.decode("utf-8", "ignore"), ["html.parser"]
245            )
246            down_page_link = bs_obj.find("a", {"id": "down1"}).attrs["href"]
247            down_page_link = urljoin(sub_page_link, down_page_link)
248            r = session.get(down_page_link)
249            bs_obj = ParserBeautifulSoup(
250                r.content.decode("utf-8", "ignore"), ["html.parser"]
251            )
252            download_link = bs_obj.find("a", {"rel": "nofollow"})
253            download_link = download_link.attrs["href"]
254            download_link = urljoin(sub_page_link, download_link)
255            return download_link
256
257        # download the subtitle
258        logger.info("Downloading subtitle %r", subtitle)
259        self.session = subtitle.session
260        download_link = _get_archive_dowload_link(self.session, subtitle.page_link)
261        r = self.session.get(download_link, headers={'Referer': subtitle.page_link}, timeout=30)
262        r.raise_for_status()
263        try:
264            filename = r.headers["Content-Disposition"]
265        except KeyError:
266            logger.debug("Unable to parse subtitles filename. Dropping this subtitles.")
267            return
268
269        if not r.content:
270            logger.debug("Unable to download subtitle. No data returned from provider")
271            return
272
273        archive_stream = io.BytesIO(r.content)
274        archive = None
275        if rarfile.is_rarfile(archive_stream):
276            logger.debug("Identified rar archive")
277            if ".rar" not in filename:
278                logger.debug(
279                    ".rar should be in the downloaded file name: {}".format(filename)
280                )
281                return
282            archive = rarfile.RarFile(archive_stream)
283            subtitle_content = _get_subtitle_from_archive(archive)
284        elif zipfile.is_zipfile(archive_stream):
285            logger.debug("Identified zip archive")
286            if ".zip" not in filename:
287                logger.debug(
288                    ".zip should be in the downloaded file name: {}".format(filename)
289                )
290                return
291            archive = zipfile.ZipFile(archive_stream)
292            subtitle_content = _get_subtitle_from_archive(archive)
293        else:
294            is_sub = ""
295            for sub_ext in SUBTITLE_EXTENSIONS:
296                if sub_ext in filename:
297                    is_sub = sub_ext
298                    break
299            if not is_sub:
300                logger.debug(
301                    "unknown subtitle ext int downloaded file name: {}".format(filename)
302                )
303                return
304            logger.debug("Identified {} file".format(is_sub))
305            subtitle_content = r.content
306
307        if subtitle_content:
308            subtitle.content = fix_line_ending(subtitle_content)
309        else:
310            logger.debug("Could not extract subtitle from %r", archive)
311
312
313def _get_subtitle_from_archive(archive):
314    extract_subname, max_score = "", -1
315
316    for subname in archive.namelist():
317        # discard hidden files
318        if os.path.split(subname)[-1].startswith("."):
319            continue
320
321        # discard non-subtitle files
322        if not subname.lower().endswith(SUBTITLE_EXTENSIONS):
323            continue
324
325        # prefer ass/ssa/srt subtitles with double languages or simplified/traditional chinese
326        score = ("ass" in subname or "ssa" in subname or "srt" in subname) * 1
327        if "简体" in subname or "chs" in subname or ".gb." in subname:
328            score += 2
329        if "繁体" in subname or "cht" in subname or ".big5." in subname:
330            score += 2
331        if "chs.eng" in subname or "chs&eng" in subname or "cht.eng" in subname or "cht&eng" in subname:
332            score += 2
333        if "中英" in subname or "简英" in subname or "繁英" in subname or "双语" in subname or "简体&英文" in subname or "繁体&英文" in subname:
334            score += 4
335        logger.debug("subtitle {}, score: {}".format(subname, score))
336        if score > max_score:
337            max_score = score
338            extract_subname = subname
339
340    return archive.read(extract_subname) if max_score != -1 else None
341
342
343def _extract_name(name):
344    """ filter out Chinese characters from subtitle names """
345    name, suffix = os.path.splitext(name)
346    c_pattern = "[\u4e00-\u9fff]"
347    e_pattern = "[a-zA-Z]"
348    c_indices = [m.start(0) for m in re.finditer(c_pattern, name)]
349    e_indices = [m.start(0) for m in re.finditer(e_pattern, name)]
350
351    target, discard = e_indices, c_indices
352
353    if len(target) == 0:
354        return ""
355
356    first_target, last_target = target[0], target[-1]
357    first_discard = discard[0] if discard else -1
358    last_discard = discard[-1] if discard else -1
359    if last_discard < first_target:
360        new_name = name[first_target:]
361    elif last_target < first_discard:
362        new_name = name[:first_discard]
363    else:
364        # try to find maximum continous part
365        result, start, end = [0, 1], -1, 0
366        while end < len(name):
367            while end not in e_indices and end < len(name):
368                end += 1
369            if end == len(name):
370                break
371            start = end
372            while end not in c_indices and end < len(name):
373                end += 1
374            if end - start > result[1] - result[0]:
375                result = [start, end]
376            start = end
377            end += 1
378        new_name = name[result[0] : result[1]]
379    new_name = new_name.strip() + suffix
380    return new_name
381
382
383def num_to_cn(number):
384    """ convert numbers(1-99) to Chinese """
385    assert number.isdigit() and 1 <= int(number) <= 99
386
387    trans_map = {n: c for n, c in zip(("123456789"), ("一二三四五六七八九"))}
388
389    if len(number) == 1:
390        return trans_map[number]
391    else:
392        part1 = "十" if number[0] == "1" else trans_map[number[0]] + "十"
393        part2 = trans_map[number[1]] if number[1] != "0" else ""
394        return part1 + part2
395