1# -*- coding: utf-8 -*- 2from __future__ import absolute_import 3import io 4import logging 5import os 6import zipfile 7import re 8import copy 9 10try: 11 from urlparse import urljoin 12except ImportError: 13 from urllib.parse import urljoin 14 15import rarfile 16from babelfish import language_converters 17from subzero.language import Language 18from guessit import guessit 19from requests import Session 20from six import text_type 21from random import randint 22 23from subliminal.providers import ParserBeautifulSoup 24from subliminal_patch.providers import Provider 25from subliminal.subtitle import ( 26 SUBTITLE_EXTENSIONS, 27 fix_line_ending 28 ) 29from subliminal_patch.subtitle import ( 30 Subtitle, 31 guess_matches 32) 33from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST 34from subliminal.video import Episode, Movie 35 36logger = logging.getLogger(__name__) 37 38language_converters.register('zimuku = subliminal_patch.converters.zimuku:zimukuConverter') 39 40supported_languages = list(language_converters['zimuku'].to_zimuku.keys()) 41 42class ZimukuSubtitle(Subtitle): 43 """Zimuku Subtitle.""" 44 45 provider_name = "zimuku" 46 47 def __init__(self, language, page_link, version, session, year): 48 super(ZimukuSubtitle, self).__init__(language, page_link=page_link) 49 self.version = version 50 self.release_info = version 51 self.hearing_impaired = False 52 self.encoding = "utf-8" 53 self.session = session 54 self.year = year 55 56 @property 57 def id(self): 58 return self.page_link 59 60 def get_matches(self, video): 61 matches = set() 62 63 if video.year == self.year: 64 matches.add('year') 65 66 # episode 67 if isinstance(video, Episode): 68 info = guessit(self.version, {"type": "episode"}) 69 # other properties 70 matches |= guess_matches(video, info) 71 72 # add year to matches if video doesn't have a year but series, season and episode are matched 73 if not video.year and all(item in matches for item in ['series', 'season', 'episode']): 74 matches |= {'year'} 75 # movie 76 elif isinstance(video, Movie): 77 # other properties 78 matches |= guess_matches(video, guessit(self.version, {"type": "movie"})) 79 80 return matches 81 82 83class ZimukuProvider(Provider): 84 """Zimuku Provider.""" 85 86 languages = {Language(*l) for l in supported_languages} 87 video_types = (Episode, Movie) 88 logger.info(str(supported_languages)) 89 90 server_url = "http://zimuku.org" 91 search_url = "/search?q={}" 92 download_url = "http://zimuku.org/" 93 94 subtitle_class = ZimukuSubtitle 95 96 def __init__(self): 97 self.session = None 98 99 def initialize(self): 100 self.session = Session() 101 self.session.headers["User-Agent"] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] 102 103 def terminate(self): 104 self.session.close() 105 106 def _parse_episode_page(self, link, year): 107 r = self.session.get(link) 108 bs_obj = ParserBeautifulSoup( 109 r.content.decode("utf-8", "ignore"), ["html.parser"] 110 ) 111 subs_body = bs_obj.find("div", class_="subs box clearfix").find("tbody") 112 subs = [] 113 for sub in subs_body.find_all("tr"): 114 a = sub.find("a") 115 name = _extract_name(a.text) 116 name = os.path.splitext(name)[ 117 0 118 ] # remove ext because it can be an archive type 119 120 language = Language("eng") 121 for img in sub.find("td", class_="tac lang").find_all("img"): 122 if ( 123 "china" in img.attrs["src"] 124 and "hongkong" in img.attrs["src"] 125 ): 126 language = Language("zho").add(Language('zho', 'TW', None)) 127 logger.debug("language:"+str(language)) 128 elif ( 129 "china" in img.attrs["src"] 130 or "jollyroger" in img.attrs["src"] 131 ): 132 language = Language("zho") 133 elif "hongkong" in img.attrs["src"]: 134 language = Language('zho', 'TW', None) 135 break 136 sub_page_link = urljoin(self.server_url, a.attrs["href"]) 137 backup_session = copy.deepcopy(self.session) 138 backup_session.headers["Referer"] = link 139 140 subs.append( 141 self.subtitle_class(language, sub_page_link, name, backup_session, year) 142 ) 143 144 return subs 145 146 def query(self, keyword, season=None, episode=None, year=None): 147 params = keyword 148 if season: 149 params += ".S{season:02d}".format(season=season) 150 elif year: 151 params += " {:4d}".format(year) 152 153 logger.debug("Searching subtitles %r", params) 154 subtitles = [] 155 search_link = self.server_url + text_type(self.search_url).format(params) 156 157 r = self.session.get(search_link, timeout=30) 158 r.raise_for_status() 159 160 if not r.content: 161 logger.debug("No data returned from provider") 162 return [] 163 164 html = r.content.decode("utf-8", "ignore") 165 # parse window location 166 pattern = r"url\s*=\s*'([^']*)'\s*\+\s*url" 167 parts = re.findall(pattern, html) 168 redirect_url = search_link 169 while parts: 170 parts.reverse() 171 redirect_url = urljoin(self.server_url, "".join(parts)) 172 r = self.session.get(redirect_url, timeout=30) 173 html = r.content.decode("utf-8", "ignore") 174 parts = re.findall(pattern, html) 175 logger.debug("search url located: " + redirect_url) 176 177 soup = ParserBeautifulSoup( 178 r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"] 179 ) 180 181 # non-shooter result page 182 if soup.find("div", {"class": "item"}): 183 logger.debug("enter a non-shooter page") 184 for item in soup.find_all("div", {"class": "item"}): 185 title_a = item.find("p", class_="tt clearfix").find("a") 186 subs_year = year 187 if season: 188 # episode year in zimuku is the season's year not show's year 189 actual_subs_year = re.findall(r"\d{4}", title_a.text) or None 190 if actual_subs_year: 191 subs_year = int(actual_subs_year[0]) - season + 1 192 title = title_a.text 193 season_cn1 = re.search("第(.*)季", title) 194 if not season_cn1: 195 season_cn1 = "一" 196 else: 197 season_cn1 = season_cn1.group(1).strip() 198 season_cn2 = num_to_cn(str(season)) 199 if season_cn1 != season_cn2: 200 continue 201 episode_link = self.server_url + title_a.attrs["href"] 202 new_subs = self._parse_episode_page(episode_link, subs_year) 203 subtitles += new_subs 204 205 # NOTE: shooter result pages are ignored due to the existence of zimuku provider 206 207 return subtitles 208 209 def list_subtitles(self, video, languages): 210 if isinstance(video, Episode): 211 titles = [video.series] + video.alternative_series 212 elif isinstance(video, Movie): 213 titles = [video.title] + video.alternative_titles 214 else: 215 titles = [] 216 217 subtitles = [] 218 # query for subtitles with the show_id 219 for title in titles: 220 if isinstance(video, Episode): 221 subtitles += [ 222 s 223 for s in self.query( 224 title, 225 season=video.season, 226 episode=video.episode, 227 year=video.year, 228 ) 229 if s.language in languages 230 ] 231 elif isinstance(video, Movie): 232 subtitles += [ 233 s 234 for s in self.query(title, year=video.year) 235 if s.language in languages 236 ] 237 238 return subtitles 239 240 def download_subtitle(self, subtitle): 241 def _get_archive_dowload_link(session, sub_page_link): 242 r = session.get(sub_page_link) 243 bs_obj = ParserBeautifulSoup( 244 r.content.decode("utf-8", "ignore"), ["html.parser"] 245 ) 246 down_page_link = bs_obj.find("a", {"id": "down1"}).attrs["href"] 247 down_page_link = urljoin(sub_page_link, down_page_link) 248 r = session.get(down_page_link) 249 bs_obj = ParserBeautifulSoup( 250 r.content.decode("utf-8", "ignore"), ["html.parser"] 251 ) 252 download_link = bs_obj.find("a", {"rel": "nofollow"}) 253 download_link = download_link.attrs["href"] 254 download_link = urljoin(sub_page_link, download_link) 255 return download_link 256 257 # download the subtitle 258 logger.info("Downloading subtitle %r", subtitle) 259 self.session = subtitle.session 260 download_link = _get_archive_dowload_link(self.session, subtitle.page_link) 261 r = self.session.get(download_link, headers={'Referer': subtitle.page_link}, timeout=30) 262 r.raise_for_status() 263 try: 264 filename = r.headers["Content-Disposition"] 265 except KeyError: 266 logger.debug("Unable to parse subtitles filename. Dropping this subtitles.") 267 return 268 269 if not r.content: 270 logger.debug("Unable to download subtitle. No data returned from provider") 271 return 272 273 archive_stream = io.BytesIO(r.content) 274 archive = None 275 if rarfile.is_rarfile(archive_stream): 276 logger.debug("Identified rar archive") 277 if ".rar" not in filename: 278 logger.debug( 279 ".rar should be in the downloaded file name: {}".format(filename) 280 ) 281 return 282 archive = rarfile.RarFile(archive_stream) 283 subtitle_content = _get_subtitle_from_archive(archive) 284 elif zipfile.is_zipfile(archive_stream): 285 logger.debug("Identified zip archive") 286 if ".zip" not in filename: 287 logger.debug( 288 ".zip should be in the downloaded file name: {}".format(filename) 289 ) 290 return 291 archive = zipfile.ZipFile(archive_stream) 292 subtitle_content = _get_subtitle_from_archive(archive) 293 else: 294 is_sub = "" 295 for sub_ext in SUBTITLE_EXTENSIONS: 296 if sub_ext in filename: 297 is_sub = sub_ext 298 break 299 if not is_sub: 300 logger.debug( 301 "unknown subtitle ext int downloaded file name: {}".format(filename) 302 ) 303 return 304 logger.debug("Identified {} file".format(is_sub)) 305 subtitle_content = r.content 306 307 if subtitle_content: 308 subtitle.content = fix_line_ending(subtitle_content) 309 else: 310 logger.debug("Could not extract subtitle from %r", archive) 311 312 313def _get_subtitle_from_archive(archive): 314 extract_subname, max_score = "", -1 315 316 for subname in archive.namelist(): 317 # discard hidden files 318 if os.path.split(subname)[-1].startswith("."): 319 continue 320 321 # discard non-subtitle files 322 if not subname.lower().endswith(SUBTITLE_EXTENSIONS): 323 continue 324 325 # prefer ass/ssa/srt subtitles with double languages or simplified/traditional chinese 326 score = ("ass" in subname or "ssa" in subname or "srt" in subname) * 1 327 if "简体" in subname or "chs" in subname or ".gb." in subname: 328 score += 2 329 if "繁体" in subname or "cht" in subname or ".big5." in subname: 330 score += 2 331 if "chs.eng" in subname or "chs&eng" in subname or "cht.eng" in subname or "cht&eng" in subname: 332 score += 2 333 if "中英" in subname or "简英" in subname or "繁英" in subname or "双语" in subname or "简体&英文" in subname or "繁体&英文" in subname: 334 score += 4 335 logger.debug("subtitle {}, score: {}".format(subname, score)) 336 if score > max_score: 337 max_score = score 338 extract_subname = subname 339 340 return archive.read(extract_subname) if max_score != -1 else None 341 342 343def _extract_name(name): 344 """ filter out Chinese characters from subtitle names """ 345 name, suffix = os.path.splitext(name) 346 c_pattern = "[\u4e00-\u9fff]" 347 e_pattern = "[a-zA-Z]" 348 c_indices = [m.start(0) for m in re.finditer(c_pattern, name)] 349 e_indices = [m.start(0) for m in re.finditer(e_pattern, name)] 350 351 target, discard = e_indices, c_indices 352 353 if len(target) == 0: 354 return "" 355 356 first_target, last_target = target[0], target[-1] 357 first_discard = discard[0] if discard else -1 358 last_discard = discard[-1] if discard else -1 359 if last_discard < first_target: 360 new_name = name[first_target:] 361 elif last_target < first_discard: 362 new_name = name[:first_discard] 363 else: 364 # try to find maximum continous part 365 result, start, end = [0, 1], -1, 0 366 while end < len(name): 367 while end not in e_indices and end < len(name): 368 end += 1 369 if end == len(name): 370 break 371 start = end 372 while end not in c_indices and end < len(name): 373 end += 1 374 if end - start > result[1] - result[0]: 375 result = [start, end] 376 start = end 377 end += 1 378 new_name = name[result[0] : result[1]] 379 new_name = new_name.strip() + suffix 380 return new_name 381 382 383def num_to_cn(number): 384 """ convert numbers(1-99) to Chinese """ 385 assert number.isdigit() and 1 <= int(number) <= 99 386 387 trans_map = {n: c for n, c in zip(("123456789"), ("一二三四五六七八九"))} 388 389 if len(number) == 1: 390 return trans_map[number] 391 else: 392 part1 = "十" if number[0] == "1" else trans_map[number[0]] + "十" 393 part2 = trans_map[number[1]] if number[1] != "0" else "" 394 return part1 + part2 395