1# coding=utf-8 2 3 4from __future__ import absolute_import 5import logging 6import traceback 7 8import re 9import types 10 11import chardet 12import pysrt 13import pysubs2 14from bs4 import UnicodeDammit 15from pysubs2 import SSAStyle 16from pysubs2.subrip import parse_tags, MAX_REPRESENTABLE_TIME 17from pysubs2.time import ms_to_times 18from subzero.modification import SubtitleModifications 19from subzero.language import Language 20from subliminal import Subtitle as Subtitle_ 21from subliminal.subtitle import Episode, Movie, sanitize_release_group, get_equivalent_release_groups 22from subliminal_patch.utils import sanitize 23from ftfy import fix_text 24from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE 25from six import text_type 26 27BOMS = ( 28 (BOM_UTF8, "UTF-8"), 29 (BOM_UTF32_BE, "UTF-32-BE"), 30 (BOM_UTF32_LE, "UTF-32-LE"), 31 (BOM_UTF16_BE, "UTF-16-BE"), 32 (BOM_UTF16_LE, "UTF-16-LE"), 33) 34 35logger = logging.getLogger(__name__) 36 37 38ftfy_defaults = { 39 "uncurl_quotes": False, 40 "fix_character_width": False, 41} 42 43 44class Subtitle(Subtitle_): 45 storage_path = None 46 release_info = None 47 matches = {} 48 hash_verifiable = False 49 hearing_impaired_verifiable = False 50 mods = None 51 plex_media_fps = None 52 skip_wrong_fps = False 53 wrong_fps = False 54 wrong_series = False 55 wrong_season_ep = False 56 is_pack = False 57 asked_for_release_group = None 58 asked_for_episode = None 59 uploader = None # string - uploader username 60 61 pack_data = None 62 _guessed_encoding = None 63 _is_valid = False 64 65 def __init__(self, language, hearing_impaired=False, page_link=None, encoding=None, mods=None): 66 # set subtitle language to hi if it's hearing_impaired 67 if hearing_impaired: 68 language = Language.rebuild(language, hi=True) 69 70 super(Subtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link, 71 encoding=encoding) 72 self.mods = mods 73 self._is_valid = False 74 75 def __repr__(self): 76 return '<%s %r [%s:%s]>' % ( 77 self.__class__.__name__, self.page_link, self.language, self._guessed_encoding) 78 79 @property 80 def text(self): 81 """Content as string 82 83 If :attr:`encoding` is None, the encoding is guessed with :meth:`guess_encoding` 84 85 """ 86 if not self.content: 87 return 88 89 if not isinstance(self.content, text_type): 90 return self.content.decode(self.get_encoding(), errors='replace') 91 92 return self.content 93 94 @property 95 def numeric_id(self): 96 raise NotImplemented 97 98 def get_fps(self): 99 """ 100 :return: frames per second or None if not supported 101 :rtype: float 102 """ 103 return None 104 105 def make_picklable(self): 106 """ 107 some subtitle instances might have unpicklable objects stored; clean them up here 108 :return: self 109 """ 110 return self 111 112 def get_encoding(self): 113 return self.guess_encoding() 114 115 def set_encoding(self, encoding): 116 ge = self.get_encoding() 117 if encoding == ge: 118 return 119 120 unicontent = self.text 121 logger.debug("Changing encoding: to %s, from %s", encoding, ge) 122 self.content = unicontent.encode(encoding) 123 self._guessed_encoding = encoding 124 125 def normalize(self): 126 """ 127 Set encoding to UTF-8 and normalize line endings 128 :return: 129 """ 130 self.set_encoding("utf-8") 131 132 # normalize line endings 133 self.content = self.content.replace(b"\r\n", b"\n").replace(b'\r', b'\n') 134 135 def _check_bom(self, data): 136 return [encoding for bom, encoding in BOMS if data.startswith(bom)] 137 138 def guess_encoding(self): 139 """Guess encoding using the language, falling back on chardet. 140 141 :return: the guessed encoding. 142 :rtype: str 143 144 """ 145 if self._guessed_encoding: 146 return self._guessed_encoding 147 148 if self.encoding: 149 # check provider encoding and use it only if it is valid 150 try: 151 self.content.decode(self.encoding) 152 self._guessed_encoding = self.encoding 153 return self._guessed_encoding 154 except: 155 # provider specified encoding is invalid, fallback to guessing 156 pass 157 158 logger.info('Guessing encoding for language %s', self.language) 159 160 encodings = ['utf-8'] 161 162 # check UTF BOMs 163 bom_encodings = self._check_bom(self.content) 164 if bom_encodings: 165 encodings = list(set(enc.lower() for enc in bom_encodings + encodings)) 166 167 # add language-specific encodings 168 # http://scratchpad.wikia.com/wiki/Character_Encoding_Recommendation_for_Languages 169 170 if self.language.alpha3 == 'zho': 171 encodings.extend(['cp936', 'gb2312', 'gbk', 'hz', 'iso2022_jp_2', 'cp950', 'big5hkscs', 'big5', 172 'gb18030', 'utf-16']) 173 elif self.language.alpha3 == 'jpn': 174 encodings.extend(['shift-jis', 'cp932', 'euc_jp', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 175 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', ]) 176 elif self.language.alpha3 == 'tha': 177 encodings.extend(['tis-620', 'cp874']) 178 179 # arabian/farsi 180 elif self.language.alpha3 in ('ara', 'fas', 'per'): 181 encodings.extend(['windows-1256', 'utf-16', 'utf-16le', 'ascii', 'iso-8859-6']) 182 elif self.language.alpha3 == 'heb': 183 encodings.extend(['windows-1255', 'iso-8859-8']) 184 elif self.language.alpha3 == 'tur': 185 encodings.extend(['windows-1254', 'iso-8859-9', 'iso-8859-3']) 186 187 # Greek 188 elif self.language.alpha3 in ('grc', 'gre', 'ell'): 189 encodings.extend(['windows-1253', 'cp1253', 'cp737', 'iso8859-7', 'cp875', 'cp869', 'iso2022_jp_2', 190 'mac_greek']) 191 192 # Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script), 193 # Romanian and Albanian 194 elif self.language.alpha3 in ('pol', 'cze', 'ces', 'slk', 'slo', 'slv', 'hun', 'bos', 'hbs', 'hrv', 'rsb', 195 'ron', 'rum', 'sqi', 'alb'): 196 197 encodings.extend(['windows-1250', 'iso-8859-2']) 198 199 # Eastern European Group 1 200 if self.language.alpha3 == "slv": 201 encodings.append('iso-8859-4') 202 203 # Albanian 204 elif self.language.alpha3 in ("sqi", "alb"): 205 encodings.extend(['windows-1252', 'iso-8859-15', 'iso-8859-1', 'iso-8859-9']) 206 207 # Bulgarian, Serbian and Macedonian, Ukranian and Russian 208 elif self.language.alpha3 in ('bul', 'srp', 'mkd', 'mac', 'rus', 'ukr'): 209 # Eastern European Group 2 210 if self.language.alpha3 in ('bul', 'mkd', 'mac', 'rus', 'ukr'): 211 encodings.extend(['windows-1251', 'iso-8859-5']) 212 213 elif self.language.alpha3 == 'srp': 214 if self.language.script == "Latn": 215 encodings.extend(['windows-1250', 'iso-8859-2']) 216 elif self.language.script == "Cyrl": 217 encodings.extend(['windows-1251', 'iso-8859-5']) 218 else: 219 encodings.extend(['windows-1250', 'windows-1251', 'iso-8859-2', 'iso-8859-5']) 220 221 else: 222 # Western European (windows-1252) / Northern European 223 encodings.extend(['windows-1252', 'iso-8859-15', 'iso-8859-9', 'iso-8859-4', 'iso-8859-1']) 224 225 # try to decode 226 logger.debug('Trying encodings %r', encodings) 227 for encoding in encodings: 228 try: 229 self.content.decode(encoding) 230 231 except UnicodeDecodeError: 232 pass 233 else: 234 logger.info('Guessed encoding %s', encoding) 235 self._guessed_encoding = encoding 236 return encoding 237 238 logger.warning('Could not guess encoding from language') 239 240 # fallback on chardet 241 encoding = chardet.detect(self.content)['encoding'] 242 logger.info('Chardet found encoding %s', encoding) 243 244 if not encoding: 245 # fallback on bs4 246 logger.info('Falling back to bs4 detection') 247 a = UnicodeDammit(self.content) 248 249 logger.info("bs4 detected encoding: %s", a.original_encoding) 250 251 if a.original_encoding: 252 self._guessed_encoding = a.original_encoding 253 return a.original_encoding 254 raise ValueError(u"Couldn't guess the proper encoding for %s", self) 255 256 self._guessed_encoding = encoding 257 return encoding 258 259 def is_valid(self): 260 """Check if a :attr:`text` is a valid SubRip format. 261 262 :return: whether or not the subtitle is valid. 263 :rtype: bool 264 265 """ 266 if self._is_valid: 267 return True 268 269 text = self.text 270 if not text: 271 return False 272 273 # valid srt 274 try: 275 pysrt.from_string(text, error_handling=pysrt.ERROR_RAISE) 276 except Exception: 277 logger.error("PySRT-parsing failed, trying pysubs2") 278 else: 279 self._is_valid = True 280 return True 281 282 # something else, try to return srt 283 try: 284 logger.debug("Trying parsing with PySubs2") 285 try: 286 # in case of microdvd, try parsing the fps from the subtitle 287 subs = pysubs2.SSAFile.from_string(text) 288 if subs.format == "microdvd": 289 logger.info("Got FPS from MicroDVD subtitle: %s", subs.fps) 290 else: 291 logger.info("Got format: %s", subs.format) 292 except pysubs2.UnknownFPSError: 293 # if parsing failed, use frame rate from provider 294 sub_fps = self.get_fps() 295 if not isinstance(sub_fps, float) or sub_fps < 10.0: 296 # or use our media file's fps as a fallback 297 sub_fps = self.plex_media_fps 298 logger.info("No FPS info in subtitle. Using our own media FPS for the MicroDVD subtitle: %s", 299 self.plex_media_fps) 300 subs = pysubs2.SSAFile.from_string(text, fps=sub_fps) 301 302 unicontent = self.pysubs2_to_unicode(subs) 303 self.content = unicontent.encode(self.get_encoding()) 304 except: 305 logger.exception("Couldn't convert subtitle %s to .srt format: %s", self, traceback.format_exc()) 306 return False 307 308 self._is_valid = True 309 return True 310 311 @classmethod 312 def pysubs2_to_unicode(cls, sub, format="srt"): 313 """ 314 this is a modified version of pysubs2.SubripFormat.to_file with special handling for drawing tags in ASS 315 :param sub: 316 :param format: 317 :return: 318 """ 319 def ms_to_timestamp(ms, mssep=","): 320 """Convert ms to 'HH:MM:SS,mmm'""" 321 # XXX throw on overflow/underflow? 322 if ms < 0: ms = 0 323 if ms > MAX_REPRESENTABLE_TIME: ms = MAX_REPRESENTABLE_TIME 324 h, m, s, ms = ms_to_times(ms) 325 return "%02d:%02d:%02d%s%03d" % (h, m, s, mssep, ms) 326 327 def prepare_text(text, style): 328 body = [] 329 for fragment, sty in parse_tags(text, style, sub.styles): 330 fragment = fragment.replace(r"\h", u" ") 331 fragment = fragment.replace(r"\n", u"\n") 332 fragment = fragment.replace(r"\N", u"\n") 333 if sty.drawing: 334 raise pysubs2.ContentNotUsable 335 336 if format == "srt": 337 if sty.italic: 338 fragment = u"<i>%s</i>" % fragment 339 if sty.underline: 340 fragment = u"<u>%s</u>" % fragment 341 if sty.strikeout: 342 fragment = u"<s>%s</s>" % fragment 343 elif format == "vtt": 344 if sty.bold: 345 fragment = u"<b>%s</b>" % fragment 346 if sty.italic: 347 fragment = u"<i>%s</i>" % fragment 348 if sty.underline: 349 fragment = u"<u>%s</u>" % fragment 350 351 body.append(fragment) 352 353 return re.sub(u"\n+", u"\n", u"".join(body).strip()) 354 355 visible_lines = (line for line in sub if not line.is_comment) 356 357 out = [] 358 mssep = "," 359 360 if format == "vtt": 361 out.append("WEBVTT\n\n") 362 mssep = "." 363 364 for i, line in enumerate(visible_lines, 1): 365 start = ms_to_timestamp(line.start, mssep=mssep) 366 end = ms_to_timestamp(line.end, mssep=mssep) 367 try: 368 text = prepare_text(line.text, sub.styles.get(line.style, SSAStyle.DEFAULT_STYLE)) 369 except pysubs2.ContentNotUsable: 370 continue 371 372 out.append(u"%d\n" % i) 373 out.append(u"%s --> %s\n" % (start, end)) 374 out.append(u"%s%s" % (text, "\n\n")) 375 376 return u"".join(out) 377 378 def get_modified_content(self, format="srt", debug=False): 379 """ 380 :return: string 381 """ 382 if not self.mods: 383 return fix_text(self.content.decode(encoding=self.get_encoding()), **ftfy_defaults).encode( 384 encoding=self.get_encoding()) 385 386 submods = SubtitleModifications(debug=debug) 387 if submods.load(content=self.text, language=self.language): 388 logger.info("Applying mods: %s", self.mods) 389 submods.modify(*self.mods) 390 self.mods = submods.mods_used 391 392 content = fix_text(self.pysubs2_to_unicode(submods.f, format=format), **ftfy_defaults)\ 393 .encode(encoding=self.get_encoding()) 394 submods.f = None 395 del submods 396 return content 397 return None 398 399 400class ModifiedSubtitle(Subtitle): 401 id = None 402 403 404MERGED_FORMATS = { 405 "TV": ("HDTV", "SDTV", "AHDTV", "Ultra HDTV"), 406 "Air": ("SATRip", "DVB", "PPV", "Digital TV"), 407 "Disk-HD": ("HD-DVD", "Blu-ray", "Ultra HD Blu-ray"), 408 "Disk-SD": ("DVD", "VHS"), 409 "Web": ("Web",), 410} 411 412MERGED_FORMATS_REV = dict((v.lower(), k.lower()) for k in MERGED_FORMATS for v in MERGED_FORMATS[k]) 413 414def _has_match(video, guess, key) -> bool: 415 value = getattr(video, key) 416 guess_value = guess.get(key) 417 418 # To avoid extra debug calls 419 if guess_value is None or value is None: 420 return False 421 422 if isinstance(guess_value, list): 423 matched = any(value == item for item in guess_value) 424 else: 425 matched = value == guess_value 426 427 logger.debug("%s matched? %s (%s -> %s)", key, matched, value, guess_value) 428 429 return matched 430 431 432 433def guess_matches(video, guess, partial=False): 434 """Get matches between a `video` and a `guess`. 435 436 If a guess is `partial`, the absence information won't be counted as a match. 437 438 Patch: add multiple release group and formats handling 439 440 :param video: the video. 441 :type video: :class:`~subliminal.video.Video` 442 :param guess: the guess. 443 :type guess: dict 444 :param bool partial: whether or not the guess is partial. 445 :return: matches between the `video` and the `guess`. 446 :rtype: set 447 448 """ 449 matches = set() 450 if isinstance(video, Episode): 451 # series 452 if video.series and 'title' in guess: 453 titles = guess["title"] 454 if not isinstance(titles, list): 455 titles = [titles] 456 457 for title in titles: 458 if sanitize(title) in (sanitize(name) for name in [video.series] + video.alternative_series): 459 matches.add('series') 460 461 # title 462 if video.title and 'episode_title' in guess and sanitize(guess['episode_title']) == sanitize(video.title): 463 matches.add('title') 464 465 # season 466 if video.season and 'season' in guess and guess['season'] == video.season: 467 matches.add('season') 468 469 # episode 470 # Currently we only have single-ep support (guessit returns a multi-ep as a list with int values) 471 # Most providers only support single-ep, so make sure it contains only 1 episode 472 # In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number) 473 if video.episode and 'episode' in guess: 474 episode_guess = guess['episode'] 475 episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess 476 if episode == video.episode: 477 matches.add('episode') 478 479 # year 480 if video.year and 'year' in guess and guess['year'] == video.year: 481 matches.add('year') 482 483 # count "no year" as an information 484 if not partial and video.original_series and 'year' not in guess: 485 matches.add('year') 486 487 elif isinstance(video, Movie): 488 # year 489 if video.year and 'year' in guess and guess['year'] == video.year: 490 matches.add('year') 491 # title 492 if video.title and 'title' in guess and sanitize(guess['title']) in ( 493 sanitize(name) for name in [video.title] + video.alternative_titles): 494 matches.add('title') 495 496 # release_group 497 if 'release_group' in guess: 498 release_groups = guess["release_group"] 499 if not isinstance(release_groups, list): 500 release_groups = [release_groups] 501 502 if video.release_group: 503 for release_group in release_groups: 504 if (sanitize_release_group(release_group) in 505 get_equivalent_release_groups(sanitize_release_group(video.release_group))): 506 matches.add('release_group') 507 break 508 # source 509 if 'source' in guess: 510 formats = guess["source"] 511 if not isinstance(formats, list): 512 formats = [formats] 513 514 if video.source: 515 video_format = video.source.lower() 516 _video_gen_format = MERGED_FORMATS_REV.get(video_format) 517 matched = False 518 for frmt in formats: 519 _guess_gen_frmt = MERGED_FORMATS_REV.get(frmt.lower()) 520 # We don't want to match a singleton 521 if _guess_gen_frmt is None: # If the source is not in MERGED_FORMATS 522 _guess_gen_frmt = guess["source"] 523 524 if _guess_gen_frmt == _video_gen_format: 525 matched = True 526 matches.add('source') 527 break 528 529 logger.debug("Source match found? %s: %s -> %s", matched, video.source, formats) 530 531 if "release_group" in matches and "source" not in matches: 532 logger.info("Release group matched but source didn't. Removing release group match.") 533 matches.remove("release_group") 534 535 guess.update({"resolution": guess.get("screen_size")}) 536 537 # Solve match keys for potential lists 538 for key in ("video_codec", "audio_codec", "edition", "streaming_service", "resolution"): 539 if _has_match(video, guess, key): 540 matches.add(key) 541 542 # Add streaming service match for non-web sources 543 if video.source and video.source != "Web": 544 matches.add("streaming_service") 545 546 # As edition tags are rare, add edition match if the video doesn't have an edition 547 if not video.edition: 548 matches.add("edition") 549 550 return matches 551