1# coding=utf-8
2
3
4from __future__ import absolute_import
5import logging
6import traceback
7
8import re
9import types
10
11import chardet
12import pysrt
13import pysubs2
14from bs4 import UnicodeDammit
15from pysubs2 import SSAStyle
16from pysubs2.subrip import parse_tags, MAX_REPRESENTABLE_TIME
17from pysubs2.time import ms_to_times
18from subzero.modification import SubtitleModifications
19from subzero.language import Language
20from subliminal import Subtitle as Subtitle_
21from subliminal.subtitle import Episode, Movie, sanitize_release_group, get_equivalent_release_groups
22from subliminal_patch.utils import sanitize
23from ftfy import fix_text
24from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
25from six import text_type
26
27BOMS = (
28    (BOM_UTF8, "UTF-8"),
29    (BOM_UTF32_BE, "UTF-32-BE"),
30    (BOM_UTF32_LE, "UTF-32-LE"),
31    (BOM_UTF16_BE, "UTF-16-BE"),
32    (BOM_UTF16_LE, "UTF-16-LE"),
33)
34
35logger = logging.getLogger(__name__)
36
37
38ftfy_defaults = {
39    "uncurl_quotes": False,
40    "fix_character_width": False,
41}
42
43
44class Subtitle(Subtitle_):
45    storage_path = None
46    release_info = None
47    matches = {}
48    hash_verifiable = False
49    hearing_impaired_verifiable = False
50    mods = None
51    plex_media_fps = None
52    skip_wrong_fps = False
53    wrong_fps = False
54    wrong_series = False
55    wrong_season_ep = False
56    is_pack = False
57    asked_for_release_group = None
58    asked_for_episode = None
59    uploader = None # string - uploader username
60
61    pack_data = None
62    _guessed_encoding = None
63    _is_valid = False
64
65    def __init__(self, language, hearing_impaired=False, page_link=None, encoding=None, mods=None):
66        # set subtitle language to hi if it's hearing_impaired
67        if hearing_impaired:
68            language = Language.rebuild(language, hi=True)
69
70        super(Subtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link,
71                                       encoding=encoding)
72        self.mods = mods
73        self._is_valid = False
74
75    def __repr__(self):
76        return '<%s %r [%s:%s]>' % (
77            self.__class__.__name__, self.page_link, self.language, self._guessed_encoding)
78
79    @property
80    def text(self):
81        """Content as string
82
83        If :attr:`encoding` is None, the encoding is guessed with :meth:`guess_encoding`
84
85        """
86        if not self.content:
87            return
88
89        if not isinstance(self.content, text_type):
90            return self.content.decode(self.get_encoding(), errors='replace')
91
92        return self.content
93
94    @property
95    def numeric_id(self):
96        raise NotImplemented
97
98    def get_fps(self):
99        """
100        :return: frames per second or None if not supported
101        :rtype: float
102        """
103        return None
104
105    def make_picklable(self):
106        """
107        some subtitle instances might have unpicklable objects stored; clean them up here
108        :return: self
109        """
110        return self
111
112    def get_encoding(self):
113        return self.guess_encoding()
114
115    def set_encoding(self, encoding):
116        ge = self.get_encoding()
117        if encoding == ge:
118            return
119
120        unicontent = self.text
121        logger.debug("Changing encoding: to %s, from %s", encoding, ge)
122        self.content = unicontent.encode(encoding)
123        self._guessed_encoding = encoding
124
125    def normalize(self):
126        """
127        Set encoding to UTF-8 and normalize line endings
128        :return:
129        """
130        self.set_encoding("utf-8")
131
132        # normalize line endings
133        self.content = self.content.replace(b"\r\n", b"\n").replace(b'\r', b'\n')
134
135    def _check_bom(self, data):
136        return [encoding for bom, encoding in BOMS if data.startswith(bom)]
137
138    def guess_encoding(self):
139        """Guess encoding using the language, falling back on chardet.
140
141        :return: the guessed encoding.
142        :rtype: str
143
144        """
145        if self._guessed_encoding:
146            return self._guessed_encoding
147
148        if self.encoding:
149            # check provider encoding and use it only if it is valid
150            try:
151                self.content.decode(self.encoding)
152                self._guessed_encoding = self.encoding
153                return self._guessed_encoding
154            except:
155                # provider specified encoding is invalid, fallback to guessing
156                pass
157
158        logger.info('Guessing encoding for language %s', self.language)
159
160        encodings = ['utf-8']
161
162        # check UTF BOMs
163        bom_encodings = self._check_bom(self.content)
164        if bom_encodings:
165            encodings = list(set(enc.lower() for enc in bom_encodings + encodings))
166
167        # add language-specific encodings
168        # http://scratchpad.wikia.com/wiki/Character_Encoding_Recommendation_for_Languages
169
170        if self.language.alpha3 == 'zho':
171            encodings.extend(['cp936', 'gb2312', 'gbk', 'hz', 'iso2022_jp_2', 'cp950', 'big5hkscs', 'big5',
172                              'gb18030', 'utf-16'])
173        elif self.language.alpha3 == 'jpn':
174            encodings.extend(['shift-jis', 'cp932', 'euc_jp', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
175                              'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', ])
176        elif self.language.alpha3 == 'tha':
177            encodings.extend(['tis-620', 'cp874'])
178
179        # arabian/farsi
180        elif self.language.alpha3 in ('ara', 'fas', 'per'):
181            encodings.extend(['windows-1256', 'utf-16', 'utf-16le', 'ascii', 'iso-8859-6'])
182        elif self.language.alpha3 == 'heb':
183            encodings.extend(['windows-1255', 'iso-8859-8'])
184        elif self.language.alpha3 == 'tur':
185            encodings.extend(['windows-1254', 'iso-8859-9', 'iso-8859-3'])
186
187        # Greek
188        elif self.language.alpha3 in ('grc', 'gre', 'ell'):
189            encodings.extend(['windows-1253', 'cp1253', 'cp737', 'iso8859-7', 'cp875', 'cp869', 'iso2022_jp_2',
190                              'mac_greek'])
191
192        # Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script),
193        # Romanian and Albanian
194        elif self.language.alpha3 in ('pol', 'cze', 'ces', 'slk', 'slo', 'slv', 'hun', 'bos', 'hbs', 'hrv', 'rsb',
195                                      'ron', 'rum', 'sqi', 'alb'):
196
197            encodings.extend(['windows-1250', 'iso-8859-2'])
198
199            # Eastern European Group 1
200            if self.language.alpha3 == "slv":
201                encodings.append('iso-8859-4')
202
203            # Albanian
204            elif self.language.alpha3 in ("sqi", "alb"):
205                encodings.extend(['windows-1252', 'iso-8859-15', 'iso-8859-1', 'iso-8859-9'])
206
207        # Bulgarian, Serbian and Macedonian, Ukranian and Russian
208        elif self.language.alpha3 in ('bul', 'srp', 'mkd', 'mac', 'rus', 'ukr'):
209            # Eastern European Group 2
210            if self.language.alpha3 in ('bul', 'mkd', 'mac', 'rus', 'ukr'):
211                encodings.extend(['windows-1251', 'iso-8859-5'])
212
213            elif self.language.alpha3 == 'srp':
214                if self.language.script == "Latn":
215                    encodings.extend(['windows-1250', 'iso-8859-2'])
216                elif self.language.script == "Cyrl":
217                    encodings.extend(['windows-1251', 'iso-8859-5'])
218                else:
219                    encodings.extend(['windows-1250', 'windows-1251', 'iso-8859-2', 'iso-8859-5'])
220
221        else:
222            # Western European (windows-1252) / Northern European
223            encodings.extend(['windows-1252', 'iso-8859-15', 'iso-8859-9', 'iso-8859-4', 'iso-8859-1'])
224
225        # try to decode
226        logger.debug('Trying encodings %r', encodings)
227        for encoding in encodings:
228            try:
229                self.content.decode(encoding)
230
231            except UnicodeDecodeError:
232                pass
233            else:
234                logger.info('Guessed encoding %s', encoding)
235                self._guessed_encoding = encoding
236                return encoding
237
238        logger.warning('Could not guess encoding from language')
239
240        # fallback on chardet
241        encoding = chardet.detect(self.content)['encoding']
242        logger.info('Chardet found encoding %s', encoding)
243
244        if not encoding:
245            # fallback on bs4
246            logger.info('Falling back to bs4 detection')
247            a = UnicodeDammit(self.content)
248
249            logger.info("bs4 detected encoding: %s", a.original_encoding)
250
251            if a.original_encoding:
252                self._guessed_encoding = a.original_encoding
253                return a.original_encoding
254            raise ValueError(u"Couldn't guess the proper encoding for %s", self)
255
256        self._guessed_encoding = encoding
257        return encoding
258
259    def is_valid(self):
260        """Check if a :attr:`text` is a valid SubRip format.
261
262        :return: whether or not the subtitle is valid.
263        :rtype: bool
264
265        """
266        if self._is_valid:
267            return True
268
269        text = self.text
270        if not text:
271            return False
272
273        # valid srt
274        try:
275            pysrt.from_string(text, error_handling=pysrt.ERROR_RAISE)
276        except Exception:
277            logger.error("PySRT-parsing failed, trying pysubs2")
278        else:
279            self._is_valid = True
280            return True
281
282        # something else, try to return srt
283        try:
284            logger.debug("Trying parsing with PySubs2")
285            try:
286                # in case of microdvd, try parsing the fps from the subtitle
287                subs = pysubs2.SSAFile.from_string(text)
288                if subs.format == "microdvd":
289                    logger.info("Got FPS from MicroDVD subtitle: %s", subs.fps)
290                else:
291                    logger.info("Got format: %s", subs.format)
292            except pysubs2.UnknownFPSError:
293                # if parsing failed, use frame rate from provider
294                sub_fps = self.get_fps()
295                if not isinstance(sub_fps, float) or sub_fps < 10.0:
296                    # or use our media file's fps as a fallback
297                    sub_fps = self.plex_media_fps
298                    logger.info("No FPS info in subtitle. Using our own media FPS for the MicroDVD subtitle: %s",
299                                self.plex_media_fps)
300                subs = pysubs2.SSAFile.from_string(text, fps=sub_fps)
301
302            unicontent = self.pysubs2_to_unicode(subs)
303            self.content = unicontent.encode(self.get_encoding())
304        except:
305            logger.exception("Couldn't convert subtitle %s to .srt format: %s", self, traceback.format_exc())
306            return False
307
308        self._is_valid = True
309        return True
310
311    @classmethod
312    def pysubs2_to_unicode(cls, sub, format="srt"):
313        """
314        this is a modified version of pysubs2.SubripFormat.to_file with special handling for drawing tags in ASS
315        :param sub:
316        :param format:
317        :return:
318        """
319        def ms_to_timestamp(ms, mssep=","):
320            """Convert ms to 'HH:MM:SS,mmm'"""
321            # XXX throw on overflow/underflow?
322            if ms < 0: ms = 0
323            if ms > MAX_REPRESENTABLE_TIME: ms = MAX_REPRESENTABLE_TIME
324            h, m, s, ms = ms_to_times(ms)
325            return "%02d:%02d:%02d%s%03d" % (h, m, s, mssep, ms)
326
327        def prepare_text(text, style):
328            body = []
329            for fragment, sty in parse_tags(text, style, sub.styles):
330                fragment = fragment.replace(r"\h", u" ")
331                fragment = fragment.replace(r"\n", u"\n")
332                fragment = fragment.replace(r"\N", u"\n")
333                if sty.drawing:
334                    raise pysubs2.ContentNotUsable
335
336                if format == "srt":
337                    if sty.italic:
338                        fragment = u"<i>%s</i>" % fragment
339                    if sty.underline:
340                        fragment = u"<u>%s</u>" % fragment
341                    if sty.strikeout:
342                        fragment = u"<s>%s</s>" % fragment
343                elif format == "vtt":
344                    if sty.bold:
345                        fragment = u"<b>%s</b>" % fragment
346                    if sty.italic:
347                        fragment = u"<i>%s</i>" % fragment
348                    if sty.underline:
349                        fragment = u"<u>%s</u>" % fragment
350
351                body.append(fragment)
352
353            return re.sub(u"\n+", u"\n", u"".join(body).strip())
354
355        visible_lines = (line for line in sub if not line.is_comment)
356
357        out = []
358        mssep = ","
359
360        if format == "vtt":
361            out.append("WEBVTT\n\n")
362            mssep = "."
363
364        for i, line in enumerate(visible_lines, 1):
365            start = ms_to_timestamp(line.start, mssep=mssep)
366            end = ms_to_timestamp(line.end, mssep=mssep)
367            try:
368                text = prepare_text(line.text, sub.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
369            except pysubs2.ContentNotUsable:
370                continue
371
372            out.append(u"%d\n" % i)
373            out.append(u"%s --> %s\n" % (start, end))
374            out.append(u"%s%s" % (text, "\n\n"))
375
376        return u"".join(out)
377
378    def get_modified_content(self, format="srt", debug=False):
379        """
380        :return: string
381        """
382        if not self.mods:
383            return fix_text(self.content.decode(encoding=self.get_encoding()), **ftfy_defaults).encode(
384                encoding=self.get_encoding())
385
386        submods = SubtitleModifications(debug=debug)
387        if submods.load(content=self.text, language=self.language):
388            logger.info("Applying mods: %s", self.mods)
389            submods.modify(*self.mods)
390            self.mods = submods.mods_used
391
392            content = fix_text(self.pysubs2_to_unicode(submods.f, format=format), **ftfy_defaults)\
393                .encode(encoding=self.get_encoding())
394            submods.f = None
395            del submods
396            return content
397        return None
398
399
400class ModifiedSubtitle(Subtitle):
401    id = None
402
403
404MERGED_FORMATS = {
405    "TV": ("HDTV", "SDTV", "AHDTV", "Ultra HDTV"),
406    "Air": ("SATRip", "DVB", "PPV", "Digital TV"),
407    "Disk-HD": ("HD-DVD", "Blu-ray", "Ultra HD Blu-ray"),
408    "Disk-SD": ("DVD", "VHS"),
409    "Web": ("Web",),
410}
411
412MERGED_FORMATS_REV = dict((v.lower(), k.lower()) for k in MERGED_FORMATS for v in MERGED_FORMATS[k])
413
414def _has_match(video, guess, key) -> bool:
415    value = getattr(video, key)
416    guess_value = guess.get(key)
417
418    # To avoid extra debug calls
419    if guess_value is None or value is None:
420        return False
421
422    if isinstance(guess_value, list):
423        matched = any(value == item for item in guess_value)
424    else:
425        matched = value == guess_value
426
427    logger.debug("%s matched? %s (%s -> %s)", key, matched, value, guess_value)
428
429    return matched
430
431
432
433def guess_matches(video, guess, partial=False):
434    """Get matches between a `video` and a `guess`.
435
436    If a guess is `partial`, the absence information won't be counted as a match.
437
438    Patch: add multiple release group and formats handling
439
440    :param video: the video.
441    :type video: :class:`~subliminal.video.Video`
442    :param guess: the guess.
443    :type guess: dict
444    :param bool partial: whether or not the guess is partial.
445    :return: matches between the `video` and the `guess`.
446    :rtype: set
447
448    """
449    matches = set()
450    if isinstance(video, Episode):
451        # series
452        if video.series and 'title' in guess:
453            titles = guess["title"]
454            if not isinstance(titles, list):
455                titles = [titles]
456
457            for title in titles:
458                if sanitize(title) in (sanitize(name) for name in [video.series] + video.alternative_series):
459                    matches.add('series')
460
461        # title
462        if video.title and 'episode_title' in guess and sanitize(guess['episode_title']) == sanitize(video.title):
463            matches.add('title')
464
465        # season
466        if video.season and 'season' in guess and guess['season'] == video.season:
467            matches.add('season')
468
469        # episode
470        # Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
471        # Most providers only support single-ep, so make sure it contains only 1 episode
472        # In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
473        if video.episode and 'episode' in guess:
474            episode_guess = guess['episode']
475            episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
476            if episode == video.episode:
477                matches.add('episode')
478
479        # year
480        if video.year and 'year' in guess and guess['year'] == video.year:
481            matches.add('year')
482
483        # count "no year" as an information
484        if not partial and video.original_series and 'year' not in guess:
485            matches.add('year')
486
487    elif isinstance(video, Movie):
488        # year
489        if video.year and 'year' in guess and guess['year'] == video.year:
490            matches.add('year')
491        # title
492        if video.title and 'title' in guess and sanitize(guess['title']) in (
493                    sanitize(name) for name in [video.title] + video.alternative_titles):
494            matches.add('title')
495
496    # release_group
497    if 'release_group' in guess:
498        release_groups = guess["release_group"]
499        if not isinstance(release_groups, list):
500            release_groups = [release_groups]
501
502        if video.release_group:
503            for release_group in release_groups:
504                if (sanitize_release_group(release_group) in
505                        get_equivalent_release_groups(sanitize_release_group(video.release_group))):
506                    matches.add('release_group')
507                    break
508    # source
509    if 'source' in guess:
510        formats = guess["source"]
511        if not isinstance(formats, list):
512            formats = [formats]
513
514        if video.source:
515            video_format = video.source.lower()
516            _video_gen_format = MERGED_FORMATS_REV.get(video_format)
517            matched = False
518            for frmt in formats:
519                _guess_gen_frmt = MERGED_FORMATS_REV.get(frmt.lower())
520                # We don't want to match a singleton
521                if _guess_gen_frmt is None: # If the source is not in MERGED_FORMATS
522                    _guess_gen_frmt = guess["source"]
523
524                if _guess_gen_frmt == _video_gen_format:
525                    matched = True
526                    matches.add('source')
527                    break
528
529            logger.debug("Source match found? %s: %s -> %s", matched, video.source, formats)
530
531        if "release_group" in matches and "source" not in matches:
532            logger.info("Release group matched but source didn't. Removing release group match.")
533            matches.remove("release_group")
534
535    guess.update({"resolution": guess.get("screen_size")})
536
537    # Solve match keys for potential lists
538    for key in ("video_codec", "audio_codec", "edition", "streaming_service", "resolution"):
539        if _has_match(video, guess, key):
540            matches.add(key)
541
542    # Add streaming service match for non-web sources
543    if video.source and video.source != "Web":
544        matches.add("streaming_service")
545
546    # As edition tags are rare, add edition match if the video doesn't have an edition
547    if not video.edition:
548        matches.add("edition")
549
550    return matches
551