1import logging
2import os
3import re
4import sys
5import time
6
7from loguru import logger
8
9from flexget import plugin
10from flexget.event import event
11from flexget.utils import qualities
12from flexget.utils.parsers.generic import ParseWarning, default_ignore_prefixes, name_to_re
13from flexget.utils.tools import ReList
14
15from .parser_common import MovieParseResult, SeriesParseResult
16
17# rebulk (that underlies guessit) will use the 'regex' module rather than 're' if installed.
18# For consistency, prevent that unless env variable is explicitly already enabling it.
19os.environ.setdefault('REGEX_DISABLED', 'true')  # isort:skip
20from guessit.api import GuessItApi, GuessitException  # isort:skip
21from guessit.rules import rebulk_builder  # isort:skip
22from rebulk import Rebulk  # isort:skip
23from rebulk.pattern import RePattern  # isort:skip
24
25
26logger = logger.bind(name='parser_guessit')
27
28logging.getLogger('rebulk').setLevel(logging.WARNING)
29logging.getLogger('guessit').setLevel(logging.WARNING)
30
31
32def _id_regexps_function(input_string, context):
33    ret = []
34    for regexp in context.get('id_regexps'):
35        for match in RePattern(regexp, children=True).matches(input_string, context):
36            ret.append(match.span)
37    return ret
38
39
40_id_regexps = Rebulk().functional(
41    _id_regexps_function, name='regexpId', disabled=lambda context: not context.get('id_regexps')
42)
43
44
45def rules_builder(config):
46    rebulk = rebulk_builder(config)
47    rebulk.rebulk(_id_regexps)
48    return rebulk
49
50
51guessit_api = GuessItApi()
52guessit_api.configure(options={}, rules_builder=rules_builder, force=True)
53
54
55def normalize_component(data):
56    if data is None:
57        return []
58    if isinstance(data, list):
59        return [d.lower().replace('-', '') for d in data]
60
61    return [data.lower().replace('-', '')]
62
63
64try:
65    preferred_clock = time.process_time
66except AttributeError:
67    preferred_clock = time.clock
68
69
70class ParserGuessit:
71    SOURCE_MAP = {
72        'Camera': 'cam',
73        'HD Camera': 'cam',
74        'HD Telesync': 'telesync',
75        'Pay-per-view': 'ppv',
76        'Digital TV': 'dvb',
77        'Video on Demand': 'vod',
78        'Analog HDTV': 'ahdtv',
79        'Ultra HDTV': 'uhdtv',
80        'HD Telecine': 'hdtc',
81        'Web': 'web-dl',
82    }
83
84    @staticmethod
85    def _guessit_options(options):
86        settings = {
87            'name_only': True,
88            'allowed_languages': ['en', 'fr'],
89            'allowed_countries': ['us', 'uk', 'gb'],
90            'single_value': True,
91        }
92        options['episode_prefer_number'] = not options.get('identified_by') == 'ep'
93        if options.get('allow_groups'):
94            options['expected_group'] = options['allow_groups']
95        if 'date_yearfirst' in options:
96            options['date_year_first'] = options['date_yearfirst']
97        if 'date_dayfirst' in options:
98            options['date_day_first'] = options['date_dayfirst']
99        else:
100            # See https://github.com/guessit-io/guessit/issues/329
101            # https://github.com/guessit-io/guessit/pull/333
102            # They made changes that break backward compatibility, so we have to make do this hackery
103            if options.get('date_year_first'):
104                options['date_day_first'] = True
105        settings.update(options)
106        return settings
107
108    @staticmethod
109    def _proper_count(guessit_result):
110        """Calculate a FlexGet style proper_count from a guessit result."""
111        version = guessit_result.get('version')
112        if version is None:
113            version = 0
114        elif version <= 0:
115            version = -1
116        else:
117            version -= 1
118        proper_count = guessit_result.get('proper_count', 0)
119        fastsub = 'fast subtitled' in normalize_component(guessit_result.values_list.get('other'))
120        return version + proper_count - (5 if fastsub else 0)
121
122    def _source(self, guessit_result):
123        other = normalize_component(guessit_result.values_list.get('other'))
124        source = self.SOURCE_MAP.get(guessit_result.get('source'), guessit_result.get('source'))
125        # special case
126        if source == 'web-dl' and 'rip' in other:
127            source = 'webrip'
128
129        source = normalize_component(source)
130
131        if 'preair' in other:
132            source.append('preair')
133        if 'screener' in other:
134            if 'bluray' in source:
135                source.append('bdscr')
136            else:
137                source.append('dvdscr')
138        if 'region 5' in other or 'region c' in other:
139            source.append('r5')
140
141        return source
142
143    def _quality(self, guessit_result):
144        """Generate a FlexGet Quality from a guessit result."""
145        resolution = normalize_component(guessit_result.values_list.get('screen_size'))
146        other = normalize_component(guessit_result.values_list.get('other'))
147        if not resolution and 'high resolution' in other:
148            resolution.append('hr')
149
150        source = self._source(guessit_result)
151
152        codec = normalize_component(guessit_result.values_list.get('video_codec'))
153        if '10bit' in normalize_component(guessit_result.values_list.get('color_depth')):
154            codec.append('10bit')
155
156        audio = normalize_component(guessit_result.values_list.get('audio_codec'))
157        audio_profile = normalize_component(guessit_result.values_list.get('audio_profile'))
158        audio_channels = normalize_component(guessit_result.values_list.get('audio_channels'))
159        # unlike the other components, audio can be a bit iffy with multiple codecs, so we limit it to one
160        if 'dts' in audio and any(hd in audio_profile for hd in ['hd', 'master audio']):
161            audio = ['dtshd']
162        elif '5.1' in audio_channels and 'dolby digital plus' in audio:
163            audio = ['dd+5.1']
164        elif '5.1' in audio_channels and 'dolby digital' in audio:
165            audio = ['dd5.1']
166
167        # Make sure everything are strings (guessit will return lists when there are multiples)
168        flattened_qualities = []
169        for component in (resolution, source, codec, audio):
170            if isinstance(component, list):
171                flattened_qualities.append(' '.join(component))
172            elif isinstance(component, str):
173                flattened_qualities.append(component)
174            else:
175                raise ParseWarning(
176                    self,
177                    'Guessit quality returned type {}: {}. Expected str or list.'.format(
178                        type(component), component
179                    ),
180                )
181
182        return qualities.Quality(' '.join(flattened_qualities))
183
184    # movie_parser API
185    def parse_movie(self, data, **kwargs):
186        logger.debug('Parsing movie: `{}` [options: {}]', data, kwargs)
187        start = preferred_clock()
188        guessit_options = self._guessit_options(kwargs)
189        guessit_options['type'] = 'movie'
190        guess_result = guessit_api.guessit(data, options=guessit_options)
191        # NOTE: Guessit expects str on PY3 and unicode on PY2 hence the use of future.utils.native
192        parsed = MovieParseResult(
193            data=data,
194            name=guess_result.get('title'),
195            year=guess_result.get('year'),
196            proper_count=self._proper_count(guess_result),
197            quality=self._quality(guess_result),
198            release_group=guess_result.get('release_group'),
199            valid=bool(
200                guess_result.get('title')
201            ),  # It's not valid if it didn't find a name, which sometimes happens
202        )
203        logger.debug('Parsing result: {} (in {} ms)', parsed, (preferred_clock() - start) * 1000)
204        return parsed
205
206    # series_parser API
207    def parse_series(self, data, **kwargs):
208        logger.debug('Parsing series: `{}` [options: {}]', data, kwargs)
209        guessit_options = self._guessit_options(kwargs)
210        valid = True
211        if kwargs.get('name'):
212            expected_titles = [kwargs['name']]
213            if kwargs.get('alternate_names'):
214                expected_titles.extend(kwargs['alternate_names'])
215            # apostrophe support
216            expected_titles = [
217                title.replace('\'', '(?:\'|\\\'|\\\\\'|-|)?') for title in expected_titles
218            ]
219            guessit_options['expected_title'] = ['re:' + title for title in expected_titles]
220        if kwargs.get('id_regexps'):
221            guessit_options['id_regexps'] = kwargs.get('id_regexps')
222        start = preferred_clock()
223        # If no series name is provided, we don't tell guessit what kind of match we are looking for
224        # This prevents guessit from determining that too general of matches are series
225        parse_type = 'episode' if kwargs.get('name') else None
226        if parse_type:
227            guessit_options['type'] = parse_type
228
229        # NOTE: Guessit expects str on PY3 and unicode on PY2 hence the use of future.utils.native
230        try:
231            guess_result = guessit_api.guessit(data, options=guessit_options)
232        except GuessitException:
233            logger.warning('Parsing {} with guessit failed. Most likely a unicode error.', data)
234            return SeriesParseResult(data=data, valid=False)
235
236        if guess_result.get('type') != 'episode':
237            valid = False
238
239        name = kwargs.get('name')
240        country = guess_result.get('country')
241        if not name:
242            name = guess_result.get('title', '')
243            if not name:
244                valid = False
245            elif country and hasattr(country, 'alpha2'):
246                name += ' (%s)' % country.alpha2
247        elif guess_result.matches['title']:
248            # Make sure the name match is up to FlexGet standards
249            # Check there is no unmatched cruft before the matched name
250            title_start = guess_result.matches['title'][0].start
251            title_end = guess_result.matches['title'][0].end
252            if title_start != 0:
253                try:
254                    pre_title = max(
255                        (
256                            match[0].end
257                            for match in guess_result.matches.values()
258                            if match[0].end <= title_start
259                        )
260                    )
261                except ValueError:
262                    pre_title = 0
263                for char in reversed(data[pre_title:title_start]):
264                    if char.isalnum() or char.isdigit():
265                        return SeriesParseResult(data=data, valid=False)
266                    if char.isspace() or char in '._':
267                        continue
268                    else:
269                        break
270            # Check the name doesn't end mid-word (guessit might put the border before or after the space after title)
271            if (
272                data[title_end - 1].isalnum()
273                and len(data) <= title_end
274                or not self._is_valid_name(data, guessit_options=guessit_options)
275            ):
276                valid = False
277            # If we are in exact mode, make sure there is nothing after the title
278            if kwargs.get('strict_name'):
279                post_title = sys.maxsize
280                for match_type, matches in guess_result.matches.items():
281                    if match_type in ['season', 'episode', 'date', 'regexpId']:
282                        if matches[0].start < title_end:
283                            continue
284                        post_title = min(post_title, matches[0].start)
285                        if matches[0].parent:
286                            post_title = min(post_title, matches[0].parent.start)
287                for char in data[title_end:post_title]:
288                    if char.isalnum() or char.isdigit():
289                        valid = False
290        else:
291            valid = False
292        season = guess_result.get('season')
293        episode = guess_result.get('episode')
294        if episode is None and 'part' in guess_result:
295            episode = guess_result['part']
296        if isinstance(episode, list):
297            # guessit >=2.1.4 returns a list for multi-packs, but we just want the first one and the number of eps
298            episode = episode[0]
299        date = guess_result.get('date')
300        quality = self._quality(guess_result)
301        proper_count = self._proper_count(guess_result)
302        group = guess_result.get('release_group')
303        # Validate group with from_group
304        if not self._is_valid_groups(group, guessit_options.get('allow_groups', [])):
305            valid = False
306        # Validate country, TODO: LEGACY
307        if country and name.endswith(')'):
308            p_start = name.rfind('(')
309            if p_start != -1:
310                parenthetical = re.escape(name[p_start + 1 : -1])
311                if parenthetical and parenthetical.lower() != str(country).lower():
312                    valid = False
313        # Check the full list of 'episode_details' for special,
314        # since things like 'pilot' and 'unaired' can also show up there
315        special = any(
316            v.lower() == 'special' for v in guess_result.values_list.get('episode_details', [])
317        )
318        if 'episode' not in guess_result.values_list:
319            episodes = len(guess_result.values_list.get('part', []))
320        else:
321            episodes = len(guess_result.values_list['episode'])
322        if episodes > 3:
323            valid = False
324        identified_by = kwargs.get('identified_by', 'auto')
325        identifier_type, identifier = None, None
326        if identified_by in ['date', 'auto']:
327            if date:
328                identifier_type = 'date'
329                identifier = date
330        if not identifier_type and identified_by in ['ep', 'auto']:
331            if episode is not None:
332                if season is None and kwargs.get('allow_seasonless', True):
333                    if 'part' in guess_result:
334                        season = 1
335                    else:
336                        episode_raw = guess_result.matches['episode'][0].initiator.raw
337                        if episode_raw and any(
338                            c.isalpha() and c.lower() != 'v' for c in episode_raw
339                        ):
340                            season = 1
341                if season is not None:
342                    identifier_type = 'ep'
343                    identifier = (season, episode)
344
345        if not identifier_type and identified_by in ['id', 'auto']:
346            if guess_result.matches['regexpId']:
347                identifier_type = 'id'
348                identifier = '-'.join(match.value for match in guess_result.matches['regexpId'])
349        if not identifier_type and identified_by in ['sequence', 'auto']:
350            if episode is not None:
351                identifier_type = 'sequence'
352                identifier = episode
353        if (not identifier_type or guessit_options.get('prefer_specials')) and (
354            special or guessit_options.get('assume_special')
355        ):
356            identifier_type = 'special'
357            identifier = guess_result.get('episode_title', 'special')
358        if not identifier_type:
359            valid = False
360        # TODO: Legacy - Complete == invalid
361        if 'complete' in normalize_component(guess_result.get('other')):
362            valid = False
363
364        parsed = SeriesParseResult(
365            data=data,
366            name=name,
367            episodes=episodes,
368            identified_by=identified_by,
369            id=identifier,
370            id_type=identifier_type,
371            quality=quality,
372            proper_count=proper_count,
373            special=special,
374            group=group,
375            valid=valid,
376        )
377
378        logger.debug('Parsing result: {} (in {} ms)', parsed, (preferred_clock() - start) * 1000)
379        return parsed
380
381    # TODO: The following functions are sort of legacy. No idea if they should be changed.
382    def _is_valid_name(self, data, guessit_options):
383        if not guessit_options.get('name'):
384            return True
385        # name end position
386        name_end = 0
387
388        # regexp name matching
389        re_from_name = False
390        name_regexps = ReList(guessit_options.get('name_regexps', []))
391        if not name_regexps:
392            # if we don't have name_regexps, generate one from the name
393            name_regexps = ReList(
394                name_to_re(name, default_ignore_prefixes, None)
395                for name in [guessit_options['name']] + guessit_options.get('alternate_names', [])
396            )
397            # With auto regex generation, the first regex group captures the name
398            re_from_name = True
399        # try all specified regexps on this data
400        for name_re in name_regexps:
401            match = re.search(name_re, data)
402            if match:
403                match_end = match.end(1 if re_from_name else 0)
404                # Always pick the longest matching regex
405                if match_end > name_end:
406                    name_end = match_end
407                logger.debug('NAME SUCCESS: {} matched to {}', name_re.pattern, data)
408        if not name_end:
409            # leave this invalid
410            logger.debug(
411                'FAIL: name regexps {} do not match {}',
412                [regexp.pattern for regexp in name_regexps],
413                data,
414            )
415            return False
416        return True
417
418    def _is_valid_groups(self, group, allow_groups):
419        if not allow_groups:
420            return True
421        if not group:
422            return False
423        normalized_allow_groups = [x.lower() for x in allow_groups]
424        # TODO: special case for guessit with expected_group parameter
425        if isinstance(group, list):
426            return any(g.lower() in normalized_allow_groups for g in group)
427
428        return group.lower() in normalized_allow_groups
429
430
431@event('plugin.register')
432def register_plugin():
433    plugin.register(
434        ParserGuessit, 'parser_guessit', interfaces=['movie_parser', 'series_parser'], api_ver=2
435    )
436