1import logging
2import os
3import re
4import sys
5import time
7from loguru import logger
9from flexget import plugin
10from flexget.event import event
11from flexget.utils import qualities
12from flexget.utils.parsers.generic import ParseWarning, default_ignore_prefixes, name_to_re
13from flexget.utils.tools import ReList
15from .parser_common import MovieParseResult, SeriesParseResult
17# rebulk (that underlies guessit) will use the 'regex' module rather than 're' if installed.
18# For consistency, prevent that unless env variable is explicitly already enabling it.
19os.environ.setdefault('REGEX_DISABLED', 'true')  # isort:skip
20from guessit.api import GuessItApi, GuessitException  # isort:skip
21from guessit.rules import rebulk_builder  # isort:skip
22from rebulk import Rebulk  # isort:skip
23from rebulk.pattern import RePattern  # isort:skip
26logger = logger.bind(name='parser_guessit')
32def _id_regexps_function(input_string, context):
33    ret = []
34    for regexp in context.get('id_regexps'):
35        for match in RePattern(regexp, children=True).matches(input_string, context):
36            ret.append(match.span)
37    return ret
40_id_regexps = Rebulk().functional(
41    _id_regexps_function, name='regexpId', disabled=lambda context: not context.get('id_regexps')
45def rules_builder(config):
46    rebulk = rebulk_builder(config)
47    rebulk.rebulk(_id_regexps)
48    return rebulk
51guessit_api = GuessItApi()
52guessit_api.configure(options={}, rules_builder=rules_builder, force=True)
55def normalize_component(data):
56    if data is None:
57        return []
58    if isinstance(data, list):
59        return [d.lower().replace('-', '') for d in data]
61    return [data.lower().replace('-', '')]
65    preferred_clock = time.process_time
66except AttributeError:
67    preferred_clock = time.clock
70class ParserGuessit:
71    SOURCE_MAP = {
72        'Camera': 'cam',
73        'HD Camera': 'cam',
74        'HD Telesync': 'telesync',
75        'Pay-per-view': 'ppv',
76        'Digital TV': 'dvb',
77        'Video on Demand': 'vod',
78        'Analog HDTV': 'ahdtv',
79        'Ultra HDTV': 'uhdtv',
80        'HD Telecine': 'hdtc',
81        'Web': 'web-dl',
82    }
84    @staticmethod
85    def _guessit_options(options):
86        settings = {
87            'name_only': True,
88            'allowed_languages': ['en', 'fr'],
89            'allowed_countries': ['us', 'uk', 'gb'],
90            'single_value': True,
91        }
92        options['episode_prefer_number'] = not options.get('identified_by') == 'ep'
93        if options.get('allow_groups'):
94            options['expected_group'] = options['allow_groups']
95        if 'date_yearfirst' in options:
96            options['date_year_first'] = options['date_yearfirst']
97        if 'date_dayfirst' in options:
98            options['date_day_first'] = options['date_dayfirst']
99        else:
100            # See https://github.com/guessit-io/guessit/issues/329
101            # https://github.com/guessit-io/guessit/pull/333
102            # They made changes that break backward compatibility, so we have to make do this hackery
103            if options.get('date_year_first'):
104                options['date_day_first'] = True
105        settings.update(options)
106        return settings
108    @staticmethod
109    def _proper_count(guessit_result):
110        """Calculate a FlexGet style proper_count from a guessit result."""
111        version = guessit_result.get('version')
112        if version is None:
113            version = 0
114        elif version <= 0:
115            version = -1
116        else:
117            version -= 1
118        proper_count = guessit_result.get('proper_count', 0)
119        fastsub = 'fast subtitled' in normalize_component(guessit_result.values_list.get('other'))
120        return version + proper_count - (5 if fastsub else 0)
122    def _source(self, guessit_result):
123        other = normalize_component(guessit_result.values_list.get('other'))
124        source = self.SOURCE_MAP.get(guessit_result.get('source'), guessit_result.get('source'))
125        # special case
126        if source == 'web-dl' and 'rip' in other:
127            source = 'webrip'
129        source = normalize_component(source)
131        if 'preair' in other:
132            source.append('preair')
133        if 'screener' in other:
134            if 'bluray' in source:
135                source.append('bdscr')
136            else:
137                source.append('dvdscr')
138        if 'region 5' in other or 'region c' in other:
139            source.append('r5')
141        return source
143    def _quality(self, guessit_result):
144        """Generate a FlexGet Quality from a guessit result."""
145        resolution = normalize_component(guessit_result.values_list.get('screen_size'))
146        other = normalize_component(guessit_result.values_list.get('other'))
147        if not resolution and 'high resolution' in other:
148            resolution.append('hr')
150        source = self._source(guessit_result)
152        codec = normalize_component(guessit_result.values_list.get('video_codec'))
153        if '10bit' in normalize_component(guessit_result.values_list.get('color_depth')):
154            codec.append('10bit')
156        audio = normalize_component(guessit_result.values_list.get('audio_codec'))
157        audio_profile = normalize_component(guessit_result.values_list.get('audio_profile'))
158        audio_channels = normalize_component(guessit_result.values_list.get('audio_channels'))
159        # unlike the other components, audio can be a bit iffy with multiple codecs, so we limit it to one
160        if 'dts' in audio and any(hd in audio_profile for hd in ['hd', 'master audio']):
161            audio = ['dtshd']
162        elif '5.1' in audio_channels and 'dolby digital plus' in audio:
163            audio = ['dd+5.1']
164        elif '5.1' in audio_channels and 'dolby digital' in audio:
165            audio = ['dd5.1']
167        # Make sure everything are strings (guessit will return lists when there are multiples)
168        flattened_qualities = []
169        for component in (resolution, source, codec, audio):
170            if isinstance(component, list):
171                flattened_qualities.append(' '.join(component))
172            elif isinstance(component, str):
173                flattened_qualities.append(component)
174            else:
175                raise ParseWarning(
176                    self,
177                    'Guessit quality returned type {}: {}. Expected str or list.'.format(
178                        type(component), component
179                    ),
180                )
182        return qualities.Quality(' '.join(flattened_qualities))
184    # movie_parser API
185    def parse_movie(self, data, **kwargs):
186        logger.debug('Parsing movie: `{}` [options: {}]', data, kwargs)
187        start = preferred_clock()
188        guessit_options = self._guessit_options(kwargs)
189        guessit_options['type'] = 'movie'
190        guess_result = guessit_api.guessit(data, options=guessit_options)
191        # NOTE: Guessit expects str on PY3 and unicode on PY2 hence the use of future.utils.native
192        parsed = MovieParseResult(
193            data=data,
194            name=guess_result.get('title'),
195            year=guess_result.get('year'),
196            proper_count=self._proper_count(guess_result),
197            quality=self._quality(guess_result),
198            release_group=guess_result.get('release_group'),
199            valid=bool(
200                guess_result.get('title')
201            ),  # It's not valid if it didn't find a name, which sometimes happens
202        )
203        logger.debug('Parsing result: {} (in {} ms)', parsed, (preferred_clock() - start) * 1000)
204        return parsed
206    # series_parser API
207    def parse_series(self, data, **kwargs):
208        logger.debug('Parsing series: `{}` [options: {}]', data, kwargs)
209        guessit_options = self._guessit_options(kwargs)
210        valid = True
211        if kwargs.get('name'):
212            expected_titles = [kwargs['name']]
213            if kwargs.get('alternate_names'):
214                expected_titles.extend(kwargs['alternate_names'])
215            # apostrophe support
216            expected_titles = [
217                title.replace('\'', '(?:\'|\\\'|\\\\\'|-|)?') for title in expected_titles
218            ]
219            guessit_options['expected_title'] = ['re:' + title for title in expected_titles]
220        if kwargs.get('id_regexps'):
221            guessit_options['id_regexps'] = kwargs.get('id_regexps')
222        start = preferred_clock()
223        # If no series name is provided, we don't tell guessit what kind of match we are looking for
224        # This prevents guessit from determining that too general of matches are series
225        parse_type = 'episode' if kwargs.get('name') else None
226        if parse_type:
227            guessit_options['type'] = parse_type
229        # NOTE: Guessit expects str on PY3 and unicode on PY2 hence the use of future.utils.native
230        try:
231            guess_result = guessit_api.guessit(data, options=guessit_options)
232        except GuessitException:
233            logger.warning('Parsing {} with guessit failed. Most likely a unicode error.', data)
234            return SeriesParseResult(data=data, valid=False)
236        if guess_result.get('type') != 'episode':
237            valid = False
239        name = kwargs.get('name')
240        country = guess_result.get('country')
241        if not name:
242            name = guess_result.get('title', '')
243            if not name:
244                valid = False
245            elif country and hasattr(country, 'alpha2'):
246                name += ' (%s)' % country.alpha2
247        elif guess_result.matches['title']:
248            # Make sure the name match is up to FlexGet standards
249            # Check there is no unmatched cruft before the matched name
250            title_start = guess_result.matches['title'][0].start
251            title_end = guess_result.matches['title'][0].end
252            if title_start != 0:
253                try:
254                    pre_title = max(
255                        (
256                            match[0].end
257                            for match in guess_result.matches.values()
258                            if match[0].end <= title_start
259                        )
260                    )
261                except ValueError:
262                    pre_title = 0
263                for char in reversed(data[pre_title:title_start]):
264                    if char.isalnum() or char.isdigit():
265                        return SeriesParseResult(data=data, valid=False)
266                    if char.isspace() or char in '._':
267                        continue
268                    else:
269                        break
270            # Check the name doesn't end mid-word (guessit might put the border before or after the space after title)
271            if (
272                data[title_end - 1].isalnum()
273                and len(data) <= title_end
274                or not self._is_valid_name(data, guessit_options=guessit_options)
275            ):
276                valid = False
277            # If we are in exact mode, make sure there is nothing after the title
278            if kwargs.get('strict_name'):
279                post_title = sys.maxsize
280                for match_type, matches in guess_result.matches.items():
281                    if match_type in ['season', 'episode', 'date', 'regexpId']:
282                        if matches[0].start < title_end:
283                            continue
284                        post_title = min(post_title, matches[0].start)
285                        if matches[0].parent:
286                            post_title = min(post_title, matches[0].parent.start)
287                for char in data[title_end:post_title]:
288                    if char.isalnum() or char.isdigit():
289                        valid = False
290        else:
291            valid = False
292        season = guess_result.get('season')
293        episode = guess_result.get('episode')
294        if episode is None and 'part' in guess_result:
295            episode = guess_result['part']
296        if isinstance(episode, list):
297            # guessit >=2.1.4 returns a list for multi-packs, but we just want the first one and the number of eps
298            episode = episode[0]
299        date = guess_result.get('date')
300        quality = self._quality(guess_result)
301        proper_count = self._proper_count(guess_result)
302        group = guess_result.get('release_group')
303        # Validate group with from_group
304        if not self._is_valid_groups(group, guessit_options.get('allow_groups', [])):
305            valid = False
306        # Validate country, TODO: LEGACY
307        if country and name.endswith(')'):
308            p_start = name.rfind('(')
309            if p_start != -1:
310                parenthetical = re.escape(name[p_start + 1 : -1])
311                if parenthetical and parenthetical.lower() != str(country).lower():
312                    valid = False
313        # Check the full list of 'episode_details' for special,
314        # since things like 'pilot' and 'unaired' can also show up there
315        special = any(
316            v.lower() == 'special' for v in guess_result.values_list.get('episode_details', [])
317        )
318        if 'episode' not in guess_result.values_list:
319            episodes = len(guess_result.values_list.get('part', []))
320        else:
321            episodes = len(guess_result.values_list['episode'])
322        if episodes > 3:
323            valid = False
324        identified_by = kwargs.get('identified_by', 'auto')
325        identifier_type, identifier = None, None
326        if identified_by in ['date', 'auto']:
327            if date:
328                identifier_type = 'date'
329                identifier = date
330        if not identifier_type and identified_by in ['ep', 'auto']:
331            if episode is not None:
332                if season is None and kwargs.get('allow_seasonless', True):
333                    if 'part' in guess_result:
334                        season = 1
335                    else:
336                        episode_raw = guess_result.matches['episode'][0].initiator.raw
337                        if episode_raw and any(
338                            c.isalpha() and c.lower() != 'v' for c in episode_raw
339                        ):
340                            season = 1
341                if season is not None:
342                    identifier_type = 'ep'
343                    identifier = (season, episode)
345        if not identifier_type and identified_by in ['id', 'auto']:
346            if guess_result.matches['regexpId']:
347                identifier_type = 'id'
348                identifier = '-'.join(match.value for match in guess_result.matches['regexpId'])
349        if not identifier_type and identified_by in ['sequence', 'auto']:
350            if episode is not None:
351                identifier_type = 'sequence'
352                identifier = episode
353        if (not identifier_type or guessit_options.get('prefer_specials')) and (
354            special or guessit_options.get('assume_special')
355        ):
356            identifier_type = 'special'
357            identifier = guess_result.get('episode_title', 'special')
358        if not identifier_type:
359            valid = False
360        # TODO: Legacy - Complete == invalid
361        if 'complete' in normalize_component(guess_result.get('other')):
362            valid = False
364        parsed = SeriesParseResult(
365            data=data,
366            name=name,
367            episodes=episodes,
368            identified_by=identified_by,
369            id=identifier,
370            id_type=identifier_type,
371            quality=quality,
372            proper_count=proper_count,
373            special=special,
374            group=group,
375            valid=valid,
376        )
378        logger.debug('Parsing result: {} (in {} ms)', parsed, (preferred_clock() - start) * 1000)
379        return parsed
381    # TODO: The following functions are sort of legacy. No idea if they should be changed.
382    def _is_valid_name(self, data, guessit_options):
383        if not guessit_options.get('name'):
384            return True
385        # name end position
386        name_end = 0
388        # regexp name matching
389        re_from_name = False
390        name_regexps = ReList(guessit_options.get('name_regexps', []))
391        if not name_regexps:
392            # if we don't have name_regexps, generate one from the name
393            name_regexps = ReList(
394                name_to_re(name, default_ignore_prefixes, None)
395                for name in [guessit_options['name']] + guessit_options.get('alternate_names', [])
396            )
397            # With auto regex generation, the first regex group captures the name
398            re_from_name = True
399        # try all specified regexps on this data
400        for name_re in name_regexps:
401            match = re.search(name_re, data)
402            if match:
403                match_end = match.end(1 if re_from_name else 0)
404                # Always pick the longest matching regex
405                if match_end > name_end:
406                    name_end = match_end
407                logger.debug('NAME SUCCESS: {} matched to {}', name_re.pattern, data)
408        if not name_end:
409            # leave this invalid
410            logger.debug(
411                'FAIL: name regexps {} do not match {}',
412                [regexp.pattern for regexp in name_regexps],
413                data,
414            )
415            return False
416        return True
418    def _is_valid_groups(self, group, allow_groups):
419        if not allow_groups:
420            return True
421        if not group:
422            return False
423        normalized_allow_groups = [x.lower() for x in allow_groups]
424        # TODO: special case for guessit with expected_group parameter
425        if isinstance(group, list):
426            return any(g.lower() in normalized_allow_groups for g in group)
428        return group.lower() in normalized_allow_groups
432def register_plugin():
433    plugin.register(
434        ParserGuessit, 'parser_guessit', interfaces=['movie_parser', 'series_parser'], api_ver=2
435    )