1import logging 2import os 3import re 4import sys 5import time 6 7from loguru import logger 8 9from flexget import plugin 10from flexget.event import event 11from flexget.utils import qualities 12from flexget.utils.parsers.generic import ParseWarning, default_ignore_prefixes, name_to_re 13from flexget.utils.tools import ReList 14 15from .parser_common import MovieParseResult, SeriesParseResult 16 17# rebulk (that underlies guessit) will use the 'regex' module rather than 're' if installed. 18# For consistency, prevent that unless env variable is explicitly already enabling it. 19os.environ.setdefault('REGEX_DISABLED', 'true') # isort:skip 20from guessit.api import GuessItApi, GuessitException # isort:skip 21from guessit.rules import rebulk_builder # isort:skip 22from rebulk import Rebulk # isort:skip 23from rebulk.pattern import RePattern # isort:skip 24 25 26logger = logger.bind(name='parser_guessit') 27 28logging.getLogger('rebulk').setLevel(logging.WARNING) 29logging.getLogger('guessit').setLevel(logging.WARNING) 30 31 32def _id_regexps_function(input_string, context): 33 ret = [] 34 for regexp in context.get('id_regexps'): 35 for match in RePattern(regexp, children=True).matches(input_string, context): 36 ret.append(match.span) 37 return ret 38 39 40_id_regexps = Rebulk().functional( 41 _id_regexps_function, name='regexpId', disabled=lambda context: not context.get('id_regexps') 42) 43 44 45def rules_builder(config): 46 rebulk = rebulk_builder(config) 47 rebulk.rebulk(_id_regexps) 48 return rebulk 49 50 51guessit_api = GuessItApi() 52guessit_api.configure(options={}, rules_builder=rules_builder, force=True) 53 54 55def normalize_component(data): 56 if data is None: 57 return [] 58 if isinstance(data, list): 59 return [d.lower().replace('-', '') for d in data] 60 61 return [data.lower().replace('-', '')] 62 63 64try: 65 preferred_clock = time.process_time 66except AttributeError: 67 preferred_clock = time.clock 68 69 70class ParserGuessit: 71 SOURCE_MAP = { 72 'Camera': 'cam', 73 'HD Camera': 'cam', 74 'HD Telesync': 'telesync', 75 'Pay-per-view': 'ppv', 76 'Digital TV': 'dvb', 77 'Video on Demand': 'vod', 78 'Analog HDTV': 'ahdtv', 79 'Ultra HDTV': 'uhdtv', 80 'HD Telecine': 'hdtc', 81 'Web': 'web-dl', 82 } 83 84 @staticmethod 85 def _guessit_options(options): 86 settings = { 87 'name_only': True, 88 'allowed_languages': ['en', 'fr'], 89 'allowed_countries': ['us', 'uk', 'gb'], 90 'single_value': True, 91 } 92 options['episode_prefer_number'] = not options.get('identified_by') == 'ep' 93 if options.get('allow_groups'): 94 options['expected_group'] = options['allow_groups'] 95 if 'date_yearfirst' in options: 96 options['date_year_first'] = options['date_yearfirst'] 97 if 'date_dayfirst' in options: 98 options['date_day_first'] = options['date_dayfirst'] 99 else: 100 # See https://github.com/guessit-io/guessit/issues/329 101 # https://github.com/guessit-io/guessit/pull/333 102 # They made changes that break backward compatibility, so we have to make do this hackery 103 if options.get('date_year_first'): 104 options['date_day_first'] = True 105 settings.update(options) 106 return settings 107 108 @staticmethod 109 def _proper_count(guessit_result): 110 """Calculate a FlexGet style proper_count from a guessit result.""" 111 version = guessit_result.get('version') 112 if version is None: 113 version = 0 114 elif version <= 0: 115 version = -1 116 else: 117 version -= 1 118 proper_count = guessit_result.get('proper_count', 0) 119 fastsub = 'fast subtitled' in normalize_component(guessit_result.values_list.get('other')) 120 return version + proper_count - (5 if fastsub else 0) 121 122 def _source(self, guessit_result): 123 other = normalize_component(guessit_result.values_list.get('other')) 124 source = self.SOURCE_MAP.get(guessit_result.get('source'), guessit_result.get('source')) 125 # special case 126 if source == 'web-dl' and 'rip' in other: 127 source = 'webrip' 128 129 source = normalize_component(source) 130 131 if 'preair' in other: 132 source.append('preair') 133 if 'screener' in other: 134 if 'bluray' in source: 135 source.append('bdscr') 136 else: 137 source.append('dvdscr') 138 if 'region 5' in other or 'region c' in other: 139 source.append('r5') 140 141 return source 142 143 def _quality(self, guessit_result): 144 """Generate a FlexGet Quality from a guessit result.""" 145 resolution = normalize_component(guessit_result.values_list.get('screen_size')) 146 other = normalize_component(guessit_result.values_list.get('other')) 147 if not resolution and 'high resolution' in other: 148 resolution.append('hr') 149 150 source = self._source(guessit_result) 151 152 codec = normalize_component(guessit_result.values_list.get('video_codec')) 153 if '10bit' in normalize_component(guessit_result.values_list.get('color_depth')): 154 codec.append('10bit') 155 156 audio = normalize_component(guessit_result.values_list.get('audio_codec')) 157 audio_profile = normalize_component(guessit_result.values_list.get('audio_profile')) 158 audio_channels = normalize_component(guessit_result.values_list.get('audio_channels')) 159 # unlike the other components, audio can be a bit iffy with multiple codecs, so we limit it to one 160 if 'dts' in audio and any(hd in audio_profile for hd in ['hd', 'master audio']): 161 audio = ['dtshd'] 162 elif '5.1' in audio_channels and 'dolby digital plus' in audio: 163 audio = ['dd+5.1'] 164 elif '5.1' in audio_channels and 'dolby digital' in audio: 165 audio = ['dd5.1'] 166 167 # Make sure everything are strings (guessit will return lists when there are multiples) 168 flattened_qualities = [] 169 for component in (resolution, source, codec, audio): 170 if isinstance(component, list): 171 flattened_qualities.append(' '.join(component)) 172 elif isinstance(component, str): 173 flattened_qualities.append(component) 174 else: 175 raise ParseWarning( 176 self, 177 'Guessit quality returned type {}: {}. Expected str or list.'.format( 178 type(component), component 179 ), 180 ) 181 182 return qualities.Quality(' '.join(flattened_qualities)) 183 184 # movie_parser API 185 def parse_movie(self, data, **kwargs): 186 logger.debug('Parsing movie: `{}` [options: {}]', data, kwargs) 187 start = preferred_clock() 188 guessit_options = self._guessit_options(kwargs) 189 guessit_options['type'] = 'movie' 190 guess_result = guessit_api.guessit(data, options=guessit_options) 191 # NOTE: Guessit expects str on PY3 and unicode on PY2 hence the use of future.utils.native 192 parsed = MovieParseResult( 193 data=data, 194 name=guess_result.get('title'), 195 year=guess_result.get('year'), 196 proper_count=self._proper_count(guess_result), 197 quality=self._quality(guess_result), 198 release_group=guess_result.get('release_group'), 199 valid=bool( 200 guess_result.get('title') 201 ), # It's not valid if it didn't find a name, which sometimes happens 202 ) 203 logger.debug('Parsing result: {} (in {} ms)', parsed, (preferred_clock() - start) * 1000) 204 return parsed 205 206 # series_parser API 207 def parse_series(self, data, **kwargs): 208 logger.debug('Parsing series: `{}` [options: {}]', data, kwargs) 209 guessit_options = self._guessit_options(kwargs) 210 valid = True 211 if kwargs.get('name'): 212 expected_titles = [kwargs['name']] 213 if kwargs.get('alternate_names'): 214 expected_titles.extend(kwargs['alternate_names']) 215 # apostrophe support 216 expected_titles = [ 217 title.replace('\'', '(?:\'|\\\'|\\\\\'|-|)?') for title in expected_titles 218 ] 219 guessit_options['expected_title'] = ['re:' + title for title in expected_titles] 220 if kwargs.get('id_regexps'): 221 guessit_options['id_regexps'] = kwargs.get('id_regexps') 222 start = preferred_clock() 223 # If no series name is provided, we don't tell guessit what kind of match we are looking for 224 # This prevents guessit from determining that too general of matches are series 225 parse_type = 'episode' if kwargs.get('name') else None 226 if parse_type: 227 guessit_options['type'] = parse_type 228 229 # NOTE: Guessit expects str on PY3 and unicode on PY2 hence the use of future.utils.native 230 try: 231 guess_result = guessit_api.guessit(data, options=guessit_options) 232 except GuessitException: 233 logger.warning('Parsing {} with guessit failed. Most likely a unicode error.', data) 234 return SeriesParseResult(data=data, valid=False) 235 236 if guess_result.get('type') != 'episode': 237 valid = False 238 239 name = kwargs.get('name') 240 country = guess_result.get('country') 241 if not name: 242 name = guess_result.get('title', '') 243 if not name: 244 valid = False 245 elif country and hasattr(country, 'alpha2'): 246 name += ' (%s)' % country.alpha2 247 elif guess_result.matches['title']: 248 # Make sure the name match is up to FlexGet standards 249 # Check there is no unmatched cruft before the matched name 250 title_start = guess_result.matches['title'][0].start 251 title_end = guess_result.matches['title'][0].end 252 if title_start != 0: 253 try: 254 pre_title = max( 255 ( 256 match[0].end 257 for match in guess_result.matches.values() 258 if match[0].end <= title_start 259 ) 260 ) 261 except ValueError: 262 pre_title = 0 263 for char in reversed(data[pre_title:title_start]): 264 if char.isalnum() or char.isdigit(): 265 return SeriesParseResult(data=data, valid=False) 266 if char.isspace() or char in '._': 267 continue 268 else: 269 break 270 # Check the name doesn't end mid-word (guessit might put the border before or after the space after title) 271 if ( 272 data[title_end - 1].isalnum() 273 and len(data) <= title_end 274 or not self._is_valid_name(data, guessit_options=guessit_options) 275 ): 276 valid = False 277 # If we are in exact mode, make sure there is nothing after the title 278 if kwargs.get('strict_name'): 279 post_title = sys.maxsize 280 for match_type, matches in guess_result.matches.items(): 281 if match_type in ['season', 'episode', 'date', 'regexpId']: 282 if matches[0].start < title_end: 283 continue 284 post_title = min(post_title, matches[0].start) 285 if matches[0].parent: 286 post_title = min(post_title, matches[0].parent.start) 287 for char in data[title_end:post_title]: 288 if char.isalnum() or char.isdigit(): 289 valid = False 290 else: 291 valid = False 292 season = guess_result.get('season') 293 episode = guess_result.get('episode') 294 if episode is None and 'part' in guess_result: 295 episode = guess_result['part'] 296 if isinstance(episode, list): 297 # guessit >=2.1.4 returns a list for multi-packs, but we just want the first one and the number of eps 298 episode = episode[0] 299 date = guess_result.get('date') 300 quality = self._quality(guess_result) 301 proper_count = self._proper_count(guess_result) 302 group = guess_result.get('release_group') 303 # Validate group with from_group 304 if not self._is_valid_groups(group, guessit_options.get('allow_groups', [])): 305 valid = False 306 # Validate country, TODO: LEGACY 307 if country and name.endswith(')'): 308 p_start = name.rfind('(') 309 if p_start != -1: 310 parenthetical = re.escape(name[p_start + 1 : -1]) 311 if parenthetical and parenthetical.lower() != str(country).lower(): 312 valid = False 313 # Check the full list of 'episode_details' for special, 314 # since things like 'pilot' and 'unaired' can also show up there 315 special = any( 316 v.lower() == 'special' for v in guess_result.values_list.get('episode_details', []) 317 ) 318 if 'episode' not in guess_result.values_list: 319 episodes = len(guess_result.values_list.get('part', [])) 320 else: 321 episodes = len(guess_result.values_list['episode']) 322 if episodes > 3: 323 valid = False 324 identified_by = kwargs.get('identified_by', 'auto') 325 identifier_type, identifier = None, None 326 if identified_by in ['date', 'auto']: 327 if date: 328 identifier_type = 'date' 329 identifier = date 330 if not identifier_type and identified_by in ['ep', 'auto']: 331 if episode is not None: 332 if season is None and kwargs.get('allow_seasonless', True): 333 if 'part' in guess_result: 334 season = 1 335 else: 336 episode_raw = guess_result.matches['episode'][0].initiator.raw 337 if episode_raw and any( 338 c.isalpha() and c.lower() != 'v' for c in episode_raw 339 ): 340 season = 1 341 if season is not None: 342 identifier_type = 'ep' 343 identifier = (season, episode) 344 345 if not identifier_type and identified_by in ['id', 'auto']: 346 if guess_result.matches['regexpId']: 347 identifier_type = 'id' 348 identifier = '-'.join(match.value for match in guess_result.matches['regexpId']) 349 if not identifier_type and identified_by in ['sequence', 'auto']: 350 if episode is not None: 351 identifier_type = 'sequence' 352 identifier = episode 353 if (not identifier_type or guessit_options.get('prefer_specials')) and ( 354 special or guessit_options.get('assume_special') 355 ): 356 identifier_type = 'special' 357 identifier = guess_result.get('episode_title', 'special') 358 if not identifier_type: 359 valid = False 360 # TODO: Legacy - Complete == invalid 361 if 'complete' in normalize_component(guess_result.get('other')): 362 valid = False 363 364 parsed = SeriesParseResult( 365 data=data, 366 name=name, 367 episodes=episodes, 368 identified_by=identified_by, 369 id=identifier, 370 id_type=identifier_type, 371 quality=quality, 372 proper_count=proper_count, 373 special=special, 374 group=group, 375 valid=valid, 376 ) 377 378 logger.debug('Parsing result: {} (in {} ms)', parsed, (preferred_clock() - start) * 1000) 379 return parsed 380 381 # TODO: The following functions are sort of legacy. No idea if they should be changed. 382 def _is_valid_name(self, data, guessit_options): 383 if not guessit_options.get('name'): 384 return True 385 # name end position 386 name_end = 0 387 388 # regexp name matching 389 re_from_name = False 390 name_regexps = ReList(guessit_options.get('name_regexps', [])) 391 if not name_regexps: 392 # if we don't have name_regexps, generate one from the name 393 name_regexps = ReList( 394 name_to_re(name, default_ignore_prefixes, None) 395 for name in [guessit_options['name']] + guessit_options.get('alternate_names', []) 396 ) 397 # With auto regex generation, the first regex group captures the name 398 re_from_name = True 399 # try all specified regexps on this data 400 for name_re in name_regexps: 401 match = re.search(name_re, data) 402 if match: 403 match_end = match.end(1 if re_from_name else 0) 404 # Always pick the longest matching regex 405 if match_end > name_end: 406 name_end = match_end 407 logger.debug('NAME SUCCESS: {} matched to {}', name_re.pattern, data) 408 if not name_end: 409 # leave this invalid 410 logger.debug( 411 'FAIL: name regexps {} do not match {}', 412 [regexp.pattern for regexp in name_regexps], 413 data, 414 ) 415 return False 416 return True 417 418 def _is_valid_groups(self, group, allow_groups): 419 if not allow_groups: 420 return True 421 if not group: 422 return False 423 normalized_allow_groups = [x.lower() for x in allow_groups] 424 # TODO: special case for guessit with expected_group parameter 425 if isinstance(group, list): 426 return any(g.lower() in normalized_allow_groups for g in group) 427 428 return group.lower() in normalized_allow_groups 429 430 431@event('plugin.register') 432def register_plugin(): 433 plugin.register( 434 ParserGuessit, 'parser_guessit', interfaces=['movie_parser', 'series_parser'], api_ver=2 435 ) 436