1import difflib 2import html 3import json 4import random 5import re 6 7from bs4.element import Tag 8from loguru import logger 9 10from flexget import plugin 11from flexget.utils.requests import Session, TimedLimiter 12from flexget.utils.soup import get_soup 13from flexget.utils.tools import str_to_int 14 15logger = logger.bind(name='imdb.utils') 16# IMDb delivers a version of the page which is unparsable to unknown (and some known) user agents, such as requests' 17# Spoof the old urllib user agent to keep results consistent 18requests = Session() 19requests.headers.update({'User-Agent': 'Python-urllib/2.6'}) 20# requests.headers.update({'User-Agent': random.choice(USERAGENTS)}) 21 22# this makes most of the titles to be returned in english translation, but not all of them 23requests.headers.update({'Accept-Language': 'en-US,en;q=0.8'}) 24requests.headers.update( 25 {'X-Forwarded-For': '24.110.%d.%d' % (random.randint(0, 254), random.randint(0, 254))} 26) 27 28# give imdb a little break between requests (see: http://flexget.com/ticket/129#comment:1) 29requests.add_domain_limiter(TimedLimiter('imdb.com', '3 seconds')) 30 31 32def is_imdb_url(url): 33 """Tests the url to see if it's for imdb.com.""" 34 if not isinstance(url, str): 35 return 36 # Probably should use urlparse. 37 return re.match(r'https?://[^/]*imdb\.com/', url) 38 39 40def is_valid_imdb_title_id(value): 41 """ 42 Return True if `value` is a valid IMDB ID for titles (movies, series, etc). 43 """ 44 if not isinstance(value, str): 45 raise TypeError("is_valid_imdb_title_id expects a string but got {0}".format(type(value))) 46 # IMDB IDs for titles have 'tt' followed by 7 or 8 digits 47 return re.match(r'tt\d{7,8}', value) is not None 48 49 50def is_valid_imdb_person_id(value): 51 """ 52 Return True if `value` is a valid IMDB ID for a person. 53 """ 54 if not isinstance(value, str): 55 raise TypeError("is_valid_imdb_person_id expects a string but got {0}".format(type(value))) 56 # An IMDB ID for a person is formed by 'nm' followed by 7 digits 57 return re.match(r'nm\d{7,8}', value) is not None 58 59 60def extract_id(url): 61 """Return IMDb ID of the given URL. Return None if not valid or if URL is not a string.""" 62 if not isinstance(url, str): 63 return 64 m = re.search(r'((?:nm|tt)\d{7,8})', url) 65 if m: 66 return m.group(1) 67 68 69def make_url(imdb_id): 70 """Return IMDb URL of the given ID""" 71 return 'https://www.imdb.com/title/%s/' % imdb_id 72 73 74class ImdbSearch: 75 def __init__(self): 76 # de-prioritize aka matches a bit 77 self.aka_weight = 0.95 78 # prioritize first 79 self.first_weight = 1.1 80 self.min_match = 0.7 81 self.min_diff = 0.01 82 self.debug = False 83 84 self.max_results = 50 85 86 def ireplace(self, text, old, new, count=0): 87 """Case insensitive string replace""" 88 pattern = re.compile(re.escape(old), re.I) 89 return re.sub(pattern, new, text, count) 90 91 def smart_match(self, raw_name, single_match=True): 92 """Accepts messy name, cleans it and uses information available to make smartest and best match""" 93 parser = plugin.get('parsing', 'imdb_search').parse_movie(raw_name) 94 name = parser.name 95 year = parser.year 96 if not name: 97 logger.critical('Failed to parse name from {}', raw_name) 98 return None 99 logger.debug('smart_match name={} year={}', name, str(year)) 100 return self.best_match(name, year, single_match) 101 102 def best_match(self, name, year=None, single_match=True): 103 """Return single movie that best matches name criteria or None""" 104 movies = self.search(name) 105 106 if not movies: 107 logger.debug('search did not return any movies') 108 return None 109 110 # remove all movies below min_match, and different year 111 for movie in movies[:]: 112 if year and movie.get('year'): 113 if movie['year'] != year: 114 logger.debug( 115 'best_match removing {} - {} (wrong year: {})', 116 movie['name'], 117 movie['url'], 118 str(movie['year']), 119 ) 120 movies.remove(movie) 121 continue 122 if movie['match'] < self.min_match: 123 logger.debug('best_match removing {} (min_match)', movie['name']) 124 movies.remove(movie) 125 continue 126 127 if not movies: 128 logger.debug('FAILURE: no movies remain') 129 return None 130 131 # if only one remains .. 132 if len(movies) == 1: 133 logger.debug('SUCCESS: only one movie remains') 134 return movies[0] 135 136 # check min difference between best two hits 137 diff = movies[0]['match'] - movies[1]['match'] 138 if diff < self.min_diff: 139 logger.debug( 140 'unable to determine correct movie, min_diff too small (`{}` <-?-> `{}`)', 141 movies[0], 142 movies[1], 143 ) 144 for m in movies: 145 logger.debug('remain: {} (match: {}) {}', m['name'], m['match'], m['url']) 146 return None 147 else: 148 return movies[0] if single_match else movies 149 150 def search(self, name): 151 """Return array of movie details (dict)""" 152 logger.debug('Searching: {}', name) 153 url = 'https://www.imdb.com/find' 154 # This may include Shorts and TV series in the results 155 params = {'q': name, 's': 'tt'} 156 157 logger.debug('Search query: {}', repr(url)) 158 page = requests.get(url, params=params) 159 actual_url = page.url 160 161 movies = [] 162 soup = get_soup(page.text) 163 # in case we got redirected to movie page (perfect match) 164 re_m = re.match(r'.*\.imdb\.com/title/tt\d+/', actual_url) 165 if re_m: 166 actual_url = re_m.group(0) 167 imdb_id = extract_id(actual_url) 168 movie_parse = ImdbParser() 169 movie_parse.parse(imdb_id, soup=soup) 170 logger.debug('Perfect hit. Search got redirected to {}', actual_url) 171 movie = { 172 'match': 1.0, 173 'name': movie_parse.name, 174 'imdb_id': imdb_id, 175 'url': make_url(imdb_id), 176 'year': movie_parse.year, 177 } 178 movies.append(movie) 179 return movies 180 181 section_table = soup.find('table', 'findList') 182 if not section_table: 183 logger.debug('results table not found') 184 return 185 186 rows = section_table.find_all('tr') 187 if not rows: 188 logger.debug('Titles section does not have links') 189 for count, row in enumerate(rows): 190 # Title search gives a lot of results, only check the first ones 191 if count > self.max_results: 192 break 193 194 result_text = row.find('td', 'result_text') 195 movie = {} 196 additional = re.findall(r'\((.*?)\)', result_text.text) 197 if len(additional) > 0: 198 if re.match(r'^\d{4}$', additional[-1]): 199 movie['year'] = str_to_int(additional[-1]) 200 elif len(additional) > 1: 201 movie['year'] = str_to_int(additional[-2]) 202 if additional[-1] not in ['TV Movie', 'Video']: 203 logger.debug('skipping {}', result_text.text) 204 continue 205 primary_photo = row.find('td', 'primary_photo') 206 movie['thumbnail'] = primary_photo.find('a').find('img').get('src') 207 208 link = result_text.find_next('a') 209 movie['name'] = link.text 210 movie['imdb_id'] = extract_id(link.get('href')) 211 movie['url'] = make_url(movie['imdb_id']) 212 logger.debug('processing name: {} url: {}', movie['name'], movie['url']) 213 214 # calc & set best matching ratio 215 seq = difflib.SequenceMatcher(lambda x: x == ' ', movie['name'].title(), name.title()) 216 ratio = seq.ratio() 217 218 # check if some of the akas have better ratio 219 for aka in link.parent.find_all('i'): 220 aka = aka.next.string 221 match = re.search(r'".*"', aka) 222 if not match: 223 logger.debug('aka `{}` is invalid', aka) 224 continue 225 aka = match.group(0).replace('"', '') 226 logger.trace('processing aka {}', aka) 227 seq = difflib.SequenceMatcher(lambda x: x == ' ', aka.title(), name.title()) 228 aka_ratio = seq.ratio() 229 if aka_ratio > ratio: 230 ratio = aka_ratio * self.aka_weight 231 logger.debug( 232 '- aka `{}` matches better to `{}` ratio {} (weighted to {})', 233 aka, 234 name, 235 aka_ratio, 236 ratio, 237 ) 238 239 # prioritize items by position 240 position_ratio = (self.first_weight - 1) / (count + 1) + 1 241 logger.debug( 242 '- prioritizing based on position {} `{}`: {}', count, movie['url'], position_ratio 243 ) 244 ratio *= position_ratio 245 246 # store ratio 247 movie['match'] = ratio 248 movies.append(movie) 249 250 movies.sort(key=lambda x: x['match'], reverse=True) 251 return movies 252 253 254class ImdbParser: 255 """Quick-hack to parse relevant imdb details""" 256 257 def __init__(self): 258 self.genres = [] 259 self.languages = [] 260 self.actors = {} 261 self.directors = {} 262 self.writers = {} 263 self.score = 0.0 264 self.votes = 0 265 self.meta_score = 0 266 self.year = 0 267 self.plot_outline = None 268 self.name = None 269 self.original_name = None 270 self.url = None 271 self.imdb_id = None 272 self.photo = None 273 self.mpaa_rating = '' 274 self.plot_keywords = [] 275 276 def __str__(self): 277 return '<ImdbParser(name=%s,imdb_id=%s)>' % (self.name, self.imdb_id) 278 279 def parse(self, imdb_id, soup=None): 280 self.imdb_id = extract_id(imdb_id) 281 url = make_url(self.imdb_id) 282 self.url = url 283 284 if not soup: 285 page = requests.get(url) 286 soup = get_soup(page.text) 287 288 title_wrapper = soup.find('div', attrs={'class': 'title_wrapper'}) 289 if not title_wrapper: 290 # New layout, transitional 291 title_wrapper = soup.find( 292 'div', {"class": re.compile("^TitleBlock__TitleContainer.?")} 293 ) 294 295 if not title_wrapper: 296 raise plugin.PluginError( 297 'IMDB parser needs updating, imdb format changed. Please report on Github.' 298 ) 299 300 data = json.loads(soup.find('script', {'type': 'application/ld+json'}).string) 301 302 if not data: 303 raise plugin.PluginError( 304 'IMDB parser needs updating, imdb format changed. Please report on Github.' 305 ) 306 307 # Parse stuff from the title-overview section 308 name_elem = html.unescape(data['name']) 309 if name_elem: 310 self.name = name_elem.strip() 311 else: 312 logger.error('Possible IMDB parser needs updating, Please report on Github.') 313 raise plugin.PluginError( 314 'Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url) 315 ) 316 317 year = soup.find('span', attrs={'id': 'titleYear'}) 318 if not year: 319 # Test new layout 320 year = title_wrapper.find( 321 'span', {"class": re.compile("^TitleBlockMetaData__ListItemText.?")} 322 ) 323 324 if year: 325 m = re.search(r'([0-9]{4})', year.text) 326 if m: 327 self.year = int(m.group(1)) 328 329 if not self.year: 330 logger.debug('No year found for {}', self.imdb_id) 331 332 mpaa_rating_elem = data.get('contentRating') 333 if mpaa_rating_elem: 334 self.mpaa_rating = mpaa_rating_elem 335 else: 336 logger.debug('No rating found for {}', self.imdb_id) 337 338 photo_elem = data.get('image') 339 if photo_elem: 340 self.photo = photo_elem 341 else: 342 logger.debug('No photo found for {}', self.imdb_id) 343 344 strip_pre_text = False 345 original_name_elem = title_wrapper.find('div', {'class': 'originalTitle'}) 346 if not original_name_elem: 347 # Test new layout 348 strip_pre_text = True 349 original_name_elem = title_wrapper.find( 350 'div', {"class": re.compile("^OriginalTitle.?")} 351 ) 352 353 if original_name_elem: 354 self.name = title_wrapper.find('h1').contents[0].strip() 355 self.original_name = original_name_elem.contents[0].strip().strip('"') 356 if strip_pre_text: 357 striped_text = re.search(r"([^\:]*)\:? (.*)", self.original_name) 358 if len(striped_text.groups()) == 2: 359 self.original_name = striped_text.group(2) 360 361 if not original_name_elem: 362 logger.debug('No original title found for {}', self.imdb_id) 363 364 votes_elem = data.get('aggregateRating', {}).get('ratingCount') 365 if votes_elem: 366 self.votes = str_to_int(votes_elem) if not isinstance(votes_elem, int) else votes_elem 367 else: 368 logger.debug('No votes found for {}', self.imdb_id) 369 370 score_elem = data.get('aggregateRating', {}).get('ratingValue') 371 if score_elem: 372 self.score = float(score_elem) 373 else: 374 logger.debug('No score found for {}', self.imdb_id) 375 376 meta_score_elem = soup.find(attrs={'class': 'metacriticScore'}) 377 if not meta_score_elem: 378 # Test new layout 379 meta_score_elem = soup.find('span', attrs={'class': 'score-meta'}) 380 381 if meta_score_elem: 382 self.meta_score = str_to_int(meta_score_elem.text) 383 else: 384 logger.debug('No Metacritic score found for {}', self.imdb_id) 385 386 # get director(s) 387 directors = data.get('director', []) 388 if not isinstance(directors, list): 389 directors = [directors] 390 391 for director in directors: 392 if director['@type'] != 'Person': 393 continue 394 director_id = extract_id(director['url']) 395 director_name = director['name'] 396 self.directors[director_id] = director_name 397 398 # get writer(s) 399 writers = data.get('creator', []) 400 if not isinstance(writers, list): 401 writers = [writers] 402 403 for writer in writers: 404 if writer['@type'] != 'Person': 405 continue 406 writer_id = extract_id(writer['url']) 407 writer_name = writer['name'] 408 self.writers[writer_id] = writer_name 409 410 # Details section 411 title_details = soup.find('div', attrs={'id': 'titleDetails'}) 412 if not title_details: 413 # Test new layout 414 title_details = soup.find('div', attrs={'data-testid': 'title-details-section'}) 415 416 if title_details: 417 # get languages 418 for link in title_details.find_all( 419 'a', href=re.compile(r'^/search/title\?title_type=feature' '&primary_language=') 420 ): 421 lang = link.text.strip().lower() 422 if lang not in self.languages: 423 self.languages.append(lang.strip()) 424 425 # Storyline section 426 storyline = soup.find('div', attrs={'id': 'titleStoryLine'}) 427 if storyline: 428 plot_elem = storyline.find('p') 429 if plot_elem: 430 # Remove the "Written By" part. 431 if plot_elem.em: 432 plot_elem.em.replace_with('') 433 self.plot_outline = plot_elem.text.strip() 434 else: 435 logger.debug('No storyline found for {}', self.imdb_id) 436 437 keyword_elem = storyline.find('h4').parent 438 if keyword_elem: 439 # The last "a" tag is a link to the full list 440 self.plot_keywords = [ 441 keyword_elem.text.strip() for keyword_elem in keyword_elem.find_all("a")[:-1] 442 ] 443 else: 444 # Test new layout 445 storyline = soup.find('div', attrs={'data-testid': 'storyline-plot-summary'}) 446 if storyline: 447 self.plot_outline = storyline.text 448 449 keyword_elem = soup.find('div', attrs={'data-testid': 'storyline-plot-keywords'}) 450 if keyword_elem: 451 self.plot_keywords = [ 452 keyword_elem.text.strip() for keyword_elem in keyword_elem.find_all("a")[:-1] 453 ] 454 455 genres = data.get('genre', []) 456 if not isinstance(genres, list): 457 genres = [genres] 458 459 self.genres = [g.strip().lower() for g in genres] 460 461 # Cast section 462 cast = soup.find('table', attrs={'class': 'cast_list'}) 463 if cast: 464 for actor in cast.select('tr > td:nth-of-type(2) > a'): 465 actor_id = extract_id(actor['href']) 466 actor_name = actor.text.strip() 467 # tag instead of name 468 if isinstance(actor_name, Tag): 469 actor_name = None 470 self.actors[actor_id] = actor_name 471 else: 472 # Test new layout 473 cast = soup.find_all('a', attrs={'data-testid': 'title-cast-item__actor'}) 474 if cast: 475 for actor in cast: 476 actor_id = extract_id(actor['href']) 477 actor_name = actor.text.strip() 478 # tag instead of name 479 if isinstance(actor_name, Tag): 480 actor_name = None 481 self.actors[actor_id] = actor_name 482