1import difflib
2import html
3import json
4import random
5import re
7from bs4.element import Tag
8from loguru import logger
10from flexget import plugin
11from flexget.utils.requests import Session, TimedLimiter
12from flexget.utils.soup import get_soup
13from flexget.utils.tools import str_to_int
15logger = logger.bind(name='imdb.utils')
16# IMDb delivers a version of the page which is unparsable to unknown (and some known) user agents, such as requests'
17# Spoof the old urllib user agent to keep results consistent
18requests = Session()
19requests.headers.update({'User-Agent': 'Python-urllib/2.6'})
20# requests.headers.update({'User-Agent': random.choice(USERAGENTS)})
22# this makes most of the titles to be returned in english translation, but not all of them
23requests.headers.update({'Accept-Language': 'en-US,en;q=0.8'})
25    {'X-Forwarded-For': '24.110.%d.%d' % (random.randint(0, 254), random.randint(0, 254))}
28# give imdb a little break between requests (see: http://flexget.com/ticket/129#comment:1)
29requests.add_domain_limiter(TimedLimiter('imdb.com', '3 seconds'))
32def is_imdb_url(url):
33    """Tests the url to see if it's for imdb.com."""
34    if not isinstance(url, str):
35        return
36    # Probably should use urlparse.
37    return re.match(r'https?://[^/]*imdb\.com/', url)
40def is_valid_imdb_title_id(value):
41    """
42    Return True if `value` is a valid IMDB ID for titles (movies, series, etc).
43    """
44    if not isinstance(value, str):
45        raise TypeError("is_valid_imdb_title_id expects a string but got {0}".format(type(value)))
46    # IMDB IDs for titles have 'tt' followed by 7 or 8 digits
47    return re.match(r'tt\d{7,8}', value) is not None
50def is_valid_imdb_person_id(value):
51    """
52    Return True if `value` is a valid IMDB ID for a person.
53    """
54    if not isinstance(value, str):
55        raise TypeError("is_valid_imdb_person_id expects a string but got {0}".format(type(value)))
56    # An IMDB ID for a person is formed by 'nm' followed by 7 digits
57    return re.match(r'nm\d{7,8}', value) is not None
60def extract_id(url):
61    """Return IMDb ID of the given URL. Return None if not valid or if URL is not a string."""
62    if not isinstance(url, str):
63        return
64    m = re.search(r'((?:nm|tt)\d{7,8})', url)
65    if m:
66        return m.group(1)
69def make_url(imdb_id):
70    """Return IMDb URL of the given ID"""
71    return 'https://www.imdb.com/title/%s/' % imdb_id
74class ImdbSearch:
75    def __init__(self):
76        # de-prioritize aka matches a bit
77        self.aka_weight = 0.95
78        # prioritize first
79        self.first_weight = 1.1
80        self.min_match = 0.7
81        self.min_diff = 0.01
82        self.debug = False
84        self.max_results = 50
86    def ireplace(self, text, old, new, count=0):
87        """Case insensitive string replace"""
88        pattern = re.compile(re.escape(old), re.I)
89        return re.sub(pattern, new, text, count)
91    def smart_match(self, raw_name, single_match=True):
92        """Accepts messy name, cleans it and uses information available to make smartest and best match"""
93        parser = plugin.get('parsing', 'imdb_search').parse_movie(raw_name)
94        name = parser.name
95        year = parser.year
96        if not name:
97            logger.critical('Failed to parse name from {}', raw_name)
98            return None
99        logger.debug('smart_match name={} year={}', name, str(year))
100        return self.best_match(name, year, single_match)
102    def best_match(self, name, year=None, single_match=True):
103        """Return single movie that best matches name criteria or None"""
104        movies = self.search(name)
106        if not movies:
107            logger.debug('search did not return any movies')
108            return None
110        # remove all movies below min_match, and different year
111        for movie in movies[:]:
112            if year and movie.get('year'):
113                if movie['year'] != year:
114                    logger.debug(
115                        'best_match removing {} - {} (wrong year: {})',
116                        movie['name'],
117                        movie['url'],
118                        str(movie['year']),
119                    )
120                    movies.remove(movie)
121                    continue
122            if movie['match'] < self.min_match:
123                logger.debug('best_match removing {} (min_match)', movie['name'])
124                movies.remove(movie)
125                continue
127        if not movies:
128            logger.debug('FAILURE: no movies remain')
129            return None
131        # if only one remains ..
132        if len(movies) == 1:
133            logger.debug('SUCCESS: only one movie remains')
134            return movies[0]
136        # check min difference between best two hits
137        diff = movies[0]['match'] - movies[1]['match']
138        if diff < self.min_diff:
139            logger.debug(
140                'unable to determine correct movie, min_diff too small (`{}` <-?-> `{}`)',
141                movies[0],
142                movies[1],
143            )
144            for m in movies:
145                logger.debug('remain: {} (match: {}) {}', m['name'], m['match'], m['url'])
146            return None
147        else:
148            return movies[0] if single_match else movies
150    def search(self, name):
151        """Return array of movie details (dict)"""
152        logger.debug('Searching: {}', name)
153        url = 'https://www.imdb.com/find'
154        # This may include Shorts and TV series in the results
155        params = {'q': name, 's': 'tt'}
157        logger.debug('Search query: {}', repr(url))
158        page = requests.get(url, params=params)
159        actual_url = page.url
161        movies = []
162        soup = get_soup(page.text)
163        # in case we got redirected to movie page (perfect match)
164        re_m = re.match(r'.*\.imdb\.com/title/tt\d+/', actual_url)
165        if re_m:
166            actual_url = re_m.group(0)
167            imdb_id = extract_id(actual_url)
168            movie_parse = ImdbParser()
169            movie_parse.parse(imdb_id, soup=soup)
170            logger.debug('Perfect hit. Search got redirected to {}', actual_url)
171            movie = {
172                'match': 1.0,
173                'name': movie_parse.name,
174                'imdb_id': imdb_id,
175                'url': make_url(imdb_id),
176                'year': movie_parse.year,
177            }
178            movies.append(movie)
179            return movies
181        section_table = soup.find('table', 'findList')
182        if not section_table:
183            logger.debug('results table not found')
184            return
186        rows = section_table.find_all('tr')
187        if not rows:
188            logger.debug('Titles section does not have links')
189        for count, row in enumerate(rows):
190            # Title search gives a lot of results, only check the first ones
191            if count > self.max_results:
192                break
194            result_text = row.find('td', 'result_text')
195            movie = {}
196            additional = re.findall(r'\((.*?)\)', result_text.text)
197            if len(additional) > 0:
198                if re.match(r'^\d{4}$', additional[-1]):
199                    movie['year'] = str_to_int(additional[-1])
200                elif len(additional) > 1:
201                    movie['year'] = str_to_int(additional[-2])
202                    if additional[-1] not in ['TV Movie', 'Video']:
203                        logger.debug('skipping {}', result_text.text)
204                        continue
205            primary_photo = row.find('td', 'primary_photo')
206            movie['thumbnail'] = primary_photo.find('a').find('img').get('src')
208            link = result_text.find_next('a')
209            movie['name'] = link.text
210            movie['imdb_id'] = extract_id(link.get('href'))
211            movie['url'] = make_url(movie['imdb_id'])
212            logger.debug('processing name: {} url: {}', movie['name'], movie['url'])
214            # calc & set best matching ratio
215            seq = difflib.SequenceMatcher(lambda x: x == ' ', movie['name'].title(), name.title())
216            ratio = seq.ratio()
218            # check if some of the akas have better ratio
219            for aka in link.parent.find_all('i'):
220                aka = aka.next.string
221                match = re.search(r'".*"', aka)
222                if not match:
223                    logger.debug('aka `{}` is invalid', aka)
224                    continue
225                aka = match.group(0).replace('"', '')
226                logger.trace('processing aka {}', aka)
227                seq = difflib.SequenceMatcher(lambda x: x == ' ', aka.title(), name.title())
228                aka_ratio = seq.ratio()
229                if aka_ratio > ratio:
230                    ratio = aka_ratio * self.aka_weight
231                    logger.debug(
232                        '- aka `{}` matches better to `{}` ratio {} (weighted to {})',
233                        aka,
234                        name,
235                        aka_ratio,
236                        ratio,
237                    )
239            # prioritize items by position
240            position_ratio = (self.first_weight - 1) / (count + 1) + 1
241            logger.debug(
242                '- prioritizing based on position {} `{}`: {}', count, movie['url'], position_ratio
243            )
244            ratio *= position_ratio
246            # store ratio
247            movie['match'] = ratio
248            movies.append(movie)
250        movies.sort(key=lambda x: x['match'], reverse=True)
251        return movies
254class ImdbParser:
255    """Quick-hack to parse relevant imdb details"""
257    def __init__(self):
258        self.genres = []
259        self.languages = []
260        self.actors = {}
261        self.directors = {}
262        self.writers = {}
263        self.score = 0.0
264        self.votes = 0
265        self.meta_score = 0
266        self.year = 0
267        self.plot_outline = None
268        self.name = None
269        self.original_name = None
270        self.url = None
271        self.imdb_id = None
272        self.photo = None
273        self.mpaa_rating = ''
274        self.plot_keywords = []
276    def __str__(self):
277        return '<ImdbParser(name=%s,imdb_id=%s)>' % (self.name, self.imdb_id)
279    def parse(self, imdb_id, soup=None):
280        self.imdb_id = extract_id(imdb_id)
281        url = make_url(self.imdb_id)
282        self.url = url
284        if not soup:
285            page = requests.get(url)
286            soup = get_soup(page.text)
288        title_wrapper = soup.find('div', attrs={'class': 'title_wrapper'})
289        if not title_wrapper:
290            # New layout, transitional
291            title_wrapper = soup.find(
292                'div', {"class": re.compile("^TitleBlock__TitleContainer.?")}
293            )
295        if not title_wrapper:
296            raise plugin.PluginError(
297                'IMDB parser needs updating, imdb format changed. Please report on Github.'
298            )
300        data = json.loads(soup.find('script', {'type': 'application/ld+json'}).string)
302        if not data:
303            raise plugin.PluginError(
304                'IMDB parser needs updating, imdb format changed. Please report on Github.'
305            )
307        # Parse stuff from the title-overview section
308        name_elem = html.unescape(data['name'])
309        if name_elem:
310            self.name = name_elem.strip()
311        else:
312            logger.error('Possible IMDB parser needs updating, Please report on Github.')
313            raise plugin.PluginError(
314                'Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url)
315            )
317        year = soup.find('span', attrs={'id': 'titleYear'})
318        if not year:
319            # Test new layout
320            year = title_wrapper.find(
321                'span', {"class": re.compile("^TitleBlockMetaData__ListItemText.?")}
322            )
324        if year:
325            m = re.search(r'([0-9]{4})', year.text)
326            if m:
327                self.year = int(m.group(1))
329        if not self.year:
330            logger.debug('No year found for {}', self.imdb_id)
332        mpaa_rating_elem = data.get('contentRating')
333        if mpaa_rating_elem:
334            self.mpaa_rating = mpaa_rating_elem
335        else:
336            logger.debug('No rating found for {}', self.imdb_id)
338        photo_elem = data.get('image')
339        if photo_elem:
340            self.photo = photo_elem
341        else:
342            logger.debug('No photo found for {}', self.imdb_id)
344        strip_pre_text = False
345        original_name_elem = title_wrapper.find('div', {'class': 'originalTitle'})
346        if not original_name_elem:
347            # Test new layout
348            strip_pre_text = True
349            original_name_elem = title_wrapper.find(
350                'div', {"class": re.compile("^OriginalTitle.?")}
351            )
353        if original_name_elem:
354            self.name = title_wrapper.find('h1').contents[0].strip()
355            self.original_name = original_name_elem.contents[0].strip().strip('"')
356            if strip_pre_text:
357                striped_text = re.search(r"([^\:]*)\:? (.*)", self.original_name)
358                if len(striped_text.groups()) == 2:
359                    self.original_name = striped_text.group(2)
361        if not original_name_elem:
362            logger.debug('No original title found for {}', self.imdb_id)
364        votes_elem = data.get('aggregateRating', {}).get('ratingCount')
365        if votes_elem:
366            self.votes = str_to_int(votes_elem) if not isinstance(votes_elem, int) else votes_elem
367        else:
368            logger.debug('No votes found for {}', self.imdb_id)
370        score_elem = data.get('aggregateRating', {}).get('ratingValue')
371        if score_elem:
372            self.score = float(score_elem)
373        else:
374            logger.debug('No score found for {}', self.imdb_id)
376        meta_score_elem = soup.find(attrs={'class': 'metacriticScore'})
377        if not meta_score_elem:
378            # Test new layout
379            meta_score_elem = soup.find('span', attrs={'class': 'score-meta'})
381        if meta_score_elem:
382            self.meta_score = str_to_int(meta_score_elem.text)
383        else:
384            logger.debug('No Metacritic score found for {}', self.imdb_id)
386        # get director(s)
387        directors = data.get('director', [])
388        if not isinstance(directors, list):
389            directors = [directors]
391        for director in directors:
392            if director['@type'] != 'Person':
393                continue
394            director_id = extract_id(director['url'])
395            director_name = director['name']
396            self.directors[director_id] = director_name
398        # get writer(s)
399        writers = data.get('creator', [])
400        if not isinstance(writers, list):
401            writers = [writers]
403        for writer in writers:
404            if writer['@type'] != 'Person':
405                continue
406            writer_id = extract_id(writer['url'])
407            writer_name = writer['name']
408            self.writers[writer_id] = writer_name
410        # Details section
411        title_details = soup.find('div', attrs={'id': 'titleDetails'})
412        if not title_details:
413            # Test new layout
414            title_details = soup.find('div', attrs={'data-testid': 'title-details-section'})
416        if title_details:
417            # get languages
418            for link in title_details.find_all(
419                'a', href=re.compile(r'^/search/title\?title_type=feature' '&primary_language=')
420            ):
421                lang = link.text.strip().lower()
422                if lang not in self.languages:
423                    self.languages.append(lang.strip())
425        # Storyline section
426        storyline = soup.find('div', attrs={'id': 'titleStoryLine'})
427        if storyline:
428            plot_elem = storyline.find('p')
429            if plot_elem:
430                # Remove the "Written By" part.
431                if plot_elem.em:
432                    plot_elem.em.replace_with('')
433                self.plot_outline = plot_elem.text.strip()
434            else:
435                logger.debug('No storyline found for {}', self.imdb_id)
437            keyword_elem = storyline.find('h4').parent
438            if keyword_elem:
439                # The last "a" tag is a link to the full list
440                self.plot_keywords = [
441                    keyword_elem.text.strip() for keyword_elem in keyword_elem.find_all("a")[:-1]
442                ]
443        else:
444            # Test new layout
445            storyline = soup.find('div', attrs={'data-testid': 'storyline-plot-summary'})
446            if storyline:
447                self.plot_outline = storyline.text
449            keyword_elem = soup.find('div', attrs={'data-testid': 'storyline-plot-keywords'})
450            if keyword_elem:
451                self.plot_keywords = [
452                    keyword_elem.text.strip() for keyword_elem in keyword_elem.find_all("a")[:-1]
453                ]
455        genres = data.get('genre', [])
456        if not isinstance(genres, list):
457            genres = [genres]
459        self.genres = [g.strip().lower() for g in genres]
461        # Cast section
462        cast = soup.find('table', attrs={'class': 'cast_list'})
463        if cast:
464            for actor in cast.select('tr > td:nth-of-type(2) > a'):
465                actor_id = extract_id(actor['href'])
466                actor_name = actor.text.strip()
467                # tag instead of name
468                if isinstance(actor_name, Tag):
469                    actor_name = None
470                self.actors[actor_id] = actor_name
471        else:
472            # Test new layout
473            cast = soup.find_all('a', attrs={'data-testid': 'title-cast-item__actor'})
474            if cast:
475                for actor in cast:
476                    actor_id = extract_id(actor['href'])
477                    actor_name = actor.text.strip()
478                    # tag instead of name
479                    if isinstance(actor_name, Tag):
480                        actor_name = None
481                    self.actors[actor_id] = actor_name