1import difflib
2import html
3import json
4import random
5import re
6
7from bs4.element import Tag
8from loguru import logger
9
10from flexget import plugin
11from flexget.utils.requests import Session, TimedLimiter
12from flexget.utils.soup import get_soup
13from flexget.utils.tools import str_to_int
14
15logger = logger.bind(name='imdb.utils')
16# IMDb delivers a version of the page which is unparsable to unknown (and some known) user agents, such as requests'
17# Spoof the old urllib user agent to keep results consistent
18requests = Session()
19requests.headers.update({'User-Agent': 'Python-urllib/2.6'})
20# requests.headers.update({'User-Agent': random.choice(USERAGENTS)})
21
22# this makes most of the titles to be returned in english translation, but not all of them
23requests.headers.update({'Accept-Language': 'en-US,en;q=0.8'})
24requests.headers.update(
25    {'X-Forwarded-For': '24.110.%d.%d' % (random.randint(0, 254), random.randint(0, 254))}
26)
27
28# give imdb a little break between requests (see: http://flexget.com/ticket/129#comment:1)
29requests.add_domain_limiter(TimedLimiter('imdb.com', '3 seconds'))
30
31
32def is_imdb_url(url):
33    """Tests the url to see if it's for imdb.com."""
34    if not isinstance(url, str):
35        return
36    # Probably should use urlparse.
37    return re.match(r'https?://[^/]*imdb\.com/', url)
38
39
40def is_valid_imdb_title_id(value):
41    """
42    Return True if `value` is a valid IMDB ID for titles (movies, series, etc).
43    """
44    if not isinstance(value, str):
45        raise TypeError("is_valid_imdb_title_id expects a string but got {0}".format(type(value)))
46    # IMDB IDs for titles have 'tt' followed by 7 or 8 digits
47    return re.match(r'tt\d{7,8}', value) is not None
48
49
50def is_valid_imdb_person_id(value):
51    """
52    Return True if `value` is a valid IMDB ID for a person.
53    """
54    if not isinstance(value, str):
55        raise TypeError("is_valid_imdb_person_id expects a string but got {0}".format(type(value)))
56    # An IMDB ID for a person is formed by 'nm' followed by 7 digits
57    return re.match(r'nm\d{7,8}', value) is not None
58
59
60def extract_id(url):
61    """Return IMDb ID of the given URL. Return None if not valid or if URL is not a string."""
62    if not isinstance(url, str):
63        return
64    m = re.search(r'((?:nm|tt)\d{7,8})', url)
65    if m:
66        return m.group(1)
67
68
69def make_url(imdb_id):
70    """Return IMDb URL of the given ID"""
71    return 'https://www.imdb.com/title/%s/' % imdb_id
72
73
74class ImdbSearch:
75    def __init__(self):
76        # de-prioritize aka matches a bit
77        self.aka_weight = 0.95
78        # prioritize first
79        self.first_weight = 1.1
80        self.min_match = 0.7
81        self.min_diff = 0.01
82        self.debug = False
83
84        self.max_results = 50
85
86    def ireplace(self, text, old, new, count=0):
87        """Case insensitive string replace"""
88        pattern = re.compile(re.escape(old), re.I)
89        return re.sub(pattern, new, text, count)
90
91    def smart_match(self, raw_name, single_match=True):
92        """Accepts messy name, cleans it and uses information available to make smartest and best match"""
93        parser = plugin.get('parsing', 'imdb_search').parse_movie(raw_name)
94        name = parser.name
95        year = parser.year
96        if not name:
97            logger.critical('Failed to parse name from {}', raw_name)
98            return None
99        logger.debug('smart_match name={} year={}', name, str(year))
100        return self.best_match(name, year, single_match)
101
102    def best_match(self, name, year=None, single_match=True):
103        """Return single movie that best matches name criteria or None"""
104        movies = self.search(name)
105
106        if not movies:
107            logger.debug('search did not return any movies')
108            return None
109
110        # remove all movies below min_match, and different year
111        for movie in movies[:]:
112            if year and movie.get('year'):
113                if movie['year'] != year:
114                    logger.debug(
115                        'best_match removing {} - {} (wrong year: {})',
116                        movie['name'],
117                        movie['url'],
118                        str(movie['year']),
119                    )
120                    movies.remove(movie)
121                    continue
122            if movie['match'] < self.min_match:
123                logger.debug('best_match removing {} (min_match)', movie['name'])
124                movies.remove(movie)
125                continue
126
127        if not movies:
128            logger.debug('FAILURE: no movies remain')
129            return None
130
131        # if only one remains ..
132        if len(movies) == 1:
133            logger.debug('SUCCESS: only one movie remains')
134            return movies[0]
135
136        # check min difference between best two hits
137        diff = movies[0]['match'] - movies[1]['match']
138        if diff < self.min_diff:
139            logger.debug(
140                'unable to determine correct movie, min_diff too small (`{}` <-?-> `{}`)',
141                movies[0],
142                movies[1],
143            )
144            for m in movies:
145                logger.debug('remain: {} (match: {}) {}', m['name'], m['match'], m['url'])
146            return None
147        else:
148            return movies[0] if single_match else movies
149
150    def search(self, name):
151        """Return array of movie details (dict)"""
152        logger.debug('Searching: {}', name)
153        url = 'https://www.imdb.com/find'
154        # This may include Shorts and TV series in the results
155        params = {'q': name, 's': 'tt'}
156
157        logger.debug('Search query: {}', repr(url))
158        page = requests.get(url, params=params)
159        actual_url = page.url
160
161        movies = []
162        soup = get_soup(page.text)
163        # in case we got redirected to movie page (perfect match)
164        re_m = re.match(r'.*\.imdb\.com/title/tt\d+/', actual_url)
165        if re_m:
166            actual_url = re_m.group(0)
167            imdb_id = extract_id(actual_url)
168            movie_parse = ImdbParser()
169            movie_parse.parse(imdb_id, soup=soup)
170            logger.debug('Perfect hit. Search got redirected to {}', actual_url)
171            movie = {
172                'match': 1.0,
173                'name': movie_parse.name,
174                'imdb_id': imdb_id,
175                'url': make_url(imdb_id),
176                'year': movie_parse.year,
177            }
178            movies.append(movie)
179            return movies
180
181        section_table = soup.find('table', 'findList')
182        if not section_table:
183            logger.debug('results table not found')
184            return
185
186        rows = section_table.find_all('tr')
187        if not rows:
188            logger.debug('Titles section does not have links')
189        for count, row in enumerate(rows):
190            # Title search gives a lot of results, only check the first ones
191            if count > self.max_results:
192                break
193
194            result_text = row.find('td', 'result_text')
195            movie = {}
196            additional = re.findall(r'\((.*?)\)', result_text.text)
197            if len(additional) > 0:
198                if re.match(r'^\d{4}$', additional[-1]):
199                    movie['year'] = str_to_int(additional[-1])
200                elif len(additional) > 1:
201                    movie['year'] = str_to_int(additional[-2])
202                    if additional[-1] not in ['TV Movie', 'Video']:
203                        logger.debug('skipping {}', result_text.text)
204                        continue
205            primary_photo = row.find('td', 'primary_photo')
206            movie['thumbnail'] = primary_photo.find('a').find('img').get('src')
207
208            link = result_text.find_next('a')
209            movie['name'] = link.text
210            movie['imdb_id'] = extract_id(link.get('href'))
211            movie['url'] = make_url(movie['imdb_id'])
212            logger.debug('processing name: {} url: {}', movie['name'], movie['url'])
213
214            # calc & set best matching ratio
215            seq = difflib.SequenceMatcher(lambda x: x == ' ', movie['name'].title(), name.title())
216            ratio = seq.ratio()
217
218            # check if some of the akas have better ratio
219            for aka in link.parent.find_all('i'):
220                aka = aka.next.string
221                match = re.search(r'".*"', aka)
222                if not match:
223                    logger.debug('aka `{}` is invalid', aka)
224                    continue
225                aka = match.group(0).replace('"', '')
226                logger.trace('processing aka {}', aka)
227                seq = difflib.SequenceMatcher(lambda x: x == ' ', aka.title(), name.title())
228                aka_ratio = seq.ratio()
229                if aka_ratio > ratio:
230                    ratio = aka_ratio * self.aka_weight
231                    logger.debug(
232                        '- aka `{}` matches better to `{}` ratio {} (weighted to {})',
233                        aka,
234                        name,
235                        aka_ratio,
236                        ratio,
237                    )
238
239            # prioritize items by position
240            position_ratio = (self.first_weight - 1) / (count + 1) + 1
241            logger.debug(
242                '- prioritizing based on position {} `{}`: {}', count, movie['url'], position_ratio
243            )
244            ratio *= position_ratio
245
246            # store ratio
247            movie['match'] = ratio
248            movies.append(movie)
249
250        movies.sort(key=lambda x: x['match'], reverse=True)
251        return movies
252
253
254class ImdbParser:
255    """Quick-hack to parse relevant imdb details"""
256
257    def __init__(self):
258        self.genres = []
259        self.languages = []
260        self.actors = {}
261        self.directors = {}
262        self.writers = {}
263        self.score = 0.0
264        self.votes = 0
265        self.meta_score = 0
266        self.year = 0
267        self.plot_outline = None
268        self.name = None
269        self.original_name = None
270        self.url = None
271        self.imdb_id = None
272        self.photo = None
273        self.mpaa_rating = ''
274        self.plot_keywords = []
275
276    def __str__(self):
277        return '<ImdbParser(name=%s,imdb_id=%s)>' % (self.name, self.imdb_id)
278
279    def parse(self, imdb_id, soup=None):
280        self.imdb_id = extract_id(imdb_id)
281        url = make_url(self.imdb_id)
282        self.url = url
283
284        if not soup:
285            page = requests.get(url)
286            soup = get_soup(page.text)
287
288        title_wrapper = soup.find('div', attrs={'class': 'title_wrapper'})
289        if not title_wrapper:
290            # New layout, transitional
291            title_wrapper = soup.find(
292                'div', {"class": re.compile("^TitleBlock__TitleContainer.?")}
293            )
294
295        if not title_wrapper:
296            raise plugin.PluginError(
297                'IMDB parser needs updating, imdb format changed. Please report on Github.'
298            )
299
300        data = json.loads(soup.find('script', {'type': 'application/ld+json'}).string)
301
302        if not data:
303            raise plugin.PluginError(
304                'IMDB parser needs updating, imdb format changed. Please report on Github.'
305            )
306
307        # Parse stuff from the title-overview section
308        name_elem = html.unescape(data['name'])
309        if name_elem:
310            self.name = name_elem.strip()
311        else:
312            logger.error('Possible IMDB parser needs updating, Please report on Github.')
313            raise plugin.PluginError(
314                'Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url)
315            )
316
317        year = soup.find('span', attrs={'id': 'titleYear'})
318        if not year:
319            # Test new layout
320            year = title_wrapper.find(
321                'span', {"class": re.compile("^TitleBlockMetaData__ListItemText.?")}
322            )
323
324        if year:
325            m = re.search(r'([0-9]{4})', year.text)
326            if m:
327                self.year = int(m.group(1))
328
329        if not self.year:
330            logger.debug('No year found for {}', self.imdb_id)
331
332        mpaa_rating_elem = data.get('contentRating')
333        if mpaa_rating_elem:
334            self.mpaa_rating = mpaa_rating_elem
335        else:
336            logger.debug('No rating found for {}', self.imdb_id)
337
338        photo_elem = data.get('image')
339        if photo_elem:
340            self.photo = photo_elem
341        else:
342            logger.debug('No photo found for {}', self.imdb_id)
343
344        strip_pre_text = False
345        original_name_elem = title_wrapper.find('div', {'class': 'originalTitle'})
346        if not original_name_elem:
347            # Test new layout
348            strip_pre_text = True
349            original_name_elem = title_wrapper.find(
350                'div', {"class": re.compile("^OriginalTitle.?")}
351            )
352
353        if original_name_elem:
354            self.name = title_wrapper.find('h1').contents[0].strip()
355            self.original_name = original_name_elem.contents[0].strip().strip('"')
356            if strip_pre_text:
357                striped_text = re.search(r"([^\:]*)\:? (.*)", self.original_name)
358                if len(striped_text.groups()) == 2:
359                    self.original_name = striped_text.group(2)
360
361        if not original_name_elem:
362            logger.debug('No original title found for {}', self.imdb_id)
363
364        votes_elem = data.get('aggregateRating', {}).get('ratingCount')
365        if votes_elem:
366            self.votes = str_to_int(votes_elem) if not isinstance(votes_elem, int) else votes_elem
367        else:
368            logger.debug('No votes found for {}', self.imdb_id)
369
370        score_elem = data.get('aggregateRating', {}).get('ratingValue')
371        if score_elem:
372            self.score = float(score_elem)
373        else:
374            logger.debug('No score found for {}', self.imdb_id)
375
376        meta_score_elem = soup.find(attrs={'class': 'metacriticScore'})
377        if not meta_score_elem:
378            # Test new layout
379            meta_score_elem = soup.find('span', attrs={'class': 'score-meta'})
380
381        if meta_score_elem:
382            self.meta_score = str_to_int(meta_score_elem.text)
383        else:
384            logger.debug('No Metacritic score found for {}', self.imdb_id)
385
386        # get director(s)
387        directors = data.get('director', [])
388        if not isinstance(directors, list):
389            directors = [directors]
390
391        for director in directors:
392            if director['@type'] != 'Person':
393                continue
394            director_id = extract_id(director['url'])
395            director_name = director['name']
396            self.directors[director_id] = director_name
397
398        # get writer(s)
399        writers = data.get('creator', [])
400        if not isinstance(writers, list):
401            writers = [writers]
402
403        for writer in writers:
404            if writer['@type'] != 'Person':
405                continue
406            writer_id = extract_id(writer['url'])
407            writer_name = writer['name']
408            self.writers[writer_id] = writer_name
409
410        # Details section
411        title_details = soup.find('div', attrs={'id': 'titleDetails'})
412        if not title_details:
413            # Test new layout
414            title_details = soup.find('div', attrs={'data-testid': 'title-details-section'})
415
416        if title_details:
417            # get languages
418            for link in title_details.find_all(
419                'a', href=re.compile(r'^/search/title\?title_type=feature' '&primary_language=')
420            ):
421                lang = link.text.strip().lower()
422                if lang not in self.languages:
423                    self.languages.append(lang.strip())
424
425        # Storyline section
426        storyline = soup.find('div', attrs={'id': 'titleStoryLine'})
427        if storyline:
428            plot_elem = storyline.find('p')
429            if plot_elem:
430                # Remove the "Written By" part.
431                if plot_elem.em:
432                    plot_elem.em.replace_with('')
433                self.plot_outline = plot_elem.text.strip()
434            else:
435                logger.debug('No storyline found for {}', self.imdb_id)
436
437            keyword_elem = storyline.find('h4').parent
438            if keyword_elem:
439                # The last "a" tag is a link to the full list
440                self.plot_keywords = [
441                    keyword_elem.text.strip() for keyword_elem in keyword_elem.find_all("a")[:-1]
442                ]
443        else:
444            # Test new layout
445            storyline = soup.find('div', attrs={'data-testid': 'storyline-plot-summary'})
446            if storyline:
447                self.plot_outline = storyline.text
448
449            keyword_elem = soup.find('div', attrs={'data-testid': 'storyline-plot-keywords'})
450            if keyword_elem:
451                self.plot_keywords = [
452                    keyword_elem.text.strip() for keyword_elem in keyword_elem.find_all("a")[:-1]
453                ]
454
455        genres = data.get('genre', [])
456        if not isinstance(genres, list):
457            genres = [genres]
458
459        self.genres = [g.strip().lower() for g in genres]
460
461        # Cast section
462        cast = soup.find('table', attrs={'class': 'cast_list'})
463        if cast:
464            for actor in cast.select('tr > td:nth-of-type(2) > a'):
465                actor_id = extract_id(actor['href'])
466                actor_name = actor.text.strip()
467                # tag instead of name
468                if isinstance(actor_name, Tag):
469                    actor_name = None
470                self.actors[actor_id] = actor_name
471        else:
472            # Test new layout
473            cast = soup.find_all('a', attrs={'data-testid': 'title-cast-item__actor'})
474            if cast:
475                for actor in cast:
476                    actor_id = extract_id(actor['href'])
477                    actor_name = actor.text.strip()
478                    # tag instead of name
479                    if isinstance(actor_name, Tag):
480                        actor_name = None
481                    self.actors[actor_id] = actor_name
482