1#!/usr/local/bin/python3.8
2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3from __future__ import absolute_import, division, print_function, unicode_literals
4
5__license__   = 'GPL v3'
6__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
7__docformat__ = 'restructuredtext en'
8
9import re, threading
10from functools import total_ordering
11
12from calibre import browser, random_user_agent
13from calibre.customize import Plugin
14from calibre.ebooks.metadata import check_isbn
15from calibre.ebooks.metadata.author_mapper import cap_author_token
16from calibre.utils.localization import canonicalize_lang, get_lang
17from polyglot.builtins import iteritems, cmp
18
19
20def create_log(ostream=None):
21    from calibre.utils.logging import ThreadSafeLog, FileStream
22    log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
23    log.outputs = [FileStream(ostream)]
24    return log
25
26
27# Comparing Metadata objects for relevance {{{
28words = ("the", "a", "an", "of", "and")
29prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
30trailing_paren_pat = re.compile(r'\(.*\)$')
31whitespace_pat = re.compile(r'\s+')
32
33
34def cleanup_title(s):
35    if not s:
36        s = _('Unknown')
37    s = s.strip().lower()
38    s = prefix_pat.sub(' ', s)
39    s = trailing_paren_pat.sub('', s)
40    s = whitespace_pat.sub(' ', s)
41    return s.strip()
42
43
44@total_ordering
45class InternalMetadataCompareKeyGen:
46
47    '''
48    Generate a sort key for comparison of the relevance of Metadata objects,
49    given a search query. This is used only to compare results from the same
50    metadata source, not across different sources.
51
52    The sort key ensures that an ascending order sort is a sort by order of
53    decreasing relevance.
54
55    The algorithm is:
56
57        * Prefer results that have at least one identifier the same as for the query
58        * Prefer results with a cached cover URL
59        * Prefer results with all available fields filled in
60        * Prefer results with the same language as the current user interface language
61        * Prefer results that are an exact title match to the query
62        * Prefer results with longer comments (greater than 10% longer)
63        * Use the relevance of the result as reported by the metadata source's search
64           engine
65    '''
66
67    def __init__(self, mi, source_plugin, title, authors, identifiers):
68        same_identifier = 2
69        idents = mi.get_identifiers()
70        for k, v in iteritems(identifiers):
71            if idents.get(k) == v:
72                same_identifier = 1
73                break
74
75        all_fields = 1 if source_plugin.test_fields(mi) is None else 2
76
77        exact_title = 1 if title and \
78                cleanup_title(title) == cleanup_title(mi.title) else 2
79
80        language = 1
81        if mi.language:
82            mil = canonicalize_lang(mi.language)
83            if mil != 'und' and mil != canonicalize_lang(get_lang()):
84                language = 2
85
86        has_cover = 2 if (not source_plugin.cached_cover_url_is_reliable or
87                source_plugin.get_cached_cover_url(mi.identifiers) is None) else 1
88
89        self.base = (same_identifier, has_cover, all_fields, language, exact_title)
90        self.comments_len = len((mi.comments or '').strip())
91        self.extra = getattr(mi, 'source_relevance', 0)
92
93    def compare_to_other(self, other):
94        a = cmp(self.base, other.base)
95        if a != 0:
96            return a
97        cx, cy = self.comments_len, other.comments_len
98        if cx and cy:
99            t = (cx + cy) / 20
100            delta = cy - cx
101            if abs(delta) > t:
102                return -1 if delta < 0 else 1
103        return cmp(self.extra, other.extra)
104
105    def __eq__(self, other):
106        return self.compare_to_other(other) == 0
107
108    def __ne__(self, other):
109        return self.compare_to_other(other) != 0
110
111    def __lt__(self, other):
112        return self.compare_to_other(other) < 0
113
114    def __le__(self, other):
115        return self.compare_to_other(other) <= 0
116
117    def __gt__(self, other):
118        return self.compare_to_other(other) > 0
119
120    def __ge__(self, other):
121        return self.compare_to_other(other) >= 0
122
123# }}}
124
125
126def get_cached_cover_urls(mi):
127    from calibre.customize.ui import metadata_plugins
128    plugins = list(metadata_plugins(['identify']))
129    for p in plugins:
130        url = p.get_cached_cover_url(mi.identifiers)
131        if url:
132            yield (p, url)
133
134
135def dump_caches():
136    from calibre.customize.ui import metadata_plugins
137    return {p.name:p.dump_caches() for p in metadata_plugins(['identify'])}
138
139
140def load_caches(dump):
141    from calibre.customize.ui import metadata_plugins
142    plugins = list(metadata_plugins(['identify']))
143    for p in plugins:
144        cache = dump.get(p.name, None)
145        if cache:
146            p.load_caches(cache)
147
148
149def fixauthors(authors):
150    if not authors:
151        return authors
152    ans = []
153    for x in authors:
154        ans.append(' '.join(map(cap_author_token, x.split())))
155    return ans
156
157
158def fixcase(x):
159    if x:
160        from calibre.utils.titlecase import titlecase
161        x = titlecase(x)
162    return x
163
164
165class Option:
166    __slots__ = ['type', 'default', 'label', 'desc', 'name', 'choices']
167
168    def __init__(self, name, type_, default, label, desc, choices=None):
169        '''
170        :param name: The name of this option. Must be a valid python identifier
171        :param type_: The type of this option, one of ('number', 'string',
172                        'bool', 'choices')
173        :param default: The default value for this option
174        :param label: A short (few words) description of this option
175        :param desc: A longer description of this option
176        :param choices: A dict of possible values, used only if type='choices'.
177        dict is of the form {key:human readable label, ...}
178        '''
179        self.name, self.type, self.default, self.label, self.desc = (name,
180                type_, default, label, desc)
181        if choices and not isinstance(choices, dict):
182            choices = dict([(x, x) for x in choices])
183        self.choices = choices
184
185
186class Source(Plugin):
187
188    type = _('Metadata source')
189    author = 'Kovid Goyal'
190
191    supported_platforms = ['windows', 'osx', 'linux']
192
193    #: Set of capabilities supported by this plugin.
194    #: Useful capabilities are: 'identify', 'cover'
195    capabilities = frozenset()
196
197    #: List of metadata fields that can potentially be download by this plugin
198    #: during the identify phase
199    touched_fields = frozenset()
200
201    #: Set this to True if your plugin returns HTML formatted comments
202    has_html_comments = False
203
204    #: Setting this to True means that the browser object will indicate
205    #: that it supports gzip transfer encoding. This can speedup downloads
206    #: but make sure that the source actually supports gzip transfer encoding
207    #: correctly first
208    supports_gzip_transfer_encoding = False
209
210    #: Set this to True to ignore HTTPS certificate errors when connecting
211    #: to this source.
212    ignore_ssl_errors = False
213
214    #: Cached cover URLs can sometimes be unreliable (i.e. the download could
215    #: fail or the returned image could be bogus). If that is often the case
216    #: with this source, set to False
217    cached_cover_url_is_reliable = True
218
219    #: A list of :class:`Option` objects. They will be used to automatically
220    #: construct the configuration widget for this plugin
221    options = ()
222
223    #: A string that is displayed at the top of the config widget for this
224    #: plugin
225    config_help_message = None
226
227    #: If True this source can return multiple covers for a given query
228    can_get_multiple_covers = False
229
230    #: If set to True covers downloaded by this plugin are automatically trimmed.
231    auto_trim_covers = False
232
233    #: If set to True, and this source returns multiple results for a query,
234    #: some of which have ISBNs and some of which do not, the results without
235    #: ISBNs will be ignored
236    prefer_results_with_isbn = True
237
238    def __init__(self, *args, **kwargs):
239        Plugin.__init__(self, *args, **kwargs)
240        self.running_a_test = False  # Set to True when using identify_test()
241        self._isbn_to_identifier_cache = {}
242        self._identifier_to_cover_url_cache = {}
243        self.cache_lock = threading.RLock()
244        self._config_obj = None
245        self._browser = None
246        self.prefs.defaults['ignore_fields'] = []
247        for opt in self.options:
248            self.prefs.defaults[opt.name] = opt.default
249
250    # Configuration {{{
251
252    def is_configured(self):
253        '''
254        Return False if your plugin needs to be configured before it can be
255        used. For example, it might need a username/password/API key.
256        '''
257        return True
258
259    def is_customizable(self):
260        return True
261
262    def customization_help(self):
263        return 'This plugin can only be customized using the GUI'
264
265    def config_widget(self):
266        from calibre.gui2.metadata.config import ConfigWidget
267        return ConfigWidget(self)
268
269    def save_settings(self, config_widget):
270        config_widget.commit()
271
272    @property
273    def prefs(self):
274        if self._config_obj is None:
275            from calibre.utils.config import JSONConfig
276            self._config_obj = JSONConfig('metadata_sources/%s.json'%self.name)
277        return self._config_obj
278    # }}}
279
280    # Browser {{{
281
282    @property
283    def user_agent(self):
284        # Pass in an index to random_user_agent() to test with a particular
285        # user agent
286        return random_user_agent()
287
288    @property
289    def browser(self):
290        if self._browser is None:
291            self._browser = browser(user_agent=self.user_agent, verify_ssl_certificates=not self.ignore_ssl_errors)
292            if self.supports_gzip_transfer_encoding:
293                self._browser.set_handle_gzip(True)
294        return self._browser.clone_browser()
295
296    # }}}
297
298    # Caching {{{
299
300    def get_related_isbns(self, id_):
301        with self.cache_lock:
302            for isbn, q in iteritems(self._isbn_to_identifier_cache):
303                if q == id_:
304                    yield isbn
305
306    def cache_isbn_to_identifier(self, isbn, identifier):
307        with self.cache_lock:
308            self._isbn_to_identifier_cache[isbn] = identifier
309
310    def cached_isbn_to_identifier(self, isbn):
311        with self.cache_lock:
312            return self._isbn_to_identifier_cache.get(isbn, None)
313
314    def cache_identifier_to_cover_url(self, id_, url):
315        with self.cache_lock:
316            self._identifier_to_cover_url_cache[id_] = url
317
318    def cached_identifier_to_cover_url(self, id_):
319        with self.cache_lock:
320            return self._identifier_to_cover_url_cache.get(id_, None)
321
322    def dump_caches(self):
323        with self.cache_lock:
324            return {'isbn_to_identifier':self._isbn_to_identifier_cache.copy(),
325                    'identifier_to_cover':self._identifier_to_cover_url_cache.copy()}
326
327    def load_caches(self, dump):
328        with self.cache_lock:
329            self._isbn_to_identifier_cache.update(dump['isbn_to_identifier'])
330            self._identifier_to_cover_url_cache.update(dump['identifier_to_cover'])
331
332    # }}}
333
334    # Utility functions {{{
335
336    def get_author_tokens(self, authors, only_first_author=True):
337        '''
338        Take a list of authors and return a list of tokens useful for an
339        AND search query. This function tries to return tokens in
340        first name middle names last name order, by assuming that if a comma is
341        in the author name, the name is in lastname, other names form.
342        '''
343
344        if authors:
345            # Leave ' in there for Irish names
346            remove_pat = re.compile(r'[!@#$%^&*()()「」{}`~"\s\[\]/]')
347            replace_pat = re.compile(r'[-+.:;,,。;:]')
348            if only_first_author:
349                authors = authors[:1]
350            for au in authors:
351                has_comma = ',' in au
352                au = replace_pat.sub(' ', au)
353                parts = au.split()
354                if has_comma:
355                    # au probably in ln, fn form
356                    parts = parts[1:] + parts[:1]
357                for tok in parts:
358                    tok = remove_pat.sub('', tok).strip()
359                    if len(tok) > 2 and tok.lower() not in ('von', 'van',
360                            _('Unknown').lower()):
361                        yield tok
362
363    def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
364        '''
365        Take a title and return a list of tokens useful for an AND search query.
366        Excludes connectives(optionally) and punctuation.
367        '''
368        if title:
369            # strip sub-titles
370            if strip_subtitle:
371                subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
372                if len(subtitle.sub('', title)) > 1:
373                    title = subtitle.sub('', title)
374
375            title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
376            [
377                # Remove things like: (2010) (Omnibus) etc.
378                (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]', ''),
379                # Remove any strings that contain the substring edition inside
380                # parentheses
381                (r'(?i)[({\[].*?(edition|ed.).*?[\]})]', ''),
382                # Remove commas used a separators in numbers
383                (r'(\d+),(\d+)', r'\1\2'),
384                # Remove hyphens only if they have whitespace before them
385                (r'(\s-)', ' '),
386                # Replace other special chars with a space
387                (r'''[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”''', ' '),
388            ]]
389
390            for pat, repl in title_patterns:
391                title = pat.sub(repl, title)
392
393            tokens = title.split()
394            for token in tokens:
395                token = token.strip().strip('"').strip("'")
396                if token and (not strip_joiners or token.lower() not in ('a',
397                    'and', 'the', '&')):
398                    yield token
399
400    def split_jobs(self, jobs, num):
401        'Split a list of jobs into at most num groups, as evenly as possible'
402        groups = [[] for i in range(num)]
403        jobs = list(jobs)
404        while jobs:
405            for gr in groups:
406                try:
407                    job = jobs.pop()
408                except IndexError:
409                    break
410                gr.append(job)
411        return [g for g in groups if g]
412
413    def test_fields(self, mi):
414        '''
415        Return the first field from self.touched_fields that is null on the
416        mi object
417        '''
418        for key in self.touched_fields:
419            if key.startswith('identifier:'):
420                key = key.partition(':')[-1]
421                if not mi.has_identifier(key):
422                    return 'identifier: ' + key
423            elif mi.is_null(key):
424                return key
425
426    def clean_downloaded_metadata(self, mi):
427        '''
428        Call this method in your plugin's identify method to normalize metadata
429        before putting the Metadata object into result_queue. You can of
430        course, use a custom algorithm suited to your metadata source.
431        '''
432        docase = mi.language == 'eng' or mi.is_null('language')
433        if docase and mi.title:
434            mi.title = fixcase(mi.title)
435        mi.authors = fixauthors(mi.authors)
436        if mi.tags and docase:
437            mi.tags = list(map(fixcase, mi.tags))
438        mi.isbn = check_isbn(mi.isbn)
439
440    def download_multiple_covers(self, title, authors, urls, get_best_cover, timeout, result_queue, abort, log, prefs_name='max_covers'):
441        if not urls:
442            log('No images found for, title: %r and authors: %r'%(title, authors))
443            return
444        from threading import Thread
445        import time
446        if prefs_name:
447            urls = urls[:self.prefs[prefs_name]]
448        if get_best_cover:
449            urls = urls[:1]
450        log('Downloading %d covers'%len(urls))
451        workers = [Thread(target=self.download_image, args=(u, timeout, log, result_queue)) for u in urls]
452        for w in workers:
453            w.daemon = True
454            w.start()
455        alive = True
456        start_time = time.time()
457        while alive and not abort.is_set() and time.time() - start_time < timeout:
458            alive = False
459            for w in workers:
460                if w.is_alive():
461                    alive = True
462                    break
463            abort.wait(0.1)
464
465    def download_image(self, url, timeout, log, result_queue):
466        try:
467            ans = self.browser.open_novisit(url, timeout=timeout).read()
468            result_queue.put((self, ans))
469            log('Downloaded cover from: %s'%url)
470        except Exception:
471            self.log.exception('Failed to download cover from: %r'%url)
472
473    # }}}
474
475    # Metadata API {{{
476    def get_book_url(self, identifiers):
477        '''
478        Return a 3-tuple or None. The 3-tuple is of the form:
479        (identifier_type, identifier_value, URL).
480        The URL is the URL for the book identified by identifiers at this
481        source. identifier_type, identifier_value specify the identifier
482        corresponding to the URL.
483        This URL must be browsable to by a human using a browser. It is meant
484        to provide a clickable link for the user to easily visit the books page
485        at this source.
486        If no URL is found, return None. This method must be quick, and
487        consistent, so only implement it if it is possible to construct the URL
488        from a known scheme given identifiers.
489        '''
490        return None
491
492    def get_book_url_name(self, idtype, idval, url):
493        '''
494        Return a human readable name from the return value of get_book_url().
495        '''
496        return self.name
497
498    def get_book_urls(self, identifiers):
499        '''
500        Override this method if you would like to return multiple urls for this book.
501        Return a list of 3-tuples. By default this method simply calls :func:`get_book_url`.
502        '''
503        data = self.get_book_url(identifiers)
504        if data is None:
505            return ()
506        return (data,)
507
508    def get_cached_cover_url(self, identifiers):
509        '''
510        Return cached cover URL for the book identified by
511        the identifiers dict or None if no such URL exists.
512
513        Note that this method must only return validated URLs, i.e. not URLS
514        that could result in a generic cover image or a not found error.
515        '''
516        return None
517
518    def id_from_url(self, url):
519        '''
520        Parse a URL and return a tuple of the form:
521        (identifier_type, identifier_value).
522        If the URL does not match the pattern for the metadata source,
523        return None.
524        '''
525        return None
526
527    def identify_results_keygen(self, title=None, authors=None,
528            identifiers={}):
529        '''
530        Return a function that is used to generate a key that can sort Metadata
531        objects by their relevance given a search query (title, authors,
532        identifiers).
533
534        These keys are used to sort the results of a call to :meth:`identify`.
535
536        For details on the default algorithm see
537        :class:`InternalMetadataCompareKeyGen`. Re-implement this function in
538        your plugin if the default algorithm is not suitable.
539        '''
540        def keygen(mi):
541            return InternalMetadataCompareKeyGen(mi, self, title, authors,
542                identifiers)
543        return keygen
544
545    def identify(self, log, result_queue, abort, title=None, authors=None,
546            identifiers={}, timeout=30):
547        '''
548        Identify a book by its Title/Author/ISBN/etc.
549
550        If identifiers(s) are specified and no match is found and this metadata
551        source does not store all related identifiers (for example, all ISBNs
552        of a book), this method should retry with just the title and author
553        (assuming they were specified).
554
555        If this metadata source also provides covers, the URL to the cover
556        should be cached so that a subsequent call to the get covers API with
557        the same ISBN/special identifier does not need to get the cover URL
558        again. Use the caching API for this.
559
560        Every Metadata object put into result_queue by this method must have a
561        `source_relevance` attribute that is an integer indicating the order in
562        which the results were returned by the metadata source for this query.
563        This integer will be used by :meth:`compare_identify_results`. If the
564        order is unimportant, set it to zero for every result.
565
566        Make sure that any cover/ISBN mapping information is cached before the
567        Metadata object is put into result_queue.
568
569        :param log: A log object, use it to output debugging information/errors
570        :param result_queue: A result Queue, results should be put into it.
571                            Each result is a Metadata object
572        :param abort: If abort.is_set() returns True, abort further processing
573                      and return as soon as possible
574        :param title: The title of the book, can be None
575        :param authors: A list of authors of the book, can be None
576        :param identifiers: A dictionary of other identifiers, most commonly
577                            {'isbn':'1234...'}
578        :param timeout: Timeout in seconds, no network request should hang for
579                        longer than timeout.
580        :return: None if no errors occurred, otherwise a unicode representation
581                 of the error suitable for showing to the user
582
583        '''
584        return None
585
586    def download_cover(self, log, result_queue, abort,
587            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
588        '''
589        Download a cover and put it into result_queue. The parameters all have
590        the same meaning as for :meth:`identify`. Put (self, cover_data) into
591        result_queue.
592
593        This method should use cached cover URLs for efficiency whenever
594        possible. When cached data is not present, most plugins simply call
595        identify and use its results.
596
597        If the parameter get_best_cover is True and this plugin can get
598        multiple covers, it should only get the "best" one.
599        '''
600        pass
601
602    # }}}
603