1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3from __future__ import absolute_import, division, print_function, unicode_literals 4 5__license__ = 'GPL v3' 6__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' 7__docformat__ = 'restructuredtext en' 8 9import re, threading 10from functools import total_ordering 11 12from calibre import browser, random_user_agent 13from calibre.customize import Plugin 14from calibre.ebooks.metadata import check_isbn 15from calibre.ebooks.metadata.author_mapper import cap_author_token 16from calibre.utils.localization import canonicalize_lang, get_lang 17from polyglot.builtins import iteritems, cmp 18 19 20def create_log(ostream=None): 21 from calibre.utils.logging import ThreadSafeLog, FileStream 22 log = ThreadSafeLog(level=ThreadSafeLog.DEBUG) 23 log.outputs = [FileStream(ostream)] 24 return log 25 26 27# Comparing Metadata objects for relevance {{{ 28words = ("the", "a", "an", "of", "and") 29prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words))) 30trailing_paren_pat = re.compile(r'\(.*\)$') 31whitespace_pat = re.compile(r'\s+') 32 33 34def cleanup_title(s): 35 if not s: 36 s = _('Unknown') 37 s = s.strip().lower() 38 s = prefix_pat.sub(' ', s) 39 s = trailing_paren_pat.sub('', s) 40 s = whitespace_pat.sub(' ', s) 41 return s.strip() 42 43 44@total_ordering 45class InternalMetadataCompareKeyGen: 46 47 ''' 48 Generate a sort key for comparison of the relevance of Metadata objects, 49 given a search query. This is used only to compare results from the same 50 metadata source, not across different sources. 51 52 The sort key ensures that an ascending order sort is a sort by order of 53 decreasing relevance. 54 55 The algorithm is: 56 57 * Prefer results that have at least one identifier the same as for the query 58 * Prefer results with a cached cover URL 59 * Prefer results with all available fields filled in 60 * Prefer results with the same language as the current user interface language 61 * Prefer results that are an exact title match to the query 62 * Prefer results with longer comments (greater than 10% longer) 63 * Use the relevance of the result as reported by the metadata source's search 64 engine 65 ''' 66 67 def __init__(self, mi, source_plugin, title, authors, identifiers): 68 same_identifier = 2 69 idents = mi.get_identifiers() 70 for k, v in iteritems(identifiers): 71 if idents.get(k) == v: 72 same_identifier = 1 73 break 74 75 all_fields = 1 if source_plugin.test_fields(mi) is None else 2 76 77 exact_title = 1 if title and \ 78 cleanup_title(title) == cleanup_title(mi.title) else 2 79 80 language = 1 81 if mi.language: 82 mil = canonicalize_lang(mi.language) 83 if mil != 'und' and mil != canonicalize_lang(get_lang()): 84 language = 2 85 86 has_cover = 2 if (not source_plugin.cached_cover_url_is_reliable or 87 source_plugin.get_cached_cover_url(mi.identifiers) is None) else 1 88 89 self.base = (same_identifier, has_cover, all_fields, language, exact_title) 90 self.comments_len = len((mi.comments or '').strip()) 91 self.extra = getattr(mi, 'source_relevance', 0) 92 93 def compare_to_other(self, other): 94 a = cmp(self.base, other.base) 95 if a != 0: 96 return a 97 cx, cy = self.comments_len, other.comments_len 98 if cx and cy: 99 t = (cx + cy) / 20 100 delta = cy - cx 101 if abs(delta) > t: 102 return -1 if delta < 0 else 1 103 return cmp(self.extra, other.extra) 104 105 def __eq__(self, other): 106 return self.compare_to_other(other) == 0 107 108 def __ne__(self, other): 109 return self.compare_to_other(other) != 0 110 111 def __lt__(self, other): 112 return self.compare_to_other(other) < 0 113 114 def __le__(self, other): 115 return self.compare_to_other(other) <= 0 116 117 def __gt__(self, other): 118 return self.compare_to_other(other) > 0 119 120 def __ge__(self, other): 121 return self.compare_to_other(other) >= 0 122 123# }}} 124 125 126def get_cached_cover_urls(mi): 127 from calibre.customize.ui import metadata_plugins 128 plugins = list(metadata_plugins(['identify'])) 129 for p in plugins: 130 url = p.get_cached_cover_url(mi.identifiers) 131 if url: 132 yield (p, url) 133 134 135def dump_caches(): 136 from calibre.customize.ui import metadata_plugins 137 return {p.name:p.dump_caches() for p in metadata_plugins(['identify'])} 138 139 140def load_caches(dump): 141 from calibre.customize.ui import metadata_plugins 142 plugins = list(metadata_plugins(['identify'])) 143 for p in plugins: 144 cache = dump.get(p.name, None) 145 if cache: 146 p.load_caches(cache) 147 148 149def fixauthors(authors): 150 if not authors: 151 return authors 152 ans = [] 153 for x in authors: 154 ans.append(' '.join(map(cap_author_token, x.split()))) 155 return ans 156 157 158def fixcase(x): 159 if x: 160 from calibre.utils.titlecase import titlecase 161 x = titlecase(x) 162 return x 163 164 165class Option: 166 __slots__ = ['type', 'default', 'label', 'desc', 'name', 'choices'] 167 168 def __init__(self, name, type_, default, label, desc, choices=None): 169 ''' 170 :param name: The name of this option. Must be a valid python identifier 171 :param type_: The type of this option, one of ('number', 'string', 172 'bool', 'choices') 173 :param default: The default value for this option 174 :param label: A short (few words) description of this option 175 :param desc: A longer description of this option 176 :param choices: A dict of possible values, used only if type='choices'. 177 dict is of the form {key:human readable label, ...} 178 ''' 179 self.name, self.type, self.default, self.label, self.desc = (name, 180 type_, default, label, desc) 181 if choices and not isinstance(choices, dict): 182 choices = dict([(x, x) for x in choices]) 183 self.choices = choices 184 185 186class Source(Plugin): 187 188 type = _('Metadata source') 189 author = 'Kovid Goyal' 190 191 supported_platforms = ['windows', 'osx', 'linux'] 192 193 #: Set of capabilities supported by this plugin. 194 #: Useful capabilities are: 'identify', 'cover' 195 capabilities = frozenset() 196 197 #: List of metadata fields that can potentially be download by this plugin 198 #: during the identify phase 199 touched_fields = frozenset() 200 201 #: Set this to True if your plugin returns HTML formatted comments 202 has_html_comments = False 203 204 #: Setting this to True means that the browser object will indicate 205 #: that it supports gzip transfer encoding. This can speedup downloads 206 #: but make sure that the source actually supports gzip transfer encoding 207 #: correctly first 208 supports_gzip_transfer_encoding = False 209 210 #: Set this to True to ignore HTTPS certificate errors when connecting 211 #: to this source. 212 ignore_ssl_errors = False 213 214 #: Cached cover URLs can sometimes be unreliable (i.e. the download could 215 #: fail or the returned image could be bogus). If that is often the case 216 #: with this source, set to False 217 cached_cover_url_is_reliable = True 218 219 #: A list of :class:`Option` objects. They will be used to automatically 220 #: construct the configuration widget for this plugin 221 options = () 222 223 #: A string that is displayed at the top of the config widget for this 224 #: plugin 225 config_help_message = None 226 227 #: If True this source can return multiple covers for a given query 228 can_get_multiple_covers = False 229 230 #: If set to True covers downloaded by this plugin are automatically trimmed. 231 auto_trim_covers = False 232 233 #: If set to True, and this source returns multiple results for a query, 234 #: some of which have ISBNs and some of which do not, the results without 235 #: ISBNs will be ignored 236 prefer_results_with_isbn = True 237 238 def __init__(self, *args, **kwargs): 239 Plugin.__init__(self, *args, **kwargs) 240 self.running_a_test = False # Set to True when using identify_test() 241 self._isbn_to_identifier_cache = {} 242 self._identifier_to_cover_url_cache = {} 243 self.cache_lock = threading.RLock() 244 self._config_obj = None 245 self._browser = None 246 self.prefs.defaults['ignore_fields'] = [] 247 for opt in self.options: 248 self.prefs.defaults[opt.name] = opt.default 249 250 # Configuration {{{ 251 252 def is_configured(self): 253 ''' 254 Return False if your plugin needs to be configured before it can be 255 used. For example, it might need a username/password/API key. 256 ''' 257 return True 258 259 def is_customizable(self): 260 return True 261 262 def customization_help(self): 263 return 'This plugin can only be customized using the GUI' 264 265 def config_widget(self): 266 from calibre.gui2.metadata.config import ConfigWidget 267 return ConfigWidget(self) 268 269 def save_settings(self, config_widget): 270 config_widget.commit() 271 272 @property 273 def prefs(self): 274 if self._config_obj is None: 275 from calibre.utils.config import JSONConfig 276 self._config_obj = JSONConfig('metadata_sources/%s.json'%self.name) 277 return self._config_obj 278 # }}} 279 280 # Browser {{{ 281 282 @property 283 def user_agent(self): 284 # Pass in an index to random_user_agent() to test with a particular 285 # user agent 286 return random_user_agent() 287 288 @property 289 def browser(self): 290 if self._browser is None: 291 self._browser = browser(user_agent=self.user_agent, verify_ssl_certificates=not self.ignore_ssl_errors) 292 if self.supports_gzip_transfer_encoding: 293 self._browser.set_handle_gzip(True) 294 return self._browser.clone_browser() 295 296 # }}} 297 298 # Caching {{{ 299 300 def get_related_isbns(self, id_): 301 with self.cache_lock: 302 for isbn, q in iteritems(self._isbn_to_identifier_cache): 303 if q == id_: 304 yield isbn 305 306 def cache_isbn_to_identifier(self, isbn, identifier): 307 with self.cache_lock: 308 self._isbn_to_identifier_cache[isbn] = identifier 309 310 def cached_isbn_to_identifier(self, isbn): 311 with self.cache_lock: 312 return self._isbn_to_identifier_cache.get(isbn, None) 313 314 def cache_identifier_to_cover_url(self, id_, url): 315 with self.cache_lock: 316 self._identifier_to_cover_url_cache[id_] = url 317 318 def cached_identifier_to_cover_url(self, id_): 319 with self.cache_lock: 320 return self._identifier_to_cover_url_cache.get(id_, None) 321 322 def dump_caches(self): 323 with self.cache_lock: 324 return {'isbn_to_identifier':self._isbn_to_identifier_cache.copy(), 325 'identifier_to_cover':self._identifier_to_cover_url_cache.copy()} 326 327 def load_caches(self, dump): 328 with self.cache_lock: 329 self._isbn_to_identifier_cache.update(dump['isbn_to_identifier']) 330 self._identifier_to_cover_url_cache.update(dump['identifier_to_cover']) 331 332 # }}} 333 334 # Utility functions {{{ 335 336 def get_author_tokens(self, authors, only_first_author=True): 337 ''' 338 Take a list of authors and return a list of tokens useful for an 339 AND search query. This function tries to return tokens in 340 first name middle names last name order, by assuming that if a comma is 341 in the author name, the name is in lastname, other names form. 342 ''' 343 344 if authors: 345 # Leave ' in there for Irish names 346 remove_pat = re.compile(r'[!@#$%^&*()()「」{}`~"\s\[\]/]') 347 replace_pat = re.compile(r'[-+.:;,,。;:]') 348 if only_first_author: 349 authors = authors[:1] 350 for au in authors: 351 has_comma = ',' in au 352 au = replace_pat.sub(' ', au) 353 parts = au.split() 354 if has_comma: 355 # au probably in ln, fn form 356 parts = parts[1:] + parts[:1] 357 for tok in parts: 358 tok = remove_pat.sub('', tok).strip() 359 if len(tok) > 2 and tok.lower() not in ('von', 'van', 360 _('Unknown').lower()): 361 yield tok 362 363 def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False): 364 ''' 365 Take a title and return a list of tokens useful for an AND search query. 366 Excludes connectives(optionally) and punctuation. 367 ''' 368 if title: 369 # strip sub-titles 370 if strip_subtitle: 371 subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)') 372 if len(subtitle.sub('', title)) > 1: 373 title = subtitle.sub('', title) 374 375 title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in 376 [ 377 # Remove things like: (2010) (Omnibus) etc. 378 (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]', ''), 379 # Remove any strings that contain the substring edition inside 380 # parentheses 381 (r'(?i)[({\[].*?(edition|ed.).*?[\]})]', ''), 382 # Remove commas used a separators in numbers 383 (r'(\d+),(\d+)', r'\1\2'), 384 # Remove hyphens only if they have whitespace before them 385 (r'(\s-)', ' '), 386 # Replace other special chars with a space 387 (r'''[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”''', ' '), 388 ]] 389 390 for pat, repl in title_patterns: 391 title = pat.sub(repl, title) 392 393 tokens = title.split() 394 for token in tokens: 395 token = token.strip().strip('"').strip("'") 396 if token and (not strip_joiners or token.lower() not in ('a', 397 'and', 'the', '&')): 398 yield token 399 400 def split_jobs(self, jobs, num): 401 'Split a list of jobs into at most num groups, as evenly as possible' 402 groups = [[] for i in range(num)] 403 jobs = list(jobs) 404 while jobs: 405 for gr in groups: 406 try: 407 job = jobs.pop() 408 except IndexError: 409 break 410 gr.append(job) 411 return [g for g in groups if g] 412 413 def test_fields(self, mi): 414 ''' 415 Return the first field from self.touched_fields that is null on the 416 mi object 417 ''' 418 for key in self.touched_fields: 419 if key.startswith('identifier:'): 420 key = key.partition(':')[-1] 421 if not mi.has_identifier(key): 422 return 'identifier: ' + key 423 elif mi.is_null(key): 424 return key 425 426 def clean_downloaded_metadata(self, mi): 427 ''' 428 Call this method in your plugin's identify method to normalize metadata 429 before putting the Metadata object into result_queue. You can of 430 course, use a custom algorithm suited to your metadata source. 431 ''' 432 docase = mi.language == 'eng' or mi.is_null('language') 433 if docase and mi.title: 434 mi.title = fixcase(mi.title) 435 mi.authors = fixauthors(mi.authors) 436 if mi.tags and docase: 437 mi.tags = list(map(fixcase, mi.tags)) 438 mi.isbn = check_isbn(mi.isbn) 439 440 def download_multiple_covers(self, title, authors, urls, get_best_cover, timeout, result_queue, abort, log, prefs_name='max_covers'): 441 if not urls: 442 log('No images found for, title: %r and authors: %r'%(title, authors)) 443 return 444 from threading import Thread 445 import time 446 if prefs_name: 447 urls = urls[:self.prefs[prefs_name]] 448 if get_best_cover: 449 urls = urls[:1] 450 log('Downloading %d covers'%len(urls)) 451 workers = [Thread(target=self.download_image, args=(u, timeout, log, result_queue)) for u in urls] 452 for w in workers: 453 w.daemon = True 454 w.start() 455 alive = True 456 start_time = time.time() 457 while alive and not abort.is_set() and time.time() - start_time < timeout: 458 alive = False 459 for w in workers: 460 if w.is_alive(): 461 alive = True 462 break 463 abort.wait(0.1) 464 465 def download_image(self, url, timeout, log, result_queue): 466 try: 467 ans = self.browser.open_novisit(url, timeout=timeout).read() 468 result_queue.put((self, ans)) 469 log('Downloaded cover from: %s'%url) 470 except Exception: 471 self.log.exception('Failed to download cover from: %r'%url) 472 473 # }}} 474 475 # Metadata API {{{ 476 def get_book_url(self, identifiers): 477 ''' 478 Return a 3-tuple or None. The 3-tuple is of the form: 479 (identifier_type, identifier_value, URL). 480 The URL is the URL for the book identified by identifiers at this 481 source. identifier_type, identifier_value specify the identifier 482 corresponding to the URL. 483 This URL must be browsable to by a human using a browser. It is meant 484 to provide a clickable link for the user to easily visit the books page 485 at this source. 486 If no URL is found, return None. This method must be quick, and 487 consistent, so only implement it if it is possible to construct the URL 488 from a known scheme given identifiers. 489 ''' 490 return None 491 492 def get_book_url_name(self, idtype, idval, url): 493 ''' 494 Return a human readable name from the return value of get_book_url(). 495 ''' 496 return self.name 497 498 def get_book_urls(self, identifiers): 499 ''' 500 Override this method if you would like to return multiple urls for this book. 501 Return a list of 3-tuples. By default this method simply calls :func:`get_book_url`. 502 ''' 503 data = self.get_book_url(identifiers) 504 if data is None: 505 return () 506 return (data,) 507 508 def get_cached_cover_url(self, identifiers): 509 ''' 510 Return cached cover URL for the book identified by 511 the identifiers dict or None if no such URL exists. 512 513 Note that this method must only return validated URLs, i.e. not URLS 514 that could result in a generic cover image or a not found error. 515 ''' 516 return None 517 518 def id_from_url(self, url): 519 ''' 520 Parse a URL and return a tuple of the form: 521 (identifier_type, identifier_value). 522 If the URL does not match the pattern for the metadata source, 523 return None. 524 ''' 525 return None 526 527 def identify_results_keygen(self, title=None, authors=None, 528 identifiers={}): 529 ''' 530 Return a function that is used to generate a key that can sort Metadata 531 objects by their relevance given a search query (title, authors, 532 identifiers). 533 534 These keys are used to sort the results of a call to :meth:`identify`. 535 536 For details on the default algorithm see 537 :class:`InternalMetadataCompareKeyGen`. Re-implement this function in 538 your plugin if the default algorithm is not suitable. 539 ''' 540 def keygen(mi): 541 return InternalMetadataCompareKeyGen(mi, self, title, authors, 542 identifiers) 543 return keygen 544 545 def identify(self, log, result_queue, abort, title=None, authors=None, 546 identifiers={}, timeout=30): 547 ''' 548 Identify a book by its Title/Author/ISBN/etc. 549 550 If identifiers(s) are specified and no match is found and this metadata 551 source does not store all related identifiers (for example, all ISBNs 552 of a book), this method should retry with just the title and author 553 (assuming they were specified). 554 555 If this metadata source also provides covers, the URL to the cover 556 should be cached so that a subsequent call to the get covers API with 557 the same ISBN/special identifier does not need to get the cover URL 558 again. Use the caching API for this. 559 560 Every Metadata object put into result_queue by this method must have a 561 `source_relevance` attribute that is an integer indicating the order in 562 which the results were returned by the metadata source for this query. 563 This integer will be used by :meth:`compare_identify_results`. If the 564 order is unimportant, set it to zero for every result. 565 566 Make sure that any cover/ISBN mapping information is cached before the 567 Metadata object is put into result_queue. 568 569 :param log: A log object, use it to output debugging information/errors 570 :param result_queue: A result Queue, results should be put into it. 571 Each result is a Metadata object 572 :param abort: If abort.is_set() returns True, abort further processing 573 and return as soon as possible 574 :param title: The title of the book, can be None 575 :param authors: A list of authors of the book, can be None 576 :param identifiers: A dictionary of other identifiers, most commonly 577 {'isbn':'1234...'} 578 :param timeout: Timeout in seconds, no network request should hang for 579 longer than timeout. 580 :return: None if no errors occurred, otherwise a unicode representation 581 of the error suitable for showing to the user 582 583 ''' 584 return None 585 586 def download_cover(self, log, result_queue, abort, 587 title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): 588 ''' 589 Download a cover and put it into result_queue. The parameters all have 590 the same meaning as for :meth:`identify`. Put (self, cover_data) into 591 result_queue. 592 593 This method should use cached cover URLs for efficiency whenever 594 possible. When cached data is not present, most plugins simply call 595 identify and use its results. 596 597 If the parameter get_best_cover is True and this plugin can get 598 multiple covers, it should only get the "best" one. 599 ''' 600 pass 601 602 # }}} 603