1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net> 4from __future__ import absolute_import, division, print_function, unicode_literals 5 6import re 7import socket 8import time 9from functools import partial 10try: 11 from queue import Empty, Queue 12except ImportError: 13 from Queue import Empty, Queue 14from threading import Thread 15try: 16 from urllib.parse import urlparse 17except ImportError: 18 from urlparse import urlparse 19 20from calibre import as_unicode, browser, random_user_agent, xml_replace_entities 21from calibre.ebooks.metadata import check_isbn 22from calibre.ebooks.metadata.book.base import Metadata 23from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase 24from calibre.utils.localization import canonicalize_lang 25from calibre.utils.random_ua import accept_header_for_ua 26from calibre.ebooks.oeb.base import urlquote 27 28 29def iri_quote_plus(url): 30 ans = urlquote(url) 31 if isinstance(ans, bytes): 32 ans = ans.decode('utf-8') 33 return ans.replace('%20', '+') 34 35 36def user_agent_is_ok(ua): 37 return 'Mobile/' not in ua and 'Mobile ' not in ua 38 39 40class CaptchaError(Exception): 41 pass 42 43 44class SearchFailed(ValueError): 45 pass 46 47 48def parse_html(raw): 49 try: 50 from html5_parser import parse 51 except ImportError: 52 # Old versions of calibre 53 import html5lib 54 return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) 55 else: 56 return parse(raw) 57 58 59def parse_details_page(url, log, timeout, browser, domain): 60 from calibre.utils.cleantext import clean_ascii_chars 61 from calibre.ebooks.chardet import xml_to_unicode 62 from lxml.html import tostring 63 log('Getting details from:', url) 64 try: 65 raw = browser.open_novisit(url, timeout=timeout).read().strip() 66 except Exception as e: 67 if callable(getattr(e, 'getcode', None)) and \ 68 e.getcode() == 404: 69 log.error('URL malformed: %r' % url) 70 return 71 attr = getattr(e, 'args', [None]) 72 attr = attr if attr else [None] 73 if isinstance(attr[0], socket.timeout): 74 msg = 'Details page timed out. Try again later.' 75 log.error(msg) 76 else: 77 msg = 'Failed to make details query: %r' % url 78 log.exception(msg) 79 return 80 81 oraw = raw 82 if 'amazon.com.br' in url: 83 # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag 84 raw = raw.decode('utf-8') 85 raw = xml_to_unicode(raw, strip_encoding_pats=True, 86 resolve_entities=True)[0] 87 if '<title>404 - ' in raw: 88 raise ValueError('URL malformed: %r' % url) 89 if '>Could not find the requested document in the cache.<' in raw: 90 raise ValueError('No cached entry for %s found' % url) 91 92 try: 93 root = parse_html(clean_ascii_chars(raw)) 94 except Exception: 95 msg = 'Failed to parse amazon details page: %r' % url 96 log.exception(msg) 97 return 98 if domain == 'jp': 99 for a in root.xpath('//a[@href]'): 100 if 'black-curtain-redirect.html' in a.get('href'): 101 url = a.get('href') 102 if url: 103 if url.startswith('/'): 104 url = 'https://amazon.co.jp' + a.get('href') 105 log('Black curtain redirect found, following') 106 return parse_details_page(url, log, timeout, browser, domain) 107 108 errmsg = root.xpath('//*[@id="errorMessage"]') 109 if errmsg: 110 msg = 'Failed to parse amazon details page: %r' % url 111 msg += tostring(errmsg, method='text', encoding='unicode').strip() 112 log.error(msg) 113 return 114 115 from css_selectors import Select 116 selector = Select(root) 117 return oraw, root, selector 118 119 120def parse_asin(root, log, url): 121 try: 122 link = root.xpath('//link[@rel="canonical" and @href]') 123 for l in link: 124 return l.get('href').rpartition('/')[-1] 125 except Exception: 126 log.exception('Error parsing ASIN for url: %r' % url) 127 128 129class Worker(Thread): # Get details {{{ 130 131 ''' 132 Get book details from amazons book page in a separate thread 133 ''' 134 135 def __init__(self, url, result_queue, browser, log, relevance, domain, 136 plugin, timeout=20, testing=False, preparsed_root=None, 137 cover_url_processor=None, filter_result=None): 138 Thread.__init__(self) 139 self.cover_url_processor = cover_url_processor 140 self.preparsed_root = preparsed_root 141 self.daemon = True 142 self.testing = testing 143 self.url, self.result_queue = url, result_queue 144 self.log, self.timeout = log, timeout 145 self.filter_result = filter_result or (lambda x, log: True) 146 self.relevance, self.plugin = relevance, plugin 147 self.browser = browser 148 self.cover_url = self.amazon_id = self.isbn = None 149 self.domain = domain 150 from lxml.html import tostring 151 self.tostring = tostring 152 153 months = { # {{{ 154 'de': { 155 1: ['jän', 'januar'], 156 2: ['februar'], 157 3: ['märz'], 158 5: ['mai'], 159 6: ['juni'], 160 7: ['juli'], 161 10: ['okt', 'oktober'], 162 12: ['dez', 'dezember'] 163 }, 164 'it': { 165 1: ['gennaio', 'enn'], 166 2: ['febbraio', 'febbr'], 167 3: ['marzo'], 168 4: ['aprile'], 169 5: ['maggio', 'magg'], 170 6: ['giugno'], 171 7: ['luglio'], 172 8: ['agosto', 'ag'], 173 9: ['settembre', 'sett'], 174 10: ['ottobre', 'ott'], 175 11: ['novembre'], 176 12: ['dicembre', 'dic'], 177 }, 178 'fr': { 179 1: ['janv'], 180 2: ['févr'], 181 3: ['mars'], 182 4: ['avril'], 183 5: ['mai'], 184 6: ['juin'], 185 7: ['juil'], 186 8: ['août'], 187 9: ['sept'], 188 12: ['déc'], 189 }, 190 'br': { 191 1: ['janeiro'], 192 2: ['fevereiro'], 193 3: ['março'], 194 4: ['abril'], 195 5: ['maio'], 196 6: ['junho'], 197 7: ['julho'], 198 8: ['agosto'], 199 9: ['setembro'], 200 10: ['outubro'], 201 11: ['novembro'], 202 12: ['dezembro'], 203 }, 204 'es': { 205 1: ['enero'], 206 2: ['febrero'], 207 3: ['marzo'], 208 4: ['abril'], 209 5: ['mayo'], 210 6: ['junio'], 211 7: ['julio'], 212 8: ['agosto'], 213 9: ['septiembre', 'setiembre'], 214 10: ['octubre'], 215 11: ['noviembre'], 216 12: ['diciembre'], 217 }, 218 'se': { 219 1: ['januari'], 220 2: ['februari'], 221 3: ['mars'], 222 4: ['april'], 223 5: ['maj'], 224 6: ['juni'], 225 7: ['juli'], 226 8: ['augusti'], 227 9: ['september'], 228 10: ['oktober'], 229 11: ['november'], 230 12: ['december'], 231 }, 232 'jp': { 233 1: ['1月'], 234 2: ['2月'], 235 3: ['3月'], 236 4: ['4月'], 237 5: ['5月'], 238 6: ['6月'], 239 7: ['7月'], 240 8: ['8月'], 241 9: ['9月'], 242 10: ['10月'], 243 11: ['11月'], 244 12: ['12月'], 245 }, 246 'nl': { 247 1: ['januari'], 2: ['februari'], 3: ['maart'], 5: ['mei'], 6: ['juni'], 7: ['juli'], 8: ['augustus'], 10: ['oktober'], 248 } 249 250 } # }}} 251 252 self.english_months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 253 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 254 self.months = months.get(self.domain, {}) 255 256 self.pd_xpath = ''' 257 //h2[text()="Product Details" or \ 258 text()="Produktinformation" or \ 259 text()="Dettagli prodotto" or \ 260 text()="Product details" or \ 261 text()="Détails sur le produit" or \ 262 text()="Detalles del producto" or \ 263 text()="Detalhes do produto" or \ 264 text()="Productgegevens" or \ 265 text()="基本信息" or \ 266 starts-with(text(), "登録情報")]/../div[@class="content"] 267 ''' 268 # Editor: is for Spanish 269 self.publisher_xpath = ''' 270 descendant::*[starts-with(text(), "Publisher:") or \ 271 starts-with(text(), "Verlag:") or \ 272 starts-with(text(), "Editore:") or \ 273 starts-with(text(), "Editeur") or \ 274 starts-with(text(), "Editor:") or \ 275 starts-with(text(), "Editora:") or \ 276 starts-with(text(), "Uitgever:") or \ 277 starts-with(text(), "Utgivare:") or \ 278 starts-with(text(), "出版社:")] 279 ''' 280 self.pubdate_xpath = ''' 281 descendant::*[starts-with(text(), "Publication Date:") or \ 282 starts-with(text(), "Audible.com Release Date:")] 283 ''' 284 self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Utgivare', 285 'Editore', 'Editeur', 'Editor', 'Editora', '出版社'} 286 287 self.language_xpath = ''' 288 descendant::*[ 289 starts-with(text(), "Language:") \ 290 or text() = "Language" \ 291 or text() = "Sprache:" \ 292 or text() = "Lingua:" \ 293 or text() = "Idioma:" \ 294 or starts-with(text(), "Langue") \ 295 or starts-with(text(), "言語") \ 296 or starts-with(text(), "Språk") \ 297 or starts-with(text(), "语种") 298 ] 299 ''' 300 self.language_names = {'Language', 'Sprache', 'Språk', 301 'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'} 302 303 self.tags_xpath = ''' 304 descendant::h2[ 305 text() = "Look for Similar Items by Category" or 306 text() = "Ähnliche Artikel finden" or 307 text() = "Buscar productos similares por categoría" or 308 text() = "Ricerca articoli simili per categoria" or 309 text() = "Rechercher des articles similaires par rubrique" or 310 text() = "Procure por items similares por categoria" or 311 text() = "関連商品を探す" 312 ]/../descendant::ul/li 313 ''' 314 315 self.ratings_pat = re.compile( 316 r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) ' 317 r'([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}' 318 ) 319 self.ratings_pat_cn = re.compile('([0-9.]+) 颗星,最多 5 颗星') 320 self.ratings_pat_jp = re.compile(r'\d+つ星のうち([\d\.]+)') 321 322 lm = { 323 'eng': ('English', 'Englisch', 'Engels', 'Engelska'), 324 'fra': ('French', 'Français'), 325 'ita': ('Italian', 'Italiano'), 326 'deu': ('German', 'Deutsch'), 327 'spa': ('Spanish', 'Espa\xf1ol', 'Espaniol'), 328 'jpn': ('Japanese', '日本語'), 329 'por': ('Portuguese', 'Português'), 330 'nld': ('Dutch', 'Nederlands',), 331 'chs': ('Chinese', '中文', '简体中文'), 332 'swe': ('Swedish', 'Svenska'), 333 } 334 self.lang_map = {} 335 for code, names in lm.items(): 336 for name in names: 337 self.lang_map[name] = code 338 339 self.series_pat = re.compile( 340 r''' 341 \|\s* # Prefix 342 (Series)\s*:\s* # Series declaration 343 (?P<series>.+?)\s+ # The series name 344 \((Book)\s* # Book declaration 345 (?P<index>[0-9.]+) # Series index 346 \s*\) 347 ''', re.X) 348 349 def delocalize_datestr(self, raw): 350 if self.domain == 'cn': 351 return raw.replace('年', '-').replace('月', '-').replace('日', '') 352 if not self.months: 353 return raw 354 ans = raw.lower() 355 for i, vals in self.months.items(): 356 for x in vals: 357 ans = ans.replace(x, self.english_months[i]) 358 ans = ans.replace(' de ', ' ') 359 return ans 360 361 def run(self): 362 try: 363 self.get_details() 364 except: 365 self.log.exception('get_details failed for url: %r' % self.url) 366 367 def get_details(self): 368 if self.preparsed_root is None: 369 raw, root, selector = parse_details_page( 370 self.url, self.log, self.timeout, self.browser, self.domain) 371 else: 372 raw, root, selector = self.preparsed_root 373 374 from css_selectors import Select 375 self.selector = Select(root) 376 self.parse_details(raw, root) 377 378 def parse_details(self, raw, root): 379 asin = parse_asin(root, self.log, self.url) 380 if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'): 381 raise CaptchaError( 382 'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.') 383 if self.testing: 384 import tempfile 385 import uuid 386 with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_', 387 suffix='.html', delete=False) as f: 388 f.write(raw) 389 print('Downloaded html for', asin, 'saved in', f.name) 390 391 try: 392 title = self.parse_title(root) 393 except: 394 self.log.exception('Error parsing title for url: %r' % self.url) 395 title = None 396 397 try: 398 authors = self.parse_authors(root) 399 except: 400 self.log.exception('Error parsing authors for url: %r' % self.url) 401 authors = [] 402 403 if not title or not authors or not asin: 404 self.log.error( 405 'Could not find title/authors/asin for %r' % self.url) 406 self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title, 407 authors)) 408 return 409 410 mi = Metadata(title, authors) 411 idtype = 'amazon' if self.domain == 'com' else 'amazon_' + self.domain 412 mi.set_identifier(idtype, asin) 413 self.amazon_id = asin 414 415 try: 416 mi.rating = self.parse_rating(root) 417 except: 418 self.log.exception('Error parsing ratings for url: %r' % self.url) 419 420 try: 421 mi.comments = self.parse_comments(root, raw) 422 except: 423 self.log.exception('Error parsing comments for url: %r' % self.url) 424 425 try: 426 series, series_index = self.parse_series(root) 427 if series: 428 mi.series, mi.series_index = series, series_index 429 elif self.testing: 430 mi.series, mi.series_index = 'Dummy series for testing', 1 431 except: 432 self.log.exception('Error parsing series for url: %r' % self.url) 433 434 try: 435 mi.tags = self.parse_tags(root) 436 except: 437 self.log.exception('Error parsing tags for url: %r' % self.url) 438 439 try: 440 self.cover_url = self.parse_cover(root, raw) 441 except: 442 self.log.exception('Error parsing cover for url: %r' % self.url) 443 if self.cover_url_processor is not None and self.cover_url and self.cover_url.startswith('/'): 444 self.cover_url = self.cover_url_processor(self.cover_url) 445 mi.has_cover = bool(self.cover_url) 446 447 detail_bullets = root.xpath('//*[@data-feature-name="detailBullets"]') 448 non_hero = tuple(self.selector( 449 'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector( 450 '#productDetails_techSpec_sections')) 451 if detail_bullets: 452 self.parse_detail_bullets(root, mi, detail_bullets[0]) 453 elif non_hero: 454 try: 455 self.parse_new_details(root, mi, non_hero[0]) 456 except: 457 self.log.exception( 458 'Failed to parse new-style book details section') 459 460 else: 461 pd = root.xpath(self.pd_xpath) 462 if pd: 463 pd = pd[0] 464 465 try: 466 isbn = self.parse_isbn(pd) 467 if isbn: 468 self.isbn = mi.isbn = isbn 469 except: 470 self.log.exception( 471 'Error parsing ISBN for url: %r' % self.url) 472 473 try: 474 mi.publisher = self.parse_publisher(pd) 475 except: 476 self.log.exception( 477 'Error parsing publisher for url: %r' % self.url) 478 479 try: 480 mi.pubdate = self.parse_pubdate(pd) 481 except: 482 self.log.exception( 483 'Error parsing publish date for url: %r' % self.url) 484 485 try: 486 lang = self.parse_language(pd) 487 if lang: 488 mi.language = lang 489 except: 490 self.log.exception( 491 'Error parsing language for url: %r' % self.url) 492 493 else: 494 self.log.warning( 495 'Failed to find product description for url: %r' % self.url) 496 497 mi.source_relevance = self.relevance 498 499 if self.amazon_id: 500 if self.isbn: 501 self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id) 502 if self.cover_url: 503 self.plugin.cache_identifier_to_cover_url(self.amazon_id, 504 self.cover_url) 505 506 self.plugin.clean_downloaded_metadata(mi) 507 508 if self.filter_result(mi, self.log): 509 self.result_queue.put(mi) 510 511 def totext(self, elem, only_printable=False): 512 res = self.tostring(elem, encoding='unicode', method='text') 513 if only_printable: 514 filtered_characters = list(s for s in res if s.isprintable()) 515 res = ''.join(filtered_characters).strip() 516 return res 517 518 def parse_title(self, root): 519 520 def sanitize_title(title): 521 ans = re.sub(r'[(\[].*[)\]]', '', title).strip() 522 if not ans: 523 ans = title.rpartition('[')[0].strip() 524 return ans 525 526 h1 = root.xpath('//h1[@id="title"]') 527 if h1: 528 h1 = h1[0] 529 for child in h1.xpath('./*[contains(@class, "a-color-secondary")]'): 530 h1.remove(child) 531 return sanitize_title(self.totext(h1)) 532 tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]') 533 if not tdiv: 534 span = root.xpath('//*[@id="ebooksTitle"]') 535 if span: 536 return sanitize_title(self.totext(span[0])) 537 raise ValueError('No title block found') 538 tdiv = tdiv[0] 539 actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]') 540 if actual_title: 541 title = self.tostring(actual_title[0], encoding='unicode', 542 method='text').strip() 543 else: 544 title = self.tostring(tdiv, encoding='unicode', 545 method='text').strip() 546 return sanitize_title(title) 547 548 def parse_authors(self, root): 549 for sel in ( 550 '#byline .author .contributorNameID', 551 '#byline .author a.a-link-normal', 552 '#bylineInfo .author .contributorNameID', 553 '#bylineInfo .author a.a-link-normal', 554 '#bylineInfo #bylineContributor', 555 ): 556 matches = tuple(self.selector(sel)) 557 if matches: 558 authors = [self.totext(x) for x in matches] 559 return [a for a in authors if a] 560 561 x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]' 562 aname = root.xpath(x) 563 if not aname: 564 aname = root.xpath(''' 565 //h1[contains(@class, "parseasinTitle")]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")] 566 ''') 567 for x in aname: 568 x.tail = '' 569 authors = [self.tostring(x, encoding='unicode', method='text').strip() for x 570 in aname] 571 authors = [a for a in authors if a] 572 return authors 573 574 def parse_rating(self, root): 575 for x in root.xpath('//div[@id="cpsims-feature" or @id="purchase-sims-feature" or @id="rhf"]'): 576 # Remove the similar books section as it can cause spurious 577 # ratings matches 578 x.getparent().remove(x) 579 580 rating_paths = ( 581 '//div[@data-feature-name="averageCustomerReviews" or @id="averageCustomerReviews"]', 582 '//div[@class="jumpBar"]/descendant::span[contains(@class,"asinReviewsSummary")]', 583 '//div[@class="buying"]/descendant::span[contains(@class,"asinReviewsSummary")]', 584 '//span[@class="crAvgStars"]/descendant::span[contains(@class,"asinReviewsSummary")]' 585 ) 586 ratings = None 587 for p in rating_paths: 588 ratings = root.xpath(p) 589 if ratings: 590 break 591 592 def parse_ratings_text(text): 593 try: 594 m = self.ratings_pat.match(text) 595 return float(m.group(1).replace(',', '.')) / float(m.group(3)) * 5 596 except Exception: 597 pass 598 599 if ratings: 600 ratings = ratings[0] 601 for elem in ratings.xpath('descendant::*[@title]'): 602 t = elem.get('title').strip() 603 if self.domain == 'cn': 604 m = self.ratings_pat_cn.match(t) 605 if m is not None: 606 return float(m.group(1)) 607 elif self.domain == 'jp': 608 m = self.ratings_pat_jp.match(t) 609 if m is not None: 610 return float(m.group(1)) 611 else: 612 ans = parse_ratings_text(t) 613 if ans is not None: 614 return ans 615 for elem in ratings.xpath('descendant::span[@class="a-icon-alt"]'): 616 t = self.tostring( 617 elem, encoding='unicode', method='text', with_tail=False).strip() 618 ans = parse_ratings_text(t) 619 if ans is not None: 620 return ans 621 622 def _render_comments(self, desc): 623 from calibre.library.comments import sanitize_comments_html 624 625 for c in desc.xpath('descendant::noscript'): 626 c.getparent().remove(c) 627 for c in desc.xpath('descendant::*[@class="seeAll" or' 628 ' @class="emptyClear" or @id="collapsePS" or' 629 ' @id="expandPS"]'): 630 c.getparent().remove(c) 631 for b in desc.xpath('descendant::b[@style]'): 632 # Bing highlights search results 633 s = b.get('style', '') 634 if 'color' in s: 635 b.tag = 'span' 636 del b.attrib['style'] 637 638 for a in desc.xpath('descendant::a[@href]'): 639 del a.attrib['href'] 640 a.tag = 'span' 641 desc = self.tostring(desc, method='html', encoding='unicode').strip() 642 desc = xml_replace_entities(desc, 'utf-8') 643 644 # Encoding bug in Amazon data U+fffd (replacement char) 645 # in some examples it is present in place of ' 646 desc = desc.replace('\ufffd', "'") 647 # remove all attributes from tags 648 desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) 649 # Collapse whitespace 650 # desc = re.sub('\n+', '\n', desc) 651 # desc = re.sub(' +', ' ', desc) 652 # Remove the notice about text referring to out of print editions 653 desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc) 654 # Remove comments 655 desc = re.sub(r'(?s)<!--.*?-->', '', desc) 656 return sanitize_comments_html(desc) 657 658 def parse_comments(self, root, raw): 659 try: 660 from urllib.parse import unquote 661 except ImportError: 662 from urllib import unquote 663 ans = '' 664 ns = tuple(self.selector('#bookDescription_feature_div noscript')) 665 if ns: 666 ns = ns[0] 667 if len(ns) == 0 and ns.text: 668 import html5lib 669 # html5lib parsed noscript as CDATA 670 ns = html5lib.parseFragment( 671 '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] 672 else: 673 ns.tag = 'div' 674 ans = self._render_comments(ns) 675 else: 676 desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') 677 if desc: 678 ans = self._render_comments(desc[0]) 679 else: 680 ns = tuple(self.selector('#bookDescription_feature_div .a-expander-content')) 681 if ns: 682 ans = self._render_comments(ns[0]) 683 684 desc = root.xpath( 685 '//div[@id="productDescription"]/*[@class="content"]') 686 if desc: 687 ans += self._render_comments(desc[0]) 688 else: 689 # Idiot chickens from amazon strike again. This data is now stored 690 # in a JS variable inside a script tag URL encoded. 691 m = re.search(br'var\s+iframeContent\s*=\s*"([^"]+)"', raw) 692 if m is not None: 693 try: 694 text = unquote(m.group(1)).decode('utf-8') 695 nr = parse_html(text) 696 desc = nr.xpath( 697 '//div[@id="productDescription"]/*[@class="content"]') 698 if desc: 699 ans += self._render_comments(desc[0]) 700 except Exception as e: 701 self.log.warn( 702 'Parsing of obfuscated product description failed with error: %s' % as_unicode(e)) 703 else: 704 desc = root.xpath('//div[@id="productDescription_fullView"]') 705 if desc: 706 ans += self._render_comments(desc[0]) 707 708 return ans 709 710 def parse_series(self, root): 711 ans = (None, None) 712 713 # This is found on the paperback/hardback pages for books on amazon.com 714 series = root.xpath('//div[@data-feature-name="seriesTitle"]') 715 if series: 716 series = series[0] 717 spans = series.xpath('./span') 718 if spans: 719 raw = self.tostring( 720 spans[0], encoding='unicode', method='text', with_tail=False).strip() 721 m = re.search(r'\s+([0-9.]+)$', raw.strip()) 722 if m is not None: 723 series_index = float(m.group(1)) 724 s = series.xpath('./a[@id="series-page-link"]') 725 if s: 726 series = self.tostring( 727 s[0], encoding='unicode', method='text', with_tail=False).strip() 728 if series: 729 ans = (series, series_index) 730 else: 731 series = root.xpath('//div[@id="seriesBulletWidget_feature_div"]') 732 if series: 733 a = series[0].xpath('descendant::a') 734 if a: 735 raw = self.tostring(a[0], encoding='unicode', method='text', with_tail=False) 736 m = re.search(r'(?:Book|Libro)\s+(?P<index>[0-9.]+)\s+(?:of|de)\s+([0-9.]+)\s*:\s*(?P<series>.+)', raw.strip()) 737 if m is not None: 738 ans = (m.group('series').strip(), float(m.group('index'))) 739 740 # This is found on Kindle edition pages on amazon.com 741 if ans == (None, None): 742 for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'): 743 text = (span.text or '').strip() 744 m = re.match(r'Book\s+([0-9.]+)', text) 745 if m is not None: 746 series_index = float(m.group(1)) 747 a = span.xpath('./a[@href]') 748 if a: 749 series = self.tostring( 750 a[0], encoding='unicode', method='text', with_tail=False).strip() 751 if series: 752 ans = (series, series_index) 753 # This is found on newer Kindle edition pages on amazon.com 754 if ans == (None, None): 755 for b in root.xpath('//div[@id="reviewFeatureGroup"]/span/b'): 756 text = (b.text or '').strip() 757 m = re.match(r'Book\s+([0-9.]+)', text) 758 if m is not None: 759 series_index = float(m.group(1)) 760 a = b.getparent().xpath('./a[@href]') 761 if a: 762 series = self.tostring( 763 a[0], encoding='unicode', method='text', with_tail=False).partition('(')[0].strip() 764 if series: 765 ans = series, series_index 766 767 if ans == (None, None): 768 desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]') 769 if desc: 770 raw = self.tostring(desc[0], method='text', encoding='unicode') 771 raw = re.sub(r'\s+', ' ', raw) 772 match = self.series_pat.search(raw) 773 if match is not None: 774 s, i = match.group('series'), float(match.group('index')) 775 if s: 776 ans = (s, i) 777 if ans[0]: 778 ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1]) 779 ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1]) 780 return ans 781 782 def parse_tags(self, root): 783 ans = [] 784 exclude_tokens = {'kindle', 'a-z'} 785 exclude = {'special features', 'by authors', 786 'authors & illustrators', 'books', 'new; used & rental textbooks'} 787 seen = set() 788 for li in root.xpath(self.tags_xpath): 789 for i, a in enumerate(li.iterdescendants('a')): 790 if i > 0: 791 # we ignore the first category since it is almost always 792 # too broad 793 raw = (a.text or '').strip().replace(',', ';') 794 lraw = icu_lower(raw) 795 tokens = frozenset(lraw.split()) 796 if raw and lraw not in exclude and not tokens.intersection(exclude_tokens) and lraw not in seen: 797 ans.append(raw) 798 seen.add(lraw) 799 return ans 800 801 def parse_cover(self, root, raw=b""): 802 # Look for the image URL in javascript, using the first image in the 803 # image gallery as the cover 804 import json 805 imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""") 806 for script in root.xpath('//script'): 807 m = imgpat.search(script.text or '') 808 if m is not None: 809 try: 810 return json.loads(m.group(1))[0]['mainUrl'] 811 except Exception: 812 continue 813 814 def clean_img_src(src): 815 parts = src.split('/') 816 if len(parts) > 3: 817 bn = parts[-1] 818 sparts = bn.split('_') 819 if len(sparts) > 2: 820 bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1])) 821 return ('/'.join(parts[:-1])) + '/' + bn 822 823 imgpat2 = re.compile(r'var imageSrc = "([^"]+)"') 824 for script in root.xpath('//script'): 825 m = imgpat2.search(script.text or '') 826 if m is not None: 827 src = m.group(1) 828 url = clean_img_src(src) 829 if url: 830 return url 831 832 imgs = root.xpath( 833 '//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]') 834 if not imgs: 835 imgs = ( 836 root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or 837 root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or 838 root.xpath( 839 '//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]') 840 ) 841 for img in imgs: 842 try: 843 idata = json.loads(img.get('data-a-dynamic-image')) 844 except Exception: 845 imgs = () 846 else: 847 mwidth = 0 848 try: 849 url = None 850 for iurl, (width, height) in idata.items(): 851 if width > mwidth: 852 mwidth = width 853 url = iurl 854 return url 855 except Exception: 856 pass 857 858 for img in imgs: 859 src = img.get('src') 860 if 'data:' in src: 861 continue 862 if 'loading-' in src: 863 js_img = re.search(br'"largeImage":"(https?://[^"]+)",', raw) 864 if js_img: 865 src = js_img.group(1).decode('utf-8') 866 if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src): 867 self.log('Found image: %s' % src) 868 url = clean_img_src(src) 869 if url: 870 return url 871 872 def parse_detail_bullets(self, root, mi, container): 873 ul = next(self.selector('.detail-bullet-list', root=container)) 874 for span in self.selector('.a-list-item', root=ul): 875 cells = span.xpath('./span') 876 if len(cells) >= 2: 877 self.parse_detail_cells(mi, cells[0], cells[1]) 878 879 def parse_new_details(self, root, mi, non_hero): 880 table = non_hero.xpath('descendant::table')[0] 881 for tr in table.xpath('descendant::tr'): 882 cells = tr.xpath('descendant::*[local-name()="td" or local-name()="th"]') 883 if len(cells) == 2: 884 self.parse_detail_cells(mi, cells[0], cells[1]) 885 886 def parse_detail_cells(self, mi, c1, c2): 887 name = self.totext(c1, only_printable=True).strip().strip(':').strip() 888 val = self.totext(c2).strip() 889 if not val: 890 return 891 if name in self.language_names: 892 ans = self.lang_map.get(val, None) 893 if not ans: 894 ans = canonicalize_lang(val) 895 if ans: 896 mi.language = ans 897 elif name in self.publisher_names: 898 pub = val.partition(';')[0].partition('(')[0].strip() 899 if pub: 900 mi.publisher = pub 901 date = val.rpartition('(')[-1].replace(')', '').strip() 902 try: 903 from calibre.utils.date import parse_only_date 904 date = self.delocalize_datestr(date) 905 mi.pubdate = parse_only_date(date, assume_utc=True) 906 except: 907 self.log.exception('Failed to parse pubdate: %s' % val) 908 elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}: 909 ans = check_isbn(val) 910 if ans: 911 self.isbn = mi.isbn = ans 912 elif name in {'Publication date'}: 913 from calibre.utils.date import parse_only_date 914 date = self.delocalize_datestr(val) 915 mi.pubdate = parse_only_date(date, assume_utc=True) 916 917 def parse_isbn(self, pd): 918 items = pd.xpath( 919 'descendant::*[starts-with(text(), "ISBN")]') 920 if not items: 921 items = pd.xpath( 922 'descendant::b[contains(text(), "ISBN:")]') 923 for x in reversed(items): 924 if x.tail: 925 ans = check_isbn(x.tail.strip()) 926 if ans: 927 return ans 928 929 def parse_publisher(self, pd): 930 for x in reversed(pd.xpath(self.publisher_xpath)): 931 if x.tail: 932 ans = x.tail.partition(';')[0] 933 return ans.partition('(')[0].strip() 934 935 def parse_pubdate(self, pd): 936 from calibre.utils.date import parse_only_date 937 for x in reversed(pd.xpath(self.pubdate_xpath)): 938 if x.tail: 939 date = x.tail.strip() 940 date = self.delocalize_datestr(date) 941 try: 942 return parse_only_date(date, assume_utc=True) 943 except Exception: 944 pass 945 for x in reversed(pd.xpath(self.publisher_xpath)): 946 if x.tail: 947 ans = x.tail 948 date = ans.rpartition('(')[-1].replace(')', '').strip() 949 date = self.delocalize_datestr(date) 950 try: 951 return parse_only_date(date, assume_utc=True) 952 except Exception: 953 pass 954 955 def parse_language(self, pd): 956 for x in reversed(pd.xpath(self.language_xpath)): 957 if x.tail: 958 raw = x.tail.strip().partition(',')[0].strip() 959 ans = self.lang_map.get(raw, None) 960 if ans: 961 return ans 962 ans = canonicalize_lang(ans) 963 if ans: 964 return ans 965# }}} 966 967 968class Amazon(Source): 969 970 name = 'Amazon.com' 971 version = (1, 2, 22) 972 minimum_calibre_version = (2, 82, 0) 973 description = _('Downloads metadata and covers from Amazon') 974 975 capabilities = frozenset(('identify', 'cover')) 976 touched_fields = frozenset(('title', 'authors', 'identifier:amazon', 977 'rating', 'comments', 'publisher', 'pubdate', 978 'languages', 'series', 'tags')) 979 has_html_comments = True 980 supports_gzip_transfer_encoding = True 981 prefer_results_with_isbn = False 982 983 AMAZON_DOMAINS = { 984 'com': _('US'), 985 'fr': _('France'), 986 'de': _('Germany'), 987 'uk': _('UK'), 988 'au': _('Australia'), 989 'it': _('Italy'), 990 'jp': _('Japan'), 991 'es': _('Spain'), 992 'br': _('Brazil'), 993 'nl': _('Netherlands'), 994 'cn': _('China'), 995 'ca': _('Canada'), 996 'se': _('Sweden'), 997 } 998 999 SERVERS = { 1000 'auto': _('Choose server automatically'), 1001 'amazon': _('Amazon servers'), 1002 'bing': _('Bing search cache'), 1003 'google': _('Google search cache'), 1004 'wayback': _('Wayback machine cache (slow)'), 1005 } 1006 1007 options = ( 1008 Option('domain', 'choices', 'com', _('Amazon country website to use:'), 1009 _('Metadata from Amazon will be fetched using this ' 1010 'country\'s Amazon website.'), choices=AMAZON_DOMAINS), 1011 Option('server', 'choices', 'auto', _('Server to get data from:'), 1012 _( 1013 'Amazon has started blocking attempts to download' 1014 ' metadata from its servers. To get around this problem,' 1015 ' calibre can fetch the Amazon data from many different' 1016 ' places where it is cached. Choose the source you prefer.' 1017 ), choices=SERVERS), 1018 Option('use_mobi_asin', 'bool', False, _('Use the MOBI-ASIN for metadata search'), 1019 _( 1020 'Enable this option to search for metadata with an' 1021 ' ASIN identifier from the MOBI file at the current country website,' 1022 ' unless any other amazon id is available. Note that if the' 1023 ' MOBI file came from a different Amazon country store, you could get' 1024 ' incorrect results.' 1025 )), 1026 ) 1027 1028 def __init__(self, *args, **kwargs): 1029 Source.__init__(self, *args, **kwargs) 1030 self.set_amazon_id_touched_fields() 1031 1032 def test_fields(self, mi): 1033 ''' 1034 Return the first field from self.touched_fields that is null on the 1035 mi object 1036 ''' 1037 for key in self.touched_fields: 1038 if key.startswith('identifier:'): 1039 key = key.partition(':')[-1] 1040 if key == 'amazon': 1041 if self.domain != 'com': 1042 key += '_' + self.domain 1043 if not mi.has_identifier(key): 1044 return 'identifier: ' + key 1045 elif mi.is_null(key): 1046 return key 1047 1048 @property 1049 def browser(self): 1050 br = self._browser 1051 if br is None: 1052 ua = 'Mobile ' 1053 while not user_agent_is_ok(ua): 1054 ua = random_user_agent(allow_ie=False) 1055 # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0' 1056 self._browser = br = browser(user_agent=ua) 1057 br.set_handle_gzip(True) 1058 if self.use_search_engine: 1059 br.addheaders += [ 1060 ('Accept', accept_header_for_ua(ua)), 1061 ('Upgrade-insecure-requests', '1'), 1062 ] 1063 else: 1064 br.addheaders += [ 1065 ('Accept', accept_header_for_ua(ua)), 1066 ('Upgrade-insecure-requests', '1'), 1067 ('Referer', self.referrer_for_domain()), 1068 ] 1069 return br 1070 1071 def save_settings(self, *args, **kwargs): 1072 Source.save_settings(self, *args, **kwargs) 1073 self.set_amazon_id_touched_fields() 1074 1075 def set_amazon_id_touched_fields(self): 1076 ident_name = "identifier:amazon" 1077 if self.domain != 'com': 1078 ident_name += '_' + self.domain 1079 tf = [x for x in self.touched_fields if not 1080 x.startswith('identifier:amazon')] + [ident_name] 1081 self.touched_fields = frozenset(tf) 1082 1083 def get_domain_and_asin(self, identifiers, extra_domains=()): 1084 identifiers = {k.lower(): v for k, v in identifiers.items()} 1085 for key, val in identifiers.items(): 1086 if key in ('amazon', 'asin'): 1087 return 'com', val 1088 if key.startswith('amazon_'): 1089 domain = key.partition('_')[-1] 1090 if domain and (domain in self.AMAZON_DOMAINS or domain in extra_domains): 1091 return domain, val 1092 if self.prefs['use_mobi_asin']: 1093 val = identifiers.get('mobi-asin') 1094 if val is not None: 1095 return self.domain, val 1096 return None, None 1097 1098 def referrer_for_domain(self, domain=None): 1099 domain = domain or self.domain 1100 return { 1101 'uk': 'https://www.amazon.co.uk/', 1102 'au': 'https://www.amazon.com.au/', 1103 'br': 'https://www.amazon.com.br/', 1104 'jp': 'https://www.amazon.co.jp/', 1105 }.get(domain, 'https://www.amazon.%s/' % domain) 1106 1107 def _get_book_url(self, identifiers): # {{{ 1108 domain, asin = self.get_domain_and_asin( 1109 identifiers, extra_domains=('in', 'au', 'ca')) 1110 if domain and asin: 1111 url = None 1112 r = self.referrer_for_domain(domain) 1113 if r is not None: 1114 url = r + 'dp/' + asin 1115 if url: 1116 idtype = 'amazon' if domain == 'com' else 'amazon_' + domain 1117 return domain, idtype, asin, url 1118 1119 def get_book_url(self, identifiers): 1120 ans = self._get_book_url(identifiers) 1121 if ans is not None: 1122 return ans[1:] 1123 1124 def get_book_url_name(self, idtype, idval, url): 1125 if idtype == 'amazon': 1126 return self.name 1127 return 'A' + idtype.replace('_', '.')[1:] 1128 # }}} 1129 1130 @property 1131 def domain(self): 1132 x = getattr(self, 'testing_domain', None) 1133 if x is not None: 1134 return x 1135 domain = self.prefs['domain'] 1136 if domain not in self.AMAZON_DOMAINS: 1137 domain = 'com' 1138 1139 return domain 1140 1141 @property 1142 def server(self): 1143 x = getattr(self, 'testing_server', None) 1144 if x is not None: 1145 return x 1146 server = self.prefs['server'] 1147 if server not in self.SERVERS: 1148 server = 'auto' 1149 return server 1150 1151 @property 1152 def use_search_engine(self): 1153 return self.server != 'amazon' 1154 1155 def clean_downloaded_metadata(self, mi): 1156 docase = ( 1157 mi.language == 'eng' or 1158 (mi.is_null('language') and self.domain in {'com', 'uk', 'au'}) 1159 ) 1160 if mi.title and docase: 1161 # Remove series information from title 1162 m = re.search(r'\S+\s+(\(.+?\s+Book\s+\d+\))$', mi.title) 1163 if m is not None: 1164 mi.title = mi.title.replace(m.group(1), '').strip() 1165 mi.title = fixcase(mi.title) 1166 mi.authors = fixauthors(mi.authors) 1167 if mi.tags and docase: 1168 mi.tags = list(map(fixcase, mi.tags)) 1169 mi.isbn = check_isbn(mi.isbn) 1170 if mi.series and docase: 1171 mi.series = fixcase(mi.series) 1172 if mi.title and mi.series: 1173 for pat in (r':\s*Book\s+\d+\s+of\s+%s$', r'\(%s\)$', r':\s*%s\s+Book\s+\d+$'): 1174 pat = pat % re.escape(mi.series) 1175 q = re.sub(pat, '', mi.title, flags=re.I).strip() 1176 if q and q != mi.title: 1177 mi.title = q 1178 break 1179 1180 def get_website_domain(self, domain): 1181 return {'uk': 'co.uk', 'jp': 'co.jp', 'br': 'com.br', 'au': 'com.au'}.get(domain, domain) 1182 1183 def create_query(self, log, title=None, authors=None, identifiers={}, # {{{ 1184 domain=None, for_amazon=True): 1185 try: 1186 from urllib.parse import urlencode, unquote_plus 1187 except ImportError: 1188 from urllib import urlencode, unquote_plus 1189 if domain is None: 1190 domain = self.domain 1191 1192 idomain, asin = self.get_domain_and_asin(identifiers) 1193 if idomain is not None: 1194 domain = idomain 1195 1196 # See the amazon detailed search page to get all options 1197 terms = [] 1198 q = {'search-alias': 'aps', 1199 'unfiltered': '1', 1200 } 1201 1202 if domain == 'com': 1203 q['sort'] = 'relevanceexprank' 1204 else: 1205 q['sort'] = 'relevancerank' 1206 1207 isbn = check_isbn(identifiers.get('isbn', None)) 1208 1209 if asin is not None: 1210 q['field-keywords'] = asin 1211 terms.append(asin) 1212 elif isbn is not None: 1213 q['field-isbn'] = isbn 1214 if len(isbn) == 13: 1215 terms.extend('({} OR {}-{})'.format(isbn, isbn[:3], isbn[3:]).split()) 1216 else: 1217 terms.append(isbn) 1218 else: 1219 # Only return book results 1220 q['search-alias'] = {'br': 'digital-text', 1221 'nl': 'aps'}.get(domain, 'stripbooks') 1222 if title: 1223 title_tokens = list(self.get_title_tokens(title)) 1224 if title_tokens: 1225 q['field-title'] = ' '.join(title_tokens) 1226 terms.extend(title_tokens) 1227 if authors: 1228 author_tokens = list(self.get_author_tokens(authors, 1229 only_first_author=True)) 1230 if author_tokens: 1231 q['field-author'] = ' '.join(author_tokens) 1232 terms.extend(author_tokens) 1233 1234 if not ('field-keywords' in q or 'field-isbn' in q or 1235 ('field-title' in q)): 1236 # Insufficient metadata to make an identify query 1237 return None, None 1238 1239 if not for_amazon: 1240 return terms, domain 1241 1242 if domain == 'nl': 1243 q['__mk_nl_NL'] = 'ÅMÅŽÕÑ' 1244 if 'field-keywords' not in q: 1245 q['field-keywords'] = '' 1246 for f in 'field-isbn field-title field-author'.split(): 1247 q['field-keywords'] += ' ' + q.pop(f, '') 1248 q['field-keywords'] = q['field-keywords'].strip() 1249 1250 encoded_q = dict([(x.encode('utf-8', 'ignore'), y.encode( 1251 'utf-8', 'ignore')) for x, y in q.items()]) 1252 url_query = urlencode(encoded_q) 1253 # amazon's servers want IRIs with unicode characters not percent esaped 1254 parts = [] 1255 for x in url_query.split(b'&' if isinstance(url_query, bytes) else '&'): 1256 k, v = x.split(b'=' if isinstance(x, bytes) else '=', 1) 1257 parts.append('{}={}'.format(iri_quote_plus(unquote_plus(k)), iri_quote_plus(unquote_plus(v)))) 1258 url_query = '&'.join(parts) 1259 url = 'https://www.amazon.%s/s/?' % self.get_website_domain( 1260 domain) + url_query 1261 return url, domain 1262 1263 # }}} 1264 1265 def get_cached_cover_url(self, identifiers): # {{{ 1266 url = None 1267 domain, asin = self.get_domain_and_asin(identifiers) 1268 if asin is None: 1269 isbn = identifiers.get('isbn', None) 1270 if isbn is not None: 1271 asin = self.cached_isbn_to_identifier(isbn) 1272 if asin is not None: 1273 url = self.cached_identifier_to_cover_url(asin) 1274 1275 return url 1276 # }}} 1277 1278 def parse_results_page(self, root, domain): # {{{ 1279 from lxml.html import tostring 1280 1281 matches = [] 1282 1283 def title_ok(title): 1284 title = title.lower() 1285 bad = ['bulk pack', '[audiobook]', '[audio cd]', 1286 '(a book companion)', '( slipcase with door )', ': free sampler'] 1287 if self.domain == 'com': 1288 bad.extend(['(%s edition)' % x for x in ('spanish', 'german')]) 1289 for x in bad: 1290 if x in title: 1291 return False 1292 if title and title[0] in '[{' and re.search(r'\(\s*author\s*\)', title) is not None: 1293 # Bad entries in the catalog 1294 return False 1295 return True 1296 1297 for query in ( 1298 '//div[contains(@class, "s-result-list")]//h2/a[@href]', 1299 '//div[contains(@class, "s-result-list")]//div[@data-index]//h5//a[@href]', 1300 r'//li[starts-with(@id, "result_")]//a[@href and contains(@class, "s-access-detail-page")]', 1301 ): 1302 result_links = root.xpath(query) 1303 if result_links: 1304 break 1305 for a in result_links: 1306 title = tostring(a, method='text', encoding='unicode') 1307 if title_ok(title): 1308 url = a.get('href') 1309 if url.startswith('/'): 1310 url = 'https://www.amazon.%s%s' % ( 1311 self.get_website_domain(domain), url) 1312 matches.append(url) 1313 1314 if not matches: 1315 # Previous generation of results page markup 1316 for div in root.xpath(r'//div[starts-with(@id, "result_")]'): 1317 links = div.xpath(r'descendant::a[@class="title" and @href]') 1318 if not links: 1319 # New amazon markup 1320 links = div.xpath('descendant::h3/a[@href]') 1321 for a in links: 1322 title = tostring(a, method='text', encoding='unicode') 1323 if title_ok(title): 1324 url = a.get('href') 1325 if url.startswith('/'): 1326 url = 'https://www.amazon.%s%s' % ( 1327 self.get_website_domain(domain), url) 1328 matches.append(url) 1329 break 1330 1331 if not matches: 1332 # This can happen for some user agents that Amazon thinks are 1333 # mobile/less capable 1334 for td in root.xpath( 1335 r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'): 1336 for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'): 1337 title = tostring(a, method='text', encoding='unicode') 1338 if title_ok(title): 1339 url = a.get('href') 1340 if url.startswith('/'): 1341 url = 'https://www.amazon.%s%s' % ( 1342 self.get_website_domain(domain), url) 1343 matches.append(url) 1344 break 1345 if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'): 1346 raise CaptchaError('Amazon returned a CAPTCHA page. Recently Amazon has begun using statistical' 1347 ' profiling to block access to its website. As such this metadata plugin is' 1348 ' unlikely to ever work reliably.') 1349 1350 # Keep only the top 3 matches as the matches are sorted by relevance by 1351 # Amazon so lower matches are not likely to be very relevant 1352 return matches[:3] 1353 # }}} 1354 1355 def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout): # {{{ 1356 from calibre.utils.cleantext import clean_ascii_chars 1357 from calibre.ebooks.chardet import xml_to_unicode 1358 matches = [] 1359 query, domain = self.create_query(log, title=title, authors=authors, 1360 identifiers=identifiers) 1361 if query is None: 1362 log.error('Insufficient metadata to construct query') 1363 raise SearchFailed() 1364 try: 1365 raw = br.open_novisit(query, timeout=timeout).read().strip() 1366 except Exception as e: 1367 if callable(getattr(e, 'getcode', None)) and \ 1368 e.getcode() == 404: 1369 log.error('Query malformed: %r' % query) 1370 raise SearchFailed() 1371 attr = getattr(e, 'args', [None]) 1372 attr = attr if attr else [None] 1373 if isinstance(attr[0], socket.timeout): 1374 msg = _('Amazon timed out. Try again later.') 1375 log.error(msg) 1376 else: 1377 msg = 'Failed to make identify query: %r' % query 1378 log.exception(msg) 1379 raise SearchFailed() 1380 1381 raw = clean_ascii_chars(xml_to_unicode(raw, 1382 strip_encoding_pats=True, resolve_entities=True)[0]) 1383 1384 if testing: 1385 import tempfile 1386 with tempfile.NamedTemporaryFile(prefix='amazon_results_', 1387 suffix='.html', delete=False) as f: 1388 f.write(raw.encode('utf-8')) 1389 print('Downloaded html for results page saved in', f.name) 1390 1391 matches = [] 1392 found = '<title>404 - ' not in raw 1393 1394 if found: 1395 try: 1396 root = parse_html(raw) 1397 except Exception: 1398 msg = 'Failed to parse amazon page for query: %r' % query 1399 log.exception(msg) 1400 raise SearchFailed() 1401 1402 matches = self.parse_results_page(root, domain) 1403 1404 return matches, query, domain, None 1405 # }}} 1406 1407 def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None): # {{{ 1408 from calibre.ebooks.metadata.sources.update import search_engines_module 1409 terms, domain = self.create_query(log, title=title, authors=authors, 1410 identifiers=identifiers, for_amazon=False) 1411 site = self.referrer_for_domain( 1412 domain)[len('https://'):].partition('/')[0] 1413 matches = [] 1414 se = search_engines_module() 1415 server = override_server or self.server 1416 if server in ('bing',): 1417 urlproc, sfunc = se.bing_url_processor, se.bing_search 1418 elif server in ('auto', 'google'): 1419 urlproc, sfunc = se.google_url_processor, se.google_search 1420 elif server == 'wayback': 1421 urlproc, sfunc = se.wayback_url_processor, se.ddg_search 1422 results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout) 1423 br.set_current_header('Referer', qurl) 1424 for result in results: 1425 if abort.is_set(): 1426 return matches, terms, domain, None 1427 1428 purl = urlparse(result.url) 1429 if '/dp/' in purl.path and site in purl.netloc: 1430 url = result.cached_url 1431 if url is None: 1432 url = se.wayback_machine_cached_url( 1433 result.url, br, timeout=timeout) 1434 if url is None: 1435 log('Failed to find cached page for:', result.url) 1436 continue 1437 if url not in matches: 1438 matches.append(url) 1439 if len(matches) >= 3: 1440 break 1441 else: 1442 log('Skipping non-book result:', result) 1443 if not matches: 1444 log('No search engine results for terms:', ' '.join(terms)) 1445 if urlproc is se.google_url_processor: 1446 # Google does not cache adult titles 1447 log('Trying the bing search engine instead') 1448 return self.search_search_engine(br, testing, log, abort, title, authors, identifiers, timeout, 'bing') 1449 return matches, terms, domain, urlproc 1450 # }}} 1451 1452 def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ 1453 identifiers={}, timeout=60): 1454 ''' 1455 Note this method will retry without identifiers automatically if no 1456 match is found with identifiers. 1457 ''' 1458 1459 testing = getattr(self, 'running_a_test', False) 1460 1461 udata = self._get_book_url(identifiers) 1462 br = self.browser 1463 log('User-agent:', br.current_user_agent()) 1464 log('Server:', self.server) 1465 if testing: 1466 print('User-agent:', br.current_user_agent()) 1467 if udata is not None and not self.use_search_engine: 1468 # Try to directly get details page instead of running a search 1469 # Cannot use search engine as the directly constructed URL is 1470 # usually redirected to a full URL by amazon, and is therefore 1471 # not cached 1472 domain, idtype, asin, durl = udata 1473 if durl is not None: 1474 preparsed_root = parse_details_page( 1475 durl, log, timeout, br, domain) 1476 if preparsed_root is not None: 1477 qasin = parse_asin(preparsed_root[1], log, durl) 1478 if qasin == asin: 1479 w = Worker(durl, result_queue, br, log, 0, domain, 1480 self, testing=testing, preparsed_root=preparsed_root, timeout=timeout) 1481 try: 1482 w.get_details() 1483 return 1484 except Exception: 1485 log.exception( 1486 'get_details failed for url: %r' % durl) 1487 func = self.search_search_engine if self.use_search_engine else self.search_amazon 1488 try: 1489 matches, query, domain, cover_url_processor = func( 1490 br, testing, log, abort, title, authors, identifiers, timeout) 1491 except SearchFailed: 1492 return 1493 1494 if abort.is_set(): 1495 return 1496 1497 if not matches: 1498 if identifiers and title and authors: 1499 log('No matches found with identifiers, retrying using only' 1500 ' title and authors. Query: %r' % query) 1501 time.sleep(1) 1502 return self.identify(log, result_queue, abort, title=title, 1503 authors=authors, timeout=timeout) 1504 log.error('No matches found with query: %r' % query) 1505 return 1506 1507 workers = [Worker( 1508 url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout, 1509 cover_url_processor=cover_url_processor, filter_result=partial( 1510 self.filter_result, title, authors, identifiers)) for i, url in enumerate(matches)] 1511 1512 for w in workers: 1513 # Don't send all requests at the same time 1514 time.sleep(1) 1515 w.start() 1516 if abort.is_set(): 1517 return 1518 1519 while not abort.is_set(): 1520 a_worker_is_alive = False 1521 for w in workers: 1522 w.join(0.2) 1523 if abort.is_set(): 1524 break 1525 if w.is_alive(): 1526 a_worker_is_alive = True 1527 if not a_worker_is_alive: 1528 break 1529 1530 return None 1531 # }}} 1532 1533 def filter_result(self, title, authors, identifiers, mi, log): # {{{ 1534 if not self.use_search_engine: 1535 return True 1536 if title is not None: 1537 1538 def tokenize_title(x): 1539 return icu_lower(x).replace("'", '').replace('"', '').rstrip(':') 1540 1541 tokens = {tokenize_title(x) for x in title.split() if len(x) > 3} 1542 if tokens: 1543 result_tokens = {tokenize_title(x) for x in mi.title.split()} 1544 if not tokens.intersection(result_tokens): 1545 log('Ignoring result:', mi.title, 'as its title does not match') 1546 return False 1547 if authors: 1548 author_tokens = set() 1549 for author in authors: 1550 author_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2} 1551 result_tokens = set() 1552 for author in mi.authors: 1553 result_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2} 1554 if author_tokens and not author_tokens.intersection(result_tokens): 1555 log('Ignoring result:', mi.title, 'by', ' & '.join(mi.authors), 'as its author does not match') 1556 return False 1557 return True 1558 # }}} 1559 1560 def download_cover(self, log, result_queue, abort, # {{{ 1561 title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False): 1562 cached_url = self.get_cached_cover_url(identifiers) 1563 if cached_url is None: 1564 log.info('No cached cover found, running identify') 1565 rq = Queue() 1566 self.identify(log, rq, abort, title=title, authors=authors, 1567 identifiers=identifiers) 1568 if abort.is_set(): 1569 return 1570 if abort.is_set(): 1571 return 1572 results = [] 1573 while True: 1574 try: 1575 results.append(rq.get_nowait()) 1576 except Empty: 1577 break 1578 results.sort(key=self.identify_results_keygen( 1579 title=title, authors=authors, identifiers=identifiers)) 1580 for mi in results: 1581 cached_url = self.get_cached_cover_url(mi.identifiers) 1582 if cached_url is not None: 1583 break 1584 if cached_url is None: 1585 log.info('No cover found') 1586 return 1587 1588 if abort.is_set(): 1589 return 1590 log('Downloading cover from:', cached_url) 1591 br = self.browser 1592 if self.use_search_engine: 1593 br = br.clone_browser() 1594 br.set_current_header('Referer', self.referrer_for_domain(self.domain)) 1595 try: 1596 time.sleep(1) 1597 cdata = br.open_novisit( 1598 cached_url, timeout=timeout).read() 1599 result_queue.put((self, cdata)) 1600 except: 1601 log.exception('Failed to download cover from:', cached_url) 1602 # }}} 1603 1604 1605def manual_tests(domain, **kw): # {{{ 1606 # To run these test use: 1607 # calibre-debug -c "from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')" 1608 from calibre.ebooks.metadata.sources.test import (test_identify_plugin, 1609 isbn_test, title_test, authors_test, comments_test, series_test) 1610 all_tests = {} 1611 all_tests['com'] = [ # {{{ 1612 ( # Paperback with series 1613 {'identifiers': {'amazon': '1423146786'}}, 1614 [title_test('The Heroes of Olympus, Book Five The Blood of Olympus', 1615 exact=True), series_test('The Heroes of Olympus', 5)] 1616 ), 1617 1618 ( # Kindle edition with series 1619 {'identifiers': {'amazon': 'B0085UEQDO'}}, 1620 [title_test('Three Parts Dead', exact=True), 1621 series_test('Craft Sequence', 1)] 1622 ), 1623 1624 ( # + in title and uses id="main-image" for cover 1625 {'identifiers': {'amazon': '1933988770'}}, 1626 [title_test( 1627 'C++ Concurrency in Action: Practical Multithreading', exact=True)] 1628 ), 1629 1630 1631 ( # Different comments markup, using Book Description section 1632 {'identifiers': {'amazon': '0982514506'}}, 1633 [title_test( 1634 "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy", 1635 exact=True), 1636 comments_test('Jelena'), comments_test('Ashinji'), 1637 ] 1638 ), 1639 1640 ( # # in title 1641 {'title': 'Expert C# 2008 Business Objects', 1642 'authors': ['Lhotka']}, 1643 [title_test('Expert C#'), 1644 authors_test(['Rockford Lhotka']) 1645 ] 1646 ), 1647 1648 ( # No specific problems 1649 {'identifiers': {'isbn': '0743273567'}}, 1650 [title_test('the great gatsby: the only authorized edition', exact=True), 1651 authors_test(['Francis Scott Fitzgerald'])] 1652 ), 1653 1654 ] 1655 1656 # }}} 1657 1658 all_tests['de'] = [ # {{{ 1659 ( # umlaut in title/authors 1660 {'title': 'Flüsternde Wälder', 1661 'authors': ['Nicola Förg']}, 1662 [title_test('Flüsternde Wälder'), 1663 authors_test(['Nicola Förg'], subset=True) 1664 ] 1665 ), 1666 1667 1668 ( 1669 {'identifiers': {'isbn': '9783453314979'}}, 1670 [title_test('Die letzten Wächter: Roman', 1671 exact=False), authors_test(['Sergej Lukianenko']) 1672 ] 1673 1674 ), 1675 1676 ( 1677 {'identifiers': {'isbn': '3548283519'}}, 1678 [title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff', 1679 exact=False), authors_test(['Nele Neuhaus']) 1680 ] 1681 1682 ), 1683 ] # }}} 1684 1685 all_tests['it'] = [ # {{{ 1686 ( 1687 {'identifiers': {'isbn': '8838922195'}}, 1688 [title_test('La briscola in cinque', 1689 exact=True), authors_test(['Marco Malvaldi']) 1690 ] 1691 1692 ), 1693 ] # }}} 1694 1695 all_tests['fr'] = [ # {{{ 1696 ( 1697 {'identifiers': {'amazon_fr': 'B07L7ST4RS'}}, 1698 [title_test('Le secret de Lola', exact=True), 1699 authors_test(['Amélie BRIZIO']) 1700 ] 1701 ), 1702 ( 1703 {'identifiers': {'isbn': '2221116798'}}, 1704 [title_test('L\'étrange voyage de Monsieur Daldry', 1705 exact=True), authors_test(['Marc Levy']) 1706 ] 1707 1708 ), 1709 ] # }}} 1710 1711 all_tests['es'] = [ # {{{ 1712 ( 1713 {'identifiers': {'isbn': '8483460831'}}, 1714 [title_test('Tiempos Interesantes', 1715 exact=False), authors_test(['Terry Pratchett']) 1716 ] 1717 1718 ), 1719 ] # }}} 1720 1721 all_tests['se'] = [ # {{{ 1722 ( 1723 {'identifiers': {'isbn': '9780552140287'}}, 1724 [title_test('Men At Arms: A Discworld Novel: 14', 1725 exact=False), authors_test(['Terry Pratchett']) 1726 ] 1727 1728 ), 1729 ] # }}} 1730 1731 all_tests['jp'] = [ # {{{ 1732 ( # Adult filtering test 1733 {'identifiers': {'isbn': '4799500066'}}, 1734 [title_test('Bitch Trap'), ] 1735 ), 1736 1737 ( # isbn -> title, authors 1738 {'identifiers': {'isbn': '9784101302720'}}, 1739 [title_test('精霊の守り人', 1740 exact=True), authors_test(['上橋 菜穂子']) 1741 ] 1742 ), 1743 ( # title, authors -> isbn (will use Shift_JIS encoding in query.) 1744 {'title': '考えない練習', 1745 'authors': ['小池 龍之介']}, 1746 [isbn_test('9784093881067'), ] 1747 ), 1748 ] # }}} 1749 1750 all_tests['br'] = [ # {{{ 1751 ( 1752 {'title': 'Guerra dos Tronos'}, 1753 [title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo', 1754 exact=True), authors_test(['George R. R. Martin']) 1755 ] 1756 1757 ), 1758 ] # }}} 1759 1760 all_tests['nl'] = [ # {{{ 1761 ( 1762 {'title': 'Freakonomics'}, 1763 [title_test('Freakonomics', 1764 exact=True), authors_test(['Steven Levitt & Stephen Dubner & R. Kuitenbrouwer & O. Brenninkmeijer & A. van Den Berg']) 1765 ] 1766 1767 ), 1768 ] # }}} 1769 1770 all_tests['cn'] = [ # {{{ 1771 ( 1772 {'identifiers': {'isbn': '9787115369512'}}, 1773 [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True), 1774 authors_test(['[美]sam Williams', '邓楠,李凡希'])] 1775 ), 1776 ( 1777 {'title': '爱上Raspberry Pi'}, 1778 [title_test('爱上Raspberry Pi', 1779 exact=True), authors_test(['Matt Richardson', 'Shawn Wallace', '李凡希']) 1780 ] 1781 1782 ), 1783 ] # }}} 1784 1785 all_tests['ca'] = [ # {{{ 1786 ( # Paperback with series 1787 {'identifiers': {'isbn': '9781623808747'}}, 1788 [title_test('Parting Shot', exact=True), 1789 authors_test(['Mary Calmes'])] 1790 ), 1791 ( # # in title 1792 {'title': 'Expert C# 2008 Business Objects', 1793 'authors': ['Lhotka']}, 1794 [title_test('Expert C# 2008 Business Objects'), 1795 authors_test(['Rockford Lhotka'])] 1796 ), 1797 ( # noscript description 1798 {'identifiers': {'amazon_ca': '162380874X'}}, 1799 [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes']) 1800 ] 1801 ), 1802 ] # }}} 1803 1804 def do_test(domain, start=0, stop=None, server='auto'): 1805 tests = all_tests[domain] 1806 if stop is None: 1807 stop = len(tests) 1808 tests = tests[start:stop] 1809 test_identify_plugin(Amazon.name, tests, modify_plugin=lambda p: ( 1810 setattr(p, 'testing_domain', domain), 1811 setattr(p, 'touched_fields', p.touched_fields - {'tags'}), 1812 setattr(p, 'testing_server', server), 1813 )) 1814 1815 do_test(domain, **kw) 1816# }}} 1817