1#!/usr/local/bin/python3.8
2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3# License: GPLv3 Copyright: 2011, Kovid Goyal <kovid at kovidgoyal.net>
4from __future__ import absolute_import, division, print_function, unicode_literals
5
6import re
7import socket
8import time
9from functools import partial
10try:
11    from queue import Empty, Queue
12except ImportError:
13    from Queue import Empty, Queue
14from threading import Thread
15try:
16    from urllib.parse import urlparse
17except ImportError:
18    from urlparse import urlparse
19
20from calibre import as_unicode, browser, random_user_agent, xml_replace_entities
21from calibre.ebooks.metadata import check_isbn
22from calibre.ebooks.metadata.book.base import Metadata
23from calibre.ebooks.metadata.sources.base import Option, Source, fixauthors, fixcase
24from calibre.utils.localization import canonicalize_lang
25from calibre.utils.random_ua import accept_header_for_ua
26from calibre.ebooks.oeb.base import urlquote
27
28
29def iri_quote_plus(url):
30    ans = urlquote(url)
31    if isinstance(ans, bytes):
32        ans = ans.decode('utf-8')
33    return ans.replace('%20', '+')
34
35
36def user_agent_is_ok(ua):
37    return 'Mobile/' not in ua and 'Mobile ' not in ua
38
39
40class CaptchaError(Exception):
41    pass
42
43
44class SearchFailed(ValueError):
45    pass
46
47
48def parse_html(raw):
49    try:
50        from html5_parser import parse
51    except ImportError:
52        # Old versions of calibre
53        import html5lib
54        return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
55    else:
56        return parse(raw)
57
58
59def parse_details_page(url, log, timeout, browser, domain):
60    from calibre.utils.cleantext import clean_ascii_chars
61    from calibre.ebooks.chardet import xml_to_unicode
62    from lxml.html import tostring
63    log('Getting details from:', url)
64    try:
65        raw = browser.open_novisit(url, timeout=timeout).read().strip()
66    except Exception as e:
67        if callable(getattr(e, 'getcode', None)) and \
68                e.getcode() == 404:
69            log.error('URL malformed: %r' % url)
70            return
71        attr = getattr(e, 'args', [None])
72        attr = attr if attr else [None]
73        if isinstance(attr[0], socket.timeout):
74            msg = 'Details page timed out. Try again later.'
75            log.error(msg)
76        else:
77            msg = 'Failed to make details query: %r' % url
78            log.exception(msg)
79        return
80
81    oraw = raw
82    if 'amazon.com.br' in url:
83        # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
84        raw = raw.decode('utf-8')
85    raw = xml_to_unicode(raw, strip_encoding_pats=True,
86                         resolve_entities=True)[0]
87    if '<title>404 - ' in raw:
88        raise ValueError('URL malformed: %r' % url)
89    if '>Could not find the requested document in the cache.<' in raw:
90        raise ValueError('No cached entry for %s found' % url)
91
92    try:
93        root = parse_html(clean_ascii_chars(raw))
94    except Exception:
95        msg = 'Failed to parse amazon details page: %r' % url
96        log.exception(msg)
97        return
98    if domain == 'jp':
99        for a in root.xpath('//a[@href]'):
100            if 'black-curtain-redirect.html' in a.get('href'):
101                url = a.get('href')
102                if url:
103                    if url.startswith('/'):
104                        url = 'https://amazon.co.jp' + a.get('href')
105                    log('Black curtain redirect found, following')
106                    return parse_details_page(url, log, timeout, browser, domain)
107
108    errmsg = root.xpath('//*[@id="errorMessage"]')
109    if errmsg:
110        msg = 'Failed to parse amazon details page: %r' % url
111        msg += tostring(errmsg, method='text', encoding='unicode').strip()
112        log.error(msg)
113        return
114
115    from css_selectors import Select
116    selector = Select(root)
117    return oraw, root, selector
118
119
120def parse_asin(root, log, url):
121    try:
122        link = root.xpath('//link[@rel="canonical" and @href]')
123        for l in link:
124            return l.get('href').rpartition('/')[-1]
125    except Exception:
126        log.exception('Error parsing ASIN for url: %r' % url)
127
128
129class Worker(Thread):  # Get details {{{
130
131    '''
132    Get book details from amazons book page in a separate thread
133    '''
134
135    def __init__(self, url, result_queue, browser, log, relevance, domain,
136                 plugin, timeout=20, testing=False, preparsed_root=None,
137                 cover_url_processor=None, filter_result=None):
138        Thread.__init__(self)
139        self.cover_url_processor = cover_url_processor
140        self.preparsed_root = preparsed_root
141        self.daemon = True
142        self.testing = testing
143        self.url, self.result_queue = url, result_queue
144        self.log, self.timeout = log, timeout
145        self.filter_result = filter_result or (lambda x, log: True)
146        self.relevance, self.plugin = relevance, plugin
147        self.browser = browser
148        self.cover_url = self.amazon_id = self.isbn = None
149        self.domain = domain
150        from lxml.html import tostring
151        self.tostring = tostring
152
153        months = {  # {{{
154            'de': {
155                1: ['jän', 'januar'],
156                2: ['februar'],
157                3: ['märz'],
158                5: ['mai'],
159                6: ['juni'],
160                7: ['juli'],
161                10: ['okt', 'oktober'],
162                12: ['dez', 'dezember']
163            },
164            'it': {
165                1: ['gennaio', 'enn'],
166                2: ['febbraio', 'febbr'],
167                3: ['marzo'],
168                4: ['aprile'],
169                5: ['maggio', 'magg'],
170                6: ['giugno'],
171                7: ['luglio'],
172                8: ['agosto', 'ag'],
173                9: ['settembre', 'sett'],
174                10: ['ottobre', 'ott'],
175                11: ['novembre'],
176                12: ['dicembre', 'dic'],
177            },
178            'fr': {
179                1: ['janv'],
180                2: ['févr'],
181                3: ['mars'],
182                4: ['avril'],
183                5: ['mai'],
184                6: ['juin'],
185                7: ['juil'],
186                8: ['août'],
187                9: ['sept'],
188                12: ['déc'],
189            },
190            'br': {
191                1: ['janeiro'],
192                2: ['fevereiro'],
193                3: ['março'],
194                4: ['abril'],
195                5: ['maio'],
196                6: ['junho'],
197                7: ['julho'],
198                8: ['agosto'],
199                9: ['setembro'],
200                10: ['outubro'],
201                11: ['novembro'],
202                12: ['dezembro'],
203            },
204            'es': {
205                1: ['enero'],
206                2: ['febrero'],
207                3: ['marzo'],
208                4: ['abril'],
209                5: ['mayo'],
210                6: ['junio'],
211                7: ['julio'],
212                8: ['agosto'],
213                9: ['septiembre', 'setiembre'],
214                10: ['octubre'],
215                11: ['noviembre'],
216                12: ['diciembre'],
217            },
218            'se': {
219                1: ['januari'],
220                2: ['februari'],
221                3: ['mars'],
222                4: ['april'],
223                5: ['maj'],
224                6: ['juni'],
225                7: ['juli'],
226                8: ['augusti'],
227                9: ['september'],
228                10: ['oktober'],
229                11: ['november'],
230                12: ['december'],
231            },
232            'jp': {
233                1: ['1月'],
234                2: ['2月'],
235                3: ['3月'],
236                4: ['4月'],
237                5: ['5月'],
238                6: ['6月'],
239                7: ['7月'],
240                8: ['8月'],
241                9: ['9月'],
242                10: ['10月'],
243                11: ['11月'],
244                12: ['12月'],
245            },
246            'nl': {
247                1: ['januari'], 2: ['februari'], 3: ['maart'], 5: ['mei'], 6: ['juni'], 7: ['juli'], 8: ['augustus'], 10: ['oktober'],
248            }
249
250        }  # }}}
251
252        self.english_months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
253                               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
254        self.months = months.get(self.domain, {})
255
256        self.pd_xpath = '''
257            //h2[text()="Product Details" or \
258                 text()="Produktinformation" or \
259                 text()="Dettagli prodotto" or \
260                 text()="Product details" or \
261                 text()="Détails sur le produit" or \
262                 text()="Detalles del producto" or \
263                 text()="Detalhes do produto" or \
264                 text()="Productgegevens" or \
265                 text()="基本信息" or \
266                 starts-with(text(), "登録情報")]/../div[@class="content"]
267            '''
268        # Editor: is for Spanish
269        self.publisher_xpath = '''
270            descendant::*[starts-with(text(), "Publisher:") or \
271                    starts-with(text(), "Verlag:") or \
272                    starts-with(text(), "Editore:") or \
273                    starts-with(text(), "Editeur") or \
274                    starts-with(text(), "Editor:") or \
275                    starts-with(text(), "Editora:") or \
276                    starts-with(text(), "Uitgever:") or \
277                    starts-with(text(), "Utgivare:") or \
278                    starts-with(text(), "出版社:")]
279            '''
280        self.pubdate_xpath = '''
281            descendant::*[starts-with(text(), "Publication Date:") or \
282                    starts-with(text(), "Audible.com Release Date:")]
283        '''
284        self.publisher_names = {'Publisher', 'Uitgever', 'Verlag', 'Utgivare',
285                                'Editore', 'Editeur', 'Editor', 'Editora', '出版社'}
286
287        self.language_xpath =    '''
288            descendant::*[
289                starts-with(text(), "Language:") \
290                or text() = "Language" \
291                or text() = "Sprache:" \
292                or text() = "Lingua:" \
293                or text() = "Idioma:" \
294                or starts-with(text(), "Langue") \
295                or starts-with(text(), "言語") \
296                or starts-with(text(), "Språk") \
297                or starts-with(text(), "语种")
298                ]
299            '''
300        self.language_names = {'Language', 'Sprache', 'Språk',
301                               'Lingua', 'Idioma', 'Langue', '言語', 'Taal', '语种'}
302
303        self.tags_xpath = '''
304            descendant::h2[
305                text() = "Look for Similar Items by Category" or
306                text() = "Ähnliche Artikel finden" or
307                text() = "Buscar productos similares por categoría" or
308                text() = "Ricerca articoli simili per categoria" or
309                text() = "Rechercher des articles similaires par rubrique" or
310                text() = "Procure por items similares por categoria" or
311                text() = "関連商品を探す"
312            ]/../descendant::ul/li
313        '''
314
315        self.ratings_pat = re.compile(
316            r'([0-9.,]+) ?(out of|von|van|su|étoiles sur|つ星のうち|de un máximo de|de|av) '
317            r'([\d\.]+)( (stars|Sternen|stelle|estrellas|estrelas|sterren|stjärnor)){0,1}'
318        )
319        self.ratings_pat_cn = re.compile('([0-9.]+) 颗星,最多 5 颗星')
320        self.ratings_pat_jp = re.compile(r'\d+つ星のうち([\d\.]+)')
321
322        lm = {
323            'eng': ('English', 'Englisch', 'Engels', 'Engelska'),
324            'fra': ('French', 'Français'),
325            'ita': ('Italian', 'Italiano'),
326            'deu': ('German', 'Deutsch'),
327            'spa': ('Spanish', 'Espa\xf1ol', 'Espaniol'),
328            'jpn': ('Japanese', '日本語'),
329            'por': ('Portuguese', 'Português'),
330            'nld': ('Dutch', 'Nederlands',),
331            'chs': ('Chinese', '中文', '简体中文'),
332            'swe': ('Swedish', 'Svenska'),
333        }
334        self.lang_map = {}
335        for code, names in lm.items():
336            for name in names:
337                self.lang_map[name] = code
338
339        self.series_pat = re.compile(
340            r'''
341                \|\s*              # Prefix
342                (Series)\s*:\s*    # Series declaration
343                (?P<series>.+?)\s+  # The series name
344                \((Book)\s*    # Book declaration
345                (?P<index>[0-9.]+) # Series index
346                \s*\)
347                ''', re.X)
348
349    def delocalize_datestr(self, raw):
350        if self.domain == 'cn':
351            return raw.replace('年', '-').replace('月', '-').replace('日', '')
352        if not self.months:
353            return raw
354        ans = raw.lower()
355        for i, vals in self.months.items():
356            for x in vals:
357                ans = ans.replace(x, self.english_months[i])
358        ans = ans.replace(' de ', ' ')
359        return ans
360
361    def run(self):
362        try:
363            self.get_details()
364        except:
365            self.log.exception('get_details failed for url: %r' % self.url)
366
367    def get_details(self):
368        if self.preparsed_root is None:
369            raw, root, selector = parse_details_page(
370                self.url, self.log, self.timeout, self.browser, self.domain)
371        else:
372            raw, root, selector = self.preparsed_root
373
374        from css_selectors import Select
375        self.selector = Select(root)
376        self.parse_details(raw, root)
377
378    def parse_details(self, raw, root):
379        asin = parse_asin(root, self.log, self.url)
380        if not asin and root.xpath('//form[@action="/errors/validateCaptcha"]'):
381            raise CaptchaError(
382                'Amazon returned a CAPTCHA page, probably because you downloaded too many books. Wait for some time and try again.')
383        if self.testing:
384            import tempfile
385            import uuid
386            with tempfile.NamedTemporaryFile(prefix=(asin or type('')(uuid.uuid4())) + '_',
387                                             suffix='.html', delete=False) as f:
388                f.write(raw)
389            print('Downloaded html for', asin, 'saved in', f.name)
390
391        try:
392            title = self.parse_title(root)
393        except:
394            self.log.exception('Error parsing title for url: %r' % self.url)
395            title = None
396
397        try:
398            authors = self.parse_authors(root)
399        except:
400            self.log.exception('Error parsing authors for url: %r' % self.url)
401            authors = []
402
403        if not title or not authors or not asin:
404            self.log.error(
405                'Could not find title/authors/asin for %r' % self.url)
406            self.log.error('ASIN: %r Title: %r Authors: %r' % (asin, title,
407                                                               authors))
408            return
409
410        mi = Metadata(title, authors)
411        idtype = 'amazon' if self.domain == 'com' else 'amazon_' + self.domain
412        mi.set_identifier(idtype, asin)
413        self.amazon_id = asin
414
415        try:
416            mi.rating = self.parse_rating(root)
417        except:
418            self.log.exception('Error parsing ratings for url: %r' % self.url)
419
420        try:
421            mi.comments = self.parse_comments(root, raw)
422        except:
423            self.log.exception('Error parsing comments for url: %r' % self.url)
424
425        try:
426            series, series_index = self.parse_series(root)
427            if series:
428                mi.series, mi.series_index = series, series_index
429            elif self.testing:
430                mi.series, mi.series_index = 'Dummy series for testing', 1
431        except:
432            self.log.exception('Error parsing series for url: %r' % self.url)
433
434        try:
435            mi.tags = self.parse_tags(root)
436        except:
437            self.log.exception('Error parsing tags for url: %r' % self.url)
438
439        try:
440            self.cover_url = self.parse_cover(root, raw)
441        except:
442            self.log.exception('Error parsing cover for url: %r' % self.url)
443        if self.cover_url_processor is not None and self.cover_url and self.cover_url.startswith('/'):
444            self.cover_url = self.cover_url_processor(self.cover_url)
445        mi.has_cover = bool(self.cover_url)
446
447        detail_bullets = root.xpath('//*[@data-feature-name="detailBullets"]')
448        non_hero = tuple(self.selector(
449            'div#bookDetails_container_div div#nonHeroSection')) or tuple(self.selector(
450                '#productDetails_techSpec_sections'))
451        if detail_bullets:
452            self.parse_detail_bullets(root, mi, detail_bullets[0])
453        elif non_hero:
454            try:
455                self.parse_new_details(root, mi, non_hero[0])
456            except:
457                self.log.exception(
458                    'Failed to parse new-style book details section')
459
460        else:
461            pd = root.xpath(self.pd_xpath)
462            if pd:
463                pd = pd[0]
464
465                try:
466                    isbn = self.parse_isbn(pd)
467                    if isbn:
468                        self.isbn = mi.isbn = isbn
469                except:
470                    self.log.exception(
471                        'Error parsing ISBN for url: %r' % self.url)
472
473                try:
474                    mi.publisher = self.parse_publisher(pd)
475                except:
476                    self.log.exception(
477                        'Error parsing publisher for url: %r' % self.url)
478
479                try:
480                    mi.pubdate = self.parse_pubdate(pd)
481                except:
482                    self.log.exception(
483                        'Error parsing publish date for url: %r' % self.url)
484
485                try:
486                    lang = self.parse_language(pd)
487                    if lang:
488                        mi.language = lang
489                except:
490                    self.log.exception(
491                        'Error parsing language for url: %r' % self.url)
492
493            else:
494                self.log.warning(
495                    'Failed to find product description for url: %r' % self.url)
496
497        mi.source_relevance = self.relevance
498
499        if self.amazon_id:
500            if self.isbn:
501                self.plugin.cache_isbn_to_identifier(self.isbn, self.amazon_id)
502            if self.cover_url:
503                self.plugin.cache_identifier_to_cover_url(self.amazon_id,
504                                                          self.cover_url)
505
506        self.plugin.clean_downloaded_metadata(mi)
507
508        if self.filter_result(mi, self.log):
509            self.result_queue.put(mi)
510
511    def totext(self, elem, only_printable=False):
512        res = self.tostring(elem, encoding='unicode', method='text')
513        if only_printable:
514            filtered_characters = list(s for s in res if s.isprintable())
515            res = ''.join(filtered_characters).strip()
516        return res
517
518    def parse_title(self, root):
519
520        def sanitize_title(title):
521            ans = re.sub(r'[(\[].*[)\]]', '', title).strip()
522            if not ans:
523                ans = title.rpartition('[')[0].strip()
524            return ans
525
526        h1 = root.xpath('//h1[@id="title"]')
527        if h1:
528            h1 = h1[0]
529            for child in h1.xpath('./*[contains(@class, "a-color-secondary")]'):
530                h1.remove(child)
531            return sanitize_title(self.totext(h1))
532        tdiv = root.xpath('//h1[contains(@class, "parseasinTitle")]')
533        if not tdiv:
534            span = root.xpath('//*[@id="ebooksTitle"]')
535            if span:
536                return sanitize_title(self.totext(span[0]))
537            raise ValueError('No title block found')
538        tdiv = tdiv[0]
539        actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
540        if actual_title:
541            title = self.tostring(actual_title[0], encoding='unicode',
542                                  method='text').strip()
543        else:
544            title = self.tostring(tdiv, encoding='unicode',
545                                  method='text').strip()
546        return sanitize_title(title)
547
548    def parse_authors(self, root):
549        for sel in (
550                '#byline .author .contributorNameID',
551                '#byline .author a.a-link-normal',
552                '#bylineInfo .author .contributorNameID',
553                '#bylineInfo .author a.a-link-normal',
554                '#bylineInfo #bylineContributor',
555        ):
556            matches = tuple(self.selector(sel))
557            if matches:
558                authors = [self.totext(x) for x in matches]
559                return [a for a in authors if a]
560
561        x = '//h1[contains(@class, "parseasinTitle")]/following-sibling::span/*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]'
562        aname = root.xpath(x)
563        if not aname:
564            aname = root.xpath('''
565            //h1[contains(@class, "parseasinTitle")]/following-sibling::*[(name()="a" and @href) or (name()="span" and @class="contributorNameTrigger")]
566                    ''')
567        for x in aname:
568            x.tail = ''
569        authors = [self.tostring(x, encoding='unicode', method='text').strip() for x
570                   in aname]
571        authors = [a for a in authors if a]
572        return authors
573
574    def parse_rating(self, root):
575        for x in root.xpath('//div[@id="cpsims-feature" or @id="purchase-sims-feature" or @id="rhf"]'):
576            # Remove the similar books section as it can cause spurious
577            # ratings matches
578            x.getparent().remove(x)
579
580        rating_paths = (
581            '//div[@data-feature-name="averageCustomerReviews" or @id="averageCustomerReviews"]',
582            '//div[@class="jumpBar"]/descendant::span[contains(@class,"asinReviewsSummary")]',
583            '//div[@class="buying"]/descendant::span[contains(@class,"asinReviewsSummary")]',
584            '//span[@class="crAvgStars"]/descendant::span[contains(@class,"asinReviewsSummary")]'
585        )
586        ratings = None
587        for p in rating_paths:
588            ratings = root.xpath(p)
589            if ratings:
590                break
591
592        def parse_ratings_text(text):
593            try:
594                m = self.ratings_pat.match(text)
595                return float(m.group(1).replace(',', '.')) / float(m.group(3)) * 5
596            except Exception:
597                pass
598
599        if ratings:
600            ratings = ratings[0]
601            for elem in ratings.xpath('descendant::*[@title]'):
602                t = elem.get('title').strip()
603                if self.domain == 'cn':
604                    m = self.ratings_pat_cn.match(t)
605                    if m is not None:
606                        return float(m.group(1))
607                elif self.domain == 'jp':
608                    m = self.ratings_pat_jp.match(t)
609                    if m is not None:
610                        return float(m.group(1))
611                else:
612                    ans = parse_ratings_text(t)
613                    if ans is not None:
614                        return ans
615            for elem in ratings.xpath('descendant::span[@class="a-icon-alt"]'):
616                t = self.tostring(
617                    elem, encoding='unicode', method='text', with_tail=False).strip()
618                ans = parse_ratings_text(t)
619                if ans is not None:
620                    return ans
621
622    def _render_comments(self, desc):
623        from calibre.library.comments import sanitize_comments_html
624
625        for c in desc.xpath('descendant::noscript'):
626            c.getparent().remove(c)
627        for c in desc.xpath('descendant::*[@class="seeAll" or'
628                            ' @class="emptyClear" or @id="collapsePS" or'
629                            ' @id="expandPS"]'):
630            c.getparent().remove(c)
631        for b in desc.xpath('descendant::b[@style]'):
632            # Bing highlights search results
633            s = b.get('style', '')
634            if 'color' in s:
635                b.tag = 'span'
636                del b.attrib['style']
637
638        for a in desc.xpath('descendant::a[@href]'):
639            del a.attrib['href']
640            a.tag = 'span'
641        desc = self.tostring(desc, method='html', encoding='unicode').strip()
642        desc = xml_replace_entities(desc, 'utf-8')
643
644        # Encoding bug in Amazon data U+fffd (replacement char)
645        # in some examples it is present in place of '
646        desc = desc.replace('\ufffd', "'")
647        # remove all attributes from tags
648        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
649        # Collapse whitespace
650        # desc = re.sub('\n+', '\n', desc)
651        # desc = re.sub(' +', ' ', desc)
652        # Remove the notice about text referring to out of print editions
653        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
654        # Remove comments
655        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
656        return sanitize_comments_html(desc)
657
658    def parse_comments(self, root, raw):
659        try:
660            from urllib.parse import unquote
661        except ImportError:
662            from urllib import unquote
663        ans = ''
664        ns = tuple(self.selector('#bookDescription_feature_div noscript'))
665        if ns:
666            ns = ns[0]
667            if len(ns) == 0 and ns.text:
668                import html5lib
669                # html5lib parsed noscript as CDATA
670                ns = html5lib.parseFragment(
671                    '<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0]
672            else:
673                ns.tag = 'div'
674            ans = self._render_comments(ns)
675        else:
676            desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]')
677            if desc:
678                ans = self._render_comments(desc[0])
679            else:
680                ns = tuple(self.selector('#bookDescription_feature_div .a-expander-content'))
681                if ns:
682                    ans = self._render_comments(ns[0])
683
684        desc = root.xpath(
685            '//div[@id="productDescription"]/*[@class="content"]')
686        if desc:
687            ans += self._render_comments(desc[0])
688        else:
689            # Idiot chickens from amazon strike again. This data is now stored
690            # in a JS variable inside a script tag URL encoded.
691            m = re.search(br'var\s+iframeContent\s*=\s*"([^"]+)"', raw)
692            if m is not None:
693                try:
694                    text = unquote(m.group(1)).decode('utf-8')
695                    nr = parse_html(text)
696                    desc = nr.xpath(
697                        '//div[@id="productDescription"]/*[@class="content"]')
698                    if desc:
699                        ans += self._render_comments(desc[0])
700                except Exception as e:
701                    self.log.warn(
702                        'Parsing of obfuscated product description failed with error: %s' % as_unicode(e))
703            else:
704                desc = root.xpath('//div[@id="productDescription_fullView"]')
705                if desc:
706                    ans += self._render_comments(desc[0])
707
708        return ans
709
710    def parse_series(self, root):
711        ans = (None, None)
712
713        # This is found on the paperback/hardback pages for books on amazon.com
714        series = root.xpath('//div[@data-feature-name="seriesTitle"]')
715        if series:
716            series = series[0]
717            spans = series.xpath('./span')
718            if spans:
719                raw = self.tostring(
720                    spans[0], encoding='unicode', method='text', with_tail=False).strip()
721                m = re.search(r'\s+([0-9.]+)$', raw.strip())
722                if m is not None:
723                    series_index = float(m.group(1))
724                    s = series.xpath('./a[@id="series-page-link"]')
725                    if s:
726                        series = self.tostring(
727                            s[0], encoding='unicode', method='text', with_tail=False).strip()
728                        if series:
729                            ans = (series, series_index)
730        else:
731            series = root.xpath('//div[@id="seriesBulletWidget_feature_div"]')
732            if series:
733                a = series[0].xpath('descendant::a')
734                if a:
735                    raw = self.tostring(a[0], encoding='unicode', method='text', with_tail=False)
736                    m = re.search(r'(?:Book|Libro)\s+(?P<index>[0-9.]+)\s+(?:of|de)\s+([0-9.]+)\s*:\s*(?P<series>.+)', raw.strip())
737                    if m is not None:
738                        ans = (m.group('series').strip(), float(m.group('index')))
739
740        # This is found on Kindle edition pages on amazon.com
741        if ans == (None, None):
742            for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'):
743                text = (span.text or '').strip()
744                m = re.match(r'Book\s+([0-9.]+)', text)
745                if m is not None:
746                    series_index = float(m.group(1))
747                    a = span.xpath('./a[@href]')
748                    if a:
749                        series = self.tostring(
750                            a[0], encoding='unicode', method='text', with_tail=False).strip()
751                        if series:
752                            ans = (series, series_index)
753        # This is found on newer Kindle edition pages on amazon.com
754        if ans == (None, None):
755            for b in root.xpath('//div[@id="reviewFeatureGroup"]/span/b'):
756                text = (b.text or '').strip()
757                m = re.match(r'Book\s+([0-9.]+)', text)
758                if m is not None:
759                    series_index = float(m.group(1))
760                    a = b.getparent().xpath('./a[@href]')
761                    if a:
762                        series = self.tostring(
763                            a[0], encoding='unicode', method='text', with_tail=False).partition('(')[0].strip()
764                        if series:
765                            ans = series, series_index
766
767        if ans == (None, None):
768            desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
769            if desc:
770                raw = self.tostring(desc[0], method='text', encoding='unicode')
771                raw = re.sub(r'\s+', ' ', raw)
772                match = self.series_pat.search(raw)
773                if match is not None:
774                    s, i = match.group('series'), float(match.group('index'))
775                    if s:
776                        ans = (s, i)
777        if ans[0]:
778            ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1])
779            ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1])
780        return ans
781
782    def parse_tags(self, root):
783        ans = []
784        exclude_tokens = {'kindle', 'a-z'}
785        exclude = {'special features', 'by authors',
786                   'authors & illustrators', 'books', 'new; used & rental textbooks'}
787        seen = set()
788        for li in root.xpath(self.tags_xpath):
789            for i, a in enumerate(li.iterdescendants('a')):
790                if i > 0:
791                    # we ignore the first category since it is almost always
792                    # too broad
793                    raw = (a.text or '').strip().replace(',', ';')
794                    lraw = icu_lower(raw)
795                    tokens = frozenset(lraw.split())
796                    if raw and lraw not in exclude and not tokens.intersection(exclude_tokens) and lraw not in seen:
797                        ans.append(raw)
798                        seen.add(lraw)
799        return ans
800
801    def parse_cover(self, root, raw=b""):
802        # Look for the image URL in javascript, using the first image in the
803        # image gallery as the cover
804        import json
805        imgpat = re.compile(r"""'imageGalleryData'\s*:\s*(\[\s*{.+])""")
806        for script in root.xpath('//script'):
807            m = imgpat.search(script.text or '')
808            if m is not None:
809                try:
810                    return json.loads(m.group(1))[0]['mainUrl']
811                except Exception:
812                    continue
813
814        def clean_img_src(src):
815            parts = src.split('/')
816            if len(parts) > 3:
817                bn = parts[-1]
818                sparts = bn.split('_')
819                if len(sparts) > 2:
820                    bn = re.sub(r'\.\.jpg$', '.jpg', (sparts[0] + sparts[-1]))
821                    return ('/'.join(parts[:-1])) + '/' + bn
822
823        imgpat2 = re.compile(r'var imageSrc = "([^"]+)"')
824        for script in root.xpath('//script'):
825            m = imgpat2.search(script.text or '')
826            if m is not None:
827                src = m.group(1)
828                url = clean_img_src(src)
829                if url:
830                    return url
831
832        imgs = root.xpath(
833            '//img[(@id="prodImage" or @id="original-main-image" or @id="main-image" or @id="main-image-nonjs") and @src]')
834        if not imgs:
835            imgs = (
836                root.xpath('//div[@class="main-image-inner-wrapper"]/img[@src]') or
837                root.xpath('//div[@id="main-image-container" or @id="ebooks-main-image-container"]//img[@src]') or
838                root.xpath(
839                    '//div[@id="mainImageContainer"]//img[@data-a-dynamic-image]')
840            )
841            for img in imgs:
842                try:
843                    idata = json.loads(img.get('data-a-dynamic-image'))
844                except Exception:
845                    imgs = ()
846                else:
847                    mwidth = 0
848                    try:
849                        url = None
850                        for iurl, (width, height) in idata.items():
851                            if width > mwidth:
852                                mwidth = width
853                                url = iurl
854                        return url
855                    except Exception:
856                        pass
857
858        for img in imgs:
859            src = img.get('src')
860            if 'data:' in src:
861                continue
862            if 'loading-' in src:
863                js_img = re.search(br'"largeImage":"(https?://[^"]+)",', raw)
864                if js_img:
865                    src = js_img.group(1).decode('utf-8')
866            if ('/no-image-avail' not in src and 'loading-' not in src and '/no-img-sm' not in src):
867                self.log('Found image: %s' % src)
868                url = clean_img_src(src)
869                if url:
870                    return url
871
872    def parse_detail_bullets(self, root, mi, container):
873        ul = next(self.selector('.detail-bullet-list', root=container))
874        for span in self.selector('.a-list-item', root=ul):
875            cells = span.xpath('./span')
876            if len(cells) >= 2:
877                self.parse_detail_cells(mi, cells[0], cells[1])
878
879    def parse_new_details(self, root, mi, non_hero):
880        table = non_hero.xpath('descendant::table')[0]
881        for tr in table.xpath('descendant::tr'):
882            cells = tr.xpath('descendant::*[local-name()="td" or local-name()="th"]')
883            if len(cells) == 2:
884                self.parse_detail_cells(mi, cells[0], cells[1])
885
886    def parse_detail_cells(self, mi, c1, c2):
887        name = self.totext(c1, only_printable=True).strip().strip(':').strip()
888        val = self.totext(c2).strip()
889        if not val:
890            return
891        if name in self.language_names:
892            ans = self.lang_map.get(val, None)
893            if not ans:
894                ans = canonicalize_lang(val)
895            if ans:
896                mi.language = ans
897        elif name in self.publisher_names:
898            pub = val.partition(';')[0].partition('(')[0].strip()
899            if pub:
900                mi.publisher = pub
901            date = val.rpartition('(')[-1].replace(')', '').strip()
902            try:
903                from calibre.utils.date import parse_only_date
904                date = self.delocalize_datestr(date)
905                mi.pubdate = parse_only_date(date, assume_utc=True)
906            except:
907                self.log.exception('Failed to parse pubdate: %s' % val)
908        elif name in {'ISBN', 'ISBN-10', 'ISBN-13'}:
909            ans = check_isbn(val)
910            if ans:
911                self.isbn = mi.isbn = ans
912        elif name in {'Publication date'}:
913            from calibre.utils.date import parse_only_date
914            date = self.delocalize_datestr(val)
915            mi.pubdate = parse_only_date(date, assume_utc=True)
916
917    def parse_isbn(self, pd):
918        items = pd.xpath(
919            'descendant::*[starts-with(text(), "ISBN")]')
920        if not items:
921            items = pd.xpath(
922                'descendant::b[contains(text(), "ISBN:")]')
923        for x in reversed(items):
924            if x.tail:
925                ans = check_isbn(x.tail.strip())
926                if ans:
927                    return ans
928
929    def parse_publisher(self, pd):
930        for x in reversed(pd.xpath(self.publisher_xpath)):
931            if x.tail:
932                ans = x.tail.partition(';')[0]
933                return ans.partition('(')[0].strip()
934
935    def parse_pubdate(self, pd):
936        from calibre.utils.date import parse_only_date
937        for x in reversed(pd.xpath(self.pubdate_xpath)):
938            if x.tail:
939                date = x.tail.strip()
940                date = self.delocalize_datestr(date)
941                try:
942                    return parse_only_date(date, assume_utc=True)
943                except Exception:
944                    pass
945        for x in reversed(pd.xpath(self.publisher_xpath)):
946            if x.tail:
947                ans = x.tail
948                date = ans.rpartition('(')[-1].replace(')', '').strip()
949                date = self.delocalize_datestr(date)
950                try:
951                    return parse_only_date(date, assume_utc=True)
952                except Exception:
953                    pass
954
955    def parse_language(self, pd):
956        for x in reversed(pd.xpath(self.language_xpath)):
957            if x.tail:
958                raw = x.tail.strip().partition(',')[0].strip()
959                ans = self.lang_map.get(raw, None)
960                if ans:
961                    return ans
962                ans = canonicalize_lang(ans)
963                if ans:
964                    return ans
965# }}}
966
967
968class Amazon(Source):
969
970    name = 'Amazon.com'
971    version = (1, 2, 22)
972    minimum_calibre_version = (2, 82, 0)
973    description = _('Downloads metadata and covers from Amazon')
974
975    capabilities = frozenset(('identify', 'cover'))
976    touched_fields = frozenset(('title', 'authors', 'identifier:amazon',
977        'rating', 'comments', 'publisher', 'pubdate',
978        'languages', 'series', 'tags'))
979    has_html_comments = True
980    supports_gzip_transfer_encoding = True
981    prefer_results_with_isbn = False
982
983    AMAZON_DOMAINS = {
984        'com': _('US'),
985        'fr': _('France'),
986        'de': _('Germany'),
987        'uk': _('UK'),
988        'au': _('Australia'),
989        'it': _('Italy'),
990        'jp': _('Japan'),
991        'es': _('Spain'),
992        'br': _('Brazil'),
993        'nl': _('Netherlands'),
994        'cn': _('China'),
995        'ca': _('Canada'),
996        'se': _('Sweden'),
997    }
998
999    SERVERS = {
1000        'auto': _('Choose server automatically'),
1001        'amazon': _('Amazon servers'),
1002        'bing': _('Bing search cache'),
1003        'google': _('Google search cache'),
1004        'wayback': _('Wayback machine cache (slow)'),
1005    }
1006
1007    options = (
1008        Option('domain', 'choices', 'com', _('Amazon country website to use:'),
1009               _('Metadata from Amazon will be fetched using this '
1010                 'country\'s Amazon website.'), choices=AMAZON_DOMAINS),
1011        Option('server', 'choices', 'auto', _('Server to get data from:'),
1012               _(
1013                   'Amazon has started blocking attempts to download'
1014                   ' metadata from its servers. To get around this problem,'
1015                   ' calibre can fetch the Amazon data from many different'
1016                   ' places where it is cached. Choose the source you prefer.'
1017               ), choices=SERVERS),
1018        Option('use_mobi_asin', 'bool', False, _('Use the MOBI-ASIN for metadata search'),
1019               _(
1020                   'Enable this option to search for metadata with an'
1021                   ' ASIN identifier from the MOBI file at the current country website,'
1022                   ' unless any other amazon id is available. Note that if the'
1023                   ' MOBI file came from a different Amazon country store, you could get'
1024                   ' incorrect results.'
1025               )),
1026    )
1027
1028    def __init__(self, *args, **kwargs):
1029        Source.__init__(self, *args, **kwargs)
1030        self.set_amazon_id_touched_fields()
1031
1032    def test_fields(self, mi):
1033        '''
1034        Return the first field from self.touched_fields that is null on the
1035        mi object
1036        '''
1037        for key in self.touched_fields:
1038            if key.startswith('identifier:'):
1039                key = key.partition(':')[-1]
1040                if key == 'amazon':
1041                    if self.domain != 'com':
1042                        key += '_' + self.domain
1043                if not mi.has_identifier(key):
1044                    return 'identifier: ' + key
1045            elif mi.is_null(key):
1046                return key
1047
1048    @property
1049    def browser(self):
1050        br = self._browser
1051        if br is None:
1052            ua = 'Mobile '
1053            while not user_agent_is_ok(ua):
1054                ua = random_user_agent(allow_ie=False)
1055            # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0'
1056            self._browser = br = browser(user_agent=ua)
1057            br.set_handle_gzip(True)
1058            if self.use_search_engine:
1059                br.addheaders += [
1060                    ('Accept', accept_header_for_ua(ua)),
1061                    ('Upgrade-insecure-requests', '1'),
1062                ]
1063            else:
1064                br.addheaders += [
1065                    ('Accept', accept_header_for_ua(ua)),
1066                    ('Upgrade-insecure-requests', '1'),
1067                    ('Referer', self.referrer_for_domain()),
1068                ]
1069        return br
1070
1071    def save_settings(self, *args, **kwargs):
1072        Source.save_settings(self, *args, **kwargs)
1073        self.set_amazon_id_touched_fields()
1074
1075    def set_amazon_id_touched_fields(self):
1076        ident_name = "identifier:amazon"
1077        if self.domain != 'com':
1078            ident_name += '_' + self.domain
1079        tf = [x for x in self.touched_fields if not
1080              x.startswith('identifier:amazon')] + [ident_name]
1081        self.touched_fields = frozenset(tf)
1082
1083    def get_domain_and_asin(self, identifiers, extra_domains=()):
1084        identifiers = {k.lower(): v for k, v in identifiers.items()}
1085        for key, val in identifiers.items():
1086            if key in ('amazon', 'asin'):
1087                return 'com', val
1088            if key.startswith('amazon_'):
1089                domain = key.partition('_')[-1]
1090                if domain and (domain in self.AMAZON_DOMAINS or domain in extra_domains):
1091                    return domain, val
1092        if self.prefs['use_mobi_asin']:
1093            val = identifiers.get('mobi-asin')
1094            if val is not None:
1095                return self.domain, val
1096        return None, None
1097
1098    def referrer_for_domain(self, domain=None):
1099        domain = domain or self.domain
1100        return {
1101            'uk':  'https://www.amazon.co.uk/',
1102            'au':  'https://www.amazon.com.au/',
1103            'br':  'https://www.amazon.com.br/',
1104            'jp':  'https://www.amazon.co.jp/',
1105        }.get(domain, 'https://www.amazon.%s/' % domain)
1106
1107    def _get_book_url(self, identifiers):  # {{{
1108        domain, asin = self.get_domain_and_asin(
1109            identifiers, extra_domains=('in', 'au', 'ca'))
1110        if domain and asin:
1111            url = None
1112            r = self.referrer_for_domain(domain)
1113            if r is not None:
1114                url = r + 'dp/' + asin
1115            if url:
1116                idtype = 'amazon' if domain == 'com' else 'amazon_' + domain
1117                return domain, idtype, asin, url
1118
1119    def get_book_url(self, identifiers):
1120        ans = self._get_book_url(identifiers)
1121        if ans is not None:
1122            return ans[1:]
1123
1124    def get_book_url_name(self, idtype, idval, url):
1125        if idtype == 'amazon':
1126            return self.name
1127        return 'A' + idtype.replace('_', '.')[1:]
1128    # }}}
1129
1130    @property
1131    def domain(self):
1132        x = getattr(self, 'testing_domain', None)
1133        if x is not None:
1134            return x
1135        domain = self.prefs['domain']
1136        if domain not in self.AMAZON_DOMAINS:
1137            domain = 'com'
1138
1139        return domain
1140
1141    @property
1142    def server(self):
1143        x = getattr(self, 'testing_server', None)
1144        if x is not None:
1145            return x
1146        server = self.prefs['server']
1147        if server not in self.SERVERS:
1148            server = 'auto'
1149        return server
1150
1151    @property
1152    def use_search_engine(self):
1153        return self.server != 'amazon'
1154
1155    def clean_downloaded_metadata(self, mi):
1156        docase = (
1157            mi.language == 'eng' or
1158            (mi.is_null('language') and self.domain in {'com', 'uk', 'au'})
1159        )
1160        if mi.title and docase:
1161            # Remove series information from title
1162            m = re.search(r'\S+\s+(\(.+?\s+Book\s+\d+\))$', mi.title)
1163            if m is not None:
1164                mi.title = mi.title.replace(m.group(1), '').strip()
1165            mi.title = fixcase(mi.title)
1166        mi.authors = fixauthors(mi.authors)
1167        if mi.tags and docase:
1168            mi.tags = list(map(fixcase, mi.tags))
1169        mi.isbn = check_isbn(mi.isbn)
1170        if mi.series and docase:
1171            mi.series = fixcase(mi.series)
1172        if mi.title and mi.series:
1173            for pat in (r':\s*Book\s+\d+\s+of\s+%s$', r'\(%s\)$', r':\s*%s\s+Book\s+\d+$'):
1174                pat = pat % re.escape(mi.series)
1175                q = re.sub(pat, '', mi.title, flags=re.I).strip()
1176                if q and q != mi.title:
1177                    mi.title = q
1178                    break
1179
1180    def get_website_domain(self, domain):
1181        return {'uk': 'co.uk', 'jp': 'co.jp', 'br': 'com.br', 'au': 'com.au'}.get(domain, domain)
1182
1183    def create_query(self, log, title=None, authors=None, identifiers={},  # {{{
1184                     domain=None, for_amazon=True):
1185        try:
1186            from urllib.parse import urlencode, unquote_plus
1187        except ImportError:
1188            from urllib import urlencode, unquote_plus
1189        if domain is None:
1190            domain = self.domain
1191
1192        idomain, asin = self.get_domain_and_asin(identifiers)
1193        if idomain is not None:
1194            domain = idomain
1195
1196        # See the amazon detailed search page to get all options
1197        terms = []
1198        q = {'search-alias': 'aps',
1199             'unfiltered': '1',
1200             }
1201
1202        if domain == 'com':
1203            q['sort'] = 'relevanceexprank'
1204        else:
1205            q['sort'] = 'relevancerank'
1206
1207        isbn = check_isbn(identifiers.get('isbn', None))
1208
1209        if asin is not None:
1210            q['field-keywords'] = asin
1211            terms.append(asin)
1212        elif isbn is not None:
1213            q['field-isbn'] = isbn
1214            if len(isbn) == 13:
1215                terms.extend('({} OR {}-{})'.format(isbn, isbn[:3], isbn[3:]).split())
1216            else:
1217                terms.append(isbn)
1218        else:
1219            # Only return book results
1220            q['search-alias'] = {'br': 'digital-text',
1221                                 'nl': 'aps'}.get(domain, 'stripbooks')
1222            if title:
1223                title_tokens = list(self.get_title_tokens(title))
1224                if title_tokens:
1225                    q['field-title'] = ' '.join(title_tokens)
1226                    terms.extend(title_tokens)
1227            if authors:
1228                author_tokens = list(self.get_author_tokens(authors,
1229                                                       only_first_author=True))
1230                if author_tokens:
1231                    q['field-author'] = ' '.join(author_tokens)
1232                    terms.extend(author_tokens)
1233
1234        if not ('field-keywords' in q or 'field-isbn' in q or
1235                ('field-title' in q)):
1236            # Insufficient metadata to make an identify query
1237            return None, None
1238
1239        if not for_amazon:
1240            return terms, domain
1241
1242        if domain == 'nl':
1243            q['__mk_nl_NL'] = 'ÅMÅŽÕÑ'
1244            if 'field-keywords' not in q:
1245                q['field-keywords'] = ''
1246            for f in 'field-isbn field-title field-author'.split():
1247                q['field-keywords'] += ' ' + q.pop(f, '')
1248            q['field-keywords'] = q['field-keywords'].strip()
1249
1250        encoded_q = dict([(x.encode('utf-8', 'ignore'), y.encode(
1251            'utf-8', 'ignore')) for x, y in q.items()])
1252        url_query = urlencode(encoded_q)
1253        # amazon's servers want IRIs with unicode characters not percent esaped
1254        parts = []
1255        for x in url_query.split(b'&' if isinstance(url_query, bytes) else '&'):
1256            k, v = x.split(b'=' if isinstance(x, bytes) else '=', 1)
1257            parts.append('{}={}'.format(iri_quote_plus(unquote_plus(k)), iri_quote_plus(unquote_plus(v))))
1258        url_query = '&'.join(parts)
1259        url = 'https://www.amazon.%s/s/?' % self.get_website_domain(
1260            domain) + url_query
1261        return url, domain
1262
1263    # }}}
1264
1265    def get_cached_cover_url(self, identifiers):  # {{{
1266        url = None
1267        domain, asin = self.get_domain_and_asin(identifiers)
1268        if asin is None:
1269            isbn = identifiers.get('isbn', None)
1270            if isbn is not None:
1271                asin = self.cached_isbn_to_identifier(isbn)
1272        if asin is not None:
1273            url = self.cached_identifier_to_cover_url(asin)
1274
1275        return url
1276    # }}}
1277
1278    def parse_results_page(self, root, domain):  # {{{
1279        from lxml.html import tostring
1280
1281        matches = []
1282
1283        def title_ok(title):
1284            title = title.lower()
1285            bad = ['bulk pack', '[audiobook]', '[audio cd]',
1286                   '(a book companion)', '( slipcase with door )', ': free sampler']
1287            if self.domain == 'com':
1288                bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
1289            for x in bad:
1290                if x in title:
1291                    return False
1292            if title and title[0] in '[{' and re.search(r'\(\s*author\s*\)', title) is not None:
1293                # Bad entries in the catalog
1294                return False
1295            return True
1296
1297        for query in (
1298                '//div[contains(@class, "s-result-list")]//h2/a[@href]',
1299                '//div[contains(@class, "s-result-list")]//div[@data-index]//h5//a[@href]',
1300                r'//li[starts-with(@id, "result_")]//a[@href and contains(@class, "s-access-detail-page")]',
1301        ):
1302            result_links = root.xpath(query)
1303            if result_links:
1304                break
1305        for a in result_links:
1306            title = tostring(a, method='text', encoding='unicode')
1307            if title_ok(title):
1308                url = a.get('href')
1309                if url.startswith('/'):
1310                    url = 'https://www.amazon.%s%s' % (
1311                        self.get_website_domain(domain), url)
1312                matches.append(url)
1313
1314        if not matches:
1315            # Previous generation of results page markup
1316            for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
1317                links = div.xpath(r'descendant::a[@class="title" and @href]')
1318                if not links:
1319                    # New amazon markup
1320                    links = div.xpath('descendant::h3/a[@href]')
1321                for a in links:
1322                    title = tostring(a, method='text', encoding='unicode')
1323                    if title_ok(title):
1324                        url = a.get('href')
1325                        if url.startswith('/'):
1326                            url = 'https://www.amazon.%s%s' % (
1327                                self.get_website_domain(domain), url)
1328                        matches.append(url)
1329                    break
1330
1331        if not matches:
1332            # This can happen for some user agents that Amazon thinks are
1333            # mobile/less capable
1334            for td in root.xpath(
1335                    r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'):
1336                for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
1337                    title = tostring(a, method='text', encoding='unicode')
1338                    if title_ok(title):
1339                        url = a.get('href')
1340                        if url.startswith('/'):
1341                            url = 'https://www.amazon.%s%s' % (
1342                                self.get_website_domain(domain), url)
1343                        matches.append(url)
1344                    break
1345        if not matches and root.xpath('//form[@action="/errors/validateCaptcha"]'):
1346            raise CaptchaError('Amazon returned a CAPTCHA page. Recently Amazon has begun using statistical'
1347                               ' profiling to block access to its website. As such this metadata plugin is'
1348                               ' unlikely to ever work reliably.')
1349
1350        # Keep only the top 3 matches as the matches are sorted by relevance by
1351        # Amazon so lower matches are not likely to be very relevant
1352        return matches[:3]
1353    # }}}
1354
1355    def search_amazon(self, br, testing, log, abort, title, authors, identifiers, timeout):  # {{{
1356        from calibre.utils.cleantext import clean_ascii_chars
1357        from calibre.ebooks.chardet import xml_to_unicode
1358        matches = []
1359        query, domain = self.create_query(log, title=title, authors=authors,
1360                                          identifiers=identifiers)
1361        if query is None:
1362            log.error('Insufficient metadata to construct query')
1363            raise SearchFailed()
1364        try:
1365            raw = br.open_novisit(query, timeout=timeout).read().strip()
1366        except Exception as e:
1367            if callable(getattr(e, 'getcode', None)) and \
1368                    e.getcode() == 404:
1369                log.error('Query malformed: %r' % query)
1370                raise SearchFailed()
1371            attr = getattr(e, 'args', [None])
1372            attr = attr if attr else [None]
1373            if isinstance(attr[0], socket.timeout):
1374                msg = _('Amazon timed out. Try again later.')
1375                log.error(msg)
1376            else:
1377                msg = 'Failed to make identify query: %r' % query
1378                log.exception(msg)
1379            raise SearchFailed()
1380
1381        raw = clean_ascii_chars(xml_to_unicode(raw,
1382                                               strip_encoding_pats=True, resolve_entities=True)[0])
1383
1384        if testing:
1385            import tempfile
1386            with tempfile.NamedTemporaryFile(prefix='amazon_results_',
1387                                             suffix='.html', delete=False) as f:
1388                f.write(raw.encode('utf-8'))
1389            print('Downloaded html for results page saved in', f.name)
1390
1391        matches = []
1392        found = '<title>404 - ' not in raw
1393
1394        if found:
1395            try:
1396                root = parse_html(raw)
1397            except Exception:
1398                msg = 'Failed to parse amazon page for query: %r' % query
1399                log.exception(msg)
1400                raise SearchFailed()
1401
1402        matches = self.parse_results_page(root, domain)
1403
1404        return matches, query, domain, None
1405    # }}}
1406
1407    def search_search_engine(self, br, testing, log, abort, title, authors, identifiers, timeout, override_server=None):  # {{{
1408        from calibre.ebooks.metadata.sources.update import search_engines_module
1409        terms, domain = self.create_query(log, title=title, authors=authors,
1410                                          identifiers=identifiers, for_amazon=False)
1411        site = self.referrer_for_domain(
1412            domain)[len('https://'):].partition('/')[0]
1413        matches = []
1414        se = search_engines_module()
1415        server = override_server or self.server
1416        if server in ('bing',):
1417            urlproc, sfunc = se.bing_url_processor, se.bing_search
1418        elif server in ('auto', 'google'):
1419            urlproc, sfunc = se.google_url_processor, se.google_search
1420        elif server == 'wayback':
1421            urlproc, sfunc = se.wayback_url_processor, se.ddg_search
1422        results, qurl = sfunc(terms, site, log=log, br=br, timeout=timeout)
1423        br.set_current_header('Referer', qurl)
1424        for result in results:
1425            if abort.is_set():
1426                return matches, terms, domain, None
1427
1428            purl = urlparse(result.url)
1429            if '/dp/' in purl.path and site in purl.netloc:
1430                url = result.cached_url
1431                if url is None:
1432                    url = se.wayback_machine_cached_url(
1433                        result.url, br, timeout=timeout)
1434                if url is None:
1435                    log('Failed to find cached page for:', result.url)
1436                    continue
1437                if url not in matches:
1438                    matches.append(url)
1439                if len(matches) >= 3:
1440                    break
1441            else:
1442                log('Skipping non-book result:', result)
1443        if not matches:
1444            log('No search engine results for terms:', ' '.join(terms))
1445            if urlproc is se.google_url_processor:
1446                # Google does not cache adult titles
1447                log('Trying the bing search engine instead')
1448                return self.search_search_engine(br, testing, log, abort, title, authors, identifiers, timeout, 'bing')
1449        return matches, terms, domain, urlproc
1450    # }}}
1451
1452    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
1453                 identifiers={}, timeout=60):
1454        '''
1455        Note this method will retry without identifiers automatically if no
1456        match is found with identifiers.
1457        '''
1458
1459        testing = getattr(self, 'running_a_test', False)
1460
1461        udata = self._get_book_url(identifiers)
1462        br = self.browser
1463        log('User-agent:', br.current_user_agent())
1464        log('Server:', self.server)
1465        if testing:
1466            print('User-agent:', br.current_user_agent())
1467        if udata is not None and not self.use_search_engine:
1468            # Try to directly get details page instead of running a search
1469            # Cannot use search engine as the directly constructed URL is
1470            # usually redirected to a full URL by amazon, and is therefore
1471            # not cached
1472            domain, idtype, asin, durl = udata
1473            if durl is not None:
1474                preparsed_root = parse_details_page(
1475                    durl, log, timeout, br, domain)
1476                if preparsed_root is not None:
1477                    qasin = parse_asin(preparsed_root[1], log, durl)
1478                    if qasin == asin:
1479                        w = Worker(durl, result_queue, br, log, 0, domain,
1480                                   self, testing=testing, preparsed_root=preparsed_root, timeout=timeout)
1481                        try:
1482                            w.get_details()
1483                            return
1484                        except Exception:
1485                            log.exception(
1486                                'get_details failed for url: %r' % durl)
1487        func = self.search_search_engine if self.use_search_engine else self.search_amazon
1488        try:
1489            matches, query, domain, cover_url_processor = func(
1490                br, testing, log, abort, title, authors, identifiers, timeout)
1491        except SearchFailed:
1492            return
1493
1494        if abort.is_set():
1495            return
1496
1497        if not matches:
1498            if identifiers and title and authors:
1499                log('No matches found with identifiers, retrying using only'
1500                    ' title and authors. Query: %r' % query)
1501                time.sleep(1)
1502                return self.identify(log, result_queue, abort, title=title,
1503                                     authors=authors, timeout=timeout)
1504            log.error('No matches found with query: %r' % query)
1505            return
1506
1507        workers = [Worker(
1508            url, result_queue, br, log, i, domain, self, testing=testing, timeout=timeout,
1509            cover_url_processor=cover_url_processor, filter_result=partial(
1510                self.filter_result, title, authors, identifiers)) for i, url in enumerate(matches)]
1511
1512        for w in workers:
1513            # Don't send all requests at the same time
1514            time.sleep(1)
1515            w.start()
1516            if abort.is_set():
1517                return
1518
1519        while not abort.is_set():
1520            a_worker_is_alive = False
1521            for w in workers:
1522                w.join(0.2)
1523                if abort.is_set():
1524                    break
1525                if w.is_alive():
1526                    a_worker_is_alive = True
1527            if not a_worker_is_alive:
1528                break
1529
1530        return None
1531    # }}}
1532
1533    def filter_result(self, title, authors, identifiers, mi, log):  # {{{
1534        if not self.use_search_engine:
1535            return True
1536        if title is not None:
1537
1538            def tokenize_title(x):
1539                return icu_lower(x).replace("'", '').replace('"', '').rstrip(':')
1540
1541            tokens = {tokenize_title(x) for x in title.split() if len(x) > 3}
1542            if tokens:
1543                result_tokens = {tokenize_title(x) for x in mi.title.split()}
1544                if not tokens.intersection(result_tokens):
1545                    log('Ignoring result:', mi.title, 'as its title does not match')
1546                    return False
1547        if authors:
1548            author_tokens = set()
1549            for author in authors:
1550                author_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}
1551            result_tokens = set()
1552            for author in mi.authors:
1553                result_tokens |= {icu_lower(x) for x in author.split() if len(x) > 2}
1554            if author_tokens and not author_tokens.intersection(result_tokens):
1555                log('Ignoring result:', mi.title, 'by', ' & '.join(mi.authors), 'as its author does not match')
1556                return False
1557        return True
1558    # }}}
1559
1560    def download_cover(self, log, result_queue, abort,  # {{{
1561                       title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False):
1562        cached_url = self.get_cached_cover_url(identifiers)
1563        if cached_url is None:
1564            log.info('No cached cover found, running identify')
1565            rq = Queue()
1566            self.identify(log, rq, abort, title=title, authors=authors,
1567                          identifiers=identifiers)
1568            if abort.is_set():
1569                return
1570            if abort.is_set():
1571                return
1572            results = []
1573            while True:
1574                try:
1575                    results.append(rq.get_nowait())
1576                except Empty:
1577                    break
1578            results.sort(key=self.identify_results_keygen(
1579                title=title, authors=authors, identifiers=identifiers))
1580            for mi in results:
1581                cached_url = self.get_cached_cover_url(mi.identifiers)
1582                if cached_url is not None:
1583                    break
1584        if cached_url is None:
1585            log.info('No cover found')
1586            return
1587
1588        if abort.is_set():
1589            return
1590        log('Downloading cover from:', cached_url)
1591        br = self.browser
1592        if self.use_search_engine:
1593            br = br.clone_browser()
1594            br.set_current_header('Referer', self.referrer_for_domain(self.domain))
1595        try:
1596            time.sleep(1)
1597            cdata = br.open_novisit(
1598                cached_url, timeout=timeout).read()
1599            result_queue.put((self, cdata))
1600        except:
1601            log.exception('Failed to download cover from:', cached_url)
1602    # }}}
1603
1604
1605def manual_tests(domain, **kw):  # {{{
1606    # To run these test use:
1607    # calibre-debug -c "from calibre.ebooks.metadata.sources.amazon import *; manual_tests('com')"
1608    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
1609                                                      isbn_test, title_test, authors_test, comments_test, series_test)
1610    all_tests = {}
1611    all_tests['com'] = [  # {{{
1612        (   # Paperback with series
1613            {'identifiers': {'amazon': '1423146786'}},
1614            [title_test('The Heroes of Olympus, Book Five The Blood of Olympus',
1615                        exact=True), series_test('The Heroes of Olympus', 5)]
1616        ),
1617
1618        (   # Kindle edition with series
1619            {'identifiers': {'amazon': 'B0085UEQDO'}},
1620            [title_test('Three Parts Dead', exact=True),
1621             series_test('Craft Sequence', 1)]
1622        ),
1623
1624        (  # + in title and uses id="main-image" for cover
1625            {'identifiers': {'amazon': '1933988770'}},
1626            [title_test(
1627                'C++ Concurrency in Action: Practical Multithreading', exact=True)]
1628        ),
1629
1630
1631        (  # Different comments markup, using Book Description section
1632            {'identifiers': {'amazon': '0982514506'}},
1633            [title_test(
1634                "Griffin's Destiny: Book Three: The Griffin's Daughter Trilogy",
1635                exact=True),
1636             comments_test('Jelena'), comments_test('Ashinji'),
1637             ]
1638        ),
1639
1640        (  # # in title
1641            {'title': 'Expert C# 2008 Business Objects',
1642             'authors': ['Lhotka']},
1643            [title_test('Expert C#'),
1644             authors_test(['Rockford Lhotka'])
1645             ]
1646        ),
1647
1648        (  # No specific problems
1649            {'identifiers': {'isbn': '0743273567'}},
1650            [title_test('the great gatsby: the only authorized edition', exact=True),
1651             authors_test(['Francis Scott Fitzgerald'])]
1652        ),
1653
1654    ]
1655
1656    # }}}
1657
1658    all_tests['de'] = [  # {{{
1659        (  # umlaut in title/authors
1660            {'title': 'Flüsternde Wälder',
1661             'authors': ['Nicola Förg']},
1662            [title_test('Flüsternde Wälder'),
1663             authors_test(['Nicola Förg'], subset=True)
1664             ]
1665        ),
1666
1667
1668        (
1669            {'identifiers': {'isbn': '9783453314979'}},
1670            [title_test('Die letzten Wächter: Roman',
1671                        exact=False), authors_test(['Sergej Lukianenko'])
1672             ]
1673
1674        ),
1675
1676        (
1677            {'identifiers': {'isbn': '3548283519'}},
1678            [title_test('Wer Wind Sät: Der Fünfte Fall Für Bodenstein Und Kirchhoff',
1679                        exact=False), authors_test(['Nele Neuhaus'])
1680             ]
1681
1682        ),
1683    ]  # }}}
1684
1685    all_tests['it'] = [  # {{{
1686        (
1687            {'identifiers': {'isbn': '8838922195'}},
1688            [title_test('La briscola in cinque',
1689                        exact=True), authors_test(['Marco Malvaldi'])
1690             ]
1691
1692        ),
1693    ]  # }}}
1694
1695    all_tests['fr'] = [  # {{{
1696        (
1697            {'identifiers': {'amazon_fr': 'B07L7ST4RS'}},
1698            [title_test('Le secret de Lola', exact=True),
1699                authors_test(['Amélie BRIZIO'])
1700            ]
1701        ),
1702        (
1703            {'identifiers': {'isbn': '2221116798'}},
1704            [title_test('L\'étrange voyage de Monsieur Daldry',
1705                        exact=True), authors_test(['Marc Levy'])
1706             ]
1707
1708        ),
1709    ]  # }}}
1710
1711    all_tests['es'] = [  # {{{
1712        (
1713            {'identifiers': {'isbn': '8483460831'}},
1714            [title_test('Tiempos Interesantes',
1715                        exact=False), authors_test(['Terry Pratchett'])
1716             ]
1717
1718        ),
1719    ]  # }}}
1720
1721    all_tests['se'] = [  # {{{
1722        (
1723            {'identifiers': {'isbn': '9780552140287'}},
1724            [title_test('Men At Arms: A Discworld Novel: 14',
1725                        exact=False), authors_test(['Terry Pratchett'])
1726             ]
1727
1728        ),
1729    ]  # }}}
1730
1731    all_tests['jp'] = [  # {{{
1732        (  # Adult filtering test
1733            {'identifiers': {'isbn': '4799500066'}},
1734            [title_test('Bitch Trap'), ]
1735        ),
1736
1737        (  # isbn -> title, authors
1738            {'identifiers': {'isbn': '9784101302720'}},
1739            [title_test('精霊の守り人',
1740                        exact=True), authors_test(['上橋 菜穂子'])
1741             ]
1742        ),
1743        (  # title, authors -> isbn (will use Shift_JIS encoding in query.)
1744            {'title': '考えない練習',
1745             'authors': ['小池 龍之介']},
1746            [isbn_test('9784093881067'), ]
1747        ),
1748    ]  # }}}
1749
1750    all_tests['br'] = [  # {{{
1751        (
1752            {'title': 'Guerra dos Tronos'},
1753            [title_test('A Guerra dos Tronos - As Crônicas de Gelo e Fogo',
1754                        exact=True), authors_test(['George R. R. Martin'])
1755             ]
1756
1757        ),
1758    ]  # }}}
1759
1760    all_tests['nl'] = [  # {{{
1761        (
1762            {'title': 'Freakonomics'},
1763            [title_test('Freakonomics',
1764                        exact=True), authors_test(['Steven Levitt & Stephen Dubner & R. Kuitenbrouwer & O. Brenninkmeijer & A. van Den Berg'])
1765             ]
1766
1767        ),
1768    ]  # }}}
1769
1770    all_tests['cn'] = [  # {{{
1771        (
1772            {'identifiers': {'isbn': '9787115369512'}},
1773            [title_test('若为自由故 自由软件之父理查德斯托曼传', exact=True),
1774             authors_test(['[美]sam Williams', '邓楠,李凡希'])]
1775        ),
1776        (
1777            {'title': '爱上Raspberry Pi'},
1778            [title_test('爱上Raspberry Pi',
1779                        exact=True), authors_test(['Matt Richardson', 'Shawn Wallace', '李凡希'])
1780             ]
1781
1782        ),
1783    ]  # }}}
1784
1785    all_tests['ca'] = [  # {{{
1786        (   # Paperback with series
1787            {'identifiers': {'isbn': '9781623808747'}},
1788            [title_test('Parting Shot', exact=True),
1789             authors_test(['Mary Calmes'])]
1790        ),
1791        (  # # in title
1792            {'title': 'Expert C# 2008 Business Objects',
1793             'authors': ['Lhotka']},
1794            [title_test('Expert C# 2008 Business Objects'),
1795             authors_test(['Rockford Lhotka'])]
1796        ),
1797        (  # noscript description
1798            {'identifiers': {'amazon_ca': '162380874X'}},
1799            [title_test('Parting Shot', exact=True), authors_test(['Mary Calmes'])
1800             ]
1801        ),
1802    ]  # }}}
1803
1804    def do_test(domain, start=0, stop=None, server='auto'):
1805        tests = all_tests[domain]
1806        if stop is None:
1807            stop = len(tests)
1808        tests = tests[start:stop]
1809        test_identify_plugin(Amazon.name, tests, modify_plugin=lambda p: (
1810            setattr(p, 'testing_domain', domain),
1811            setattr(p, 'touched_fields', p.touched_fields - {'tags'}),
1812            setattr(p, 'testing_server', server),
1813        ))
1814
1815    do_test(domain, **kw)
1816# }}}
1817