1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
4
5from __future__ import absolute_import, division, print_function, unicode_literals
6
7import json
8import re
9import time
10from collections import defaultdict, namedtuple
11try:
12    from urllib.parse import parse_qs, quote_plus, urlencode, unquote
13except ImportError:
14    from urlparse import parse_qs
15    from urllib import quote_plus, urlencode, unquote
16
17from lxml import etree
18
19from calibre import browser as _browser, prints, random_user_agent
20from calibre.utils.monotonic import monotonic
21from calibre.utils.random_ua import accept_header_for_ua
22
23current_version = (1, 0, 8)
24minimum_calibre_version = (2, 80, 0)
25
26
27last_visited = defaultdict(lambda: 0)
28Result = namedtuple('Result', 'url title cached_url')
29
30
31def tostring(elem):
32    return etree.tostring(elem, encoding='unicode', method='text', with_tail=False)
33
34
35def browser():
36    ua = random_user_agent(allow_ie=False)
37    # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0'
38    br = _browser(user_agent=ua)
39    br.set_handle_gzip(True)
40    br.addheaders += [
41        ('Accept', accept_header_for_ua(ua)),
42        ('Upgrade-insecure-requests', '1'),
43    ]
44    return br
45
46
47def encode_query(**query):
48    q = {k.encode('utf-8'): v.encode('utf-8') for k, v in query.items()}
49    return urlencode(q).decode('utf-8')
50
51
52def parse_html(raw):
53    try:
54        from html5_parser import parse
55    except ImportError:
56        # Old versions of calibre
57        import html5lib
58        return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
59    else:
60        return parse(raw)
61
62
63def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None):
64    delta = monotonic() - last_visited[key]
65    if delta < limit and delta > 0:
66        time.sleep(delta)
67    try:
68        raw = br.open_novisit(url, timeout=timeout).read()
69    finally:
70        last_visited[key] = monotonic()
71    if dump_raw is not None:
72        with open(dump_raw, 'wb') as f:
73            f.write(raw)
74    if save_raw is not None:
75        save_raw(raw)
76    return parser(raw)
77
78
79def quote_term(x):
80    ans = quote_plus(x.encode('utf-8'))
81    if isinstance(ans, bytes):
82        ans = ans.decode('utf-8')
83    return ans
84
85
86# DDG + Wayback machine {{{
87
88def ddg_term(t):
89    t = t.replace('"', '')
90    if t.lower() in {'map', 'news'}:
91        t = '"' + t + '"'
92    if t in {'OR', 'AND', 'NOT'}:
93        t = t.lower()
94    return t
95
96
97def ddg_href(url):
98    if url.startswith('/'):
99        q = url.partition('?')[2]
100        url = parse_qs(q.encode('utf-8'))['uddg'][0].decode('utf-8')
101    return url
102
103
104def wayback_machine_cached_url(url, br=None, log=prints, timeout=60):
105    q = quote_term(url)
106    br = br or browser()
107    data = query(br, 'https://archive.org/wayback/available?url=' +
108                 q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout)
109    try:
110        closest = data['archived_snapshots']['closest']
111        if closest['available']:
112            return closest['url'].replace('http:', 'https:')
113    except Exception:
114        pass
115    from pprint import pformat
116    log('Response from wayback machine:', pformat(data))
117
118
119def wayback_url_processor(url):
120    if url.startswith('/'):
121        # Use original URL instead of absolutizing to wayback URL as wayback is
122        # slow
123        m = re.search('https?:', url)
124        if m is None:
125            url = 'https://web.archive.org' + url
126        else:
127            url = url[m.start():]
128    return url
129
130
131def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
132    # https://duck.co/help/results/syntax
133    terms = [quote_term(ddg_term(t)) for t in terms]
134    if site is not None:
135        terms.append(quote_term(('site:' + site)))
136    q = '+'.join(terms)
137    url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format(
138        q=q, kp=1 if safe_search else -1)
139    log('Making ddg query: ' + url)
140    br = br or browser()
141    root = query(br, url, 'ddg', dump_raw, timeout=timeout)
142    ans = []
143    for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'):
144        ans.append(Result(ddg_href(a.get('href')), tostring(a), None))
145    return ans, url
146
147
148def ddg_develop():
149    br = browser()
150    for result in ddg_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]:
151        if '/dp/' in result.url:
152            print(result.title)
153            print(' ', result.url)
154            print(' ', wayback_machine_cached_url(result.url, br))
155            print()
156# }}}
157
158# Bing {{{
159
160
161def bing_term(t):
162    t = t.replace('"', '')
163    if t in {'OR', 'AND', 'NOT'}:
164        t = t.lower()
165    return t
166
167
168def bing_url_processor(url):
169    return url
170
171
172def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
173    # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm
174    terms = [quote_term(bing_term(t)) for t in terms]
175    if site is not None:
176        terms.append(quote_term(('site:' + site)))
177    q = '+'.join(terms)
178    url = 'https://www.bing.com/search?q={q}'.format(q=q)
179    log('Making bing query: ' + url)
180    br = br or browser()
181    root = query(br, url, 'bing', dump_raw, timeout=timeout)
182    ans = []
183    for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'):
184        a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class="b_algoheader"]/a[@href]')
185        a = a[0]
186        title = tostring(a)
187        try:
188            div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0]
189        except IndexError:
190            log('Ignoring {!r} as it has no cached page'.format(title))
191            continue
192        d, w = div.get('u').split('|')[-2:]
193        cached_url = 'https://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format(
194            q=q, d=d, w=w)
195        ans.append(Result(a.get('href'), title, cached_url))
196    if not ans:
197        title = ' '.join(root.xpath('//title/text()'))
198        log('Failed to find any results on results page, with title:', title)
199    return ans, url
200
201
202def bing_develop():
203    br = browser()
204    for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]:
205        if '/dp/' in result.url:
206            print(result.title)
207            print(' ', result.url)
208            print(' ', result.cached_url)
209            print()
210# }}}
211
212# Google {{{
213
214
215def google_term(t):
216    t = t.replace('"', '')
217    if t in {'OR', 'AND', 'NOT'}:
218        t = t.lower()
219    return t
220
221
222def google_url_processor(url):
223    return url
224
225
226def google_extract_cache_urls(raw):
227    if isinstance(raw, bytes):
228        raw = raw.decode('utf-8', 'replace')
229    pat = re.compile(r'\\x22(https://webcache\.googleusercontent\.com/.+?)\\x22')
230    upat = re.compile(r'\\\\u([0-9a-fA-F]{4})')
231    cache_pat = re.compile('cache:([^:]+):(.+)')
232
233    def urepl(m):
234        return chr(int(m.group(1), 16))
235
236    seen = set()
237    ans = {}
238    for m in pat.finditer(raw):
239        cache_url = upat.sub(urepl, m.group(1))
240        m = cache_pat.search(cache_url)
241        cache_id, src_url = m.group(1), m.group(2)
242        if cache_id in seen:
243            continue
244        seen.add(cache_id)
245        src_url = src_url.split('+')[0]
246        src_url = unquote(src_url)
247        ans[src_url] = cache_url
248    return ans
249
250
251def google_parse_results(root, raw, log=prints):
252    cache_url_map = google_extract_cache_urls(raw)
253    # print('\n'.join(cache_url_map))
254    ans = []
255    for div in root.xpath('//*[@id="search"]//*[@id="rso"]//*[@class="g"]'):
256        try:
257            a = div.xpath('descendant::a[@href]')[0]
258        except IndexError:
259            log('Ignoring div with no main result link')
260            continue
261        title = tostring(a)
262        src_url = a.get('href')
263        if src_url in cache_url_map:
264            cached_url = cache_url_map[src_url]
265        else:
266            try:
267                c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0]
268            except IndexError:
269                log('Ignoring {!r} as it has no cached page'.format(title))
270                continue
271            cached_url = c.get('href')
272        ans.append(Result(a.get('href'), title, cached_url))
273    if not ans:
274        title = ' '.join(root.xpath('//title/text()'))
275        log('Failed to find any results on results page, with title:', title)
276    return ans
277
278
279def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60):
280    terms = [quote_term(google_term(t)) for t in terms]
281    if site is not None:
282        terms.append(quote_term(('site:' + site)))
283    q = '+'.join(terms)
284    url = 'https://www.google.com/search?q={q}'.format(q=q)
285    log('Making google query: ' + url)
286    br = br or browser()
287    r = []
288    root = query(br, url, 'google', dump_raw, timeout=timeout, save_raw=r.append)
289    return google_parse_results(root, r[0], log=log), url
290
291
292def google_develop(search_terms='1423146786', raw_from=''):
293    if raw_from:
294        with open(raw_from, 'rb') as f:
295            raw = f.read()
296        results = google_parse_results(parse_html(raw), raw)
297    else:
298        br = browser()
299        results = google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]
300    for result in results:
301        if '/dp/' in result.url:
302            print(result.title)
303            print(' ', result.url)
304            print(' ', result.cached_url)
305            print()
306# }}}
307
308
309def resolve_url(url):
310    prefix, rest = url.partition(':')[::2]
311    if prefix == 'bing':
312        return bing_url_processor(rest)
313    if prefix == 'wayback':
314        return wayback_url_processor(rest)
315    return url
316