1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net> 4 5from __future__ import absolute_import, division, print_function, unicode_literals 6 7import json 8import re 9import time 10from collections import defaultdict, namedtuple 11try: 12 from urllib.parse import parse_qs, quote_plus, urlencode, unquote 13except ImportError: 14 from urlparse import parse_qs 15 from urllib import quote_plus, urlencode, unquote 16 17from lxml import etree 18 19from calibre import browser as _browser, prints, random_user_agent 20from calibre.utils.monotonic import monotonic 21from calibre.utils.random_ua import accept_header_for_ua 22 23current_version = (1, 0, 8) 24minimum_calibre_version = (2, 80, 0) 25 26 27last_visited = defaultdict(lambda: 0) 28Result = namedtuple('Result', 'url title cached_url') 29 30 31def tostring(elem): 32 return etree.tostring(elem, encoding='unicode', method='text', with_tail=False) 33 34 35def browser(): 36 ua = random_user_agent(allow_ie=False) 37 # ua = 'Mozilla/5.0 (Linux; Android 8.0.0; VTR-L29; rv:63.0) Gecko/20100101 Firefox/63.0' 38 br = _browser(user_agent=ua) 39 br.set_handle_gzip(True) 40 br.addheaders += [ 41 ('Accept', accept_header_for_ua(ua)), 42 ('Upgrade-insecure-requests', '1'), 43 ] 44 return br 45 46 47def encode_query(**query): 48 q = {k.encode('utf-8'): v.encode('utf-8') for k, v in query.items()} 49 return urlencode(q).decode('utf-8') 50 51 52def parse_html(raw): 53 try: 54 from html5_parser import parse 55 except ImportError: 56 # Old versions of calibre 57 import html5lib 58 return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) 59 else: 60 return parse(raw) 61 62 63def query(br, url, key, dump_raw=None, limit=1, parser=parse_html, timeout=60, save_raw=None): 64 delta = monotonic() - last_visited[key] 65 if delta < limit and delta > 0: 66 time.sleep(delta) 67 try: 68 raw = br.open_novisit(url, timeout=timeout).read() 69 finally: 70 last_visited[key] = monotonic() 71 if dump_raw is not None: 72 with open(dump_raw, 'wb') as f: 73 f.write(raw) 74 if save_raw is not None: 75 save_raw(raw) 76 return parser(raw) 77 78 79def quote_term(x): 80 ans = quote_plus(x.encode('utf-8')) 81 if isinstance(ans, bytes): 82 ans = ans.decode('utf-8') 83 return ans 84 85 86# DDG + Wayback machine {{{ 87 88def ddg_term(t): 89 t = t.replace('"', '') 90 if t.lower() in {'map', 'news'}: 91 t = '"' + t + '"' 92 if t in {'OR', 'AND', 'NOT'}: 93 t = t.lower() 94 return t 95 96 97def ddg_href(url): 98 if url.startswith('/'): 99 q = url.partition('?')[2] 100 url = parse_qs(q.encode('utf-8'))['uddg'][0].decode('utf-8') 101 return url 102 103 104def wayback_machine_cached_url(url, br=None, log=prints, timeout=60): 105 q = quote_term(url) 106 br = br or browser() 107 data = query(br, 'https://archive.org/wayback/available?url=' + 108 q, 'wayback', parser=json.loads, limit=0.25, timeout=timeout) 109 try: 110 closest = data['archived_snapshots']['closest'] 111 if closest['available']: 112 return closest['url'].replace('http:', 'https:') 113 except Exception: 114 pass 115 from pprint import pformat 116 log('Response from wayback machine:', pformat(data)) 117 118 119def wayback_url_processor(url): 120 if url.startswith('/'): 121 # Use original URL instead of absolutizing to wayback URL as wayback is 122 # slow 123 m = re.search('https?:', url) 124 if m is None: 125 url = 'https://web.archive.org' + url 126 else: 127 url = url[m.start():] 128 return url 129 130 131def ddg_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60): 132 # https://duck.co/help/results/syntax 133 terms = [quote_term(ddg_term(t)) for t in terms] 134 if site is not None: 135 terms.append(quote_term(('site:' + site))) 136 q = '+'.join(terms) 137 url = 'https://duckduckgo.com/html/?q={q}&kp={kp}'.format( 138 q=q, kp=1 if safe_search else -1) 139 log('Making ddg query: ' + url) 140 br = br or browser() 141 root = query(br, url, 'ddg', dump_raw, timeout=timeout) 142 ans = [] 143 for a in root.xpath('//*[@class="results"]//*[@class="result__title"]/a[@href and @class="result__a"]'): 144 ans.append(Result(ddg_href(a.get('href')), tostring(a), None)) 145 return ans, url 146 147 148def ddg_develop(): 149 br = browser() 150 for result in ddg_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]: 151 if '/dp/' in result.url: 152 print(result.title) 153 print(' ', result.url) 154 print(' ', wayback_machine_cached_url(result.url, br)) 155 print() 156# }}} 157 158# Bing {{{ 159 160 161def bing_term(t): 162 t = t.replace('"', '') 163 if t in {'OR', 'AND', 'NOT'}: 164 t = t.lower() 165 return t 166 167 168def bing_url_processor(url): 169 return url 170 171 172def bing_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60): 173 # http://vlaurie.com/computers2/Articles/bing_advanced_search.htm 174 terms = [quote_term(bing_term(t)) for t in terms] 175 if site is not None: 176 terms.append(quote_term(('site:' + site))) 177 q = '+'.join(terms) 178 url = 'https://www.bing.com/search?q={q}'.format(q=q) 179 log('Making bing query: ' + url) 180 br = br or browser() 181 root = query(br, url, 'bing', dump_raw, timeout=timeout) 182 ans = [] 183 for li in root.xpath('//*[@id="b_results"]/li[@class="b_algo"]'): 184 a = li.xpath('descendant::h2/a[@href]') or li.xpath('descendant::div[@class="b_algoheader"]/a[@href]') 185 a = a[0] 186 title = tostring(a) 187 try: 188 div = li.xpath('descendant::div[@class="b_attribution" and @u]')[0] 189 except IndexError: 190 log('Ignoring {!r} as it has no cached page'.format(title)) 191 continue 192 d, w = div.get('u').split('|')[-2:] 193 cached_url = 'https://cc.bingj.com/cache.aspx?q={q}&d={d}&mkt=en-US&setlang=en-US&w={w}'.format( 194 q=q, d=d, w=w) 195 ans.append(Result(a.get('href'), title, cached_url)) 196 if not ans: 197 title = ' '.join(root.xpath('//title/text()')) 198 log('Failed to find any results on results page, with title:', title) 199 return ans, url 200 201 202def bing_develop(): 203 br = browser() 204 for result in bing_search('heroes abercrombie'.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0]: 205 if '/dp/' in result.url: 206 print(result.title) 207 print(' ', result.url) 208 print(' ', result.cached_url) 209 print() 210# }}} 211 212# Google {{{ 213 214 215def google_term(t): 216 t = t.replace('"', '') 217 if t in {'OR', 'AND', 'NOT'}: 218 t = t.lower() 219 return t 220 221 222def google_url_processor(url): 223 return url 224 225 226def google_extract_cache_urls(raw): 227 if isinstance(raw, bytes): 228 raw = raw.decode('utf-8', 'replace') 229 pat = re.compile(r'\\x22(https://webcache\.googleusercontent\.com/.+?)\\x22') 230 upat = re.compile(r'\\\\u([0-9a-fA-F]{4})') 231 cache_pat = re.compile('cache:([^:]+):(.+)') 232 233 def urepl(m): 234 return chr(int(m.group(1), 16)) 235 236 seen = set() 237 ans = {} 238 for m in pat.finditer(raw): 239 cache_url = upat.sub(urepl, m.group(1)) 240 m = cache_pat.search(cache_url) 241 cache_id, src_url = m.group(1), m.group(2) 242 if cache_id in seen: 243 continue 244 seen.add(cache_id) 245 src_url = src_url.split('+')[0] 246 src_url = unquote(src_url) 247 ans[src_url] = cache_url 248 return ans 249 250 251def google_parse_results(root, raw, log=prints): 252 cache_url_map = google_extract_cache_urls(raw) 253 # print('\n'.join(cache_url_map)) 254 ans = [] 255 for div in root.xpath('//*[@id="search"]//*[@id="rso"]//*[@class="g"]'): 256 try: 257 a = div.xpath('descendant::a[@href]')[0] 258 except IndexError: 259 log('Ignoring div with no main result link') 260 continue 261 title = tostring(a) 262 src_url = a.get('href') 263 if src_url in cache_url_map: 264 cached_url = cache_url_map[src_url] 265 else: 266 try: 267 c = div.xpath('descendant::*[@role="menuitem"]//a[@class="fl"]')[0] 268 except IndexError: 269 log('Ignoring {!r} as it has no cached page'.format(title)) 270 continue 271 cached_url = c.get('href') 272 ans.append(Result(a.get('href'), title, cached_url)) 273 if not ans: 274 title = ' '.join(root.xpath('//title/text()')) 275 log('Failed to find any results on results page, with title:', title) 276 return ans 277 278 279def google_search(terms, site=None, br=None, log=prints, safe_search=False, dump_raw=None, timeout=60): 280 terms = [quote_term(google_term(t)) for t in terms] 281 if site is not None: 282 terms.append(quote_term(('site:' + site))) 283 q = '+'.join(terms) 284 url = 'https://www.google.com/search?q={q}'.format(q=q) 285 log('Making google query: ' + url) 286 br = br or browser() 287 r = [] 288 root = query(br, url, 'google', dump_raw, timeout=timeout, save_raw=r.append) 289 return google_parse_results(root, r[0], log=log), url 290 291 292def google_develop(search_terms='1423146786', raw_from=''): 293 if raw_from: 294 with open(raw_from, 'rb') as f: 295 raw = f.read() 296 results = google_parse_results(parse_html(raw), raw) 297 else: 298 br = browser() 299 results = google_search(search_terms.split(), 'www.amazon.com', dump_raw='/t/raw.html', br=br)[0] 300 for result in results: 301 if '/dp/' in result.url: 302 print(result.title) 303 print(' ', result.url) 304 print(' ', result.cached_url) 305 print() 306# }}} 307 308 309def resolve_url(url): 310 prefix, rest = url.partition(':')[::2] 311 if prefix == 'bing': 312 return bing_url_processor(rest) 313 if prefix == 'wayback': 314 return wayback_url_processor(rest) 315 return url 316