1#!/usr/local/bin/python3.8 2# 3# Copyright © 2008 Henri Hakkinen 4# Copyright © 2015-2021 Arun Prakash Jana <engineerarun@gmail.com> 5# 6# This program is free software: you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation, either version 3 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program. If not, see <http://www.gnu.org/licenses/>. 18 19import argparse 20import atexit 21import base64 22import collections 23import codecs 24import functools 25import gzip 26import html.entities 27import html.parser 28import http.client 29from http.client import HTTPSConnection 30import locale 31import logging 32import os 33import platform 34import shutil 35import signal 36import socket 37import ssl 38import subprocess 39from subprocess import Popen, PIPE, DEVNULL 40import sys 41import textwrap 42import unicodedata 43import urllib.parse 44import uuid 45import webbrowser 46 47# Python optional dependency compatibility layer 48try: 49 import readline 50except ImportError: 51 pass 52 53try: 54 import setproctitle 55 setproctitle.setproctitle('googler') 56except (ImportError, Exception): 57 pass 58 59from typing import ( 60 Any, 61 Dict, 62 Generator, 63 Iterable, 64 Iterator, 65 List, 66 Match, 67 Optional, 68 Sequence, 69 Tuple, 70 Union, 71 cast, 72) 73 74# Basic setup 75 76logging.basicConfig(format='[%(levelname)s] %(message)s') 77logger = logging.getLogger() 78 79 80def sigint_handler(signum, frame): 81 print('\nInterrupted.', file=sys.stderr) 82 sys.exit(1) 83 84try: 85 signal.signal(signal.SIGINT, sigint_handler) 86except ValueError: 87 # signal only works in main thread 88 pass 89 90 91# Constants 92 93_VERSION_ = '4.3.2' 94_EPOCH_ = '20210115' 95 96COLORMAP = {k: '\x1b[%sm' % v for k, v in { 97 'a': '30', 'b': '31', 'c': '32', 'd': '33', 98 'e': '34', 'f': '35', 'g': '36', 'h': '37', 99 'i': '90', 'j': '91', 'k': '92', 'l': '93', 100 'm': '94', 'n': '95', 'o': '96', 'p': '97', 101 'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1', 102 'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1', 103 'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1', 104 'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1', 105 'x': '0', 'X': '1', 'y': '7', 'Y': '7;1', 106}.items()} 107 108USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' 109 110text_browsers = ['elinks', 'links', 'lynx', 'w3m', 'www-browser'] 111 112# Self-upgrade parameters 113# 114# Downstream packagers are recommended to turn off the entire self-upgrade 115# mechanism through 116# 117# make disable-self-upgrade 118# 119# before running `make install'. 120 121ENABLE_SELF_UPGRADE_MECHANISM = False 122API_REPO_BASE = 'https://api.github.com/repos/jarun/googler' 123RAW_DOWNLOAD_REPO_BASE = 'https://raw.githubusercontent.com/jarun/googler' 124 125debugger = False 126 127 128# Monkeypatch textwrap for CJK wide characters. 129 130def monkeypatch_textwrap_for_cjk(): 131 try: 132 if textwrap.wrap.patched: 133 return 134 except AttributeError: 135 pass 136 psl_textwrap_wrap = textwrap.wrap 137 138 def textwrap_wrap(text, width=70, **kwargs): 139 if width <= 2: 140 width = 2 141 # We first add a U+0000 after each East Asian Fullwidth or East 142 # Asian Wide character, then fill to width - 1 (so that if a NUL 143 # character ends up on a new line, we still have one last column 144 # to spare for the preceding wide character). Finally we strip 145 # all the NUL characters. 146 # 147 # East Asian Width: https://www.unicode.org/reports/tr11/ 148 return [ 149 line.replace('\0', '') 150 for line in psl_textwrap_wrap( 151 ''.join( 152 ch + '\0' if unicodedata.east_asian_width(ch) in ('F', 'W') else ch 153 for ch in unicodedata.normalize('NFC', text) 154 ), 155 width=width - 1, 156 **kwargs 157 ) 158 ] 159 160 def textwrap_fill(text, width=70, **kwargs): 161 return '\n'.join(textwrap_wrap(text, width=width, **kwargs)) 162 163 textwrap.wrap = textwrap_wrap 164 textwrap.fill = textwrap_fill 165 textwrap.wrap.patched = True 166 textwrap.fill.patched = True 167 168 169monkeypatch_textwrap_for_cjk() 170 171 172CoordinateType = Tuple[int, int] 173 174 175class TrackedTextwrap: 176 """ 177 Implements a text wrapper that tracks the position of each source 178 character, and can correctly insert zero-width sequences at given 179 offsets of the source text. 180 181 Wrapping result should be the same as that from PSL textwrap.wrap 182 with default settings except expand_tabs=False. 183 """ 184 185 def __init__(self, text: str, width: int): 186 self._original = text 187 188 # Do the job of replace_whitespace first so that we can easily 189 # match text to wrapped lines later. Note that this operation 190 # does not change text length or offsets. 191 whitespace = "\t\n\v\f\r " 192 whitespace_trans = str.maketrans(whitespace, " " * len(whitespace)) 193 text = text.translate(whitespace_trans) 194 195 self._lines = textwrap.wrap( 196 text, width, expand_tabs=False, replace_whitespace=False 197 ) 198 199 # self._coords track the (row, column) coordinate of each source 200 # character in the result text. It is indexed by offset in 201 # source text. 202 self._coords = [] # type: List[CoordinateType] 203 offset = 0 204 try: 205 if not self._lines: 206 # Source text only has whitespaces. We add an empty line 207 # in order to produce meaningful coordinates. 208 self._lines = [""] 209 for row, line in enumerate(self._lines): 210 assert text[offset : offset + len(line)] == line 211 col = 0 212 for _ in line: 213 self._coords.append((row, col)) 214 offset += 1 215 col += 1 216 # All subsequent dropped whitespaces map to the last, imaginary column 217 # (the EOL character if you wish) of the current line. 218 while offset < len(text) and text[offset] == " ": 219 self._coords.append((row, col)) 220 offset += 1 221 # One past the final character (think of it as EOF) should 222 # be treated as a valid offset. 223 self._coords.append((row, col)) 224 except AssertionError: 225 raise RuntimeError( 226 "TrackedTextwrap: the impossible happened at offset {} of text {!r}".format( 227 offset, self._original 228 ) 229 ) 230 231 # seq should be a zero-width sequence, e.g., an ANSI escape sequence. 232 # May raise IndexError if offset is out of bounds. 233 def insert_zero_width_sequence(self, seq: str, offset: int) -> None: 234 row, col = self._coords[offset] 235 line = self._lines[row] 236 self._lines[row] = line[:col] + seq + line[col:] 237 238 # Shift coordinates of all characters after the given character 239 # on the same line. 240 shift = len(seq) 241 offset += 1 242 while offset < len(self._coords) and self._coords[offset][0] == row: 243 _, col = self._coords[offset] 244 self._coords[offset] = (row, col + shift) 245 offset += 1 246 247 @property 248 def original(self) -> str: 249 return self._original 250 251 @property 252 def lines(self) -> List[str]: 253 return self._lines 254 255 @property 256 def wrapped(self) -> str: 257 return "\n".join(self._lines) 258 259 # May raise IndexError if offset is out of bounds. 260 def get_coordinate(self, offset: int) -> CoordinateType: 261 return self._coords[offset] 262 263 264### begin dim (DOM implementation with CSS support) ### 265### https://github.com/zmwangx/dim/blob/master/dim.py ### 266 267import html 268import re 269from collections import OrderedDict 270from enum import Enum 271from html.parser import HTMLParser 272 273 274SelectorGroupLike = Union[str, "SelectorGroup", "Selector"] 275 276 277class Node(object): 278 """ 279 Represents a DOM node. 280 281 Parts of JavaScript's DOM ``Node`` API and ``Element`` API are 282 mirrored here, with extensions. In particular, ``querySelector`` and 283 ``querySelectorAll`` are mirrored. 284 285 Notable properties and methods: :meth:`attr()`, :attr:`classes`, 286 :attr:`html`, :attr:`text`, :meth:`ancestors()`, 287 :meth:`descendants()`, :meth:`select()`, :meth:`select_all()`, 288 :meth:`matched_by()`, 289 290 Attributes: 291 tag (:class:`Optional`\\[:class:`str`]) 292 attrs (:class:`Dict`\\[:class:`str`, :class:`str`]) 293 parent (:class:`Optional`\\[:class:`Node`]) 294 children (:class:`List`\\[:class:`Node`]) 295 """ 296 297 # Meant to be reimplemented by subclasses. 298 def __init__(self) -> None: 299 self.tag = None # type: Optional[str] 300 self.attrs = {} # type: Dict[str, str] 301 self.parent = None # type: Optional[Node] 302 self.children = [] # type: List[Node] 303 304 # Used in DOMBuilder. 305 self._partial = False 306 self._namespace = None # type: Optional[str] 307 308 # HTML representation of the node. Meant to be implemented by 309 # subclasses. 310 def __str__(self) -> str: # pragma: no cover 311 raise NotImplementedError 312 313 def select(self, selector: SelectorGroupLike) -> Optional["Node"]: 314 """DOM ``querySelector`` clone. Returns one match (if any).""" 315 selector = self._normalize_selector(selector) 316 for node in self._select_all(selector): 317 return node 318 return None 319 320 def query_selector(self, selector: SelectorGroupLike) -> Optional["Node"]: 321 """Alias of :meth:`select`.""" 322 return self.select(selector) 323 324 def select_all(self, selector: SelectorGroupLike) -> List["Node"]: 325 """DOM ``querySelectorAll`` clone. Returns all matches in a list.""" 326 selector = self._normalize_selector(selector) 327 return list(self._select_all(selector)) 328 329 def query_selector_all(self, selector: SelectorGroupLike) -> List["Node"]: 330 """Alias of :meth:`select_all`.""" 331 return self.select_all(selector) 332 333 def matched_by( 334 self, selector: SelectorGroupLike, root: Optional["Node"] = None 335 ) -> bool: 336 """ 337 Checks whether this node is matched by `selector`. 338 339 See :meth:`SelectorGroup.matches()`. 340 """ 341 selector = self._normalize_selector(selector) 342 return selector.matches(self, root=root) 343 344 @staticmethod 345 def _normalize_selector(selector: SelectorGroupLike) -> "SelectorGroup": 346 if isinstance(selector, str): 347 return SelectorGroup.from_str(selector) 348 if isinstance(selector, SelectorGroup): 349 return selector 350 if isinstance(selector, Selector): 351 return SelectorGroup([selector]) 352 raise ValueError("not a selector or group of selectors: %s" % repr(selector)) 353 354 def _select_all(self, selector: "SelectorGroup") -> Generator["Node", None, None]: 355 for descendant in self.descendants(): 356 if selector.matches(descendant, root=self): 357 yield descendant 358 359 def child_nodes(self) -> List["Node"]: 360 return self.children 361 362 def first_child(self) -> Optional["Node"]: 363 if self.children: 364 return self.children[0] 365 else: 366 return None 367 368 def first_element_child(self) -> Optional["Node"]: 369 for child in self.children: 370 if isinstance(child, ElementNode): 371 return child 372 return None 373 374 def last_child(self) -> Optional["Node"]: 375 if self.children: 376 return self.children[-1] 377 else: 378 return None 379 380 def last_element_child(self) -> Optional["Node"]: 381 for child in reversed(self.children): 382 if isinstance(child, ElementNode): 383 return child 384 return None 385 386 def next_sibling(self) -> Optional["Node"]: 387 """.. note:: Not O(1), use with caution.""" 388 next_siblings = self.next_siblings() 389 if next_siblings: 390 return next_siblings[0] 391 else: 392 return None 393 394 def next_siblings(self) -> List["Node"]: 395 parent = self.parent 396 if not parent: 397 return [] 398 try: 399 index = parent.children.index(self) 400 return parent.children[index + 1 :] 401 except ValueError: # pragma: no cover 402 raise ValueError("node is not found in children of its parent") 403 404 def next_element_sibling(self) -> Optional["ElementNode"]: 405 """.. note:: Not O(1), use with caution.""" 406 for sibling in self.next_siblings(): 407 if isinstance(sibling, ElementNode): 408 return sibling 409 return None 410 411 def previous_sibling(self) -> Optional["Node"]: 412 """.. note:: Not O(1), use with caution.""" 413 previous_siblings = self.previous_siblings() 414 if previous_siblings: 415 return previous_siblings[0] 416 else: 417 return None 418 419 def previous_siblings(self) -> List["Node"]: 420 """ 421 Compared to the natural DOM order, the order of returned nodes 422 are reversed. That is, the adjacent sibling (if any) is the 423 first in the returned list. 424 """ 425 parent = self.parent 426 if not parent: 427 return [] 428 try: 429 index = parent.children.index(self) 430 if index > 0: 431 return parent.children[index - 1 :: -1] 432 else: 433 return [] 434 except ValueError: # pragma: no cover 435 raise ValueError("node is not found in children of its parent") 436 437 def previous_element_sibling(self) -> Optional["ElementNode"]: 438 """.. note:: Not O(1), use with caution.""" 439 for sibling in self.previous_siblings(): 440 if isinstance(sibling, ElementNode): 441 return sibling 442 return None 443 444 def ancestors( 445 self, *, root: Optional["Node"] = None 446 ) -> Generator["Node", None, None]: 447 """ 448 Ancestors are generated in reverse order of depth, stopping at 449 `root`. 450 451 A :class:`RuntimeException` is raised if `root` is not in the 452 ancestral chain. 453 """ 454 if self is root: 455 return 456 ancestor = self.parent 457 while ancestor is not root: 458 if ancestor is None: 459 raise RuntimeError("provided root node not found in ancestral chain") 460 yield ancestor 461 ancestor = ancestor.parent 462 if root: 463 yield root 464 465 def descendants(self) -> Generator["Node", None, None]: 466 """Descendants are generated in depth-first order.""" 467 for child in self.children: 468 yield child 469 yield from child.descendants() 470 471 def attr(self, attr: str) -> Optional[str]: 472 """Returns the attribute if it exists on the node, otherwise ``None``.""" 473 return self.attrs.get(attr) 474 475 @property 476 def html(self) -> str: 477 """ 478 HTML representation of the node. 479 480 (For a :class:`TextNode`, :meth:`html` returns the escaped version of the 481 text. 482 """ 483 return str(self) 484 485 def outer_html(self) -> str: 486 """Alias of :attr:`html`.""" 487 return self.html 488 489 def inner_html(self) -> str: 490 """HTML representation of the node's children.""" 491 return "".join(child.html for child in self.children) 492 493 @property 494 def text(self) -> str: # pragma: no cover 495 """This property is expected to be implemented by subclasses.""" 496 raise NotImplementedError 497 498 def text_content(self) -> str: 499 """Alias of :attr:`text`.""" 500 return self.text 501 502 @property 503 def classes(self) -> List[str]: 504 return self.attrs.get("class", "").split() 505 506 def class_list(self) -> List[str]: 507 return self.classes 508 509 510class ElementNode(Node): 511 """ 512 Represents an element node. 513 514 Note that tag and attribute names are case-insensitive; attribute 515 values are case-sensitive. 516 """ 517 518 def __init__( 519 self, 520 tag: str, 521 attrs: Iterable[Tuple[str, Optional[str]]], 522 *, 523 parent: Optional["Node"] = None, 524 children: Optional[Sequence["Node"]] = None 525 ) -> None: 526 Node.__init__(self) 527 self.tag = tag.lower() # type: str 528 self.attrs = OrderedDict((attr.lower(), val or "") for attr, val in attrs) 529 self.parent = parent 530 self.children = list(children or []) 531 532 def __repr__(self) -> str: 533 s = "<" + self.tag 534 if self.attrs: 535 s += " attrs=%s" % repr(list(self.attrs.items())) 536 if self.children: 537 s += " children=%s" % repr(self.children) 538 s += ">" 539 return s 540 541 # https://ipython.readthedocs.io/en/stable/api/generated/IPython.lib.pretty.html 542 def _repr_pretty_(self, p: Any, cycle: bool) -> None: # pragma: no cover 543 if cycle: 544 raise RuntimeError("cycle detected in DOM tree") 545 p.text("<\x1b[1m%s\x1b[0m" % self.tag) 546 if self.attrs: 547 p.text(" attrs=%s" % repr(list(self.attrs.items()))) 548 if self.children: 549 p.text(" children=[") 550 if len(self.children) == 1 and isinstance(self.first_child(), TextNode): 551 p.text("\x1b[4m%s\x1b[0m" % repr(self.first_child())) 552 else: 553 with p.indent(2): 554 for child in self.children: 555 p.break_() 556 if hasattr(child, "_repr_pretty_"): 557 child._repr_pretty_(p, False) # type: ignore 558 else: 559 p.text("\x1b[4m%s\x1b[0m" % repr(child)) 560 p.text(",") 561 p.break_() 562 p.text("]") 563 p.text(">") 564 565 def __str__(self) -> str: 566 """HTML representation of the node.""" 567 s = "<" + self.tag 568 for attr, val in self.attrs.items(): 569 s += ' %s="%s"' % (attr, html.escape(val)) 570 if self.children: 571 s += ">" 572 s += "".join(str(child) for child in self.children) 573 s += "</%s>" % self.tag 574 else: 575 if _tag_is_void(self.tag): 576 s += "/>" 577 else: 578 s += "></%s>" % self.tag 579 return s 580 581 @property 582 def text(self) -> str: 583 """The concatenation of all descendant text nodes.""" 584 return "".join(child.text for child in self.children) 585 586 587class TextNode(str, Node): 588 """ 589 Represents a text node. 590 591 Subclasses :class:`Node` and :class:`str`. 592 """ 593 594 def __new__(cls, text: str) -> "TextNode": 595 s = str.__new__(cls, text) # type: ignore 596 s.parent = None 597 return s # type: ignore 598 599 def __init__(self, text: str) -> None: 600 Node.__init__(self) 601 602 def __repr__(self) -> str: 603 return "<%s>" % str.__repr__(self) 604 605 # HTML-escaped form of the text node. use text() for unescaped 606 # version. 607 def __str__(self) -> str: 608 return html.escape(self) 609 610 def __eq__(self, other: object) -> bool: 611 """ 612 Two text nodes are equal if and only if they are the same node. 613 614 For string comparison, use :attr:`text`. 615 """ 616 return self is other 617 618 def __ne__(self, other: object) -> bool: 619 """ 620 Two text nodes are non-equal if they are not the same node. 621 622 For string comparison, use :attr:`text`. 623 """ 624 return self is not other 625 626 @property 627 def text(self) -> str: 628 return str.__str__(self) 629 630 631class DOMBuilderException(Exception): 632 """ 633 Exception raised when :class:`DOMBuilder` detects a bad state. 634 635 Attributes: 636 pos (:class:`Tuple`\\[:class:`int`, :class:`int`]): 637 Line number and offset in HTML input. 638 why (:class:`str`): 639 Reason of the exception. 640 """ 641 642 def __init__(self, pos: Tuple[int, int], why: str) -> None: 643 self.pos = pos 644 self.why = why 645 646 def __str__(self) -> str: # pragma: no cover 647 return "DOM builder aborted at %d:%d: %s" % (self.pos[0], self.pos[1], self.why) 648 649 650class DOMBuilder(HTMLParser): 651 """ 652 HTML parser / DOM builder. 653 654 Subclasses :class:`html.parser.HTMLParser`. 655 656 Consume HTML and builds a :class:`Node` tree. Once finished, use 657 :attr:`root` to access the root of the tree. 658 659 This parser cannot parse malformed HTML with tag mismatch. 660 """ 661 662 def __init__(self) -> None: 663 super().__init__(convert_charrefs=True) 664 # _stack is the stack for nodes. Each node is pushed to the 665 # stack when its start tag is processed, and remains on the 666 # stack until its parent node is completed (end tag processed), 667 # at which point the node is attached to the parent node as a 668 # child and popped from the stack. 669 self._stack = [] # type: List[Node] 670 # _namespace_stack is another stack tracking the parsing 671 # context, which is generally the default namespace (None) but 672 # changes when parsing foreign objects (e.g. 'svg' when parsing 673 # an <svg>). The top element is always the current parsing 674 # context, so popping works differently from _stack: an element 675 # is popped as soon as the corresponding end tag is processed. 676 self._namespace_stack = [None] # type: List[Optional[str]] 677 678 def handle_starttag( 679 self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]] 680 ) -> None: 681 node = ElementNode(tag, attrs) 682 node._partial = True 683 self._stack.append(node) 684 namespace = ( 685 tag.lower() 686 if _tag_encloses_foreign_namespace(tag) 687 else self._namespace_stack[-1] # Inherit parent namespace 688 ) 689 node._namespace = namespace 690 self._namespace_stack.append(namespace) 691 # For void elements (not in a foreign context), immediately 692 # invoke the end tag handler (see handle_startendtag()). 693 if not namespace and _tag_is_void(tag): 694 self.handle_endtag(tag) 695 696 def handle_endtag(self, tag: str) -> None: 697 tag = tag.lower() 698 children = [] 699 while self._stack and not self._stack[-1]._partial: 700 children.append(self._stack.pop()) 701 if not self._stack: 702 raise DOMBuilderException(self.getpos(), "extra end tag: %s" % repr(tag)) 703 parent = self._stack[-1] 704 if parent.tag != tag: 705 raise DOMBuilderException( 706 self.getpos(), 707 "expecting end tag %s, got %s" % (repr(parent.tag), repr(tag)), 708 ) 709 parent.children = list(reversed(children)) 710 parent._partial = False 711 for child in children: 712 child.parent = parent 713 self._namespace_stack.pop() 714 715 # Make parser behavior for explicitly and implicitly void elements 716 # (e.g., <hr> vs <hr/>) consistent. The former triggers 717 # handle_starttag only, whereas the latter triggers 718 # handle_startendtag (which by default triggers both handle_starttag 719 # and handle_endtag). See https://bugs.python.org/issue25258. 720 # 721 # An exception is foreign elements, which aren't considered void 722 # elements but can be explicitly marked as self-closing according to 723 # the HTML spec (e.g. <path/> is valid but <path> is not). 724 # Therefore, both handle_starttag and handle_endtag must be called, 725 # and handle_endtag should not be triggered from within 726 # handle_starttag in that case. 727 # 728 # Note that for simplicity we do not check whether the foreign 729 # element in question is allowed to be self-closing by spec. (The 730 # SVG spec unfortunately doesn't provide a readily available list of 731 # such elements.) 732 # 733 # https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements 734 def handle_startendtag( 735 self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]] 736 ) -> None: 737 if self._namespace_stack[-1] or _tag_encloses_foreign_namespace(tag): 738 self.handle_starttag(tag, attrs) 739 self.handle_endtag(tag) 740 else: 741 self.handle_starttag(tag, attrs) 742 743 def handle_data(self, text: str) -> None: 744 if not self._stack: 745 # Ignore text nodes before the first tag. 746 return 747 self._stack.append(TextNode(text)) 748 749 @property 750 def root(self) -> "Node": 751 """ 752 Finishes processing and returns the root node. 753 754 Raises :class:`DOMBuilderException` if there is no root tag or 755 root tag is not closed yet. 756 """ 757 if not self._stack: 758 raise DOMBuilderException(self.getpos(), "no root tag") 759 if self._stack[0]._partial: 760 raise DOMBuilderException(self.getpos(), "root tag not closed yet") 761 return self._stack[0] 762 763 764def parse_html(html: str, *, ParserClass: type = DOMBuilder) -> "Node": 765 """ 766 Parses HTML string, builds DOM, and returns root node. 767 768 The parser may raise :class:`DOMBuilderException`. 769 770 Args: 771 html: input HTML string 772 ParserClass: :class:`DOMBuilder` or a subclass 773 774 Returns: 775 Root note of the parsed tree. If the HTML string contains 776 multiple top-level elements, only the first is returned and the 777 rest are lost. 778 """ 779 builder = ParserClass() # type: DOMBuilder 780 builder.feed(html) 781 builder.close() 782 return builder.root 783 784 785class SelectorParserException(Exception): 786 """ 787 Exception raised when the selector parser fails to parse an input. 788 789 Attributes: 790 s (:class:`str`): 791 The input string to be parsed. 792 cursor (:class:`int`): 793 Cursor position where the failure occurred. 794 why (:class:`str`): 795 Reason of the failure. 796 """ 797 798 def __init__(self, s: str, cursor: int, why: str) -> None: 799 self.s = s 800 self.cursor = cursor 801 self.why = why 802 803 def __str__(self) -> str: # pragma: no cover 804 return "selector parser aborted at character %d of %s: %s" % ( 805 self.cursor, 806 repr(self.s), 807 self.why, 808 ) 809 810 811class SelectorGroup: 812 """ 813 Represents a group of CSS selectors. 814 815 A group of CSS selectors is simply a comma-separated list of 816 selectors. [#]_ See :class:`Selector` documentation for the scope of 817 support. 818 819 Typically, a :class:`SelectorGroup` is constructed from a string 820 (e.g., ``th.center, td.center``) using the factory function 821 :meth:`from_str`. 822 823 .. [#] https://www.w3.org/TR/selectors-3/#grouping 824 """ 825 826 def __init__(self, selectors: Iterable["Selector"]) -> None: 827 self._selectors = list(selectors) 828 829 def __repr__(self) -> str: 830 return "<SelectorGroup %s>" % repr(str(self)) 831 832 def __str__(self) -> str: 833 return ", ".join(str(selector) for selector in self._selectors) 834 835 def __len__(self) -> int: 836 return len(self._selectors) 837 838 def __getitem__(self, index: int) -> "Selector": 839 return self._selectors[index] 840 841 def __iter__(self) -> Iterator["Selector"]: 842 return iter(self._selectors) 843 844 @classmethod 845 def from_str(cls, s: str) -> "SelectorGroup": 846 """ 847 Parses input string into a group of selectors. 848 849 :class:`SelectorParserException` is raised on invalid input. See 850 :class:`Selector` documentation for the scope of support. 851 852 Args: 853 s: input string 854 855 Returns: 856 Parsed group of selectors. 857 """ 858 i = 0 859 selectors = [] 860 while i < len(s): 861 selector, i = Selector.from_str(s, i) 862 selectors.append(selector) 863 if not selectors: 864 raise SelectorParserException(s, i, "selector group is empty") 865 return cls(selectors) 866 867 def matches(self, node: "Node", root: Optional["Node"] = None) -> bool: 868 """ 869 Decides whether the group of selectors matches `node`. 870 871 The group of selectors matches `node` as long as one of the 872 selectors matches `node`. 873 874 If `root` is provided and child and/or descendant combinators 875 are involved, parent/ancestor lookup terminates at `root`. 876 """ 877 return any(selector.matches(node, root=root) for selector in self) 878 879 880class Selector: 881 """ 882 Represents a CSS selector. 883 884 Recall that a CSS selector is a chain of one or more *sequences of 885 simple selectors* separated by *combinators*. [#selectors-3]_ This 886 concept is represented as a cons list of sequences of simple 887 selectors (in right to left order). This class in fact holds a 888 single sequence, with an optional combinator and reference to the 889 previous sequence. 890 891 For instance, ``main#main p.important.definition > 892 a.term[id][href]`` would be parsed into (schematically) the 893 following structure:: 894 895 ">" tag='a' classes=('term') attrs=([id], [href]) ~> 896 " " tag='p' classes=('important', 'definition') ~> 897 tag='main' id='main' 898 899 Each line is held in a separate instance of :class:`Selector`, 900 linked together by the :attr:`previous` attribute. 901 902 Supported grammar (from selectors level 3 [#selectors-3]_): 903 904 - Type selectors; 905 - Universal selectors; 906 - Class selectors; 907 - ID selectors; 908 - Attribute selectors; 909 - Combinators. 910 911 Unsupported grammar: 912 913 - Pseudo-classes; 914 - Pseudo-elements; 915 - Namespace prefixes (``ns|``, ``*|``, ``|``) in any part of any 916 selector. 917 918 Rationale: 919 920 - Pseudo-classes have too many variants, a few of which even 921 complete with an admittedly not-so-complex minilanguage. These add 922 up to a lot of code. 923 - Pseudo-elements are useless outside rendering contexts, hence out of 924 scope. 925 - Namespace support is too niche to be worth the parsing headache. 926 *Using namespace prefixes may confuse the parser!* 927 928 Note that the parser only loosely follows the spec and priotizes 929 ease of parsing (which includes readability and *writability* of 930 regexes), so some invalid selectors may be accepted (in fact, false 931 positives abound, but accepting valid inputs is a much more 932 important goal than rejecting invalid inputs for this library), and 933 some valid selectors may be rejected (but as long as you stick to 934 the scope outlined above and common sense you should be fine; the 935 false negatives shouldn't be used by actual human beings anyway). 936 937 In particular, whitespace character is simplified to ``\\s`` (ASCII 938 mode) despite CSS spec not counting U+000B (VT) as whitespace, 939 identifiers are simplified to ``[\\w-]+`` (ASCII mode), and strings 940 (attribute selector values can be either identifiers or strings) 941 allow escaped quotes (i.e., ``\\'`` inside single-quoted strings and 942 ``\\"`` inside double-quoted strings) but everything else is 943 interpreted literally. The exact specs for CSS identifiers and 944 strings can be found at [#]_. 945 946 Certain selectors and combinators may be implemented in the parser 947 but not implemented in matching and/or selection APIs. 948 949 .. [#selectors-3] https://www.w3.org/TR/selectors-3/ 950 .. [#] https://www.w3.org/TR/CSS21/syndata.html 951 952 Attributes: 953 tag (:class:`Optional`\\[:class:`str`]): 954 Type selector. 955 classes (:class:`List`\\[:class:`str`]): 956 Class selectors. 957 id (:class:`Optional`\\[:class:`str`]): 958 ID selector. 959 attrs (:class:`List`\\[:class:`AttributeSelector`]): 960 Attribute selectors. 961 combinator (:class:`Optional`\\[:class:`Combinator`]): 962 Combinator with the previous sequence of simple selectors in 963 chain. 964 previous (:class:`Optional`\\[:class:`Selector`]): 965 Reference to the previous sequence of simple selectors in 966 chain. 967 968 """ 969 970 def __init__( 971 self, 972 *, 973 tag: Optional[str] = None, 974 classes: Optional[Sequence[str]] = None, 975 id: Optional[str] = None, 976 attrs: Optional[Sequence["AttributeSelector"]] = None, 977 combinator: Optional["Combinator"] = None, 978 previous: Optional["Selector"] = None 979 ) -> None: 980 self.tag = tag.lower() if tag else None 981 self.classes = list(classes or []) 982 self.id = id 983 self.attrs = list(attrs or []) 984 self.combinator = combinator 985 self.previous = previous 986 987 def __repr__(self) -> str: 988 return "<Selector %s>" % repr(str(self)) 989 990 def __str__(self) -> str: 991 sequences = [] 992 delimiters = [] 993 seq = self 994 while True: 995 sequences.append(seq._sequence_str_()) 996 if seq.previous: 997 if seq.combinator == Combinator.DESCENDANT: 998 delimiters.append(" ") 999 elif seq.combinator == Combinator.CHILD: 1000 delimiters.append(" > ") 1001 elif seq.combinator == Combinator.NEXT_SIBLING: 1002 delimiters.append(" + ") 1003 elif seq.combinator == Combinator.SUBSEQUENT_SIBLING: 1004 delimiters.append(" ~ ") 1005 else: # pragma: no cover 1006 raise RuntimeError( 1007 "unimplemented combinator: %s" % repr(self.combinator) 1008 ) 1009 seq = seq.previous 1010 else: 1011 delimiters.append("") 1012 break 1013 return "".join( 1014 delimiter + sequence 1015 for delimiter, sequence in zip(reversed(delimiters), reversed(sequences)) 1016 ) 1017 1018 # Format a single sequence of simple selectors, without combinator. 1019 def _sequence_str_(self) -> str: 1020 s = "" 1021 if self.tag: 1022 s += self.tag 1023 if self.classes: 1024 s += "".join(".%s" % class_ for class_ in self.classes) 1025 if self.id: 1026 s += "#%s" % self.id 1027 if self.attrs: 1028 s += "".join(str(attr) for attr in self.attrs) 1029 return s if s else "*" 1030 1031 @classmethod 1032 def from_str(cls, s: str, cursor: int = 0) -> Tuple["Selector", int]: 1033 """ 1034 Parses input string into selector. 1035 1036 This factory function only parses out one selector (up to a 1037 comma or EOS), so partial consumption is allowed --- an optional 1038 `cursor` is taken as input (0 by default) and the moved cursor 1039 (either after the comma or at EOS) is returned as part of the 1040 output. 1041 1042 :class:`SelectorParserException` is raised on invalid input. See 1043 :class:`Selector` documentation for the scope of support. 1044 1045 If you need to completely consume a string representing 1046 (potentially) a group of selectors, use 1047 :meth:`SelectorGroup.from_str()`. 1048 1049 Args: 1050 s: input string 1051 cursor: initial cursor position on `s` 1052 1053 Returns: 1054 A tuple containing the parsed selector and the moved the 1055 cursor (either after a comma-delimiter, or at EOS). 1056 """ 1057 # Simple selectors. 1058 TYPE_SEL = re.compile(r"[\w-]+", re.A) 1059 UNIVERSAL_SEL = re.compile(r"\*") 1060 ATTR_SEL = re.compile( 1061 r"""\[ 1062 \s*(?P<attr>[\w-]+)\s* 1063 ( 1064 (?P<op>[~|^$*]?=)\s* 1065 ( 1066 (?P<val_identifier>[\w-]+)| 1067 (?P<val_string> 1068 (?P<quote>['"]) 1069 (?P<val_string_inner>.*?) 1070 (?<!\\)(?P=quote) 1071 ) 1072 )\s* 1073 )? 1074 \]""", 1075 re.A | re.X, 1076 ) 1077 CLASS_SEL = re.compile(r"\.([\w-]+)", re.A) 1078 ID_SEL = re.compile(r"#([\w-]+)", re.A) 1079 PSEUDO_CLASS_SEL = re.compile(r":[\w-]+(\([^)]+\))?", re.A) 1080 PSEUDO_ELEM_SEL = re.compile(r"::[\w-]+", re.A) 1081 1082 # Combinators 1083 DESCENDANT_COM = re.compile(r"\s+") 1084 CHILD_COM = re.compile(r"\s*>\s*") 1085 NEXT_SIB_COM = re.compile(r"\s*\+\s*") 1086 SUB_SIB_COM = re.compile(r"\s*~\s*") 1087 1088 # Misc 1089 WHITESPACE = re.compile(r"\s*") 1090 END_OF_SELECTOR = re.compile(r"\s*($|,)") 1091 1092 tag = None 1093 classes = [] 1094 id = None 1095 attrs = [] 1096 combinator = None 1097 1098 selector = None 1099 previous_combinator = None 1100 1101 i = cursor 1102 1103 # Skip leading whitespace 1104 m = WHITESPACE.match(s, i) 1105 if m: 1106 i = m.end() 1107 1108 while i < len(s): 1109 # Parse one simple selector. 1110 # 1111 # PEP 572 (assignment expressions; the one that burned Guido 1112 # so much that he resigned as BDFL) would have been nice; it 1113 # would have saved us from all the regex match 1114 # reassignments, and worse still, the casts, since mypy 1115 # complains about getting Optional[Match[str]] instead of 1116 # Match[str]. 1117 if TYPE_SEL.match(s, i): 1118 if tag: 1119 raise SelectorParserException(s, i, "multiple type selectors found") 1120 m = cast(Match[str], TYPE_SEL.match(s, i)) 1121 tag = m.group() 1122 elif UNIVERSAL_SEL.match(s, i): 1123 m = cast(Match[str], UNIVERSAL_SEL.match(s, i)) 1124 elif ATTR_SEL.match(s, i): 1125 m = cast(Match[str], ATTR_SEL.match(s, i)) 1126 1127 attr = m.group("attr") 1128 op = m.group("op") 1129 val_identifier = m.group("val_identifier") 1130 quote = m.group("quote") 1131 val_string_inner = m.group("val_string_inner") 1132 if val_identifier is not None: 1133 val = val_identifier 1134 elif val_string_inner is not None: 1135 val = val_string_inner.replace("\\" + quote, quote) 1136 else: 1137 val = None 1138 1139 if op is None: 1140 type = AttributeSelectorType.BARE 1141 elif op == "=": 1142 type = AttributeSelectorType.EQUAL 1143 elif op == "~=": 1144 type = AttributeSelectorType.TILDE 1145 elif op == "|=": 1146 type = AttributeSelectorType.PIPE 1147 elif op == "^=": 1148 type = AttributeSelectorType.CARET 1149 elif op == "$=": 1150 type = AttributeSelectorType.DOLLAR 1151 elif op == "*=": 1152 type = AttributeSelectorType.ASTERISK 1153 else: # pragma: no cover 1154 raise SelectorParserException( 1155 s, 1156 i, 1157 "unrecognized operator %s in attribute selector" % repr(op), 1158 ) 1159 1160 attrs.append(AttributeSelector(attr, val, type)) 1161 elif CLASS_SEL.match(s, i): 1162 m = cast(Match[str], CLASS_SEL.match(s, i)) 1163 classes.append(m.group(1)) 1164 elif ID_SEL.match(s, i): 1165 if id: 1166 raise SelectorParserException(s, i, "multiple id selectors found") 1167 m = cast(Match[str], ID_SEL.match(s, i)) 1168 id = m.group(1) 1169 elif PSEUDO_CLASS_SEL.match(s, i): 1170 raise SelectorParserException(s, i, "pseudo-classes not supported") 1171 elif PSEUDO_ELEM_SEL.match(s, i): 1172 raise SelectorParserException(s, i, "pseudo-elements not supported") 1173 else: 1174 raise SelectorParserException( 1175 s, i, "expecting simple selector, found none" 1176 ) 1177 i = m.end() 1178 1179 # Try to parse a combinator, or end the selector. 1180 if CHILD_COM.match(s, i): 1181 m = cast(Match[str], CHILD_COM.match(s, i)) 1182 combinator = Combinator.CHILD 1183 elif NEXT_SIB_COM.match(s, i): 1184 m = cast(Match[str], NEXT_SIB_COM.match(s, i)) 1185 combinator = Combinator.NEXT_SIBLING 1186 elif SUB_SIB_COM.match(s, i): 1187 m = cast(Match[str], SUB_SIB_COM.match(s, i)) 1188 combinator = Combinator.SUBSEQUENT_SIBLING 1189 elif END_OF_SELECTOR.match(s, i): 1190 m = cast(Match[str], END_OF_SELECTOR.match(s, i)) 1191 combinator = None 1192 # Need to parse descendant combinator at the very end 1193 # because it could be a prefix to all previous cases. 1194 elif DESCENDANT_COM.match(s, i): 1195 m = cast(Match[str], DESCENDANT_COM.match(s, i)) 1196 combinator = Combinator.DESCENDANT 1197 else: 1198 continue 1199 i = m.end() 1200 1201 if combinator and i == len(s): 1202 raise SelectorParserException(s, i, "unexpected end at combinator") 1203 1204 selector = cls( 1205 tag=tag, 1206 classes=classes, 1207 id=id, 1208 attrs=attrs, 1209 combinator=previous_combinator, 1210 previous=selector, 1211 ) 1212 previous_combinator = combinator 1213 1214 # End of selector. 1215 if combinator is None: 1216 break 1217 1218 tag = None 1219 classes = [] 1220 id = None 1221 attrs = [] 1222 combinator = None 1223 1224 if not selector: 1225 raise SelectorParserException(s, i, "selector is empty") 1226 1227 return selector, i 1228 1229 def matches(self, node: "Node", root: Optional["Node"] = None) -> bool: 1230 """ 1231 Decides whether the selector matches `node`. 1232 1233 Each sequence of simple selectors in the selector's chain must 1234 be matched for a positive. 1235 1236 If `root` is provided and child and/or descendant combinators 1237 are involved, parent/ancestor lookup terminates at `root`. 1238 """ 1239 if self.tag: 1240 if not node.tag or node.tag != self.tag: 1241 return False 1242 if self.id: 1243 if node.attrs.get("id") != self.id: 1244 return False 1245 if self.classes: 1246 classes = node.classes 1247 for class_ in self.classes: 1248 if class_ not in classes: 1249 return False 1250 if self.attrs: 1251 for attr_selector in self.attrs: 1252 if not attr_selector.matches(node): 1253 return False 1254 1255 if not self.previous: 1256 return True 1257 1258 if self.combinator == Combinator.DESCENDANT: 1259 return any( 1260 self.previous.matches(ancestor, root=root) 1261 for ancestor in node.ancestors() 1262 ) 1263 elif self.combinator == Combinator.CHILD: 1264 if node is root or node.parent is None: 1265 return False 1266 else: 1267 return self.previous.matches(node.parent) 1268 elif self.combinator == Combinator.NEXT_SIBLING: 1269 sibling = node.previous_element_sibling() 1270 if not sibling: 1271 return False 1272 else: 1273 return self.previous.matches(sibling) 1274 elif self.combinator == Combinator.SUBSEQUENT_SIBLING: 1275 return any( 1276 self.previous.matches(sibling, root=root) 1277 for sibling in node.previous_siblings() 1278 if isinstance(sibling, ElementNode) 1279 ) 1280 else: # pragma: no cover 1281 raise RuntimeError("unimplemented combinator: %s" % repr(self.combinator)) 1282 1283 1284class AttributeSelector: 1285 """ 1286 Represents an attribute selector. 1287 1288 Attributes: 1289 attr (:class:`str`) 1290 val (:class:`Optional`\\[:class:`str`]) 1291 type (:class:`AttributeSelectorType`) 1292 """ 1293 1294 def __init__( 1295 self, attr: str, val: Optional[str], type: "AttributeSelectorType" 1296 ) -> None: 1297 self.attr = attr.lower() 1298 self.val = val 1299 self.type = type 1300 1301 def __repr__(self) -> str: 1302 return "<AttributeSelector %s>" % repr(str(self)) 1303 1304 def __str__(self) -> str: 1305 if self.type == AttributeSelectorType.BARE: 1306 fmt = "[{attr}{val:.0}]" 1307 elif self.type == AttributeSelectorType.EQUAL: 1308 fmt = "[{attr}={val}]" 1309 elif self.type == AttributeSelectorType.TILDE: 1310 fmt = "[{attr}~={val}]" 1311 elif self.type == AttributeSelectorType.PIPE: 1312 fmt = "[{attr}|={val}]" 1313 elif self.type == AttributeSelectorType.CARET: 1314 fmt = "[{attr}^={val}]" 1315 elif self.type == AttributeSelectorType.DOLLAR: 1316 fmt = "[{attr}$={val}]" 1317 elif self.type == AttributeSelectorType.ASTERISK: 1318 fmt = "[{attr}*={val}]" 1319 return fmt.format(attr=self.attr, val=repr(self.val)) 1320 1321 def matches(self, node: "Node") -> bool: 1322 val = node.attrs.get(self.attr) 1323 if val is None: 1324 return False 1325 if self.type == AttributeSelectorType.BARE: 1326 return True 1327 elif self.type == AttributeSelectorType.EQUAL: 1328 return val == self.val 1329 elif self.type == AttributeSelectorType.TILDE: 1330 return self.val in val.split() 1331 elif self.type == AttributeSelectorType.PIPE: 1332 return val == self.val or val.startswith("%s-" % self.val) 1333 elif self.type == AttributeSelectorType.CARET: 1334 return bool(self.val and val.startswith(self.val)) 1335 elif self.type == AttributeSelectorType.DOLLAR: 1336 return bool(self.val and val.endswith(self.val)) 1337 elif self.type == AttributeSelectorType.ASTERISK: 1338 return bool(self.val and self.val in val) 1339 else: # pragma: no cover 1340 raise RuntimeError("unimplemented attribute selector: %s" % repr(self.type)) 1341 1342 1343# Enum: basis for poor man's algebraic data type. 1344class AttributeSelectorType(Enum): 1345 """ 1346 Attribute selector types. 1347 1348 Members correspond to the following forms of attribute selector: 1349 1350 - :attr:`BARE`: ``[attr]``; 1351 - :attr:`EQUAL`: ``[attr=val]``; 1352 - :attr:`TILDE`: ``[attr~=val]``; 1353 - :attr:`PIPE`: ``[attr|=val]``; 1354 - :attr:`CARET`: ``[attr^=val]``; 1355 - :attr:`DOLLAR`: ``[attr$=val]``; 1356 - :attr:`ASTERISK`: ``[attr*=val]``. 1357 """ 1358 1359 # [attr] 1360 BARE = 1 1361 # [attr=val] 1362 EQUAL = 2 1363 # [attr~=val] 1364 TILDE = 3 1365 # [attr|=val] 1366 PIPE = 4 1367 # [attr^=val] 1368 CARET = 5 1369 # [attr$=val] 1370 DOLLAR = 6 1371 # [attr*=val] 1372 ASTERISK = 7 1373 1374 1375class Combinator(Enum): 1376 """ 1377 Combinator types. 1378 1379 Members correspond to the following combinators: 1380 1381 - :attr:`DESCENDANT`: ``A B``; 1382 - :attr:`CHILD`: ``A > B``; 1383 - :attr:`NEXT_SIBLING`: ``A + B``; 1384 - :attr:`SUBSEQUENT_SIBLING`: ``A ~ B``. 1385 """ 1386 1387 # ' ' 1388 DESCENDANT = 1 1389 # > 1390 CHILD = 2 1391 # + 1392 NEXT_SIBLING = 3 1393 # ~ 1394 SUBSEQUENT_SIBLING = 4 1395 1396 1397def _tag_is_void(tag: str) -> bool: 1398 """ 1399 Checks whether the tag corresponds to a void element. 1400 1401 https://www.w3.org/TR/html5/syntax.html#void-elements 1402 https://html.spec.whatwg.org/multipage/syntax.html#void-elements 1403 """ 1404 return tag.lower() in ( 1405 "area", 1406 "base", 1407 "br", 1408 "col", 1409 "embed", 1410 "hr", 1411 "img", 1412 "input", 1413 "link", 1414 "meta", 1415 "param", 1416 "source", 1417 "track", 1418 "wbr", 1419 ) 1420 1421 1422def _tag_encloses_foreign_namespace(tag: str) -> bool: 1423 """ 1424 Checks whether the tag encloses a foreign namespace (MathML or SVG). 1425 1426 https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements 1427 """ 1428 return tag.lower() in ("math", "svg") 1429 1430 1431### end dim ### 1432 1433 1434# Global helper functions 1435 1436def open_url(url): 1437 """Open an URL in the user's default web browser. 1438 1439 The string attribute ``open_url.url_handler`` can be used to open URLs 1440 in a custom CLI script or utility. A subprocess is spawned with url as 1441 the parameter in this case instead of the usual webbrowser.open() call. 1442 1443 Whether the browser's output (both stdout and stderr) are suppressed 1444 depends on the boolean attribute ``open_url.suppress_browser_output``. 1445 If the attribute is not set upon a call, set it to a default value, 1446 which means False if BROWSER is set to a known text-based browser -- 1447 elinks, links, lynx, w3m or 'www-browser'; or True otherwise. 1448 1449 The string attribute ``open_url.override_text_browser`` can be used to 1450 ignore env var BROWSER as well as some known text-based browsers and 1451 attempt to open url in a GUI browser available. 1452 Note: If a GUI browser is indeed found, this option ignores the program 1453 option `show-browser-logs` 1454 """ 1455 logger.debug('Opening %s', url) 1456 1457 # Custom URL handler gets max priority 1458 if hasattr(open_url, 'url_handler'): 1459 subprocess.run([open_url.url_handler, url]) 1460 return 1461 1462 browser = webbrowser.get() 1463 if open_url.override_text_browser: 1464 browser_output = open_url.suppress_browser_output 1465 for name in [b for b in webbrowser._tryorder if b not in text_browsers]: 1466 browser = webbrowser.get(name) 1467 logger.debug(browser) 1468 1469 # Found a GUI browser, suppress browser output 1470 open_url.suppress_browser_output = True 1471 break 1472 1473 if open_url.suppress_browser_output: 1474 _stderr = os.dup(2) 1475 os.close(2) 1476 _stdout = os.dup(1) 1477 # Patch for GUI browsers on WSL 1478 if "microsoft" not in platform.uname()[3].lower(): 1479 os.close(1) 1480 fd = os.open(os.devnull, os.O_RDWR) 1481 os.dup2(fd, 2) 1482 os.dup2(fd, 1) 1483 try: 1484 browser.open(url, new=2) 1485 finally: 1486 if open_url.suppress_browser_output: 1487 os.close(fd) 1488 os.dup2(_stderr, 2) 1489 os.dup2(_stdout, 1) 1490 1491 if open_url.override_text_browser: 1492 open_url.suppress_browser_output = browser_output 1493 1494 1495def printerr(msg): 1496 """Print message, verbatim, to stderr. 1497 1498 ``msg`` could be any stringifiable value. 1499 """ 1500 print(msg, file=sys.stderr) 1501 1502 1503def unwrap(text): 1504 """Unwrap text.""" 1505 lines = text.split('\n') 1506 result = '' 1507 for i in range(len(lines) - 1): 1508 result += lines[i] 1509 if not lines[i]: 1510 # Paragraph break 1511 result += '\n\n' 1512 elif lines[i + 1]: 1513 # Next line is not paragraph break, add space 1514 result += ' ' 1515 # Handle last line 1516 result += lines[-1] if lines[-1] else '\n' 1517 return result 1518 1519 1520def check_stdout_encoding(): 1521 """Make sure stdout encoding is utf-8. 1522 1523 If not, print error message and instructions, then exit with 1524 status 1. 1525 1526 This function is a no-op on win32 because encoding on win32 is 1527 messy, and let's just hope for the best. /s 1528 """ 1529 if sys.platform == 'win32': 1530 return 1531 1532 # Use codecs.lookup to resolve text encoding alias 1533 encoding = codecs.lookup(sys.stdout.encoding).name 1534 if encoding != 'utf-8': 1535 locale_lang, locale_encoding = locale.getlocale() 1536 if locale_lang is None: 1537 locale_lang = '<unknown>' 1538 if locale_encoding is None: 1539 locale_encoding = '<unknown>' 1540 ioencoding = os.getenv('PYTHONIOENCODING', 'not set') 1541 sys.stderr.write(unwrap(textwrap.dedent("""\ 1542 stdout encoding '{encoding}' detected. googler requires utf-8 to 1543 work properly. The wrong encoding may be due to a non-UTF-8 1544 locale or an improper PYTHONIOENCODING. (For the record, your 1545 locale language is {locale_lang} and locale encoding is 1546 {locale_encoding}; your PYTHONIOENCODING is {ioencoding}.) 1547 1548 Please set a UTF-8 locale (e.g., en_US.UTF-8) or set 1549 PYTHONIOENCODING to utf-8. 1550 """.format( 1551 encoding=encoding, 1552 locale_lang=locale_lang, 1553 locale_encoding=locale_encoding, 1554 ioencoding=ioencoding, 1555 )))) 1556 sys.exit(1) 1557 1558 1559def time_it(description=None): 1560 def decorator(func): 1561 @functools.wraps(func) 1562 def wrapped(*args, **kwargs): 1563 # Only profile in debug mode. 1564 if not logger.isEnabledFor(logging.DEBUG): 1565 return func(*args, **kwargs) 1566 1567 import time 1568 mark = time.perf_counter() 1569 ret = func(*args, **kwargs) 1570 duration = time.perf_counter() - mark 1571 logger.debug('%s completed in \x1b[33m%.3fs\x1b[0m', description or func.__name__, duration) 1572 return ret 1573 1574 return wrapped 1575 1576 return decorator 1577 1578 1579# Classes 1580 1581class HardenedHTTPSConnection(HTTPSConnection): 1582 """Overrides HTTPSConnection.connect to specify TLS version 1583 1584 NOTE: TLS 1.2 is supported from Python 3.4 1585 """ 1586 1587 def __init__(self, host, address_family=0, **kwargs): 1588 HTTPSConnection.__init__(self, host, **kwargs) 1589 self.address_family = address_family 1590 1591 def connect(self, notweak=False): 1592 sock = self.create_socket_connection() 1593 1594 # Optimizations not available on OS X 1595 if not notweak and sys.platform.startswith('linux'): 1596 try: 1597 sock.setsockopt(socket.SOL_TCP, socket.TCP_DEFER_ACCEPT, 1) 1598 sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, 1) 1599 sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 524288) 1600 except OSError: 1601 # Doesn't work on Windows' Linux subsystem (#179) 1602 logger.debug('setsockopt failed') 1603 1604 if getattr(self, '_tunnel_host', None): 1605 self.sock = sock 1606 elif not notweak: 1607 # Try to use TLS 1.2 1608 ssl_context = None 1609 if hasattr(ssl, 'PROTOCOL_TLS'): 1610 # Since Python 3.5.3 1611 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS) 1612 if hasattr(ssl_context, "minimum_version"): 1613 # Python 3.7 with OpenSSL 1.1.0g or later 1614 ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2 1615 else: 1616 ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | 1617 ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) 1618 elif hasattr(ssl, 'PROTOCOL_TLSv1_2'): 1619 # Since Python 3.4 1620 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2) 1621 if ssl_context: 1622 self.sock = ssl_context.wrap_socket(sock) 1623 return 1624 1625 # Fallback 1626 HTTPSConnection.connect(self) 1627 1628 # Adapted from socket.create_connection. 1629 # https://github.com/python/cpython/blob/bce4ddafdd188cc6deb1584728b67b9e149ca6a4/Lib/socket.py#L771-L813 1630 def create_socket_connection(self): 1631 err = None 1632 results = socket.getaddrinfo(self.host, self.port, self.address_family, socket.SOCK_STREAM) 1633 # Prefer IPv4 if address family isn't explicitly specified. 1634 if self.address_family == 0: 1635 results = sorted(results, key=lambda res: 1 if res[0] == socket.AF_INET else 2) 1636 for af, socktype, proto, canonname, sa in results: 1637 sock = None 1638 try: 1639 sock = socket.socket(af, socktype, proto) 1640 if self.timeout is not None: 1641 sock.settimeout(self.timeout) 1642 if self.source_address: 1643 sock.bind(self.source_address) 1644 sock.connect(sa) 1645 # Break explicitly a reference cycle 1646 err = None 1647 self.address_family = af 1648 logger.debug('Opened socket to %s:%d', 1649 sa[0] if af == socket.AF_INET else ('[%s]' % sa[0]), 1650 sa[1]) 1651 return sock 1652 1653 except socket.error as _: 1654 err = _ 1655 if sock is not None: 1656 sock.close() 1657 1658 if err is not None: 1659 try: 1660 raise err 1661 finally: 1662 # Break explicitly a reference cycle 1663 err = None 1664 else: 1665 raise socket.error("getaddrinfo returns an empty list") 1666 1667 1668class GoogleUrl(object): 1669 """ 1670 This class constructs the Google Search/News URL. 1671 1672 This class is modelled on urllib.parse.ParseResult for familiarity, 1673 which means it supports reading of all six attributes -- scheme, 1674 netloc, path, params, query, fragment -- of 1675 urllib.parse.ParseResult, as well as the geturl() method. 1676 1677 However, the attributes (properties) and methods listed below should 1678 be the preferred methods of access to this class. 1679 1680 Parameters 1681 ---------- 1682 opts : dict or argparse.Namespace, optional 1683 See the ``opts`` parameter of `update`. 1684 1685 Other Parameters 1686 ---------------- 1687 See "Other Parameters" of `update`. 1688 1689 Attributes 1690 ---------- 1691 hostname : str 1692 Read-write property. 1693 keywords : str or list of strs 1694 Read-write property. 1695 news : bool 1696 Read-only property. 1697 videos : bool 1698 Read-only property. 1699 url : str 1700 Read-only property. 1701 1702 Methods 1703 ------- 1704 full() 1705 relative() 1706 update(opts=None, **kwargs) 1707 set_queries(**kwargs) 1708 unset_queries(*args) 1709 next_page() 1710 prev_page() 1711 first_page() 1712 1713 """ 1714 1715 def __init__(self, opts=None, **kwargs): 1716 self.scheme = 'https' 1717 # self.netloc is a calculated property 1718 self.path = '/search' 1719 self.params = '' 1720 # self.query is a calculated property 1721 self.fragment = '' 1722 1723 self._tld = None 1724 self._num = 10 1725 self._start = 0 1726 self._keywords = [] 1727 self._sites = None 1728 self._exclude = None 1729 1730 self._query_dict = { 1731 'ie': 'UTF-8', 1732 'oe': 'UTF-8', 1733 #'gbv': '1', # control the presence of javascript on the page, 1=no js, 2=js 1734 'sei': base64.encodebytes(uuid.uuid4().bytes).decode("ascii").rstrip('=\n').replace('/', '_'), 1735 } 1736 1737 # In preloaded HTML parsing mode, set keywords to something so 1738 # that we are not tripped up by require_keywords. 1739 if opts.html_file and not opts.keywords: 1740 opts.keywords = ['<debug>'] 1741 1742 self.update(opts, **kwargs) 1743 1744 def __str__(self): 1745 return self.url 1746 1747 @property 1748 def url(self): 1749 """The full Google URL you want.""" 1750 return self.full() 1751 1752 @property 1753 def hostname(self): 1754 """The hostname.""" 1755 return self.netloc 1756 1757 @hostname.setter 1758 def hostname(self, hostname): 1759 self.netloc = hostname 1760 1761 @property 1762 def keywords(self): 1763 """The keywords, either a str or a list of strs.""" 1764 return self._keywords 1765 1766 @keywords.setter 1767 def keywords(self, keywords): 1768 self._keywords = keywords 1769 1770 @property 1771 def news(self): 1772 """Whether the URL is for Google News.""" 1773 return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'nws' 1774 1775 @property 1776 def videos(self): 1777 """Whether the URL is for Google Videos.""" 1778 return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'vid' 1779 1780 def full(self): 1781 """Return the full URL. 1782 1783 Returns 1784 ------- 1785 str 1786 1787 """ 1788 url = (self.scheme + ':') if self.scheme else '' 1789 url += '//' + self.netloc + self.relative() 1790 return url 1791 1792 def relative(self): 1793 """Return the relative URL (without scheme and authority). 1794 1795 Authority (see RFC 3986 section 3.2), or netloc in the 1796 terminology of urllib.parse, basically means the hostname 1797 here. The relative URL is good for making HTTP(S) requests to a 1798 known host. 1799 1800 Returns 1801 ------- 1802 str 1803 1804 """ 1805 rel = self.path 1806 if self.params: 1807 rel += ';' + self.params 1808 if self.query: 1809 rel += '?' + self.query 1810 if self.fragment: 1811 rel += '#' + self.fragment 1812 return rel 1813 1814 def update(self, opts=None, **kwargs): 1815 """Update the URL with the given options. 1816 1817 Parameters 1818 ---------- 1819 opts : dict or argparse.Namespace, optional 1820 Carries options that affect the Google Search/News URL. The 1821 list of currently recognized option keys with expected value 1822 types: 1823 1824 duration: str (GooglerArgumentParser.is_duration) 1825 exact: bool 1826 keywords: str or list of strs 1827 lang: str 1828 news: bool 1829 videos: bool 1830 num: int 1831 site: str 1832 start: int 1833 tld: str 1834 unfilter: bool 1835 1836 Other Parameters 1837 ---------------- 1838 kwargs 1839 The `kwargs` dict extends `opts`, that is, options can be 1840 specified either way, in `opts` or as individual keyword 1841 arguments. 1842 1843 """ 1844 1845 if opts is None: 1846 opts = {} 1847 if hasattr(opts, '__dict__'): 1848 opts = opts.__dict__ 1849 opts.update(kwargs) 1850 1851 qd = self._query_dict 1852 if opts.get('duration'): 1853 qd['tbs'] = 'qdr:%s' % opts['duration'] 1854 if 'exact' in opts: 1855 if opts['exact']: 1856 qd['nfpr'] = 1 1857 else: 1858 qd.pop('nfpr', None) 1859 if opts.get('from') or opts.get('to'): 1860 cd_min = opts.get('from') or '' 1861 cd_max = opts.get('to') or '' 1862 qd['tbs'] = 'cdr:1,cd_min:%s,cd_max:%s' % (cd_min, cd_max) 1863 if 'keywords' in opts: 1864 self._keywords = opts['keywords'] 1865 if 'lang' in opts and opts['lang']: 1866 qd['hl'] = opts['lang'] 1867 if 'geoloc' in opts and opts['geoloc']: 1868 qd['gl'] = opts['geoloc'] 1869 if 'news' in opts and opts['news']: 1870 qd['tbm'] = 'nws' 1871 elif 'videos' in opts and opts['videos']: 1872 qd['tbm'] = 'vid' 1873 else: 1874 qd.pop('tbm', None) 1875 if 'num' in opts: 1876 self._num = opts['num'] 1877 if 'sites' in opts: 1878 self._sites = opts['sites'] 1879 if 'exclude' in opts: 1880 self._exclude = opts['exclude'] 1881 if 'start' in opts: 1882 self._start = opts['start'] 1883 if 'tld' in opts: 1884 self._tld = opts['tld'] 1885 if 'unfilter' in opts and opts['unfilter']: 1886 qd['filter'] = 0 1887 1888 def set_queries(self, **kwargs): 1889 """Forcefully set queries outside the normal `update` mechanism. 1890 1891 Other Parameters 1892 ---------------- 1893 kwargs 1894 Arbitrary key value pairs to be set in the query string. All 1895 keys and values should be stringifiable. 1896 1897 Note that certain keys, e.g., ``q``, have their values 1898 constructed on the fly, so setting those has no actual 1899 effect. 1900 1901 """ 1902 for k, v in kwargs.items(): 1903 self._query_dict[k] = v 1904 1905 def unset_queries(self, *args): 1906 """Forcefully unset queries outside the normal `update` mechanism. 1907 1908 Other Parameters 1909 ---------------- 1910 args 1911 Arbitrary keys to be unset. No exception is raised if a key 1912 does not exist in the first place. 1913 1914 Note that certain keys, e.g., ``q``, are always included in 1915 the resulting URL, so unsetting those has no actual effect. 1916 1917 """ 1918 for k in args: 1919 self._query_dict.pop(k, None) 1920 1921 def next_page(self): 1922 """Navigate to the next page.""" 1923 self._start += self._num 1924 1925 def prev_page(self): 1926 """Navigate to the previous page. 1927 1928 Raises 1929 ------ 1930 ValueError 1931 If already at the first page (``start=0`` in the current 1932 query string). 1933 1934 """ 1935 if self._start == 0: 1936 raise ValueError('Already at the first page.') 1937 self._start = (self._start - self._num) if self._start > self._num else 0 1938 1939 def first_page(self): 1940 """Navigate to the first page. 1941 1942 Raises 1943 ------ 1944 ValueError 1945 If already at the first page (``start=0`` in the current 1946 query string). 1947 1948 """ 1949 if self._start == 0: 1950 raise ValueError('Already at the first page.') 1951 self._start = 0 1952 1953 # Data source: https://web.archive.org/web/20170615200243/https://en.wikipedia.org/wiki/List_of_Google_domains 1954 # Scraper script: https://gist.github.com/zmwangx/b976e83c14552fe18b71 1955 TLD_TO_DOMAIN_MAP = { 1956 'ac': 'google.ac', 'ad': 'google.ad', 'ae': 'google.ae', 1957 'af': 'google.com.af', 'ag': 'google.com.ag', 'ai': 'google.com.ai', 1958 'al': 'google.al', 'am': 'google.am', 'ao': 'google.co.ao', 1959 'ar': 'google.com.ar', 'as': 'google.as', 'at': 'google.at', 1960 'au': 'google.com.au', 'az': 'google.az', 'ba': 'google.ba', 1961 'bd': 'google.com.bd', 'be': 'google.be', 'bf': 'google.bf', 1962 'bg': 'google.bg', 'bh': 'google.com.bh', 'bi': 'google.bi', 1963 'bj': 'google.bj', 'bn': 'google.com.bn', 'bo': 'google.com.bo', 1964 'br': 'google.com.br', 'bs': 'google.bs', 'bt': 'google.bt', 1965 'bw': 'google.co.bw', 'by': 'google.by', 'bz': 'google.com.bz', 1966 'ca': 'google.ca', 'cat': 'google.cat', 'cc': 'google.cc', 1967 'cd': 'google.cd', 'cf': 'google.cf', 'cg': 'google.cg', 1968 'ch': 'google.ch', 'ci': 'google.ci', 'ck': 'google.co.ck', 1969 'cl': 'google.cl', 'cm': 'google.cm', 'cn': 'google.cn', 1970 'co': 'google.com.co', 'cr': 'google.co.cr', 'cu': 'google.com.cu', 1971 'cv': 'google.cv', 'cy': 'google.com.cy', 'cz': 'google.cz', 1972 'de': 'google.de', 'dj': 'google.dj', 'dk': 'google.dk', 1973 'dm': 'google.dm', 'do': 'google.com.do', 'dz': 'google.dz', 1974 'ec': 'google.com.ec', 'ee': 'google.ee', 'eg': 'google.com.eg', 1975 'es': 'google.es', 'et': 'google.com.et', 'fi': 'google.fi', 1976 'fj': 'google.com.fj', 'fm': 'google.fm', 'fr': 'google.fr', 1977 'ga': 'google.ga', 'ge': 'google.ge', 'gf': 'google.gf', 1978 'gg': 'google.gg', 'gh': 'google.com.gh', 'gi': 'google.com.gi', 1979 'gl': 'google.gl', 'gm': 'google.gm', 'gp': 'google.gp', 1980 'gr': 'google.gr', 'gt': 'google.com.gt', 'gy': 'google.gy', 1981 'hk': 'google.com.hk', 'hn': 'google.hn', 'hr': 'google.hr', 1982 'ht': 'google.ht', 'hu': 'google.hu', 'id': 'google.co.id', 1983 'ie': 'google.ie', 'il': 'google.co.il', 'im': 'google.im', 1984 'in': 'google.co.in', 'io': 'google.io', 'iq': 'google.iq', 1985 'is': 'google.is', 'it': 'google.it', 'je': 'google.je', 1986 'jm': 'google.com.jm', 'jo': 'google.jo', 'jp': 'google.co.jp', 1987 'ke': 'google.co.ke', 'kg': 'google.kg', 'kh': 'google.com.kh', 1988 'ki': 'google.ki', 'kr': 'google.co.kr', 'kw': 'google.com.kw', 1989 'kz': 'google.kz', 'la': 'google.la', 'lb': 'google.com.lb', 1990 'lc': 'google.com.lc', 'li': 'google.li', 'lk': 'google.lk', 1991 'ls': 'google.co.ls', 'lt': 'google.lt', 'lu': 'google.lu', 1992 'lv': 'google.lv', 'ly': 'google.com.ly', 'ma': 'google.co.ma', 1993 'md': 'google.md', 'me': 'google.me', 'mg': 'google.mg', 1994 'mk': 'google.mk', 'ml': 'google.ml', 'mm': 'google.com.mm', 1995 'mn': 'google.mn', 'ms': 'google.ms', 'mt': 'google.com.mt', 1996 'mu': 'google.mu', 'mv': 'google.mv', 'mw': 'google.mw', 1997 'mx': 'google.com.mx', 'my': 'google.com.my', 'mz': 'google.co.mz', 1998 'na': 'google.com.na', 'ne': 'google.ne', 'nf': 'google.com.nf', 1999 'ng': 'google.com.ng', 'ni': 'google.com.ni', 'nl': 'google.nl', 2000 'no': 'google.no', 'np': 'google.com.np', 'nr': 'google.nr', 2001 'nu': 'google.nu', 'nz': 'google.co.nz', 'om': 'google.com.om', 2002 'pa': 'google.com.pa', 'pe': 'google.com.pe', 'pg': 'google.com.pg', 2003 'ph': 'google.com.ph', 'pk': 'google.com.pk', 'pl': 'google.pl', 2004 'pn': 'google.co.pn', 'pr': 'google.com.pr', 'ps': 'google.ps', 2005 'pt': 'google.pt', 'py': 'google.com.py', 'qa': 'google.com.qa', 2006 'ro': 'google.ro', 'rs': 'google.rs', 'ru': 'google.ru', 2007 'rw': 'google.rw', 'sa': 'google.com.sa', 'sb': 'google.com.sb', 2008 'sc': 'google.sc', 'se': 'google.se', 'sg': 'google.com.sg', 2009 'sh': 'google.sh', 'si': 'google.si', 'sk': 'google.sk', 2010 'sl': 'google.com.sl', 'sm': 'google.sm', 'sn': 'google.sn', 2011 'so': 'google.so', 'sr': 'google.sr', 'st': 'google.st', 2012 'sv': 'google.com.sv', 'td': 'google.td', 'tg': 'google.tg', 2013 'th': 'google.co.th', 'tj': 'google.com.tj', 'tk': 'google.tk', 2014 'tl': 'google.tl', 'tm': 'google.tm', 'tn': 'google.tn', 2015 'to': 'google.to', 'tr': 'google.com.tr', 'tt': 'google.tt', 2016 'tw': 'google.com.tw', 'tz': 'google.co.tz', 'ua': 'google.com.ua', 2017 'ug': 'google.co.ug', 'uk': 'google.co.uk', 'uy': 'google.com.uy', 2018 'uz': 'google.co.uz', 'vc': 'google.com.vc', 've': 'google.co.ve', 2019 'vg': 'google.vg', 'vi': 'google.co.vi', 'vn': 'google.com.vn', 2020 'vu': 'google.vu', 'ws': 'google.ws', 'za': 'google.co.za', 2021 'zm': 'google.co.zm', 'zw': 'google.co.zw', 2022 } 2023 2024 @property 2025 def netloc(self): 2026 """The hostname.""" 2027 try: 2028 return 'www.' + self.TLD_TO_DOMAIN_MAP[self._tld] 2029 except KeyError: 2030 return 'www.google.com' 2031 2032 @property 2033 def query(self): 2034 """The query string.""" 2035 qd = {} 2036 qd.update(self._query_dict) 2037 if self._num != 10: # Skip sending the default 2038 qd['num'] = self._num 2039 if self._start: # Skip sending the default 2040 qd['start'] = self._start 2041 2042 # Construct the q query 2043 q = '' 2044 keywords = self._keywords 2045 sites = self._sites 2046 exclude = self._exclude 2047 if keywords: 2048 if isinstance(keywords, list): 2049 q += '+'.join(urllib.parse.quote_plus(kw) for kw in keywords) 2050 else: 2051 q += urllib.parse.quote_plus(keywords) 2052 if sites: 2053 q += '+OR'.join('+site:' + urllib.parse.quote_plus(site) for site in sites) 2054 if exclude: 2055 q += ''.join('+-site:' + urllib.parse.quote_plus(e) for e in exclude) 2056 qd['q'] = q 2057 return '&'.join('%s=%s' % (k, qd[k]) for k in sorted(qd.keys())) 2058 2059 2060class GoogleConnectionError(Exception): 2061 pass 2062 2063 2064class GoogleConnection(object): 2065 """ 2066 This class facilitates connecting to and fetching from Google. 2067 2068 Parameters 2069 ---------- 2070 See http.client.HTTPSConnection for documentation of the 2071 parameters. 2072 2073 Raises 2074 ------ 2075 GoogleConnectionError 2076 2077 Attributes 2078 ---------- 2079 host : str 2080 The currently connected host. Read-only property. Use 2081 `new_connection` to change host. 2082 2083 Methods 2084 ------- 2085 new_connection(host=None, port=None, timeout=45) 2086 renew_connection(timeout=45) 2087 fetch_page(url) 2088 close() 2089 2090 """ 2091 2092 def __init__(self, host, port=None, address_family=0, timeout=45, proxy=None, notweak=False): 2093 self._host = None 2094 self._port = None 2095 self._address_family = address_family 2096 self._proxy = proxy 2097 self._notweak = notweak 2098 self._conn = None 2099 self.new_connection(host, port=port, timeout=timeout) 2100 self.cookie = '' 2101 2102 @property 2103 def host(self): 2104 """The host currently connected to.""" 2105 return self._host 2106 2107 @time_it() 2108 def new_connection(self, host=None, port=None, timeout=45): 2109 """Close the current connection (if any) and establish a new one. 2110 2111 Parameters 2112 ---------- 2113 See http.client.HTTPSConnection for documentation of the 2114 parameters. Renew the connection (i.e., reuse the current host 2115 and port) if host is None or empty. 2116 2117 Raises 2118 ------ 2119 GoogleConnectionError 2120 2121 """ 2122 if self._conn: 2123 self._conn.close() 2124 2125 if not host: 2126 host = self._host 2127 port = self._port 2128 self._host = host 2129 self._port = port 2130 host_display = host + (':%d' % port if port else '') 2131 2132 proxy = self._proxy 2133 2134 if proxy: 2135 proxy_user_passwd, proxy_host_port = parse_proxy_spec(proxy) 2136 2137 logger.debug('Connecting to proxy server %s', proxy_host_port) 2138 self._conn = HardenedHTTPSConnection(proxy_host_port, 2139 address_family=self._address_family, timeout=timeout) 2140 2141 logger.debug('Tunnelling to host %s' % host_display) 2142 connect_headers = {} 2143 if proxy_user_passwd: 2144 connect_headers['Proxy-Authorization'] = 'Basic %s' % base64.b64encode( 2145 proxy_user_passwd.encode('utf-8') 2146 ).decode('utf-8') 2147 self._conn.set_tunnel(host, port=port, headers=connect_headers) 2148 2149 try: 2150 self._conn.connect(self._notweak) 2151 except Exception as e: 2152 msg = 'Failed to connect to proxy server %s: %s.' % (proxy, e) 2153 raise GoogleConnectionError(msg) 2154 else: 2155 logger.debug('Connecting to new host %s', host_display) 2156 self._conn = HardenedHTTPSConnection(host, port=port, 2157 address_family=self._address_family, timeout=timeout) 2158 try: 2159 self._conn.connect(self._notweak) 2160 except Exception as e: 2161 msg = 'Failed to connect to %s: %s.' % (host_display, e) 2162 raise GoogleConnectionError(msg) 2163 2164 def renew_connection(self, timeout=45): 2165 """Renew current connection. 2166 2167 Equivalent to ``new_connection(timeout=timeout)``. 2168 2169 """ 2170 self.new_connection(timeout=timeout) 2171 2172 @time_it() 2173 def fetch_page(self, url): 2174 """Fetch a URL. 2175 2176 Allows one reconnection and multiple redirections before failing 2177 and raising GoogleConnectionError. 2178 2179 Parameters 2180 ---------- 2181 url : str 2182 The URL to fetch, relative to the host. 2183 2184 Raises 2185 ------ 2186 GoogleConnectionError 2187 When not getting HTTP 200 even after the allowed one 2188 reconnection and/or one redirection, or when Google is 2189 blocking query due to unusual activity. 2190 2191 Returns 2192 ------- 2193 str 2194 Response payload, gunzipped (if applicable) and decoded (in UTF-8). 2195 2196 """ 2197 try: 2198 self._raw_get(url) 2199 except (http.client.HTTPException, OSError) as e: 2200 logger.debug('Got exception: %s.', e) 2201 logger.debug('Attempting to reconnect...') 2202 self.renew_connection() 2203 try: 2204 self._raw_get(url) 2205 except http.client.HTTPException as e: 2206 logger.debug('Got exception: %s.', e) 2207 raise GoogleConnectionError("Failed to get '%s'." % url) 2208 2209 resp = self._resp 2210 redirect_counter = 0 2211 while resp.status != 200 and redirect_counter < 3: 2212 if resp.status in {301, 302, 303, 307, 308}: 2213 redirection_url = resp.getheader('location', '') 2214 if 'sorry/IndexRedirect?' in redirection_url or 'sorry/index?' in redirection_url: 2215 msg = "Connection blocked due to unusual activity.\n" 2216 if self._conn.address_family == socket.AF_INET6: 2217 msg += textwrap.dedent("""\ 2218 You are connecting over IPv6 which is likely the problem. Try to make 2219 sure the machine has a working IPv4 network interface configured. 2220 See also the -4, --ipv4 option of googler.\n""") 2221 msg += textwrap.dedent("""\ 2222 THIS IS NOT A BUG, please do NOT report it as a bug unless you have specific 2223 information that may lead to the development of a workaround. 2224 You IP address is temporarily or permanently blocked by Google and requires 2225 reCAPTCHA-solving to use the service, which googler is not capable of. 2226 Possible causes include issuing too many queries in a short time frame, or 2227 operating from a shared / low reputation IP with a history of abuse. 2228 Please do NOT use googler for automated scraping.""") 2229 msg = " ".join(msg.splitlines()) 2230 raise GoogleConnectionError(msg) 2231 self._redirect(redirection_url) 2232 resp = self._resp 2233 redirect_counter += 1 2234 else: 2235 break 2236 2237 if resp.status != 200: 2238 raise GoogleConnectionError('Got HTTP %d: %s' % (resp.status, resp.reason)) 2239 2240 payload = resp.read() 2241 try: 2242 return gzip.decompress(payload).decode('utf-8') 2243 except OSError: 2244 # Not gzipped 2245 return payload.decode('utf-8') 2246 2247 def _redirect(self, url): 2248 """Redirect to and fetch a new URL. 2249 2250 Like `_raw_get`, the response is stored in ``self._resp``. A new 2251 connection is made if redirecting to a different host. 2252 2253 Parameters 2254 ---------- 2255 url : str 2256 If absolute and points to a different host, make a new 2257 connection. 2258 2259 Raises 2260 ------ 2261 GoogleConnectionError 2262 2263 """ 2264 logger.debug('Redirecting to URL %s', url) 2265 segments = urllib.parse.urlparse(url) 2266 2267 host = segments.netloc 2268 if host != self._host: 2269 self.new_connection(host) 2270 2271 relurl = urllib.parse.urlunparse(('', '') + segments[2:]) 2272 try: 2273 self._raw_get(relurl) 2274 except http.client.HTTPException as e: 2275 logger.debug('Got exception: %s.', e) 2276 raise GoogleConnectionError("Failed to get '%s'." % url) 2277 2278 def _raw_get(self, url): 2279 """Make a raw HTTP GET request. 2280 2281 No status check (which implies no redirection). Response can be 2282 accessed from ``self._resp``. 2283 2284 Parameters 2285 ---------- 2286 url : str 2287 URL relative to the host, used in the GET request. 2288 2289 Raises 2290 ------ 2291 http.client.HTTPException 2292 2293 """ 2294 logger.debug('Fetching URL %s', url) 2295 self._conn.request('GET', url, None, { 2296 'Accept': 'text/html', 2297 'Accept-Encoding': 'gzip', 2298 'User-Agent': USER_AGENT, 2299 'Cookie': self.cookie, 2300 'Connection': 'keep-alive', 2301 'DNT': '1', 2302 }) 2303 self._resp = self._conn.getresponse() 2304 if self.cookie == '': 2305 complete_cookie = self._resp.getheader('Set-Cookie') 2306 # Cookie won't be available if already blocked 2307 if complete_cookie is not None: 2308 self.cookie = complete_cookie[:complete_cookie.find(';')] 2309 logger.debug('Cookie: %s' % self.cookie) 2310 2311 def close(self): 2312 """Close the connection (if one is active).""" 2313 if self._conn: 2314 self._conn.close() 2315 2316 2317class GoogleParser(object): 2318 2319 def __init__(self, html, *, news=False, videos=False): 2320 self.news = news 2321 self.videos = videos 2322 self.autocorrected = False 2323 self.showing_results_for = None 2324 self.filtered = False 2325 self.results = [] 2326 self.parse(html) 2327 2328 @time_it() 2329 def parse(self, html): 2330 tree = parse_html(html) 2331 2332 if debugger: 2333 printerr('\x1b[1mInspect the DOM through the \x1b[4mtree\x1b[24m variable.\x1b[0m') 2334 printerr('') 2335 try: 2336 import IPython 2337 IPython.embed() 2338 except ImportError: 2339 import pdb 2340 pdb.set_trace() 2341 2342 # cw is short for collapse_whitespace. 2343 cw = lambda s: re.sub(r'[ \t\n\r]+', ' ', s) if s is not None else s 2344 2345 index = 0 2346 for div_g in tree.select_all('div.g'): 2347 if div_g.select('.hp-xpdbox'): 2348 # Skip smart cards. 2349 continue 2350 try: 2351 if div_g.select('.st'): 2352 # Old class structure, stopped working some time in 2353 # September 2020, but kept just in case. 2354 h3 = div_g.select('div.r h3') 2355 if h3: 2356 title = h3.text 2357 a = h3.parent 2358 else: 2359 h3 = div_g.select('h3.r') 2360 a = h3.select('a') 2361 title = a.text 2362 mime = div_g.select('.mime') 2363 if mime: 2364 title = mime.text + ' ' + title 2365 abstract_node = div_g.select('.st') 2366 metadata_node = div_g.select('.f') 2367 else: 2368 # Current structure as of October 2020. 2369 # Note that a filetype tag (e.g. PDF) is now pretty 2370 # damn hard to parse with confidence (that it'll 2371 # survive the slighest further change), so we don't. 2372 2373 # As of January 15th 2021, the html class is not rc anymore, it's tF2Cxc. 2374 # This approach is not very resilient to changes by Google, but it works for now. 2375 # title_node, details_node, *_ = div_g.select_all('div.rc > div') 2376 title_node, details_node, *_ = div_g.select_all('div.tF2Cxc > div') 2377 if 'yuRUbf' not in title_node.classes: 2378 logger.debug('unexpected title node class(es): expected %r, got %r', 2379 'yuRUbf', ' '.join(title_node.classes)) 2380 if 'IsZvec' not in details_node.classes: 2381 logger.debug('unexpected details node class(es): expected %r, got %r', 2382 'IsZvec', ' '.join(details_node.classes)) 2383 a = title_node.select('a') 2384 h3 = a.select('h3') 2385 title = h3.text 2386 abstract_node = details_node.select('span') 2387 metadata_node = details_node.select('.f, span ~ div') 2388 url = self.unwrap_link(a.attr('href')) 2389 matched_keywords = [] 2390 abstract = '' 2391 # BFS descendant nodes. Necessary to locate matches (b, 2392 # em) while skipping metadata (.f). 2393 abstract_nodes = collections.deque([abstract_node]) 2394 while abstract_nodes: 2395 node = abstract_nodes.popleft() 2396 if 'f' in node.classes: 2397 # .f is handled as metadata instead. 2398 continue 2399 if node.tag in ['b', 'em']: 2400 matched_keywords.append({'phrase': node.text, 'offset': len(abstract)}) 2401 abstract += node.text 2402 continue 2403 if not node.children: 2404 abstract += node.text 2405 continue 2406 for child in node.children: 2407 abstract_nodes.append(child) 2408 metadata = None 2409 try: 2410 # Sometimes there are multiple metadata fields 2411 # associated with a single entry, e.g. "Released", 2412 # "Producer(s)", "Genre", etc. for a song (sample 2413 # query: "never gonna give you up"). These need to 2414 # be delimited when displayed. 2415 metadata_fields = metadata_node.select_all('div > div.wFMWsc') 2416 if metadata_fields: 2417 metadata = ' | '.join(field.text for field in metadata_fields) 2418 elif not metadata_node.select('a') and not metadata_node.select('g-expandable-container'): 2419 metadata = metadata_node.text 2420 if metadata: 2421 metadata = ( 2422 metadata 2423 .replace('\u200e', '') 2424 .replace(' - ', ', ') 2425 .replace(' \u2014 ', ', ') 2426 .strip().rstrip(',') 2427 ) 2428 except AttributeError: 2429 pass 2430 except (AttributeError, ValueError): 2431 continue 2432 sitelinks = [] 2433 for td in div_g.select_all('td'): 2434 try: 2435 a = td.select('a') 2436 sl_title = a.text 2437 sl_url = self.unwrap_link(a.attr('href')) 2438 sl_abstract = td.select('div.s.st, div.s .st').text 2439 sitelink = Sitelink(cw(sl_title), sl_url, cw(sl_abstract)) 2440 if sitelink not in sitelinks: 2441 sitelinks.append(sitelink) 2442 except (AttributeError, ValueError): 2443 continue 2444 # cw cannot be applied to abstract here since it may screw 2445 # up offsets of matches. Instead, each relevant node's text 2446 # is whitespace-collapsed before being appended to abstract. 2447 # We then hope for the best. 2448 result = Result(index + 1, cw(title), url, abstract, 2449 metadata=cw(metadata), sitelinks=sitelinks, matches=matched_keywords) 2450 if result not in self.results: 2451 self.results.append(result) 2452 index += 1 2453 2454 if not self.results: 2455 for card in tree.select_all('g-card'): 2456 a = card.select('a[href]') 2457 if not a: 2458 continue 2459 url = self.unwrap_link(a.attr('href')) 2460 text_nodes = [] 2461 for node in a.descendants(): 2462 if isinstance(node, TextNode) and node.strip(): 2463 text_nodes.append(node.text) 2464 if len(text_nodes) != 4: 2465 continue 2466 publisher, title, abstract, publishing_time = text_nodes 2467 metadata = '%s, %s' % (publisher, publishing_time) 2468 index += 1 2469 self.results.append(Result(index, cw(title), url, cw(abstract), metadata=cw(metadata))) 2470 2471 # Showing results for ... 2472 # Search instead for ... 2473 spell_orig = tree.select("span.spell_orig") 2474 if spell_orig: 2475 showing_results_for_link = next( 2476 filter(lambda el: el.tag == "a", spell_orig.previous_siblings()), None 2477 ) 2478 if showing_results_for_link: 2479 self.autocorrected = True 2480 self.showing_results_for = showing_results_for_link.text 2481 2482 # No results found for ... 2483 # Results for ...: 2484 alt_query_infobox = tree.select('#topstuff') 2485 if alt_query_infobox: 2486 bolds = alt_query_infobox.select_all('div b') 2487 if len(bolds) == 2: 2488 self.showing_results_for = bolds[1].text 2489 2490 # In order to show you the most relevant results, we have 2491 # omitted some entries very similar to the N already displayed. 2492 # ... 2493 self.filtered = tree.select('p#ofr') is not None 2494 2495 # Unwraps /url?q=http://...&sa=... 2496 # TODO: don't unwrap if URL isn't in this form. 2497 @staticmethod 2498 def unwrap_link(link): 2499 qs = urllib.parse.urlparse(link).query 2500 try: 2501 url = urllib.parse.parse_qs(qs)['q'][0] 2502 except KeyError: 2503 return link 2504 else: 2505 if "://" in url: 2506 return url 2507 else: 2508 # Google's internal services link, e.g., 2509 # /search?q=google&..., which cannot be unwrapped into 2510 # an actual URL. 2511 raise ValueError(link) 2512 2513 2514class Sitelink(object): 2515 """Container for a sitelink.""" 2516 2517 def __init__(self, title, url, abstract): 2518 self.title = title 2519 self.url = url 2520 self.abstract = abstract 2521 self.index = '' 2522 2523 def __eq__(self, other): 2524 return ( 2525 self.title == other.title and 2526 self.url == other.url and 2527 self.abstract == other.abstract 2528 ) 2529 2530 def __hash__(self): 2531 return hash((self.title, self.url, self.abstract)) 2532 2533 2534Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract, prompt, reset') 2535 2536 2537class Result(object): 2538 """ 2539 Container for one search result, with output helpers. 2540 2541 Parameters 2542 ---------- 2543 index : int or str 2544 title : str 2545 url : str 2546 abstract : str 2547 metadata : str, optional 2548 Only applicable to Google News results, with publisher name and 2549 publishing time. 2550 sitelinks : list, optional 2551 List of ``SiteLink`` objects. 2552 2553 Attributes 2554 ---------- 2555 index : str 2556 title : str 2557 url : str 2558 abstract : str 2559 metadata : str or None 2560 sitelinks : list 2561 matches : list 2562 2563 Class Variables 2564 --------------- 2565 colors : str 2566 2567 Methods 2568 ------- 2569 print() 2570 jsonizable_object() 2571 urltable() 2572 2573 """ 2574 2575 # Class variables 2576 colors = None 2577 urlexpand = True 2578 2579 def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None, matches=None): 2580 index = str(index) 2581 self.index = index 2582 self.title = title 2583 self.url = url 2584 self.abstract = abstract 2585 self.metadata = metadata 2586 self.sitelinks = [] if sitelinks is None else sitelinks 2587 self.matches = [] if matches is None else matches 2588 2589 self._urltable = {index: url} 2590 subindex = 'a' 2591 for sitelink in self.sitelinks: 2592 fullindex = index + subindex 2593 sitelink.index = fullindex 2594 self._urltable[fullindex] = sitelink.url 2595 subindex = chr(ord(subindex) + 1) 2596 2597 def __eq__(self, other): 2598 return ( 2599 self.title == other.title and 2600 self.url == other.url and 2601 self.abstract == other.abstract and 2602 self.metadata == other.metadata and 2603 self.sitelinks == other.sitelinks and 2604 self.matches == other.matches 2605 ) 2606 2607 def __hash__(self): 2608 sitelinks_hashable = tuple(self.sitelinks) if self.sitelinks is not None else None 2609 matches_hashable = tuple(self.matches) if self.matches is not None else None 2610 return hash(self.title, self.url, self.abstract, self.metadata, sitelinks_hashable, matches_hashable) 2611 2612 def _print_title_and_url(self, index, title, url, indent=0): 2613 colors = self.colors 2614 2615 if not self.urlexpand: 2616 url = '[' + urllib.parse.urlparse(url).netloc + ']' 2617 2618 if colors: 2619 # Adjust index to print result index clearly 2620 print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colors.reset), end='') 2621 if not self.urlexpand: 2622 print(' ' + colors.title + title + colors.reset + ' ' + colors.url + url + colors.reset) 2623 else: 2624 print(' ' + colors.title + title + colors.reset) 2625 print(' ' * (indent + 5) + colors.url + url + colors.reset) 2626 else: 2627 if self.urlexpand: 2628 print(' %s%-3s %s' % (' ' * indent, index + '.', title)) 2629 print(' %s%s' % (' ' * (indent + 4), url)) 2630 else: 2631 print(' %s%-3s %s %s' % (' ' * indent, index + '.', title, url)) 2632 2633 def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=0): 2634 colors = self.colors 2635 try: 2636 columns, _ = os.get_terminal_size() 2637 except OSError: 2638 columns = 0 2639 2640 if metadata: 2641 if colors: 2642 print(' ' * (indent + 5) + colors.metadata + metadata + colors.reset) 2643 else: 2644 print(' ' * (indent + 5) + metadata) 2645 2646 if abstract: 2647 fillwidth = (columns - (indent + 6)) if columns > indent + 6 else len(abstract) 2648 wrapped_abstract = TrackedTextwrap(abstract, fillwidth) 2649 if colors: 2650 # Highlight matches. 2651 for match in matches or []: 2652 offset = match['offset'] 2653 span = len(match['phrase']) 2654 wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offset) 2655 wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offset + span) 2656 2657 if colors: 2658 print(colors.abstract, end='') 2659 for line in wrapped_abstract.lines: 2660 print('%s%s' % (' ' * (indent + 5), line)) 2661 if colors: 2662 print(colors.reset, end='') 2663 2664 print('') 2665 2666 def print(self): 2667 """Print the result entry.""" 2668 self._print_title_and_url(self.index, self.title, self.url) 2669 self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches) 2670 2671 for sitelink in self.sitelinks: 2672 self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, indent=4) 2673 self._print_metadata_and_abstract(sitelink.abstract, indent=4) 2674 2675 def jsonizable_object(self): 2676 """Return a JSON-serializable dict representing the result entry.""" 2677 obj = { 2678 'title': self.title, 2679 'url': self.url, 2680 'abstract': self.abstract 2681 } 2682 if self.metadata: 2683 obj['metadata'] = self.metadata 2684 if self.sitelinks: 2685 obj['sitelinks'] = [sitelink.__dict__ for sitelink in self.sitelinks] 2686 if self.matches: 2687 obj['matches'] = self.matches 2688 return obj 2689 2690 def urltable(self): 2691 """Return a index-to-URL table for the current result. 2692 2693 Normally, the table contains only a single entry, but when the result 2694 contains sitelinks, all sitelinks are included in this table. 2695 2696 Returns 2697 ------- 2698 dict 2699 A dict mapping indices (strs) to URLs (also strs). Indices of 2700 sitelinks are the original index appended by lowercase letters a, 2701 b, c, etc. 2702 2703 """ 2704 return self._urltable 2705 2706 @staticmethod 2707 def collapse_whitespace(s): 2708 return re.sub(r'[ \t\n\r]+', ' ', s) 2709 2710 2711class GooglerCmdException(Exception): 2712 pass 2713 2714 2715class NoKeywordsException(GooglerCmdException): 2716 pass 2717 2718 2719def require_keywords(method): 2720 # Require keywords to be set before we run a GooglerCmd method. If 2721 # no keywords have been set, raise a NoKeywordsException. 2722 @functools.wraps(method) 2723 def enforced_method(self, *args, **kwargs): 2724 if not self.keywords: 2725 raise NoKeywordsException('No keywords.') 2726 method(self, *args, **kwargs) 2727 2728 return enforced_method 2729 2730 2731def no_argument(method): 2732 # Normalize a do_* method of GooglerCmd that takes no argument to 2733 # one that takes an arg, but issue a warning when an nonempty 2734 # argument is given. 2735 @functools.wraps(method) 2736 def enforced_method(self, arg): 2737 if arg: 2738 method_name = arg.__name__ 2739 command_name = method_name[3:] if method_name.startswith('do_') else method_name 2740 logger.warning("Argument to the '%s' command ignored.", command_name) 2741 method(self) 2742 2743 return enforced_method 2744 2745 2746class GooglerCmd(object): 2747 """ 2748 Command line interpreter and executor class for googler. 2749 2750 Inspired by PSL cmd.Cmd. 2751 2752 Parameters 2753 ---------- 2754 opts : argparse.Namespace 2755 Options and/or arguments. 2756 2757 Attributes 2758 ---------- 2759 options : argparse.Namespace 2760 Options that are currently in effect. Read-only attribute. 2761 keywords : str or list or strs 2762 Current keywords. Read-only attribute 2763 2764 Methods 2765 ------- 2766 fetch() 2767 display_results(prelude='\n', json_output=False) 2768 fetch_and_display(prelude='\n', json_output=False, interactive=True) 2769 read_next_command() 2770 help() 2771 cmdloop() 2772 """ 2773 2774 # Class variables 2775 colors = None 2776 re_url_index = re.compile(r"\d+(a-z)?") 2777 2778 def __init__(self, opts): 2779 super().__init__() 2780 2781 self._opts = opts 2782 2783 self._google_url = GoogleUrl(opts) 2784 2785 if opts.html_file: 2786 # Preloaded HTML parsing mode, do not initialize connection. 2787 self._preload_from_file = opts.html_file 2788 self._conn = None 2789 else: 2790 self._preload_from_file = None 2791 proxy = opts.proxy if hasattr(opts, 'proxy') else None 2792 self._conn = GoogleConnection(self._google_url.hostname, 2793 address_family=opts.address_family, 2794 proxy=proxy, 2795 notweak=opts.notweak) 2796 atexit.register(self._conn.close) 2797 2798 self.results = [] 2799 self._autocorrected = None 2800 self._showing_results_for = None 2801 self._results_filtered = False 2802 self._urltable = {} 2803 2804 self.promptcolor = True if os.getenv('DISABLE_PROMPT_COLOR') is None else False 2805 2806 self.no_results_instructions_shown = False 2807 2808 @property 2809 def options(self): 2810 """Current options.""" 2811 return self._opts 2812 2813 @property 2814 def keywords(self): 2815 """Current keywords.""" 2816 return self._google_url.keywords 2817 2818 @require_keywords 2819 def fetch(self): 2820 """Fetch a page and parse for results. 2821 2822 Results are stored in ``self.results``. 2823 2824 Raises 2825 ------ 2826 GoogleConnectionError 2827 2828 See Also 2829 -------- 2830 fetch_and_display 2831 2832 """ 2833 # This method also sets self._results_filtered and 2834 # self._urltable. 2835 if self._preload_from_file: 2836 with open(self._preload_from_file, encoding='utf-8') as fp: 2837 page = fp.read() 2838 else: 2839 page = self._conn.fetch_page(self._google_url.relative()) 2840 if logger.isEnabledFor(logging.DEBUG): 2841 import tempfile 2842 fd, tmpfile = tempfile.mkstemp(prefix='googler-response-', suffix='.html') 2843 os.close(fd) 2844 with open(tmpfile, 'w', encoding='utf-8') as fp: 2845 fp.write(page) 2846 logger.debug("Response body written to '%s'.", tmpfile) 2847 2848 parser = GoogleParser(page, news=self._google_url.news, videos=self._google_url.videos) 2849 2850 self.results = parser.results 2851 self._autocorrected = parser.autocorrected 2852 self._showing_results_for = parser.showing_results_for 2853 self._results_filtered = parser.filtered 2854 self._urltable = {} 2855 for r in self.results: 2856 self._urltable.update(r.urltable()) 2857 2858 def warn_no_results(self): 2859 printerr('No results.') 2860 if self.no_results_instructions_shown: 2861 return 2862 2863 try: 2864 import json 2865 import urllib.error 2866 import urllib.request 2867 info_json_url = '%s/master/info.json' % RAW_DOWNLOAD_REPO_BASE 2868 logger.debug('Fetching %s for project status...', info_json_url) 2869 try: 2870 with urllib.request.urlopen(info_json_url, timeout=5) as response: 2871 try: 2872 info = json.load(response) 2873 except Exception: 2874 logger.error('Failed to decode project status from %s', info_json_url) 2875 raise RuntimeError 2876 except urllib.error.HTTPError as e: 2877 logger.error('Failed to fetch project status from %s: HTTP %d', info_json_url, e.code) 2878 raise RuntimeError 2879 epoch = info.get('epoch') 2880 if epoch > _EPOCH_: 2881 printerr('Your version of googler is broken due to Google-side changes.') 2882 tracking_issue = info.get('tracking_issue') 2883 fixed_on_master = info.get('fixed_on_master') 2884 fixed_in_release = info.get('fixed_in_release') 2885 if fixed_in_release: 2886 printerr('A new version, %s, has been released to address the changes.' % fixed_in_release) 2887 printerr('Please upgrade to the latest version.') 2888 elif fixed_on_master: 2889 printerr('The fix has been pushed to master, pending a release.') 2890 printerr('Please download the master version https://git.io/googler or wait for a release.') 2891 else: 2892 printerr('The issue is tracked at https://github.com/jarun/googler/issues/%s.' % tracking_issue) 2893 return 2894 except RuntimeError: 2895 pass 2896 2897 printerr('If you believe this is a bug, please review ' 2898 'https://git.io/googler-no-results before submitting a bug report.') 2899 self.no_results_instructions_shown = True 2900 2901 @require_keywords 2902 def display_results(self, prelude='\n', json_output=False): 2903 """Display results stored in ``self.results``. 2904 2905 Parameters 2906 ---------- 2907 See `fetch_and_display`. 2908 2909 """ 2910 if json_output: 2911 # JSON output 2912 import json 2913 results_object = [r.jsonizable_object() for r in self.results] 2914 print(json.dumps(results_object, indent=2, sort_keys=True, ensure_ascii=False)) 2915 else: 2916 # Regular output 2917 if not self.results: 2918 self.warn_no_results() 2919 else: 2920 sys.stderr.write(prelude) 2921 for r in self.results: 2922 r.print() 2923 2924 @require_keywords 2925 def showing_results_for_alert(self, interactive=True): 2926 colors = self.colors 2927 if self._showing_results_for: 2928 if colors: 2929 # Underline the query 2930 actual_query = '\x1b[4m' + self._showing_results_for + '\x1b[24m' 2931 else: 2932 actual_query = self._showing_results_for 2933 if self._autocorrected: 2934 if interactive: 2935 info = 'Showing results for %s; enter "x" for an exact search.' % actual_query 2936 else: 2937 info = 'Showing results for %s; use -x, --exact for an exact search.' % actual_query 2938 else: 2939 info = 'No results found; showing results for %s.' % actual_query 2940 if interactive: 2941 printerr('') 2942 if colors: 2943 printerr(colors.prompt + info + colors.reset) 2944 else: 2945 printerr('** ' + info) 2946 2947 @require_keywords 2948 def fetch_and_display(self, prelude='\n', json_output=False, interactive=True): 2949 """Fetch a page and display results. 2950 2951 Results are stored in ``self.results``. 2952 2953 Parameters 2954 ---------- 2955 prelude : str, optional 2956 A string that is written to stderr before showing actual results, 2957 usually serving as a separator. Default is an empty line. 2958 json_output : bool, optional 2959 Whether to dump results in JSON format. Default is False. 2960 interactive : bool, optional 2961 Whether to show contextual instructions, when e.g. Google 2962 has filtered the results. Default is True. 2963 2964 Raises 2965 ------ 2966 GoogleConnectionError 2967 2968 See Also 2969 -------- 2970 fetch 2971 display_results 2972 2973 """ 2974 self.fetch() 2975 self.showing_results_for_alert() 2976 self.display_results(prelude=prelude, json_output=json_output) 2977 if self._results_filtered: 2978 colors = self.colors 2979 info = 'Enter "unfilter" to show similar results Google omitted.' 2980 if colors: 2981 printerr(colors.prompt + info + colors.reset) 2982 else: 2983 printerr('** ' + info) 2984 printerr('') 2985 2986 def read_next_command(self): 2987 """Show omniprompt and read user command line. 2988 2989 Command line is always stripped, and each consecutive group of 2990 whitespace is replaced with a single space character. If the 2991 command line is empty after stripping, when ignore it and keep 2992 reading. Exit with status 0 if we get EOF or an empty line 2993 (pre-strip, that is, a raw <enter>) twice in a row. 2994 2995 The new command line (non-empty) is stored in ``self.cmd``. 2996 2997 """ 2998 colors = self.colors 2999 message = 'googler (? for help)' 3000 prompt = (colors.prompt + message + colors.reset + ' ') if (colors and self.promptcolor) else (message + ': ') 3001 enter_count = 0 3002 while True: 3003 try: 3004 cmd = input(prompt) 3005 except EOFError: 3006 sys.exit(0) 3007 3008 if not cmd: 3009 enter_count += 1 3010 if enter_count == 2: 3011 # Double <enter> 3012 sys.exit(0) 3013 else: 3014 enter_count = 0 3015 3016 cmd = ' '.join(cmd.split()) 3017 if cmd: 3018 self.cmd = cmd 3019 break 3020 3021 @staticmethod 3022 def help(): 3023 GooglerArgumentParser.print_omniprompt_help(sys.stderr) 3024 printerr('') 3025 3026 @require_keywords 3027 @no_argument 3028 def do_first(self): 3029 try: 3030 self._google_url.first_page() 3031 except ValueError as e: 3032 print(e, file=sys.stderr) 3033 return 3034 3035 self.fetch_and_display() 3036 3037 def do_google(self, arg): 3038 # Update keywords and reconstruct URL 3039 self._opts.keywords = arg 3040 self._google_url = GoogleUrl(self._opts) 3041 self.fetch_and_display() 3042 3043 @require_keywords 3044 @no_argument 3045 def do_next(self): 3046 # If > 5 results are being fetched each time, 3047 # block next when no parsed results in current fetch 3048 if not self.results and self._google_url._num > 5: 3049 printerr('No results.') 3050 else: 3051 self._google_url.next_page() 3052 self.fetch_and_display() 3053 3054 @require_keywords 3055 def do_open(self, *args): 3056 if not args: 3057 open_url(self._google_url.full()) 3058 return 3059 3060 for nav in args: 3061 if nav == 'a': 3062 for key, value in sorted(self._urltable.items()): 3063 open_url(self._urltable[key]) 3064 elif nav in self._urltable: 3065 open_url(self._urltable[nav]) 3066 elif '-' in nav: 3067 try: 3068 vals = [int(x) for x in nav.split('-')] 3069 if (len(vals) != 2): 3070 printerr('Invalid range %s.' % nav) 3071 continue 3072 3073 if vals[0] > vals[1]: 3074 vals[0], vals[1] = vals[1], vals[0] 3075 3076 for _id in range(vals[0], vals[1] + 1): 3077 if str(_id) in self._urltable: 3078 open_url(self._urltable[str(_id)]) 3079 else: 3080 printerr('Invalid index %s.' % _id) 3081 except ValueError: 3082 printerr('Invalid range %s.' % nav) 3083 else: 3084 printerr('Invalid index %s.' % nav) 3085 3086 @require_keywords 3087 @no_argument 3088 def do_previous(self): 3089 try: 3090 self._google_url.prev_page() 3091 except ValueError as e: 3092 print(e, file=sys.stderr) 3093 return 3094 3095 self.fetch_and_display() 3096 3097 @require_keywords 3098 @no_argument 3099 def do_exact(self): 3100 # Reset start to 0 when exact is applied. 3101 self._google_url.update(start=0, exact=True) 3102 self.fetch_and_display() 3103 3104 @require_keywords 3105 @no_argument 3106 def do_unfilter(self): 3107 # Reset start to 0 when unfilter is applied. 3108 self._google_url.update(start=0) 3109 self._google_url.set_queries(filter=0) 3110 self.fetch_and_display() 3111 3112 def copy_url(self, idx): 3113 try: 3114 try: 3115 content = self._urltable[idx].encode('utf-8') 3116 except KeyError: 3117 printerr('Invalid index.') 3118 return 3119 3120 # try copying the url to clipboard using native utilities 3121 copier_params = [] 3122 if sys.platform.startswith(('linux', 'freebsd', 'openbsd')): 3123 if shutil.which('xsel') is not None: 3124 copier_params = ['xsel', '-b', '-i'] 3125 elif shutil.which('xclip') is not None: 3126 copier_params = ['xclip', '-selection', 'clipboard'] 3127 elif shutil.which('wl-copy') is not None: 3128 copier_params = ['wl-copy'] 3129 elif shutil.which('termux-clipboard-set') is not None: 3130 copier_params = ['termux-clipboard-set'] 3131 elif sys.platform == 'darwin': 3132 copier_params = ['pbcopy'] 3133 elif sys.platform == 'win32': 3134 copier_params = ['clip'] 3135 3136 if copier_params: 3137 Popen(copier_params, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL).communicate(content) 3138 return 3139 3140 # If native clipboard utilities are absent, try to use terminal multiplexers 3141 # tmux 3142 if os.getenv('TMUX_PANE'): 3143 copier_params = ['tmux', 'set-buffer'] 3144 Popen(copier_params + [content], stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate() 3145 return 3146 3147 # GNU Screen paste buffer 3148 if os.getenv('STY'): 3149 import tempfile 3150 copier_params = ['screen', '-X', 'readbuf', '-e', 'utf8'] 3151 tmpfd, tmppath = tempfile.mkstemp() 3152 try: 3153 with os.fdopen(tmpfd, 'wb') as fp: 3154 fp.write(content) 3155 copier_params.append(tmppath) 3156 Popen(copier_params, stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate() 3157 finally: 3158 os.unlink(tmppath) 3159 return 3160 3161 printerr('failed to locate suitable clipboard utility') 3162 except Exception: 3163 raise NoKeywordsException 3164 3165 def cmdloop(self): 3166 """Run REPL.""" 3167 if self.keywords: 3168 self.fetch_and_display() 3169 else: 3170 printerr('Please initiate a query.') 3171 3172 while True: 3173 self.read_next_command() 3174 # TODO: Automatic dispatcher 3175 # 3176 # We can't write a dispatcher for now because that could 3177 # change behaviour of the prompt. However, we have already 3178 # laid a lot of ground work for the dispatcher, e.g., the 3179 # `no_argument' decorator. 3180 try: 3181 cmd = self.cmd 3182 if cmd == 'f': 3183 self.do_first('') 3184 elif cmd.startswith('g '): 3185 self.do_google(cmd[2:]) 3186 elif cmd == 'n': 3187 self.do_next('') 3188 elif cmd == 'o': 3189 self.do_open() 3190 elif cmd.startswith('o '): 3191 self.do_open(*cmd[2:].split()) 3192 elif cmd.startswith('O '): 3193 open_url.override_text_browser = True 3194 self.do_open(*cmd[2:].split()) 3195 open_url.override_text_browser = False 3196 elif cmd == 'p': 3197 self.do_previous('') 3198 elif cmd == 'q': 3199 break 3200 elif cmd == 'x': 3201 self.do_exact('') 3202 elif cmd == 'unfilter': 3203 self.do_unfilter('') 3204 elif cmd == '?': 3205 self.help() 3206 elif cmd in self._urltable: 3207 open_url(self._urltable[cmd]) 3208 elif self.keywords and cmd.isdigit() and int(cmd) < 100: 3209 printerr('Index out of bound. To search for the number, use g.') 3210 elif cmd == 'u': 3211 Result.urlexpand = not Result.urlexpand 3212 self.display_results() 3213 elif cmd.startswith('c ') and self.re_url_index.match(cmd[2:]): 3214 self.copy_url(cmd[2:]) 3215 else: 3216 self.do_google(cmd) 3217 except NoKeywordsException: 3218 printerr('Initiate a query first.') 3219 3220 3221class GooglerArgumentParser(argparse.ArgumentParser): 3222 """Custom argument parser for googler.""" 3223 3224 # Print omniprompt help 3225 @staticmethod 3226 def print_omniprompt_help(file=None): 3227 file = sys.stderr if file is None else file 3228 file.write(textwrap.dedent(""" 3229 omniprompt keys: 3230 n, p fetch the next or previous set of search results 3231 index open the result corresponding to index in browser 3232 f jump to the first page 3233 o [index|range|a ...] open space-separated result indices, numeric ranges 3234 (sitelinks unsupported in ranges), or all, in browser 3235 open the current search in browser, if no arguments 3236 O [index|range|a ...] like key 'o', but try to open in a GUI browser 3237 g keywords new Google search for 'keywords' with original options 3238 should be used to search omniprompt keys and indices 3239 c index copy url to clipboard 3240 u toggle url expansion 3241 q, ^D, double Enter exit googler 3242 ? show omniprompt help 3243 * other inputs issue a new search with original options 3244 """)) 3245 3246 # Print information on googler 3247 @staticmethod 3248 def print_general_info(file=None): 3249 file = sys.stderr if file is None else file 3250 file.write(textwrap.dedent(""" 3251 Version %s 3252 Copyright © 2008 Henri Hakkinen 3253 Copyright © 2015-2021 Arun Prakash Jana <engineerarun@gmail.com> 3254 Zhiming Wang <zmwangx@gmail.com> 3255 License: GPLv3 3256 Webpage: https://github.com/jarun/googler 3257 """ % _VERSION_)) 3258 3259 # Augment print_help to print more than synopsis and options 3260 def print_help(self, file=None): 3261 super().print_help(file) 3262 self.print_omniprompt_help(file) 3263 self.print_general_info(file) 3264 3265 # Automatically print full help text on error 3266 def error(self, message): 3267 sys.stderr.write('%s: error: %s\n\n' % (self.prog, message)) 3268 self.print_help(sys.stderr) 3269 self.exit(2) 3270 3271 # Type guards 3272 @staticmethod 3273 def positive_int(arg): 3274 """Try to convert a string into a positive integer.""" 3275 try: 3276 n = int(arg) 3277 assert n > 0 3278 return n 3279 except (ValueError, AssertionError): 3280 raise argparse.ArgumentTypeError('%s is not a positive integer' % arg) 3281 3282 @staticmethod 3283 def nonnegative_int(arg): 3284 """Try to convert a string into a nonnegative integer.""" 3285 try: 3286 n = int(arg) 3287 assert n >= 0 3288 return n 3289 except (ValueError, AssertionError): 3290 raise argparse.ArgumentTypeError('%s is not a non-negative integer' % arg) 3291 3292 @staticmethod 3293 def is_duration(arg): 3294 """Check if a string is a valid duration accepted by Google. 3295 3296 A valid duration is of the form dNUM, where d is a single letter h 3297 (hour), d (day), w (week), m (month), or y (year), and NUM is a 3298 non-negative integer. 3299 """ 3300 try: 3301 if arg[0] not in ('h', 'd', 'w', 'm', 'y') or int(arg[1:]) < 0: 3302 raise ValueError 3303 except (TypeError, IndexError, ValueError): 3304 raise argparse.ArgumentTypeError('%s is not a valid duration' % arg) 3305 return arg 3306 3307 @staticmethod 3308 def is_date(arg): 3309 """Check if a string is a valid date/month/year accepted by Google.""" 3310 if re.match(r'^(\d+/){0,2}\d+$', arg): 3311 return arg 3312 else: 3313 raise argparse.ArgumentTypeError('%s is not a valid date/month/year; ' 3314 'use the American date format with slashes') 3315 3316 @staticmethod 3317 def is_colorstr(arg): 3318 """Check if a string is a valid color string.""" 3319 try: 3320 assert len(arg) == 6 3321 for c in arg: 3322 assert c in COLORMAP 3323 except AssertionError: 3324 raise argparse.ArgumentTypeError('%s is not a valid color string' % arg) 3325 return arg 3326 3327 3328# Self-upgrade mechanism 3329 3330def system_is_windows(): 3331 """Checks if the underlying system is Windows (Cygwin included).""" 3332 return sys.platform in {'win32', 'cygwin'} 3333 3334 3335def get_latest_ref(include_git=False): 3336 """Helper for download_latest_googler.""" 3337 import urllib.request 3338 3339 if include_git: 3340 # Get SHA of latest commit on master 3341 request = urllib.request.Request('%s/commits/master' % API_REPO_BASE, 3342 headers={'Accept': 'application/vnd.github.v3.sha'}) 3343 response = urllib.request.urlopen(request) 3344 if response.status != 200: 3345 raise http.client.HTTPException(response.reason) 3346 return response.read().decode('utf-8') 3347 else: 3348 # Get name of latest tag 3349 request = urllib.request.Request('%s/releases?per_page=1' % API_REPO_BASE, 3350 headers={'Accept': 'application/vnd.github.v3+json'}) 3351 response = urllib.request.urlopen(request) 3352 if response.status != 200: 3353 raise http.client.HTTPException(response.reason) 3354 import json 3355 return json.loads(response.read().decode('utf-8'))[0]['tag_name'] 3356 3357 3358def download_latest_googler(include_git=False): 3359 """Download latest googler to a temp file. 3360 3361 By default, the latest released version is downloaded, but if 3362 `include_git` is specified, then the latest git master is downloaded 3363 instead. 3364 3365 Parameters 3366 ---------- 3367 include_git : bool, optional 3368 Download from git master. Default is False. 3369 3370 Returns 3371 ------- 3372 (git_ref, path): tuple 3373 A tuple containing the git reference (either name of the latest 3374 tag or SHA of the latest commit) and path to the downloaded 3375 file. 3376 3377 """ 3378 # Download googler to a tempfile 3379 git_ref = get_latest_ref(include_git=include_git) 3380 googler_download_url = '%s/%s/googler' % (RAW_DOWNLOAD_REPO_BASE, git_ref) 3381 printerr('Downloading %s' % googler_download_url) 3382 request = urllib.request.Request(googler_download_url, 3383 headers={'Accept-Encoding': 'gzip'}) 3384 import tempfile 3385 fd, path = tempfile.mkstemp() 3386 atexit.register(lambda: os.remove(path) if os.path.exists(path) else None) 3387 os.close(fd) 3388 with open(path, 'wb') as fp: 3389 with urllib.request.urlopen(request) as response: 3390 if response.status != 200: 3391 raise http.client.HTTPException(response.reason) 3392 payload = response.read() 3393 try: 3394 fp.write(gzip.decompress(payload)) 3395 except OSError: 3396 fp.write(payload) 3397 return git_ref, path 3398 3399 3400def self_replace(path): 3401 """Replace the current script with a specified file. 3402 3403 Both paths (the specified path and path to the current script) are 3404 resolved to absolute, symlink-free paths. Upon replacement, the 3405 owner and mode signatures of the current script are preserved. The 3406 caller needs to have the necessary permissions. 3407 3408 Replacement won't happen if the specified file is the same 3409 (content-wise) as the current script. 3410 3411 Parameters 3412 ---------- 3413 path : str 3414 Path to the replacement file. 3415 3416 Returns 3417 ------- 3418 bool 3419 True if replaced, False if skipped (specified file is the same 3420 as the current script). 3421 3422 """ 3423 if system_is_windows(): 3424 raise NotImplementedError('Self upgrade not supported on Windows.') 3425 3426 import filecmp 3427 import shutil 3428 3429 path = os.path.realpath(path) 3430 self_path = os.path.realpath(__file__) 3431 3432 if filecmp.cmp(path, self_path): 3433 return False 3434 3435 self_stat = os.stat(self_path) 3436 os.chown(path, self_stat.st_uid, self_stat.st_gid) 3437 os.chmod(path, self_stat.st_mode) 3438 3439 shutil.move(path, self_path) 3440 return True 3441 3442 3443def self_upgrade(include_git=False): 3444 """Perform in-place self-upgrade. 3445 3446 Parameters 3447 ---------- 3448 include_git : bool, optional 3449 See `download_latest_googler`. Default is False. 3450 3451 """ 3452 git_ref, path = download_latest_googler(include_git=include_git) 3453 if self_replace(path): 3454 printerr('Upgraded to %s.' % git_ref) 3455 else: 3456 printerr('Already up to date.') 3457 3458 3459def check_new_version(): 3460 try: 3461 from distutils.version import StrictVersion as Version 3462 except ImportError: 3463 # distutils not available (thanks distros), use a concise poor 3464 # man's version parser. 3465 class Version(tuple): 3466 def __new__(cls, version_str): 3467 def parseint(s): 3468 try: 3469 return int(s) 3470 except ValueError: 3471 return 0 3472 return tuple.__new__(cls, [parseint(s) for s in version_str.split('.')]) 3473 3474 import pathlib 3475 import tempfile 3476 import time 3477 cache = pathlib.Path(tempfile.gettempdir()) / 'googler-latest-version' 3478 latest_version_str = None 3479 # Try to load latest version string from cached location, if it 3480 # exists and is fresh enough. 3481 try: 3482 if cache.is_file() and time.time() - cache.stat().st_mtime < 86400: 3483 latest_version_str = cache.read_text().strip() 3484 except OSError: 3485 pass 3486 if not latest_version_str: 3487 try: 3488 latest_version_str = get_latest_ref().lstrip('v') 3489 cache.write_text(latest_version_str) 3490 except Exception: 3491 pass 3492 if not latest_version_str: 3493 return 3494 # Try to fetch latest version string from GitHub. 3495 try: 3496 current_version = Version(_VERSION_) 3497 latest_version = Version(latest_version_str) 3498 except ValueError: 3499 return 3500 if latest_version > current_version: 3501 print('\x1b[33;1mThe latest release of googler is v%s, please upgrade.\x1b[0m' 3502 % latest_version_str, 3503 file=sys.stderr) 3504 3505 3506# Miscellaneous functions 3507 3508def python_version(): 3509 return '%d.%d.%d' % sys.version_info[:3] 3510 3511 3512def https_proxy_from_environment(): 3513 return os.getenv('https_proxy') 3514 3515 3516def parse_proxy_spec(proxyspec): 3517 if '://' in proxyspec: 3518 pos = proxyspec.find('://') 3519 scheme = proxyspec[:pos] 3520 proxyspec = proxyspec[pos+3:] 3521 if scheme.lower() != 'http': 3522 # Only support HTTP proxies. 3523 # 3524 # In particular, we don't support HTTPS proxies since we 3525 # only speak plain HTTP to the proxy server, so don't give 3526 # users a false sense of security. 3527 raise NotImplementedError('Unsupported proxy scheme %s.' % scheme) 3528 3529 if '@' in proxyspec: 3530 pos = proxyspec.find('@') 3531 user_passwd = urllib.parse.unquote(proxyspec[:pos]) 3532 # Remove trailing '/' if any 3533 host_port = proxyspec[pos+1:].rstrip('/') 3534 else: 3535 user_passwd = None 3536 host_port = proxyspec.rstrip('/') 3537 3538 if ':' not in host_port: 3539 # Use port 1080 as default, following curl. 3540 host_port += ':1080' 3541 3542 return user_passwd, host_port 3543 3544 3545def set_win_console_mode(): 3546 # VT100 control sequences are supported on Windows 10 Anniversary Update and later. 3547 # https://docs.microsoft.com/en-us/windows/console/console-virtual-terminal-sequences 3548 # https://docs.microsoft.com/en-us/windows/console/setconsolemode 3549 if platform.release() == '10': 3550 STD_OUTPUT_HANDLE = -11 3551 STD_ERROR_HANDLE = -12 3552 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004 3553 try: 3554 from ctypes import windll, wintypes, byref 3555 kernel32 = windll.kernel32 3556 for nhandle in (STD_OUTPUT_HANDLE, STD_ERROR_HANDLE): 3557 handle = kernel32.GetStdHandle(nhandle) 3558 old_mode = wintypes.DWORD() 3559 if not kernel32.GetConsoleMode(handle, byref(old_mode)): 3560 raise RuntimeError('GetConsoleMode failed') 3561 new_mode = old_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING 3562 if not kernel32.SetConsoleMode(handle, new_mode): 3563 raise RuntimeError('SetConsoleMode failed') 3564 # Note: No need to restore at exit. SetConsoleMode seems to 3565 # be limited to the calling process. 3566 except Exception: 3567 pass 3568 3569 3570# Query autocompleter 3571 3572# This function is largely experimental and could raise any exception; 3573# you should be prepared to catch anything. When it works though, it 3574# returns a list of strings the prefix could autocomplete to (however, 3575# it is not guaranteed that they start with the specified prefix; for 3576# instance, they won't if the specified prefix ends in a punctuation 3577# mark.) 3578def completer_fetch_completions(prefix): 3579 import html 3580 import json 3581 import re 3582 import urllib.request 3583 3584 # One can pass the 'hl' query param to specify the language. We 3585 # ignore that for now. 3586 api_url = ('https://www.google.com/complete/search?client=psy-ab&q=%s' % 3587 urllib.parse.quote(prefix, safe='')) 3588 # A timeout of 3 seconds seems to be overly generous already. 3589 resp = urllib.request.urlopen(api_url, timeout=3) 3590 charset = resp.headers.get_content_charset() 3591 logger.debug('Completions charset: %s', charset) 3592 respobj = json.loads(resp.read().decode(charset)) 3593 3594 # The response object, once parsed as JSON, should look like 3595 # 3596 # ['git', 3597 # [['git<b>hub</b>', 0], 3598 # ['git', 0], 3599 # ['git<b>lab</b>', 0], 3600 # ['git<b> stash</b>', 0]], 3601 # {'q': 'oooAhRzoChqNmMbNaaDKXk1YY4k', 't': {'bpc': False, 'tlw': False}}] 3602 # 3603 # Note the each result entry need not have two members; e.g., for 3604 # 'gi', there is an entry ['gi<b>f</b>', 0, [131]]. 3605 HTML_TAG = re.compile(r'<[^>]+>') 3606 return [html.unescape(HTML_TAG.sub('', entry[0])) for entry in respobj[1]] 3607 3608 3609def completer_run(prefix): 3610 if prefix: 3611 completions = completer_fetch_completions(prefix) 3612 if completions: 3613 print('\n'.join(completions)) 3614 sys.exit(0) 3615 3616 3617def parse_args(args=None, namespace=None): 3618 """Parse googler arguments/options. 3619 3620 Parameters 3621 ---------- 3622 args : list, optional 3623 Arguments to parse. Default is ``sys.argv``. 3624 namespace : argparse.Namespace 3625 Namespace to write to. Default is a new namespace. 3626 3627 Returns 3628 ------- 3629 argparse.Namespace 3630 Namespace with parsed arguments / options. 3631 3632 """ 3633 3634 colorstr_env = os.getenv('GOOGLER_COLORS') 3635 3636 argparser = GooglerArgumentParser(description='Google from the command-line.') 3637 addarg = argparser.add_argument 3638 addarg('-s', '--start', type=argparser.nonnegative_int, default=0, 3639 metavar='N', help='start at the Nth result') 3640 addarg('-n', '--count', dest='num', type=argparser.positive_int, 3641 default=10, metavar='N', help='show N results (default 10)') 3642 addarg('-N', '--news', action='store_true', 3643 help='show results from news section') 3644 addarg('-V', '--videos', action='store_true', 3645 help='show results from videos section') 3646 addarg('-c', '--tld', metavar='TLD', 3647 help="""country-specific search with top-level domain .TLD, e.g., 'in' 3648 for India""") 3649 addarg('-l', '--lang', metavar='LANG', help='display in language LANG') 3650 addarg('-g', '--geoloc', metavar='CC', 3651 help="""country-specific geolocation search with country code CC, e.g. 3652 'in' for India. Country codes are the same as top-level domains""") 3653 addarg('-x', '--exact', action='store_true', 3654 help='disable automatic spelling correction') 3655 addarg('--colorize', nargs='?', choices=['auto', 'always', 'never'], 3656 const='always', default='auto', 3657 help="""whether to colorize output; defaults to 'auto', which enables 3658 color when stdout is a tty device; using --colorize without an argument 3659 is equivalent to --colorize=always""") 3660 addarg('-C', '--nocolor', action='store_true', 3661 help='equivalent to --colorize=never') 3662 addarg('--colors', dest='colorstr', type=argparser.is_colorstr, 3663 default=colorstr_env if colorstr_env else 'GKlgxy', metavar='COLORS', 3664 help='set output colors (see man page for details)') 3665 addarg('-j', '--first', '--lucky', dest='lucky', action='store_true', 3666 help='open the first result in web browser and exit') 3667 addarg('-t', '--time', dest='duration', type=argparser.is_duration, 3668 metavar='dN', help='time limit search ' 3669 '[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)]') 3670 addarg('--from', type=argparser.is_date, 3671 help="""starting date/month/year of date range; must use American date 3672 format with slashes, e.g., 2/24/2020, 2/2020, 2020; can be used in 3673 conjunction with --to, and overrides -t, --time""") 3674 addarg('--to', type=argparser.is_date, 3675 help='ending date/month/year of date range; see --from') 3676 addarg('-w', '--site', dest='sites', action='append', metavar='SITE', 3677 help='search a site using Google') 3678 addarg('-e', '--exclude', dest='exclude', action='append', metavar='SITE', 3679 help='exclude site from results') 3680 addarg('--unfilter', action='store_true', help='do not omit similar results') 3681 addarg('-p', '--proxy', default=https_proxy_from_environment(), 3682 help="""tunnel traffic through an HTTP proxy; 3683 PROXY is of the form [http://][user:password@]proxyhost[:port]""") 3684 addarg('--noua', action='store_true', help=argparse.SUPPRESS) 3685 addarg('--notweak', action='store_true', 3686 help='disable TCP optimizations and forced TLS 1.2') 3687 addarg('--json', action='store_true', 3688 help='output in JSON format; implies --noprompt') 3689 addarg('--url-handler', metavar='UTIL', 3690 help='custom script or cli utility to open results') 3691 addarg('--show-browser-logs', action='store_true', 3692 help='do not suppress browser output (stdout and stderr)') 3693 addarg('--np', '--noprompt', dest='noninteractive', action='store_true', 3694 help='search and exit, do not prompt') 3695 addarg('-4', '--ipv4', action='store_const', dest='address_family', 3696 const=socket.AF_INET, default=0, 3697 help="""only connect over IPv4 3698 (by default, IPv4 is preferred but IPv6 is used as a fallback)""") 3699 addarg('-6', '--ipv6', action='store_const', dest='address_family', 3700 const=socket.AF_INET6, default=0, 3701 help='only connect over IPv6') 3702 addarg('keywords', nargs='*', metavar='KEYWORD', help='search keywords') 3703 if ENABLE_SELF_UPGRADE_MECHANISM and not system_is_windows(): 3704 addarg('-u', '--upgrade', action='store_true', 3705 help='perform in-place self-upgrade') 3706 addarg('--include-git', action='store_true', 3707 help='when used with --upgrade, get latest git master') 3708 addarg('-v', '--version', action='version', version=_VERSION_) 3709 addarg('-d', '--debug', action='store_true', help='enable debugging') 3710 # Hidden option for interacting with DOM in an IPython/pdb shell 3711 addarg('-D', '--debugger', action='store_true', help=argparse.SUPPRESS) 3712 # Hidden option for parsing dumped HTML 3713 addarg('--parse', dest='html_file', help=argparse.SUPPRESS) 3714 addarg('--complete', help=argparse.SUPPRESS) 3715 3716 parsed = argparser.parse_args(args, namespace) 3717 if parsed.nocolor: 3718 parsed.colorize = 'never' 3719 3720 return parsed 3721 3722 3723def main(): 3724 try: 3725 opts = parse_args() 3726 3727 # Set logging level 3728 if opts.debug: 3729 logger.setLevel(logging.DEBUG) 3730 logger.debug('googler version %s', _VERSION_) 3731 logger.debug('Python version %s', python_version()) 3732 logger.debug('Platform: %s', platform.platform()) 3733 check_new_version() 3734 3735 if opts.debugger: 3736 global debugger 3737 debugger = True 3738 3739 # Handle query completer 3740 if opts.complete is not None: 3741 completer_run(opts.complete) 3742 3743 # Handle self-upgrade 3744 if hasattr(opts, 'upgrade') and opts.upgrade: 3745 self_upgrade(include_git=opts.include_git) 3746 sys.exit(0) 3747 3748 check_stdout_encoding() 3749 3750 if opts.keywords: 3751 try: 3752 # Add cmdline args to readline history 3753 readline.add_history(' '.join(opts.keywords)) 3754 except Exception: 3755 pass 3756 3757 # Set colors 3758 if opts.colorize == 'always': 3759 colorize = True 3760 elif opts.colorize == 'auto': 3761 colorize = sys.stdout.isatty() 3762 else: # opts.colorize == 'never' 3763 colorize = False 3764 3765 if colorize: 3766 colors = Colors(*[COLORMAP[c] for c in opts.colorstr], reset=COLORMAP['x']) 3767 else: 3768 colors = None 3769 Result.colors = colors 3770 Result.urlexpand = True if os.getenv('DISABLE_URL_EXPANSION') is None else False 3771 GooglerCmd.colors = colors 3772 3773 # Try to enable ANSI color support in cmd or PowerShell on Windows 10 3774 if sys.platform == 'win32' and sys.stdout.isatty() and colorize: 3775 set_win_console_mode() 3776 3777 if opts.url_handler is not None: 3778 open_url.url_handler = opts.url_handler 3779 else: 3780 # Set text browser override to False 3781 open_url.override_text_browser = False 3782 3783 # Handle browser output suppression 3784 if opts.show_browser_logs or (os.getenv('BROWSER') in text_browsers): 3785 open_url.suppress_browser_output = False 3786 else: 3787 open_url.suppress_browser_output = True 3788 3789 if opts.noua: 3790 logger.warning('--noua option has been deprecated and has no effect (see #284)') 3791 3792 repl = GooglerCmd(opts) 3793 3794 # Non-interactive mode 3795 if opts.json or opts.lucky or opts.noninteractive or opts.html_file: 3796 repl.fetch() 3797 if opts.lucky: 3798 if repl.results: 3799 open_url(repl.results[0].url) 3800 else: 3801 print('No results.', file=sys.stderr) 3802 else: 3803 repl.showing_results_for_alert(interactive=False) 3804 repl.display_results(json_output=opts.json) 3805 sys.exit(0) 3806 3807 # Interactive mode 3808 repl.cmdloop() 3809 except Exception as e: 3810 # With debugging on, let the exception through for a traceback; 3811 # otherwise, only print the exception error message. 3812 if logger.isEnabledFor(logging.DEBUG): 3813 raise 3814 else: 3815 logger.error(e) 3816 sys.exit(1) 3817 3818if __name__ == '__main__': 3819 main() 3820