1#!/usr/local/bin/python3.8
2#
3# Copyright © 2008 Henri Hakkinen
4# Copyright © 2015-2021 Arun Prakash Jana <engineerarun@gmail.com>
5#
6# This program is free software: you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation, either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
19import argparse
20import atexit
21import base64
22import collections
23import codecs
24import functools
25import gzip
26import html.entities
27import html.parser
28import http.client
29from http.client import HTTPSConnection
30import locale
31import logging
32import os
33import platform
34import shutil
35import signal
36import socket
37import ssl
38import subprocess
39from subprocess import Popen, PIPE, DEVNULL
40import sys
41import textwrap
42import unicodedata
43import urllib.parse
44import uuid
45import webbrowser
46
47# Python optional dependency compatibility layer
48try:
49    import readline
50except ImportError:
51    pass
52
53try:
54    import setproctitle
55    setproctitle.setproctitle('googler')
56except (ImportError, Exception):
57    pass
58
59from typing import (
60    Any,
61    Dict,
62    Generator,
63    Iterable,
64    Iterator,
65    List,
66    Match,
67    Optional,
68    Sequence,
69    Tuple,
70    Union,
71    cast,
72)
73
74# Basic setup
75
76logging.basicConfig(format='[%(levelname)s] %(message)s')
77logger = logging.getLogger()
78
79
80def sigint_handler(signum, frame):
81    print('\nInterrupted.', file=sys.stderr)
82    sys.exit(1)
83
84try:
85    signal.signal(signal.SIGINT, sigint_handler)
86except ValueError:
87    # signal only works in main thread
88    pass
89
90
91# Constants
92
93_VERSION_ = '4.3.2'
94_EPOCH_ = '20210115'
95
96COLORMAP = {k: '\x1b[%sm' % v for k, v in {
97    'a': '30', 'b': '31', 'c': '32', 'd': '33',
98    'e': '34', 'f': '35', 'g': '36', 'h': '37',
99    'i': '90', 'j': '91', 'k': '92', 'l': '93',
100    'm': '94', 'n': '95', 'o': '96', 'p': '97',
101    'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1',
102    'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1',
103    'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1',
104    'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1',
105    'x': '0', 'X': '1', 'y': '7', 'Y': '7;1',
106}.items()}
107
108USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
109
110text_browsers = ['elinks', 'links', 'lynx', 'w3m', 'www-browser']
111
112# Self-upgrade parameters
113#
114# Downstream packagers are recommended to turn off the entire self-upgrade
115# mechanism through
116#
117#     make disable-self-upgrade
118#
119# before running `make install'.
120
121ENABLE_SELF_UPGRADE_MECHANISM = False
122API_REPO_BASE = 'https://api.github.com/repos/jarun/googler'
123RAW_DOWNLOAD_REPO_BASE = 'https://raw.githubusercontent.com/jarun/googler'
124
125debugger = False
126
127
128# Monkeypatch textwrap for CJK wide characters.
129
130def monkeypatch_textwrap_for_cjk():
131    try:
132        if textwrap.wrap.patched:
133            return
134    except AttributeError:
135        pass
136    psl_textwrap_wrap = textwrap.wrap
137
138    def textwrap_wrap(text, width=70, **kwargs):
139        if width <= 2:
140            width = 2
141        # We first add a U+0000 after each East Asian Fullwidth or East
142        # Asian Wide character, then fill to width - 1 (so that if a NUL
143        # character ends up on a new line, we still have one last column
144        # to spare for the preceding wide character). Finally we strip
145        # all the NUL characters.
146        #
147        # East Asian Width: https://www.unicode.org/reports/tr11/
148        return [
149            line.replace('\0', '')
150            for line in psl_textwrap_wrap(
151                ''.join(
152                    ch + '\0' if unicodedata.east_asian_width(ch) in ('F', 'W') else ch
153                    for ch in unicodedata.normalize('NFC', text)
154                ),
155                width=width - 1,
156                **kwargs
157            )
158        ]
159
160    def textwrap_fill(text, width=70, **kwargs):
161        return '\n'.join(textwrap_wrap(text, width=width, **kwargs))
162
163    textwrap.wrap = textwrap_wrap
164    textwrap.fill = textwrap_fill
165    textwrap.wrap.patched = True
166    textwrap.fill.patched = True
167
168
169monkeypatch_textwrap_for_cjk()
170
171
172CoordinateType = Tuple[int, int]
173
174
175class TrackedTextwrap:
176    """
177    Implements a text wrapper that tracks the position of each source
178    character, and can correctly insert zero-width sequences at given
179    offsets of the source text.
180
181    Wrapping result should be the same as that from PSL textwrap.wrap
182    with default settings except expand_tabs=False.
183    """
184
185    def __init__(self, text: str, width: int):
186        self._original = text
187
188        # Do the job of replace_whitespace first so that we can easily
189        # match text to wrapped lines later. Note that this operation
190        # does not change text length or offsets.
191        whitespace = "\t\n\v\f\r "
192        whitespace_trans = str.maketrans(whitespace, " " * len(whitespace))
193        text = text.translate(whitespace_trans)
194
195        self._lines = textwrap.wrap(
196            text, width, expand_tabs=False, replace_whitespace=False
197        )
198
199        # self._coords track the (row, column) coordinate of each source
200        # character in the result text. It is indexed by offset in
201        # source text.
202        self._coords = []  # type: List[CoordinateType]
203        offset = 0
204        try:
205            if not self._lines:
206                # Source text only has whitespaces. We add an empty line
207                # in order to produce meaningful coordinates.
208                self._lines = [""]
209            for row, line in enumerate(self._lines):
210                assert text[offset : offset + len(line)] == line
211                col = 0
212                for _ in line:
213                    self._coords.append((row, col))
214                    offset += 1
215                    col += 1
216                # All subsequent dropped whitespaces map to the last, imaginary column
217                # (the EOL character if you wish) of the current line.
218                while offset < len(text) and text[offset] == " ":
219                    self._coords.append((row, col))
220                    offset += 1
221            # One past the final character (think of it as EOF) should
222            # be treated as a valid offset.
223            self._coords.append((row, col))
224        except AssertionError:
225            raise RuntimeError(
226                "TrackedTextwrap: the impossible happened at offset {} of text {!r}".format(
227                    offset, self._original
228                )
229            )
230
231    # seq should be a zero-width sequence, e.g., an ANSI escape sequence.
232    # May raise IndexError if offset is out of bounds.
233    def insert_zero_width_sequence(self, seq: str, offset: int) -> None:
234        row, col = self._coords[offset]
235        line = self._lines[row]
236        self._lines[row] = line[:col] + seq + line[col:]
237
238        # Shift coordinates of all characters after the given character
239        # on the same line.
240        shift = len(seq)
241        offset += 1
242        while offset < len(self._coords) and self._coords[offset][0] == row:
243            _, col = self._coords[offset]
244            self._coords[offset] = (row, col + shift)
245            offset += 1
246
247    @property
248    def original(self) -> str:
249        return self._original
250
251    @property
252    def lines(self) -> List[str]:
253        return self._lines
254
255    @property
256    def wrapped(self) -> str:
257        return "\n".join(self._lines)
258
259    # May raise IndexError if offset is out of bounds.
260    def get_coordinate(self, offset: int) -> CoordinateType:
261        return self._coords[offset]
262
263
264### begin dim (DOM implementation with CSS support) ###
265### https://github.com/zmwangx/dim/blob/master/dim.py ###
266
267import html
268import re
269from collections import OrderedDict
270from enum import Enum
271from html.parser import HTMLParser
272
273
274SelectorGroupLike = Union[str, "SelectorGroup", "Selector"]
275
276
277class Node(object):
278    """
279    Represents a DOM node.
280
281    Parts of JavaScript's DOM ``Node`` API and ``Element`` API are
282    mirrored here, with extensions. In particular, ``querySelector`` and
283    ``querySelectorAll`` are mirrored.
284
285    Notable properties and methods: :meth:`attr()`, :attr:`classes`,
286    :attr:`html`, :attr:`text`, :meth:`ancestors()`,
287    :meth:`descendants()`, :meth:`select()`, :meth:`select_all()`,
288    :meth:`matched_by()`,
289
290    Attributes:
291        tag      (:class:`Optional`\\[:class:`str`])
292        attrs    (:class:`Dict`\\[:class:`str`, :class:`str`])
293        parent   (:class:`Optional`\\[:class:`Node`])
294        children (:class:`List`\\[:class:`Node`])
295    """
296
297    # Meant to be reimplemented by subclasses.
298    def __init__(self) -> None:
299        self.tag = None  # type: Optional[str]
300        self.attrs = {}  # type: Dict[str, str]
301        self.parent = None  # type: Optional[Node]
302        self.children = []  # type: List[Node]
303
304        # Used in DOMBuilder.
305        self._partial = False
306        self._namespace = None  # type: Optional[str]
307
308    # HTML representation of the node. Meant to be implemented by
309    # subclasses.
310    def __str__(self) -> str:  # pragma: no cover
311        raise NotImplementedError
312
313    def select(self, selector: SelectorGroupLike) -> Optional["Node"]:
314        """DOM ``querySelector`` clone. Returns one match (if any)."""
315        selector = self._normalize_selector(selector)
316        for node in self._select_all(selector):
317            return node
318        return None
319
320    def query_selector(self, selector: SelectorGroupLike) -> Optional["Node"]:
321        """Alias of :meth:`select`."""
322        return self.select(selector)
323
324    def select_all(self, selector: SelectorGroupLike) -> List["Node"]:
325        """DOM ``querySelectorAll`` clone. Returns all matches in a list."""
326        selector = self._normalize_selector(selector)
327        return list(self._select_all(selector))
328
329    def query_selector_all(self, selector: SelectorGroupLike) -> List["Node"]:
330        """Alias of :meth:`select_all`."""
331        return self.select_all(selector)
332
333    def matched_by(
334        self, selector: SelectorGroupLike, root: Optional["Node"] = None
335    ) -> bool:
336        """
337        Checks whether this node is matched by `selector`.
338
339        See :meth:`SelectorGroup.matches()`.
340        """
341        selector = self._normalize_selector(selector)
342        return selector.matches(self, root=root)
343
344    @staticmethod
345    def _normalize_selector(selector: SelectorGroupLike) -> "SelectorGroup":
346        if isinstance(selector, str):
347            return SelectorGroup.from_str(selector)
348        if isinstance(selector, SelectorGroup):
349            return selector
350        if isinstance(selector, Selector):
351            return SelectorGroup([selector])
352        raise ValueError("not a selector or group of selectors: %s" % repr(selector))
353
354    def _select_all(self, selector: "SelectorGroup") -> Generator["Node", None, None]:
355        for descendant in self.descendants():
356            if selector.matches(descendant, root=self):
357                yield descendant
358
359    def child_nodes(self) -> List["Node"]:
360        return self.children
361
362    def first_child(self) -> Optional["Node"]:
363        if self.children:
364            return self.children[0]
365        else:
366            return None
367
368    def first_element_child(self) -> Optional["Node"]:
369        for child in self.children:
370            if isinstance(child, ElementNode):
371                return child
372        return None
373
374    def last_child(self) -> Optional["Node"]:
375        if self.children:
376            return self.children[-1]
377        else:
378            return None
379
380    def last_element_child(self) -> Optional["Node"]:
381        for child in reversed(self.children):
382            if isinstance(child, ElementNode):
383                return child
384        return None
385
386    def next_sibling(self) -> Optional["Node"]:
387        """.. note:: Not O(1), use with caution."""
388        next_siblings = self.next_siblings()
389        if next_siblings:
390            return next_siblings[0]
391        else:
392            return None
393
394    def next_siblings(self) -> List["Node"]:
395        parent = self.parent
396        if not parent:
397            return []
398        try:
399            index = parent.children.index(self)
400            return parent.children[index + 1 :]
401        except ValueError:  # pragma: no cover
402            raise ValueError("node is not found in children of its parent")
403
404    def next_element_sibling(self) -> Optional["ElementNode"]:
405        """.. note:: Not O(1), use with caution."""
406        for sibling in self.next_siblings():
407            if isinstance(sibling, ElementNode):
408                return sibling
409        return None
410
411    def previous_sibling(self) -> Optional["Node"]:
412        """.. note:: Not O(1), use with caution."""
413        previous_siblings = self.previous_siblings()
414        if previous_siblings:
415            return previous_siblings[0]
416        else:
417            return None
418
419    def previous_siblings(self) -> List["Node"]:
420        """
421        Compared to the natural DOM order, the order of returned nodes
422        are reversed. That is, the adjacent sibling (if any) is the
423        first in the returned list.
424        """
425        parent = self.parent
426        if not parent:
427            return []
428        try:
429            index = parent.children.index(self)
430            if index > 0:
431                return parent.children[index - 1 :: -1]
432            else:
433                return []
434        except ValueError:  # pragma: no cover
435            raise ValueError("node is not found in children of its parent")
436
437    def previous_element_sibling(self) -> Optional["ElementNode"]:
438        """.. note:: Not O(1), use with caution."""
439        for sibling in self.previous_siblings():
440            if isinstance(sibling, ElementNode):
441                return sibling
442        return None
443
444    def ancestors(
445        self, *, root: Optional["Node"] = None
446    ) -> Generator["Node", None, None]:
447        """
448        Ancestors are generated in reverse order of depth, stopping at
449        `root`.
450
451        A :class:`RuntimeException` is raised if `root` is not in the
452        ancestral chain.
453        """
454        if self is root:
455            return
456        ancestor = self.parent
457        while ancestor is not root:
458            if ancestor is None:
459                raise RuntimeError("provided root node not found in ancestral chain")
460            yield ancestor
461            ancestor = ancestor.parent
462        if root:
463            yield root
464
465    def descendants(self) -> Generator["Node", None, None]:
466        """Descendants are generated in depth-first order."""
467        for child in self.children:
468            yield child
469            yield from child.descendants()
470
471    def attr(self, attr: str) -> Optional[str]:
472        """Returns the attribute if it exists on the node, otherwise ``None``."""
473        return self.attrs.get(attr)
474
475    @property
476    def html(self) -> str:
477        """
478        HTML representation of the node.
479
480        (For a :class:`TextNode`, :meth:`html` returns the escaped version of the
481        text.
482        """
483        return str(self)
484
485    def outer_html(self) -> str:
486        """Alias of :attr:`html`."""
487        return self.html
488
489    def inner_html(self) -> str:
490        """HTML representation of the node's children."""
491        return "".join(child.html for child in self.children)
492
493    @property
494    def text(self) -> str:  # pragma: no cover
495        """This property is expected to be implemented by subclasses."""
496        raise NotImplementedError
497
498    def text_content(self) -> str:
499        """Alias of :attr:`text`."""
500        return self.text
501
502    @property
503    def classes(self) -> List[str]:
504        return self.attrs.get("class", "").split()
505
506    def class_list(self) -> List[str]:
507        return self.classes
508
509
510class ElementNode(Node):
511    """
512    Represents an element node.
513
514    Note that tag and attribute names are case-insensitive; attribute
515    values are case-sensitive.
516    """
517
518    def __init__(
519        self,
520        tag: str,
521        attrs: Iterable[Tuple[str, Optional[str]]],
522        *,
523        parent: Optional["Node"] = None,
524        children: Optional[Sequence["Node"]] = None
525    ) -> None:
526        Node.__init__(self)
527        self.tag = tag.lower()  # type: str
528        self.attrs = OrderedDict((attr.lower(), val or "") for attr, val in attrs)
529        self.parent = parent
530        self.children = list(children or [])
531
532    def __repr__(self) -> str:
533        s = "<" + self.tag
534        if self.attrs:
535            s += " attrs=%s" % repr(list(self.attrs.items()))
536        if self.children:
537            s += " children=%s" % repr(self.children)
538        s += ">"
539        return s
540
541    # https://ipython.readthedocs.io/en/stable/api/generated/IPython.lib.pretty.html
542    def _repr_pretty_(self, p: Any, cycle: bool) -> None:  # pragma: no cover
543        if cycle:
544            raise RuntimeError("cycle detected in DOM tree")
545        p.text("<\x1b[1m%s\x1b[0m" % self.tag)
546        if self.attrs:
547            p.text(" attrs=%s" % repr(list(self.attrs.items())))
548        if self.children:
549            p.text(" children=[")
550            if len(self.children) == 1 and isinstance(self.first_child(), TextNode):
551                p.text("\x1b[4m%s\x1b[0m" % repr(self.first_child()))
552            else:
553                with p.indent(2):
554                    for child in self.children:
555                        p.break_()
556                        if hasattr(child, "_repr_pretty_"):
557                            child._repr_pretty_(p, False)  # type: ignore
558                        else:
559                            p.text("\x1b[4m%s\x1b[0m" % repr(child))
560                        p.text(",")
561                p.break_()
562            p.text("]")
563        p.text(">")
564
565    def __str__(self) -> str:
566        """HTML representation of the node."""
567        s = "<" + self.tag
568        for attr, val in self.attrs.items():
569            s += ' %s="%s"' % (attr, html.escape(val))
570        if self.children:
571            s += ">"
572            s += "".join(str(child) for child in self.children)
573            s += "</%s>" % self.tag
574        else:
575            if _tag_is_void(self.tag):
576                s += "/>"
577            else:
578                s += "></%s>" % self.tag
579        return s
580
581    @property
582    def text(self) -> str:
583        """The concatenation of all descendant text nodes."""
584        return "".join(child.text for child in self.children)
585
586
587class TextNode(str, Node):
588    """
589    Represents a text node.
590
591    Subclasses :class:`Node` and :class:`str`.
592    """
593
594    def __new__(cls, text: str) -> "TextNode":
595        s = str.__new__(cls, text)  # type: ignore
596        s.parent = None
597        return s  # type: ignore
598
599    def __init__(self, text: str) -> None:
600        Node.__init__(self)
601
602    def __repr__(self) -> str:
603        return "<%s>" % str.__repr__(self)
604
605    # HTML-escaped form of the text node. use text() for unescaped
606    # version.
607    def __str__(self) -> str:
608        return html.escape(self)
609
610    def __eq__(self, other: object) -> bool:
611        """
612        Two text nodes are equal if and only if they are the same node.
613
614        For string comparison, use :attr:`text`.
615        """
616        return self is other
617
618    def __ne__(self, other: object) -> bool:
619        """
620        Two text nodes are non-equal if they are not the same node.
621
622        For string comparison, use :attr:`text`.
623        """
624        return self is not other
625
626    @property
627    def text(self) -> str:
628        return str.__str__(self)
629
630
631class DOMBuilderException(Exception):
632    """
633    Exception raised when :class:`DOMBuilder` detects a bad state.
634
635    Attributes:
636        pos (:class:`Tuple`\\[:class:`int`, :class:`int`]):
637            Line number and offset in HTML input.
638        why (:class:`str`):
639            Reason of the exception.
640    """
641
642    def __init__(self, pos: Tuple[int, int], why: str) -> None:
643        self.pos = pos
644        self.why = why
645
646    def __str__(self) -> str:  # pragma: no cover
647        return "DOM builder aborted at %d:%d: %s" % (self.pos[0], self.pos[1], self.why)
648
649
650class DOMBuilder(HTMLParser):
651    """
652    HTML parser / DOM builder.
653
654    Subclasses :class:`html.parser.HTMLParser`.
655
656    Consume HTML and builds a :class:`Node` tree. Once finished, use
657    :attr:`root` to access the root of the tree.
658
659    This parser cannot parse malformed HTML with tag mismatch.
660    """
661
662    def __init__(self) -> None:
663        super().__init__(convert_charrefs=True)
664        # _stack is the stack for nodes. Each node is pushed to the
665        # stack when its start tag is processed, and remains on the
666        # stack until its parent node is completed (end tag processed),
667        # at which point the node is attached to the parent node as a
668        # child and popped from the stack.
669        self._stack = []  # type: List[Node]
670        # _namespace_stack is another stack tracking the parsing
671        # context, which is generally the default namespace (None) but
672        # changes when parsing foreign objects (e.g. 'svg' when parsing
673        # an <svg>). The top element is always the current parsing
674        # context, so popping works differently from _stack: an element
675        # is popped as soon as the corresponding end tag is processed.
676        self._namespace_stack = [None]  # type: List[Optional[str]]
677
678    def handle_starttag(
679        self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]]
680    ) -> None:
681        node = ElementNode(tag, attrs)
682        node._partial = True
683        self._stack.append(node)
684        namespace = (
685            tag.lower()
686            if _tag_encloses_foreign_namespace(tag)
687            else self._namespace_stack[-1]  # Inherit parent namespace
688        )
689        node._namespace = namespace
690        self._namespace_stack.append(namespace)
691        # For void elements (not in a foreign context), immediately
692        # invoke the end tag handler (see handle_startendtag()).
693        if not namespace and _tag_is_void(tag):
694            self.handle_endtag(tag)
695
696    def handle_endtag(self, tag: str) -> None:
697        tag = tag.lower()
698        children = []
699        while self._stack and not self._stack[-1]._partial:
700            children.append(self._stack.pop())
701        if not self._stack:
702            raise DOMBuilderException(self.getpos(), "extra end tag: %s" % repr(tag))
703        parent = self._stack[-1]
704        if parent.tag != tag:
705            raise DOMBuilderException(
706                self.getpos(),
707                "expecting end tag %s, got %s" % (repr(parent.tag), repr(tag)),
708            )
709        parent.children = list(reversed(children))
710        parent._partial = False
711        for child in children:
712            child.parent = parent
713        self._namespace_stack.pop()
714
715    # Make parser behavior for explicitly and implicitly void elements
716    # (e.g., <hr> vs <hr/>) consistent. The former triggers
717    # handle_starttag only, whereas the latter triggers
718    # handle_startendtag (which by default triggers both handle_starttag
719    # and handle_endtag). See https://bugs.python.org/issue25258.
720    #
721    # An exception is foreign elements, which aren't considered void
722    # elements but can be explicitly marked as self-closing according to
723    # the HTML spec (e.g. <path/> is valid but <path> is not).
724    # Therefore, both handle_starttag and handle_endtag must be called,
725    # and handle_endtag should not be triggered from within
726    # handle_starttag in that case.
727    #
728    # Note that for simplicity we do not check whether the foreign
729    # element in question is allowed to be self-closing by spec. (The
730    # SVG spec unfortunately doesn't provide a readily available list of
731    # such elements.)
732    #
733    # https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements
734    def handle_startendtag(
735        self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]]
736    ) -> None:
737        if self._namespace_stack[-1] or _tag_encloses_foreign_namespace(tag):
738            self.handle_starttag(tag, attrs)
739            self.handle_endtag(tag)
740        else:
741            self.handle_starttag(tag, attrs)
742
743    def handle_data(self, text: str) -> None:
744        if not self._stack:
745            # Ignore text nodes before the first tag.
746            return
747        self._stack.append(TextNode(text))
748
749    @property
750    def root(self) -> "Node":
751        """
752        Finishes processing and returns the root node.
753
754        Raises :class:`DOMBuilderException` if there is no root tag or
755        root tag is not closed yet.
756        """
757        if not self._stack:
758            raise DOMBuilderException(self.getpos(), "no root tag")
759        if self._stack[0]._partial:
760            raise DOMBuilderException(self.getpos(), "root tag not closed yet")
761        return self._stack[0]
762
763
764def parse_html(html: str, *, ParserClass: type = DOMBuilder) -> "Node":
765    """
766    Parses HTML string, builds DOM, and returns root node.
767
768    The parser may raise :class:`DOMBuilderException`.
769
770    Args:
771        html: input HTML string
772        ParserClass: :class:`DOMBuilder` or a subclass
773
774    Returns:
775        Root note of the parsed tree. If the HTML string contains
776        multiple top-level elements, only the first is returned and the
777        rest are lost.
778    """
779    builder = ParserClass()  # type: DOMBuilder
780    builder.feed(html)
781    builder.close()
782    return builder.root
783
784
785class SelectorParserException(Exception):
786    """
787    Exception raised when the selector parser fails to parse an input.
788
789    Attributes:
790        s (:class:`str`):
791            The input string to be parsed.
792        cursor (:class:`int`):
793            Cursor position where the failure occurred.
794        why (:class:`str`):
795            Reason of the failure.
796    """
797
798    def __init__(self, s: str, cursor: int, why: str) -> None:
799        self.s = s
800        self.cursor = cursor
801        self.why = why
802
803    def __str__(self) -> str:  # pragma: no cover
804        return "selector parser aborted at character %d of %s: %s" % (
805            self.cursor,
806            repr(self.s),
807            self.why,
808        )
809
810
811class SelectorGroup:
812    """
813    Represents a group of CSS selectors.
814
815    A group of CSS selectors is simply a comma-separated list of
816    selectors. [#]_ See :class:`Selector` documentation for the scope of
817    support.
818
819    Typically, a :class:`SelectorGroup` is constructed from a string
820    (e.g., ``th.center, td.center``) using the factory function
821    :meth:`from_str`.
822
823    .. [#] https://www.w3.org/TR/selectors-3/#grouping
824    """
825
826    def __init__(self, selectors: Iterable["Selector"]) -> None:
827        self._selectors = list(selectors)
828
829    def __repr__(self) -> str:
830        return "<SelectorGroup %s>" % repr(str(self))
831
832    def __str__(self) -> str:
833        return ", ".join(str(selector) for selector in self._selectors)
834
835    def __len__(self) -> int:
836        return len(self._selectors)
837
838    def __getitem__(self, index: int) -> "Selector":
839        return self._selectors[index]
840
841    def __iter__(self) -> Iterator["Selector"]:
842        return iter(self._selectors)
843
844    @classmethod
845    def from_str(cls, s: str) -> "SelectorGroup":
846        """
847        Parses input string into a group of selectors.
848
849        :class:`SelectorParserException` is raised on invalid input. See
850        :class:`Selector` documentation for the scope of support.
851
852        Args:
853            s: input string
854
855        Returns:
856            Parsed group of selectors.
857        """
858        i = 0
859        selectors = []
860        while i < len(s):
861            selector, i = Selector.from_str(s, i)
862            selectors.append(selector)
863        if not selectors:
864            raise SelectorParserException(s, i, "selector group is empty")
865        return cls(selectors)
866
867    def matches(self, node: "Node", root: Optional["Node"] = None) -> bool:
868        """
869        Decides whether the group of selectors matches `node`.
870
871        The group of selectors matches `node` as long as one of the
872        selectors matches `node`.
873
874        If `root` is provided and child and/or descendant combinators
875        are involved, parent/ancestor lookup terminates at `root`.
876        """
877        return any(selector.matches(node, root=root) for selector in self)
878
879
880class Selector:
881    """
882    Represents a CSS selector.
883
884    Recall that a CSS selector is a chain of one or more *sequences of
885    simple selectors* separated by *combinators*. [#selectors-3]_ This
886    concept is represented as a cons list of sequences of simple
887    selectors (in right to left order). This class in fact holds a
888    single sequence, with an optional combinator and reference to the
889    previous sequence.
890
891    For instance, ``main#main p.important.definition >
892    a.term[id][href]`` would be parsed into (schematically) the
893    following structure::
894
895        ">" tag='a' classes=('term') attrs=([id], [href]) ~>
896        " " tag='p' classes=('important', 'definition') ~>
897        tag='main' id='main'
898
899    Each line is held in a separate instance of :class:`Selector`,
900    linked together by the :attr:`previous` attribute.
901
902    Supported grammar (from selectors level 3 [#selectors-3]_):
903
904    - Type selectors;
905    - Universal selectors;
906    - Class selectors;
907    - ID selectors;
908    - Attribute selectors;
909    - Combinators.
910
911    Unsupported grammar:
912
913    - Pseudo-classes;
914    - Pseudo-elements;
915    - Namespace prefixes (``ns|``, ``*|``, ``|``) in any part of any
916      selector.
917
918    Rationale:
919
920    - Pseudo-classes have too many variants, a few of which even
921      complete with an admittedly not-so-complex minilanguage. These add
922      up to a lot of code.
923    - Pseudo-elements are useless outside rendering contexts, hence out of
924      scope.
925    - Namespace support is too niche to be worth the parsing headache.
926      *Using namespace prefixes may confuse the parser!*
927
928    Note that the parser only loosely follows the spec and priotizes
929    ease of parsing (which includes readability and *writability* of
930    regexes), so some invalid selectors may be accepted (in fact, false
931    positives abound, but accepting valid inputs is a much more
932    important goal than rejecting invalid inputs for this library), and
933    some valid selectors may be rejected (but as long as you stick to
934    the scope outlined above and common sense you should be fine; the
935    false negatives shouldn't be used by actual human beings anyway).
936
937    In particular, whitespace character is simplified to ``\\s`` (ASCII
938    mode) despite CSS spec not counting U+000B (VT) as whitespace,
939    identifiers are simplified to ``[\\w-]+`` (ASCII mode), and strings
940    (attribute selector values can be either identifiers or strings)
941    allow escaped quotes (i.e., ``\\'`` inside single-quoted strings and
942    ``\\"`` inside double-quoted strings) but everything else is
943    interpreted literally. The exact specs for CSS identifiers and
944    strings can be found at [#]_.
945
946    Certain selectors and combinators may be implemented in the parser
947    but not implemented in matching and/or selection APIs.
948
949    .. [#selectors-3] https://www.w3.org/TR/selectors-3/
950    .. [#] https://www.w3.org/TR/CSS21/syndata.html
951
952    Attributes:
953        tag (:class:`Optional`\\[:class:`str`]):
954            Type selector.
955        classes (:class:`List`\\[:class:`str`]):
956            Class selectors.
957        id (:class:`Optional`\\[:class:`str`]):
958            ID selector.
959        attrs (:class:`List`\\[:class:`AttributeSelector`]):
960            Attribute selectors.
961        combinator (:class:`Optional`\\[:class:`Combinator`]):
962            Combinator with the previous sequence of simple selectors in
963            chain.
964        previous (:class:`Optional`\\[:class:`Selector`]):
965            Reference to the previous sequence of simple selectors in
966            chain.
967
968    """
969
970    def __init__(
971        self,
972        *,
973        tag: Optional[str] = None,
974        classes: Optional[Sequence[str]] = None,
975        id: Optional[str] = None,
976        attrs: Optional[Sequence["AttributeSelector"]] = None,
977        combinator: Optional["Combinator"] = None,
978        previous: Optional["Selector"] = None
979    ) -> None:
980        self.tag = tag.lower() if tag else None
981        self.classes = list(classes or [])
982        self.id = id
983        self.attrs = list(attrs or [])
984        self.combinator = combinator
985        self.previous = previous
986
987    def __repr__(self) -> str:
988        return "<Selector %s>" % repr(str(self))
989
990    def __str__(self) -> str:
991        sequences = []
992        delimiters = []
993        seq = self
994        while True:
995            sequences.append(seq._sequence_str_())
996            if seq.previous:
997                if seq.combinator == Combinator.DESCENDANT:
998                    delimiters.append(" ")
999                elif seq.combinator == Combinator.CHILD:
1000                    delimiters.append(" > ")
1001                elif seq.combinator == Combinator.NEXT_SIBLING:
1002                    delimiters.append(" + ")
1003                elif seq.combinator == Combinator.SUBSEQUENT_SIBLING:
1004                    delimiters.append(" ~ ")
1005                else:  # pragma: no cover
1006                    raise RuntimeError(
1007                        "unimplemented combinator: %s" % repr(self.combinator)
1008                    )
1009                seq = seq.previous
1010            else:
1011                delimiters.append("")
1012                break
1013        return "".join(
1014            delimiter + sequence
1015            for delimiter, sequence in zip(reversed(delimiters), reversed(sequences))
1016        )
1017
1018    # Format a single sequence of simple selectors, without combinator.
1019    def _sequence_str_(self) -> str:
1020        s = ""
1021        if self.tag:
1022            s += self.tag
1023        if self.classes:
1024            s += "".join(".%s" % class_ for class_ in self.classes)
1025        if self.id:
1026            s += "#%s" % self.id
1027        if self.attrs:
1028            s += "".join(str(attr) for attr in self.attrs)
1029        return s if s else "*"
1030
1031    @classmethod
1032    def from_str(cls, s: str, cursor: int = 0) -> Tuple["Selector", int]:
1033        """
1034        Parses input string into selector.
1035
1036        This factory function only parses out one selector (up to a
1037        comma or EOS), so partial consumption is allowed --- an optional
1038        `cursor` is taken as input (0 by default) and the moved cursor
1039        (either after the comma or at EOS) is returned as part of the
1040        output.
1041
1042        :class:`SelectorParserException` is raised on invalid input. See
1043        :class:`Selector` documentation for the scope of support.
1044
1045        If you need to completely consume a string representing
1046        (potentially) a group of selectors, use
1047        :meth:`SelectorGroup.from_str()`.
1048
1049        Args:
1050            s:      input string
1051            cursor: initial cursor position on `s`
1052
1053        Returns:
1054            A tuple containing the parsed selector and the moved the
1055            cursor (either after a comma-delimiter, or at EOS).
1056        """
1057        # Simple selectors.
1058        TYPE_SEL = re.compile(r"[\w-]+", re.A)
1059        UNIVERSAL_SEL = re.compile(r"\*")
1060        ATTR_SEL = re.compile(
1061            r"""\[
1062            \s*(?P<attr>[\w-]+)\s*
1063            (
1064                (?P<op>[~|^$*]?=)\s*
1065                (
1066                    (?P<val_identifier>[\w-]+)|
1067                    (?P<val_string>
1068                        (?P<quote>['"])
1069                        (?P<val_string_inner>.*?)
1070                        (?<!\\)(?P=quote)
1071                    )
1072                )\s*
1073            )?
1074            \]""",
1075            re.A | re.X,
1076        )
1077        CLASS_SEL = re.compile(r"\.([\w-]+)", re.A)
1078        ID_SEL = re.compile(r"#([\w-]+)", re.A)
1079        PSEUDO_CLASS_SEL = re.compile(r":[\w-]+(\([^)]+\))?", re.A)
1080        PSEUDO_ELEM_SEL = re.compile(r"::[\w-]+", re.A)
1081
1082        # Combinators
1083        DESCENDANT_COM = re.compile(r"\s+")
1084        CHILD_COM = re.compile(r"\s*>\s*")
1085        NEXT_SIB_COM = re.compile(r"\s*\+\s*")
1086        SUB_SIB_COM = re.compile(r"\s*~\s*")
1087
1088        # Misc
1089        WHITESPACE = re.compile(r"\s*")
1090        END_OF_SELECTOR = re.compile(r"\s*($|,)")
1091
1092        tag = None
1093        classes = []
1094        id = None
1095        attrs = []
1096        combinator = None
1097
1098        selector = None
1099        previous_combinator = None
1100
1101        i = cursor
1102
1103        # Skip leading whitespace
1104        m = WHITESPACE.match(s, i)
1105        if m:
1106            i = m.end()
1107
1108        while i < len(s):
1109            # Parse one simple selector.
1110            #
1111            # PEP 572 (assignment expressions; the one that burned Guido
1112            # so much that he resigned as BDFL) would have been nice; it
1113            # would have saved us from all the regex match
1114            # reassignments, and worse still, the casts, since mypy
1115            # complains about getting Optional[Match[str]] instead of
1116            # Match[str].
1117            if TYPE_SEL.match(s, i):
1118                if tag:
1119                    raise SelectorParserException(s, i, "multiple type selectors found")
1120                m = cast(Match[str], TYPE_SEL.match(s, i))
1121                tag = m.group()
1122            elif UNIVERSAL_SEL.match(s, i):
1123                m = cast(Match[str], UNIVERSAL_SEL.match(s, i))
1124            elif ATTR_SEL.match(s, i):
1125                m = cast(Match[str], ATTR_SEL.match(s, i))
1126
1127                attr = m.group("attr")
1128                op = m.group("op")
1129                val_identifier = m.group("val_identifier")
1130                quote = m.group("quote")
1131                val_string_inner = m.group("val_string_inner")
1132                if val_identifier is not None:
1133                    val = val_identifier
1134                elif val_string_inner is not None:
1135                    val = val_string_inner.replace("\\" + quote, quote)
1136                else:
1137                    val = None
1138
1139                if op is None:
1140                    type = AttributeSelectorType.BARE
1141                elif op == "=":
1142                    type = AttributeSelectorType.EQUAL
1143                elif op == "~=":
1144                    type = AttributeSelectorType.TILDE
1145                elif op == "|=":
1146                    type = AttributeSelectorType.PIPE
1147                elif op == "^=":
1148                    type = AttributeSelectorType.CARET
1149                elif op == "$=":
1150                    type = AttributeSelectorType.DOLLAR
1151                elif op == "*=":
1152                    type = AttributeSelectorType.ASTERISK
1153                else:  # pragma: no cover
1154                    raise SelectorParserException(
1155                        s,
1156                        i,
1157                        "unrecognized operator %s in attribute selector" % repr(op),
1158                    )
1159
1160                attrs.append(AttributeSelector(attr, val, type))
1161            elif CLASS_SEL.match(s, i):
1162                m = cast(Match[str], CLASS_SEL.match(s, i))
1163                classes.append(m.group(1))
1164            elif ID_SEL.match(s, i):
1165                if id:
1166                    raise SelectorParserException(s, i, "multiple id selectors found")
1167                m = cast(Match[str], ID_SEL.match(s, i))
1168                id = m.group(1)
1169            elif PSEUDO_CLASS_SEL.match(s, i):
1170                raise SelectorParserException(s, i, "pseudo-classes not supported")
1171            elif PSEUDO_ELEM_SEL.match(s, i):
1172                raise SelectorParserException(s, i, "pseudo-elements not supported")
1173            else:
1174                raise SelectorParserException(
1175                    s, i, "expecting simple selector, found none"
1176                )
1177            i = m.end()
1178
1179            # Try to parse a combinator, or end the selector.
1180            if CHILD_COM.match(s, i):
1181                m = cast(Match[str], CHILD_COM.match(s, i))
1182                combinator = Combinator.CHILD
1183            elif NEXT_SIB_COM.match(s, i):
1184                m = cast(Match[str], NEXT_SIB_COM.match(s, i))
1185                combinator = Combinator.NEXT_SIBLING
1186            elif SUB_SIB_COM.match(s, i):
1187                m = cast(Match[str], SUB_SIB_COM.match(s, i))
1188                combinator = Combinator.SUBSEQUENT_SIBLING
1189            elif END_OF_SELECTOR.match(s, i):
1190                m = cast(Match[str], END_OF_SELECTOR.match(s, i))
1191                combinator = None
1192            # Need to parse descendant combinator at the very end
1193            # because it could be a prefix to all previous cases.
1194            elif DESCENDANT_COM.match(s, i):
1195                m = cast(Match[str], DESCENDANT_COM.match(s, i))
1196                combinator = Combinator.DESCENDANT
1197            else:
1198                continue
1199            i = m.end()
1200
1201            if combinator and i == len(s):
1202                raise SelectorParserException(s, i, "unexpected end at combinator")
1203
1204            selector = cls(
1205                tag=tag,
1206                classes=classes,
1207                id=id,
1208                attrs=attrs,
1209                combinator=previous_combinator,
1210                previous=selector,
1211            )
1212            previous_combinator = combinator
1213
1214            # End of selector.
1215            if combinator is None:
1216                break
1217
1218            tag = None
1219            classes = []
1220            id = None
1221            attrs = []
1222            combinator = None
1223
1224        if not selector:
1225            raise SelectorParserException(s, i, "selector is empty")
1226
1227        return selector, i
1228
1229    def matches(self, node: "Node", root: Optional["Node"] = None) -> bool:
1230        """
1231        Decides whether the selector matches `node`.
1232
1233        Each sequence of simple selectors in the selector's chain must
1234        be matched for a positive.
1235
1236        If `root` is provided and child and/or descendant combinators
1237        are involved, parent/ancestor lookup terminates at `root`.
1238        """
1239        if self.tag:
1240            if not node.tag or node.tag != self.tag:
1241                return False
1242        if self.id:
1243            if node.attrs.get("id") != self.id:
1244                return False
1245        if self.classes:
1246            classes = node.classes
1247            for class_ in self.classes:
1248                if class_ not in classes:
1249                    return False
1250        if self.attrs:
1251            for attr_selector in self.attrs:
1252                if not attr_selector.matches(node):
1253                    return False
1254
1255        if not self.previous:
1256            return True
1257
1258        if self.combinator == Combinator.DESCENDANT:
1259            return any(
1260                self.previous.matches(ancestor, root=root)
1261                for ancestor in node.ancestors()
1262            )
1263        elif self.combinator == Combinator.CHILD:
1264            if node is root or node.parent is None:
1265                return False
1266            else:
1267                return self.previous.matches(node.parent)
1268        elif self.combinator == Combinator.NEXT_SIBLING:
1269            sibling = node.previous_element_sibling()
1270            if not sibling:
1271                return False
1272            else:
1273                return self.previous.matches(sibling)
1274        elif self.combinator == Combinator.SUBSEQUENT_SIBLING:
1275            return any(
1276                self.previous.matches(sibling, root=root)
1277                for sibling in node.previous_siblings()
1278                if isinstance(sibling, ElementNode)
1279            )
1280        else:  # pragma: no cover
1281            raise RuntimeError("unimplemented combinator: %s" % repr(self.combinator))
1282
1283
1284class AttributeSelector:
1285    """
1286    Represents an attribute selector.
1287
1288    Attributes:
1289        attr (:class:`str`)
1290        val  (:class:`Optional`\\[:class:`str`])
1291        type (:class:`AttributeSelectorType`)
1292    """
1293
1294    def __init__(
1295        self, attr: str, val: Optional[str], type: "AttributeSelectorType"
1296    ) -> None:
1297        self.attr = attr.lower()
1298        self.val = val
1299        self.type = type
1300
1301    def __repr__(self) -> str:
1302        return "<AttributeSelector %s>" % repr(str(self))
1303
1304    def __str__(self) -> str:
1305        if self.type == AttributeSelectorType.BARE:
1306            fmt = "[{attr}{val:.0}]"
1307        elif self.type == AttributeSelectorType.EQUAL:
1308            fmt = "[{attr}={val}]"
1309        elif self.type == AttributeSelectorType.TILDE:
1310            fmt = "[{attr}~={val}]"
1311        elif self.type == AttributeSelectorType.PIPE:
1312            fmt = "[{attr}|={val}]"
1313        elif self.type == AttributeSelectorType.CARET:
1314            fmt = "[{attr}^={val}]"
1315        elif self.type == AttributeSelectorType.DOLLAR:
1316            fmt = "[{attr}$={val}]"
1317        elif self.type == AttributeSelectorType.ASTERISK:
1318            fmt = "[{attr}*={val}]"
1319        return fmt.format(attr=self.attr, val=repr(self.val))
1320
1321    def matches(self, node: "Node") -> bool:
1322        val = node.attrs.get(self.attr)
1323        if val is None:
1324            return False
1325        if self.type == AttributeSelectorType.BARE:
1326            return True
1327        elif self.type == AttributeSelectorType.EQUAL:
1328            return val == self.val
1329        elif self.type == AttributeSelectorType.TILDE:
1330            return self.val in val.split()
1331        elif self.type == AttributeSelectorType.PIPE:
1332            return val == self.val or val.startswith("%s-" % self.val)
1333        elif self.type == AttributeSelectorType.CARET:
1334            return bool(self.val and val.startswith(self.val))
1335        elif self.type == AttributeSelectorType.DOLLAR:
1336            return bool(self.val and val.endswith(self.val))
1337        elif self.type == AttributeSelectorType.ASTERISK:
1338            return bool(self.val and self.val in val)
1339        else:  # pragma: no cover
1340            raise RuntimeError("unimplemented attribute selector: %s" % repr(self.type))
1341
1342
1343# Enum: basis for poor man's algebraic data type.
1344class AttributeSelectorType(Enum):
1345    """
1346    Attribute selector types.
1347
1348    Members correspond to the following forms of attribute selector:
1349
1350    - :attr:`BARE`: ``[attr]``;
1351    - :attr:`EQUAL`: ``[attr=val]``;
1352    - :attr:`TILDE`: ``[attr~=val]``;
1353    - :attr:`PIPE`: ``[attr|=val]``;
1354    - :attr:`CARET`: ``[attr^=val]``;
1355    - :attr:`DOLLAR`: ``[attr$=val]``;
1356    - :attr:`ASTERISK`: ``[attr*=val]``.
1357    """
1358
1359    # [attr]
1360    BARE = 1
1361    # [attr=val]
1362    EQUAL = 2
1363    # [attr~=val]
1364    TILDE = 3
1365    # [attr|=val]
1366    PIPE = 4
1367    # [attr^=val]
1368    CARET = 5
1369    # [attr$=val]
1370    DOLLAR = 6
1371    # [attr*=val]
1372    ASTERISK = 7
1373
1374
1375class Combinator(Enum):
1376    """
1377    Combinator types.
1378
1379    Members correspond to the following combinators:
1380
1381    - :attr:`DESCENDANT`: ``A B``;
1382    - :attr:`CHILD`: ``A > B``;
1383    - :attr:`NEXT_SIBLING`: ``A + B``;
1384    - :attr:`SUBSEQUENT_SIBLING`: ``A ~ B``.
1385    """
1386
1387    # ' '
1388    DESCENDANT = 1
1389    # >
1390    CHILD = 2
1391    # +
1392    NEXT_SIBLING = 3
1393    # ~
1394    SUBSEQUENT_SIBLING = 4
1395
1396
1397def _tag_is_void(tag: str) -> bool:
1398    """
1399    Checks whether the tag corresponds to a void element.
1400
1401    https://www.w3.org/TR/html5/syntax.html#void-elements
1402    https://html.spec.whatwg.org/multipage/syntax.html#void-elements
1403    """
1404    return tag.lower() in (
1405        "area",
1406        "base",
1407        "br",
1408        "col",
1409        "embed",
1410        "hr",
1411        "img",
1412        "input",
1413        "link",
1414        "meta",
1415        "param",
1416        "source",
1417        "track",
1418        "wbr",
1419    )
1420
1421
1422def _tag_encloses_foreign_namespace(tag: str) -> bool:
1423    """
1424    Checks whether the tag encloses a foreign namespace (MathML or SVG).
1425
1426    https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements
1427    """
1428    return tag.lower() in ("math", "svg")
1429
1430
1431### end dim ###
1432
1433
1434# Global helper functions
1435
1436def open_url(url):
1437    """Open an URL in the user's default web browser.
1438
1439    The string attribute ``open_url.url_handler`` can be used to open URLs
1440    in a custom CLI script or utility. A subprocess is spawned with url as
1441    the parameter in this case instead of the usual webbrowser.open() call.
1442
1443    Whether the browser's output (both stdout and stderr) are suppressed
1444    depends on the boolean attribute ``open_url.suppress_browser_output``.
1445    If the attribute is not set upon a call, set it to a default value,
1446    which means False if BROWSER is set to a known text-based browser --
1447    elinks, links, lynx, w3m or 'www-browser'; or True otherwise.
1448
1449    The string attribute ``open_url.override_text_browser`` can be used to
1450    ignore env var BROWSER as well as some known text-based browsers and
1451    attempt to open url in a GUI browser available.
1452    Note: If a GUI browser is indeed found, this option ignores the program
1453          option `show-browser-logs`
1454    """
1455    logger.debug('Opening %s', url)
1456
1457    # Custom URL handler gets max priority
1458    if hasattr(open_url, 'url_handler'):
1459        subprocess.run([open_url.url_handler, url])
1460        return
1461
1462    browser = webbrowser.get()
1463    if open_url.override_text_browser:
1464        browser_output = open_url.suppress_browser_output
1465        for name in [b for b in webbrowser._tryorder if b not in text_browsers]:
1466            browser = webbrowser.get(name)
1467            logger.debug(browser)
1468
1469            # Found a GUI browser, suppress browser output
1470            open_url.suppress_browser_output = True
1471            break
1472
1473    if open_url.suppress_browser_output:
1474        _stderr = os.dup(2)
1475        os.close(2)
1476        _stdout = os.dup(1)
1477        # Patch for GUI browsers on WSL
1478        if "microsoft" not in platform.uname()[3].lower():
1479            os.close(1)
1480        fd = os.open(os.devnull, os.O_RDWR)
1481        os.dup2(fd, 2)
1482        os.dup2(fd, 1)
1483    try:
1484        browser.open(url, new=2)
1485    finally:
1486        if open_url.suppress_browser_output:
1487            os.close(fd)
1488            os.dup2(_stderr, 2)
1489            os.dup2(_stdout, 1)
1490
1491    if open_url.override_text_browser:
1492        open_url.suppress_browser_output = browser_output
1493
1494
1495def printerr(msg):
1496    """Print message, verbatim, to stderr.
1497
1498    ``msg`` could be any stringifiable value.
1499    """
1500    print(msg, file=sys.stderr)
1501
1502
1503def unwrap(text):
1504    """Unwrap text."""
1505    lines = text.split('\n')
1506    result = ''
1507    for i in range(len(lines) - 1):
1508        result += lines[i]
1509        if not lines[i]:
1510            # Paragraph break
1511            result += '\n\n'
1512        elif lines[i + 1]:
1513            # Next line is not paragraph break, add space
1514            result += ' '
1515    # Handle last line
1516    result += lines[-1] if lines[-1] else '\n'
1517    return result
1518
1519
1520def check_stdout_encoding():
1521    """Make sure stdout encoding is utf-8.
1522
1523    If not, print error message and instructions, then exit with
1524    status 1.
1525
1526    This function is a no-op on win32 because encoding on win32 is
1527    messy, and let's just hope for the best. /s
1528    """
1529    if sys.platform == 'win32':
1530        return
1531
1532    # Use codecs.lookup to resolve text encoding alias
1533    encoding = codecs.lookup(sys.stdout.encoding).name
1534    if encoding != 'utf-8':
1535        locale_lang, locale_encoding = locale.getlocale()
1536        if locale_lang is None:
1537            locale_lang = '<unknown>'
1538        if locale_encoding is None:
1539            locale_encoding = '<unknown>'
1540        ioencoding = os.getenv('PYTHONIOENCODING', 'not set')
1541        sys.stderr.write(unwrap(textwrap.dedent("""\
1542        stdout encoding '{encoding}' detected. googler requires utf-8 to
1543        work properly. The wrong encoding may be due to a non-UTF-8
1544        locale or an improper PYTHONIOENCODING. (For the record, your
1545        locale language is {locale_lang} and locale encoding is
1546        {locale_encoding}; your PYTHONIOENCODING is {ioencoding}.)
1547
1548        Please set a UTF-8 locale (e.g., en_US.UTF-8) or set
1549        PYTHONIOENCODING to utf-8.
1550        """.format(
1551            encoding=encoding,
1552            locale_lang=locale_lang,
1553            locale_encoding=locale_encoding,
1554            ioencoding=ioencoding,
1555        ))))
1556        sys.exit(1)
1557
1558
1559def time_it(description=None):
1560    def decorator(func):
1561        @functools.wraps(func)
1562        def wrapped(*args, **kwargs):
1563            # Only profile in debug mode.
1564            if not logger.isEnabledFor(logging.DEBUG):
1565                return func(*args, **kwargs)
1566
1567            import time
1568            mark = time.perf_counter()
1569            ret = func(*args, **kwargs)
1570            duration = time.perf_counter() - mark
1571            logger.debug('%s completed in \x1b[33m%.3fs\x1b[0m', description or func.__name__, duration)
1572            return ret
1573
1574        return wrapped
1575
1576    return decorator
1577
1578
1579# Classes
1580
1581class HardenedHTTPSConnection(HTTPSConnection):
1582    """Overrides HTTPSConnection.connect to specify TLS version
1583
1584    NOTE: TLS 1.2 is supported from Python 3.4
1585    """
1586
1587    def __init__(self, host, address_family=0, **kwargs):
1588        HTTPSConnection.__init__(self, host, **kwargs)
1589        self.address_family = address_family
1590
1591    def connect(self, notweak=False):
1592        sock = self.create_socket_connection()
1593
1594        # Optimizations not available on OS X
1595        if not notweak and sys.platform.startswith('linux'):
1596            try:
1597                sock.setsockopt(socket.SOL_TCP, socket.TCP_DEFER_ACCEPT, 1)
1598                sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, 1)
1599                sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 524288)
1600            except OSError:
1601                # Doesn't work on Windows' Linux subsystem (#179)
1602                logger.debug('setsockopt failed')
1603
1604        if getattr(self, '_tunnel_host', None):
1605            self.sock = sock
1606        elif not notweak:
1607            # Try to use TLS 1.2
1608            ssl_context = None
1609            if hasattr(ssl, 'PROTOCOL_TLS'):
1610                # Since Python 3.5.3
1611                ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS)
1612                if hasattr(ssl_context, "minimum_version"):
1613                    # Python 3.7 with OpenSSL 1.1.0g or later
1614                    ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2
1615                else:
1616                    ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
1617                                            ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
1618            elif hasattr(ssl, 'PROTOCOL_TLSv1_2'):
1619                # Since Python 3.4
1620                ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
1621            if ssl_context:
1622                self.sock = ssl_context.wrap_socket(sock)
1623                return
1624
1625        # Fallback
1626        HTTPSConnection.connect(self)
1627
1628    # Adapted from socket.create_connection.
1629    # https://github.com/python/cpython/blob/bce4ddafdd188cc6deb1584728b67b9e149ca6a4/Lib/socket.py#L771-L813
1630    def create_socket_connection(self):
1631        err = None
1632        results = socket.getaddrinfo(self.host, self.port, self.address_family, socket.SOCK_STREAM)
1633        # Prefer IPv4 if address family isn't explicitly specified.
1634        if self.address_family == 0:
1635            results = sorted(results, key=lambda res: 1 if res[0] == socket.AF_INET else 2)
1636        for af, socktype, proto, canonname, sa in results:
1637            sock = None
1638            try:
1639                sock = socket.socket(af, socktype, proto)
1640                if self.timeout is not None:
1641                    sock.settimeout(self.timeout)
1642                if self.source_address:
1643                    sock.bind(self.source_address)
1644                sock.connect(sa)
1645                # Break explicitly a reference cycle
1646                err = None
1647                self.address_family = af
1648                logger.debug('Opened socket to %s:%d',
1649                             sa[0] if af == socket.AF_INET else ('[%s]' % sa[0]),
1650                             sa[1])
1651                return sock
1652
1653            except socket.error as _:
1654                err = _
1655                if sock is not None:
1656                    sock.close()
1657
1658        if err is not None:
1659            try:
1660                raise err
1661            finally:
1662                # Break explicitly a reference cycle
1663                err = None
1664        else:
1665            raise socket.error("getaddrinfo returns an empty list")
1666
1667
1668class GoogleUrl(object):
1669    """
1670    This class constructs the Google Search/News URL.
1671
1672    This class is modelled on urllib.parse.ParseResult for familiarity,
1673    which means it supports reading of all six attributes -- scheme,
1674    netloc, path, params, query, fragment -- of
1675    urllib.parse.ParseResult, as well as the geturl() method.
1676
1677    However, the attributes (properties) and methods listed below should
1678    be the preferred methods of access to this class.
1679
1680    Parameters
1681    ----------
1682    opts : dict or argparse.Namespace, optional
1683        See the ``opts`` parameter of `update`.
1684
1685    Other Parameters
1686    ----------------
1687    See "Other Parameters" of `update`.
1688
1689    Attributes
1690    ----------
1691    hostname : str
1692        Read-write property.
1693    keywords : str or list of strs
1694        Read-write property.
1695    news : bool
1696        Read-only property.
1697    videos : bool
1698        Read-only property.
1699    url : str
1700        Read-only property.
1701
1702    Methods
1703    -------
1704    full()
1705    relative()
1706    update(opts=None, **kwargs)
1707    set_queries(**kwargs)
1708    unset_queries(*args)
1709    next_page()
1710    prev_page()
1711    first_page()
1712
1713    """
1714
1715    def __init__(self, opts=None, **kwargs):
1716        self.scheme = 'https'
1717        # self.netloc is a calculated property
1718        self.path = '/search'
1719        self.params = ''
1720        # self.query is a calculated property
1721        self.fragment = ''
1722
1723        self._tld = None
1724        self._num = 10
1725        self._start = 0
1726        self._keywords = []
1727        self._sites = None
1728        self._exclude = None
1729
1730        self._query_dict = {
1731            'ie': 'UTF-8',
1732            'oe': 'UTF-8',
1733            #'gbv': '1',  # control the presence of javascript on the page, 1=no js, 2=js
1734            'sei': base64.encodebytes(uuid.uuid4().bytes).decode("ascii").rstrip('=\n').replace('/', '_'),
1735        }
1736
1737        # In preloaded HTML parsing mode, set keywords to something so
1738        # that we are not tripped up by require_keywords.
1739        if opts.html_file and not opts.keywords:
1740            opts.keywords = ['<debug>']
1741
1742        self.update(opts, **kwargs)
1743
1744    def __str__(self):
1745        return self.url
1746
1747    @property
1748    def url(self):
1749        """The full Google URL you want."""
1750        return self.full()
1751
1752    @property
1753    def hostname(self):
1754        """The hostname."""
1755        return self.netloc
1756
1757    @hostname.setter
1758    def hostname(self, hostname):
1759        self.netloc = hostname
1760
1761    @property
1762    def keywords(self):
1763        """The keywords, either a str or a list of strs."""
1764        return self._keywords
1765
1766    @keywords.setter
1767    def keywords(self, keywords):
1768        self._keywords = keywords
1769
1770    @property
1771    def news(self):
1772        """Whether the URL is for Google News."""
1773        return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'nws'
1774
1775    @property
1776    def videos(self):
1777        """Whether the URL is for Google Videos."""
1778        return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'vid'
1779
1780    def full(self):
1781        """Return the full URL.
1782
1783        Returns
1784        -------
1785        str
1786
1787        """
1788        url = (self.scheme + ':') if self.scheme else ''
1789        url += '//' + self.netloc + self.relative()
1790        return url
1791
1792    def relative(self):
1793        """Return the relative URL (without scheme and authority).
1794
1795        Authority (see RFC 3986 section 3.2), or netloc in the
1796        terminology of urllib.parse, basically means the hostname
1797        here. The relative URL is good for making HTTP(S) requests to a
1798        known host.
1799
1800        Returns
1801        -------
1802        str
1803
1804        """
1805        rel = self.path
1806        if self.params:
1807            rel += ';' + self.params
1808        if self.query:
1809            rel += '?' + self.query
1810        if self.fragment:
1811            rel += '#' + self.fragment
1812        return rel
1813
1814    def update(self, opts=None, **kwargs):
1815        """Update the URL with the given options.
1816
1817        Parameters
1818        ----------
1819        opts : dict or argparse.Namespace, optional
1820            Carries options that affect the Google Search/News URL. The
1821            list of currently recognized option keys with expected value
1822            types:
1823
1824                duration: str (GooglerArgumentParser.is_duration)
1825                exact: bool
1826                keywords: str or list of strs
1827                lang: str
1828                news: bool
1829                videos: bool
1830                num: int
1831                site: str
1832                start: int
1833                tld: str
1834                unfilter: bool
1835
1836        Other Parameters
1837        ----------------
1838        kwargs
1839            The `kwargs` dict extends `opts`, that is, options can be
1840            specified either way, in `opts` or as individual keyword
1841            arguments.
1842
1843        """
1844
1845        if opts is None:
1846            opts = {}
1847        if hasattr(opts, '__dict__'):
1848            opts = opts.__dict__
1849        opts.update(kwargs)
1850
1851        qd = self._query_dict
1852        if opts.get('duration'):
1853            qd['tbs'] = 'qdr:%s' % opts['duration']
1854        if 'exact' in opts:
1855            if opts['exact']:
1856                qd['nfpr'] = 1
1857            else:
1858                qd.pop('nfpr', None)
1859        if opts.get('from') or opts.get('to'):
1860            cd_min = opts.get('from') or ''
1861            cd_max = opts.get('to') or ''
1862            qd['tbs'] = 'cdr:1,cd_min:%s,cd_max:%s' % (cd_min, cd_max)
1863        if 'keywords' in opts:
1864            self._keywords = opts['keywords']
1865        if 'lang' in opts and opts['lang']:
1866            qd['hl'] = opts['lang']
1867        if 'geoloc' in opts and opts['geoloc']:
1868            qd['gl'] = opts['geoloc']
1869        if 'news' in opts and opts['news']:
1870            qd['tbm'] = 'nws'
1871        elif 'videos' in opts and opts['videos']:
1872            qd['tbm'] = 'vid'
1873        else:
1874            qd.pop('tbm', None)
1875        if 'num' in opts:
1876            self._num = opts['num']
1877        if 'sites' in opts:
1878            self._sites = opts['sites']
1879        if 'exclude' in opts:
1880            self._exclude = opts['exclude']
1881        if 'start' in opts:
1882            self._start = opts['start']
1883        if 'tld' in opts:
1884            self._tld = opts['tld']
1885        if 'unfilter' in opts and opts['unfilter']:
1886            qd['filter'] = 0
1887
1888    def set_queries(self, **kwargs):
1889        """Forcefully set queries outside the normal `update` mechanism.
1890
1891        Other Parameters
1892        ----------------
1893        kwargs
1894            Arbitrary key value pairs to be set in the query string. All
1895            keys and values should be stringifiable.
1896
1897            Note that certain keys, e.g., ``q``, have their values
1898            constructed on the fly, so setting those has no actual
1899            effect.
1900
1901        """
1902        for k, v in kwargs.items():
1903            self._query_dict[k] = v
1904
1905    def unset_queries(self, *args):
1906        """Forcefully unset queries outside the normal `update` mechanism.
1907
1908        Other Parameters
1909        ----------------
1910        args
1911            Arbitrary keys to be unset. No exception is raised if a key
1912            does not exist in the first place.
1913
1914            Note that certain keys, e.g., ``q``, are always included in
1915            the resulting URL, so unsetting those has no actual effect.
1916
1917        """
1918        for k in args:
1919            self._query_dict.pop(k, None)
1920
1921    def next_page(self):
1922        """Navigate to the next page."""
1923        self._start += self._num
1924
1925    def prev_page(self):
1926        """Navigate to the previous page.
1927
1928        Raises
1929        ------
1930        ValueError
1931            If already at the first page (``start=0`` in the current
1932            query string).
1933
1934        """
1935        if self._start == 0:
1936            raise ValueError('Already at the first page.')
1937        self._start = (self._start - self._num) if self._start > self._num else 0
1938
1939    def first_page(self):
1940        """Navigate to the first page.
1941
1942        Raises
1943        ------
1944        ValueError
1945            If already at the first page (``start=0`` in the current
1946            query string).
1947
1948        """
1949        if self._start == 0:
1950            raise ValueError('Already at the first page.')
1951        self._start = 0
1952
1953    # Data source: https://web.archive.org/web/20170615200243/https://en.wikipedia.org/wiki/List_of_Google_domains
1954    # Scraper script: https://gist.github.com/zmwangx/b976e83c14552fe18b71
1955    TLD_TO_DOMAIN_MAP = {
1956        'ac': 'google.ac',      'ad': 'google.ad',      'ae': 'google.ae',
1957        'af': 'google.com.af',  'ag': 'google.com.ag',  'ai': 'google.com.ai',
1958        'al': 'google.al',      'am': 'google.am',      'ao': 'google.co.ao',
1959        'ar': 'google.com.ar',  'as': 'google.as',      'at': 'google.at',
1960        'au': 'google.com.au',  'az': 'google.az',      'ba': 'google.ba',
1961        'bd': 'google.com.bd',  'be': 'google.be',      'bf': 'google.bf',
1962        'bg': 'google.bg',      'bh': 'google.com.bh',  'bi': 'google.bi',
1963        'bj': 'google.bj',      'bn': 'google.com.bn',  'bo': 'google.com.bo',
1964        'br': 'google.com.br',  'bs': 'google.bs',      'bt': 'google.bt',
1965        'bw': 'google.co.bw',   'by': 'google.by',      'bz': 'google.com.bz',
1966        'ca': 'google.ca',      'cat': 'google.cat',    'cc': 'google.cc',
1967        'cd': 'google.cd',      'cf': 'google.cf',      'cg': 'google.cg',
1968        'ch': 'google.ch',      'ci': 'google.ci',      'ck': 'google.co.ck',
1969        'cl': 'google.cl',      'cm': 'google.cm',      'cn': 'google.cn',
1970        'co': 'google.com.co',  'cr': 'google.co.cr',   'cu': 'google.com.cu',
1971        'cv': 'google.cv',      'cy': 'google.com.cy',  'cz': 'google.cz',
1972        'de': 'google.de',      'dj': 'google.dj',      'dk': 'google.dk',
1973        'dm': 'google.dm',      'do': 'google.com.do',  'dz': 'google.dz',
1974        'ec': 'google.com.ec',  'ee': 'google.ee',      'eg': 'google.com.eg',
1975        'es': 'google.es',      'et': 'google.com.et',  'fi': 'google.fi',
1976        'fj': 'google.com.fj',  'fm': 'google.fm',      'fr': 'google.fr',
1977        'ga': 'google.ga',      'ge': 'google.ge',      'gf': 'google.gf',
1978        'gg': 'google.gg',      'gh': 'google.com.gh',  'gi': 'google.com.gi',
1979        'gl': 'google.gl',      'gm': 'google.gm',      'gp': 'google.gp',
1980        'gr': 'google.gr',      'gt': 'google.com.gt',  'gy': 'google.gy',
1981        'hk': 'google.com.hk',  'hn': 'google.hn',      'hr': 'google.hr',
1982        'ht': 'google.ht',      'hu': 'google.hu',      'id': 'google.co.id',
1983        'ie': 'google.ie',      'il': 'google.co.il',   'im': 'google.im',
1984        'in': 'google.co.in',   'io': 'google.io',      'iq': 'google.iq',
1985        'is': 'google.is',      'it': 'google.it',      'je': 'google.je',
1986        'jm': 'google.com.jm',  'jo': 'google.jo',      'jp': 'google.co.jp',
1987        'ke': 'google.co.ke',   'kg': 'google.kg',      'kh': 'google.com.kh',
1988        'ki': 'google.ki',      'kr': 'google.co.kr',   'kw': 'google.com.kw',
1989        'kz': 'google.kz',      'la': 'google.la',      'lb': 'google.com.lb',
1990        'lc': 'google.com.lc',  'li': 'google.li',      'lk': 'google.lk',
1991        'ls': 'google.co.ls',   'lt': 'google.lt',      'lu': 'google.lu',
1992        'lv': 'google.lv',      'ly': 'google.com.ly',  'ma': 'google.co.ma',
1993        'md': 'google.md',      'me': 'google.me',      'mg': 'google.mg',
1994        'mk': 'google.mk',      'ml': 'google.ml',      'mm': 'google.com.mm',
1995        'mn': 'google.mn',      'ms': 'google.ms',      'mt': 'google.com.mt',
1996        'mu': 'google.mu',      'mv': 'google.mv',      'mw': 'google.mw',
1997        'mx': 'google.com.mx',  'my': 'google.com.my',  'mz': 'google.co.mz',
1998        'na': 'google.com.na',  'ne': 'google.ne',      'nf': 'google.com.nf',
1999        'ng': 'google.com.ng',  'ni': 'google.com.ni',  'nl': 'google.nl',
2000        'no': 'google.no',      'np': 'google.com.np',  'nr': 'google.nr',
2001        'nu': 'google.nu',      'nz': 'google.co.nz',   'om': 'google.com.om',
2002        'pa': 'google.com.pa',  'pe': 'google.com.pe',  'pg': 'google.com.pg',
2003        'ph': 'google.com.ph',  'pk': 'google.com.pk',  'pl': 'google.pl',
2004        'pn': 'google.co.pn',   'pr': 'google.com.pr',  'ps': 'google.ps',
2005        'pt': 'google.pt',      'py': 'google.com.py',  'qa': 'google.com.qa',
2006        'ro': 'google.ro',      'rs': 'google.rs',      'ru': 'google.ru',
2007        'rw': 'google.rw',      'sa': 'google.com.sa',  'sb': 'google.com.sb',
2008        'sc': 'google.sc',      'se': 'google.se',      'sg': 'google.com.sg',
2009        'sh': 'google.sh',      'si': 'google.si',      'sk': 'google.sk',
2010        'sl': 'google.com.sl',  'sm': 'google.sm',      'sn': 'google.sn',
2011        'so': 'google.so',      'sr': 'google.sr',      'st': 'google.st',
2012        'sv': 'google.com.sv',  'td': 'google.td',      'tg': 'google.tg',
2013        'th': 'google.co.th',   'tj': 'google.com.tj',  'tk': 'google.tk',
2014        'tl': 'google.tl',      'tm': 'google.tm',      'tn': 'google.tn',
2015        'to': 'google.to',      'tr': 'google.com.tr',  'tt': 'google.tt',
2016        'tw': 'google.com.tw',  'tz': 'google.co.tz',   'ua': 'google.com.ua',
2017        'ug': 'google.co.ug',   'uk': 'google.co.uk',   'uy': 'google.com.uy',
2018        'uz': 'google.co.uz',   'vc': 'google.com.vc',  've': 'google.co.ve',
2019        'vg': 'google.vg',      'vi': 'google.co.vi',   'vn': 'google.com.vn',
2020        'vu': 'google.vu',      'ws': 'google.ws',      'za': 'google.co.za',
2021        'zm': 'google.co.zm',   'zw': 'google.co.zw',
2022    }
2023
2024    @property
2025    def netloc(self):
2026        """The hostname."""
2027        try:
2028            return 'www.' + self.TLD_TO_DOMAIN_MAP[self._tld]
2029        except KeyError:
2030            return 'www.google.com'
2031
2032    @property
2033    def query(self):
2034        """The query string."""
2035        qd = {}
2036        qd.update(self._query_dict)
2037        if self._num != 10:  # Skip sending the default
2038            qd['num'] = self._num
2039        if self._start:  # Skip sending the default
2040            qd['start'] = self._start
2041
2042        # Construct the q query
2043        q = ''
2044        keywords = self._keywords
2045        sites = self._sites
2046        exclude = self._exclude
2047        if keywords:
2048            if isinstance(keywords, list):
2049                q += '+'.join(urllib.parse.quote_plus(kw) for kw in keywords)
2050            else:
2051                q += urllib.parse.quote_plus(keywords)
2052        if sites:
2053            q += '+OR'.join('+site:' + urllib.parse.quote_plus(site) for site in sites)
2054        if exclude:
2055            q += ''.join('+-site:' + urllib.parse.quote_plus(e) for e in exclude)
2056        qd['q'] = q
2057        return '&'.join('%s=%s' % (k, qd[k]) for k in sorted(qd.keys()))
2058
2059
2060class GoogleConnectionError(Exception):
2061    pass
2062
2063
2064class GoogleConnection(object):
2065    """
2066    This class facilitates connecting to and fetching from Google.
2067
2068    Parameters
2069    ----------
2070    See http.client.HTTPSConnection for documentation of the
2071    parameters.
2072
2073    Raises
2074    ------
2075    GoogleConnectionError
2076
2077    Attributes
2078    ----------
2079    host : str
2080        The currently connected host. Read-only property. Use
2081        `new_connection` to change host.
2082
2083    Methods
2084    -------
2085    new_connection(host=None, port=None, timeout=45)
2086    renew_connection(timeout=45)
2087    fetch_page(url)
2088    close()
2089
2090    """
2091
2092    def __init__(self, host, port=None, address_family=0, timeout=45, proxy=None, notweak=False):
2093        self._host = None
2094        self._port = None
2095        self._address_family = address_family
2096        self._proxy = proxy
2097        self._notweak = notweak
2098        self._conn = None
2099        self.new_connection(host, port=port, timeout=timeout)
2100        self.cookie = ''
2101
2102    @property
2103    def host(self):
2104        """The host currently connected to."""
2105        return self._host
2106
2107    @time_it()
2108    def new_connection(self, host=None, port=None, timeout=45):
2109        """Close the current connection (if any) and establish a new one.
2110
2111        Parameters
2112        ----------
2113        See http.client.HTTPSConnection for documentation of the
2114        parameters. Renew the connection (i.e., reuse the current host
2115        and port) if host is None or empty.
2116
2117        Raises
2118        ------
2119        GoogleConnectionError
2120
2121        """
2122        if self._conn:
2123            self._conn.close()
2124
2125        if not host:
2126            host = self._host
2127            port = self._port
2128        self._host = host
2129        self._port = port
2130        host_display = host + (':%d' % port if port else '')
2131
2132        proxy = self._proxy
2133
2134        if proxy:
2135            proxy_user_passwd, proxy_host_port = parse_proxy_spec(proxy)
2136
2137            logger.debug('Connecting to proxy server %s', proxy_host_port)
2138            self._conn = HardenedHTTPSConnection(proxy_host_port,
2139                                                 address_family=self._address_family, timeout=timeout)
2140
2141            logger.debug('Tunnelling to host %s' % host_display)
2142            connect_headers = {}
2143            if proxy_user_passwd:
2144                connect_headers['Proxy-Authorization'] = 'Basic %s' % base64.b64encode(
2145                    proxy_user_passwd.encode('utf-8')
2146                ).decode('utf-8')
2147            self._conn.set_tunnel(host, port=port, headers=connect_headers)
2148
2149            try:
2150                self._conn.connect(self._notweak)
2151            except Exception as e:
2152                msg = 'Failed to connect to proxy server %s: %s.' % (proxy, e)
2153                raise GoogleConnectionError(msg)
2154        else:
2155            logger.debug('Connecting to new host %s', host_display)
2156            self._conn = HardenedHTTPSConnection(host, port=port,
2157                                                 address_family=self._address_family, timeout=timeout)
2158            try:
2159                self._conn.connect(self._notweak)
2160            except Exception as e:
2161                msg = 'Failed to connect to %s: %s.' % (host_display, e)
2162                raise GoogleConnectionError(msg)
2163
2164    def renew_connection(self, timeout=45):
2165        """Renew current connection.
2166
2167        Equivalent to ``new_connection(timeout=timeout)``.
2168
2169        """
2170        self.new_connection(timeout=timeout)
2171
2172    @time_it()
2173    def fetch_page(self, url):
2174        """Fetch a URL.
2175
2176        Allows one reconnection and multiple redirections before failing
2177        and raising GoogleConnectionError.
2178
2179        Parameters
2180        ----------
2181        url : str
2182            The URL to fetch, relative to the host.
2183
2184        Raises
2185        ------
2186        GoogleConnectionError
2187            When not getting HTTP 200 even after the allowed one
2188            reconnection and/or one redirection, or when Google is
2189            blocking query due to unusual activity.
2190
2191        Returns
2192        -------
2193        str
2194            Response payload, gunzipped (if applicable) and decoded (in UTF-8).
2195
2196        """
2197        try:
2198            self._raw_get(url)
2199        except (http.client.HTTPException, OSError) as e:
2200            logger.debug('Got exception: %s.', e)
2201            logger.debug('Attempting to reconnect...')
2202            self.renew_connection()
2203            try:
2204                self._raw_get(url)
2205            except http.client.HTTPException as e:
2206                logger.debug('Got exception: %s.', e)
2207                raise GoogleConnectionError("Failed to get '%s'." % url)
2208
2209        resp = self._resp
2210        redirect_counter = 0
2211        while resp.status != 200 and redirect_counter < 3:
2212            if resp.status in {301, 302, 303, 307, 308}:
2213                redirection_url = resp.getheader('location', '')
2214                if 'sorry/IndexRedirect?' in redirection_url or 'sorry/index?' in redirection_url:
2215                    msg = "Connection blocked due to unusual activity.\n"
2216                    if self._conn.address_family == socket.AF_INET6:
2217                        msg += textwrap.dedent("""\
2218                        You are connecting over IPv6 which is likely the problem. Try to make
2219                        sure the machine has a working IPv4 network interface configured.
2220                        See also the -4, --ipv4 option of googler.\n""")
2221                    msg += textwrap.dedent("""\
2222                    THIS IS NOT A BUG, please do NOT report it as a bug unless you have specific
2223                    information that may lead to the development of a workaround.
2224                    You IP address is temporarily or permanently blocked by Google and requires
2225                    reCAPTCHA-solving to use the service, which googler is not capable of.
2226                    Possible causes include issuing too many queries in a short time frame, or
2227                    operating from a shared / low reputation IP with a history of abuse.
2228                    Please do NOT use googler for automated scraping.""")
2229                    msg = " ".join(msg.splitlines())
2230                    raise GoogleConnectionError(msg)
2231                self._redirect(redirection_url)
2232                resp = self._resp
2233                redirect_counter += 1
2234            else:
2235                break
2236
2237        if resp.status != 200:
2238            raise GoogleConnectionError('Got HTTP %d: %s' % (resp.status, resp.reason))
2239
2240        payload = resp.read()
2241        try:
2242            return gzip.decompress(payload).decode('utf-8')
2243        except OSError:
2244            # Not gzipped
2245            return payload.decode('utf-8')
2246
2247    def _redirect(self, url):
2248        """Redirect to and fetch a new URL.
2249
2250        Like `_raw_get`, the response is stored in ``self._resp``. A new
2251        connection is made if redirecting to a different host.
2252
2253        Parameters
2254        ----------
2255        url : str
2256            If absolute and points to a different host, make a new
2257            connection.
2258
2259        Raises
2260        ------
2261        GoogleConnectionError
2262
2263        """
2264        logger.debug('Redirecting to URL %s', url)
2265        segments = urllib.parse.urlparse(url)
2266
2267        host = segments.netloc
2268        if host != self._host:
2269            self.new_connection(host)
2270
2271        relurl = urllib.parse.urlunparse(('', '') + segments[2:])
2272        try:
2273            self._raw_get(relurl)
2274        except http.client.HTTPException as e:
2275            logger.debug('Got exception: %s.', e)
2276            raise GoogleConnectionError("Failed to get '%s'." % url)
2277
2278    def _raw_get(self, url):
2279        """Make a raw HTTP GET request.
2280
2281        No status check (which implies no redirection). Response can be
2282        accessed from ``self._resp``.
2283
2284        Parameters
2285        ----------
2286        url : str
2287            URL relative to the host, used in the GET request.
2288
2289        Raises
2290        ------
2291        http.client.HTTPException
2292
2293        """
2294        logger.debug('Fetching URL %s', url)
2295        self._conn.request('GET', url, None, {
2296            'Accept': 'text/html',
2297            'Accept-Encoding': 'gzip',
2298            'User-Agent': USER_AGENT,
2299            'Cookie': self.cookie,
2300            'Connection': 'keep-alive',
2301            'DNT': '1',
2302        })
2303        self._resp = self._conn.getresponse()
2304        if self.cookie == '':
2305            complete_cookie = self._resp.getheader('Set-Cookie')
2306            # Cookie won't be available if already blocked
2307            if complete_cookie is not None:
2308                self.cookie = complete_cookie[:complete_cookie.find(';')]
2309                logger.debug('Cookie: %s' % self.cookie)
2310
2311    def close(self):
2312        """Close the connection (if one is active)."""
2313        if self._conn:
2314            self._conn.close()
2315
2316
2317class GoogleParser(object):
2318
2319    def __init__(self, html, *, news=False, videos=False):
2320        self.news = news
2321        self.videos = videos
2322        self.autocorrected = False
2323        self.showing_results_for = None
2324        self.filtered = False
2325        self.results = []
2326        self.parse(html)
2327
2328    @time_it()
2329    def parse(self, html):
2330        tree = parse_html(html)
2331
2332        if debugger:
2333            printerr('\x1b[1mInspect the DOM through the \x1b[4mtree\x1b[24m variable.\x1b[0m')
2334            printerr('')
2335            try:
2336                import IPython
2337                IPython.embed()
2338            except ImportError:
2339                import pdb
2340                pdb.set_trace()
2341
2342        # cw is short for collapse_whitespace.
2343        cw = lambda s: re.sub(r'[ \t\n\r]+', ' ', s) if s is not None else s
2344
2345        index = 0
2346        for div_g in tree.select_all('div.g'):
2347            if div_g.select('.hp-xpdbox'):
2348                # Skip smart cards.
2349                continue
2350            try:
2351                if div_g.select('.st'):
2352                    # Old class structure, stopped working some time in
2353                    # September 2020, but kept just in case.
2354                    h3 = div_g.select('div.r h3')
2355                    if h3:
2356                        title = h3.text
2357                        a = h3.parent
2358                    else:
2359                        h3 = div_g.select('h3.r')
2360                        a = h3.select('a')
2361                        title = a.text
2362                        mime = div_g.select('.mime')
2363                        if mime:
2364                            title = mime.text + ' ' + title
2365                    abstract_node = div_g.select('.st')
2366                    metadata_node = div_g.select('.f')
2367                else:
2368                    # Current structure as of October 2020.
2369                    # Note that a filetype tag (e.g. PDF) is now pretty
2370                    # damn hard to parse with confidence (that it'll
2371                    # survive the slighest further change), so we don't.
2372
2373                    # As of January 15th 2021, the html class is not rc anymore, it's tF2Cxc.
2374                    # This approach is not very resilient to changes by Google, but it works for now.
2375                    # title_node, details_node, *_ = div_g.select_all('div.rc > div')
2376                    title_node, details_node, *_ = div_g.select_all('div.tF2Cxc > div')
2377                    if 'yuRUbf' not in title_node.classes:
2378                        logger.debug('unexpected title node class(es): expected %r, got %r',
2379                                     'yuRUbf', ' '.join(title_node.classes))
2380                    if 'IsZvec' not in details_node.classes:
2381                        logger.debug('unexpected details node class(es): expected %r, got %r',
2382                                     'IsZvec', ' '.join(details_node.classes))
2383                    a = title_node.select('a')
2384                    h3 = a.select('h3')
2385                    title = h3.text
2386                    abstract_node = details_node.select('span')
2387                    metadata_node = details_node.select('.f, span ~ div')
2388                url = self.unwrap_link(a.attr('href'))
2389                matched_keywords = []
2390                abstract = ''
2391                # BFS descendant nodes. Necessary to locate matches (b,
2392                # em) while skipping metadata (.f).
2393                abstract_nodes = collections.deque([abstract_node])
2394                while abstract_nodes:
2395                    node = abstract_nodes.popleft()
2396                    if 'f' in node.classes:
2397                        # .f is handled as metadata instead.
2398                        continue
2399                    if node.tag in ['b', 'em']:
2400                        matched_keywords.append({'phrase': node.text, 'offset': len(abstract)})
2401                        abstract += node.text
2402                        continue
2403                    if not node.children:
2404                        abstract += node.text
2405                        continue
2406                    for child in node.children:
2407                        abstract_nodes.append(child)
2408                metadata = None
2409                try:
2410                    # Sometimes there are multiple metadata fields
2411                    # associated with a single entry, e.g. "Released",
2412                    # "Producer(s)", "Genre", etc. for a song (sample
2413                    # query: "never gonna give you up"). These need to
2414                    # be delimited when displayed.
2415                    metadata_fields = metadata_node.select_all('div > div.wFMWsc')
2416                    if metadata_fields:
2417                        metadata = ' | '.join(field.text for field in metadata_fields)
2418                    elif not metadata_node.select('a') and not metadata_node.select('g-expandable-container'):
2419                        metadata = metadata_node.text
2420                    if metadata:
2421                        metadata = (
2422                            metadata
2423                            .replace('\u200e', '')
2424                            .replace(' - ', ', ')
2425                            .replace(' \u2014 ', ', ')
2426                            .strip().rstrip(',')
2427                        )
2428                except AttributeError:
2429                    pass
2430            except (AttributeError, ValueError):
2431                continue
2432            sitelinks = []
2433            for td in div_g.select_all('td'):
2434                try:
2435                    a = td.select('a')
2436                    sl_title = a.text
2437                    sl_url = self.unwrap_link(a.attr('href'))
2438                    sl_abstract = td.select('div.s.st, div.s .st').text
2439                    sitelink = Sitelink(cw(sl_title), sl_url, cw(sl_abstract))
2440                    if sitelink not in sitelinks:
2441                        sitelinks.append(sitelink)
2442                except (AttributeError, ValueError):
2443                    continue
2444            # cw cannot be applied to abstract here since it may screw
2445            # up offsets of matches. Instead, each relevant node's text
2446            # is whitespace-collapsed before being appended to abstract.
2447            # We then hope for the best.
2448            result = Result(index + 1, cw(title), url, abstract,
2449                            metadata=cw(metadata), sitelinks=sitelinks, matches=matched_keywords)
2450            if result not in self.results:
2451                self.results.append(result)
2452                index += 1
2453
2454        if not self.results:
2455            for card in tree.select_all('g-card'):
2456                a = card.select('a[href]')
2457                if not a:
2458                    continue
2459                url = self.unwrap_link(a.attr('href'))
2460                text_nodes = []
2461                for node in a.descendants():
2462                    if isinstance(node, TextNode) and node.strip():
2463                        text_nodes.append(node.text)
2464                if len(text_nodes) != 4:
2465                    continue
2466                publisher, title, abstract, publishing_time = text_nodes
2467                metadata = '%s, %s' % (publisher, publishing_time)
2468                index += 1
2469                self.results.append(Result(index, cw(title), url, cw(abstract), metadata=cw(metadata)))
2470
2471        # Showing results for ...
2472        # Search instead for ...
2473        spell_orig = tree.select("span.spell_orig")
2474        if spell_orig:
2475            showing_results_for_link = next(
2476                filter(lambda el: el.tag == "a", spell_orig.previous_siblings()), None
2477            )
2478            if showing_results_for_link:
2479                self.autocorrected = True
2480                self.showing_results_for = showing_results_for_link.text
2481
2482        # No results found for ...
2483        # Results for ...:
2484        alt_query_infobox = tree.select('#topstuff')
2485        if alt_query_infobox:
2486            bolds = alt_query_infobox.select_all('div b')
2487            if len(bolds) == 2:
2488                self.showing_results_for = bolds[1].text
2489
2490        # In order to show you the most relevant results, we have
2491        # omitted some entries very similar to the N already displayed.
2492        # ...
2493        self.filtered = tree.select('p#ofr') is not None
2494
2495    # Unwraps /url?q=http://...&sa=...
2496    # TODO: don't unwrap if URL isn't in this form.
2497    @staticmethod
2498    def unwrap_link(link):
2499        qs = urllib.parse.urlparse(link).query
2500        try:
2501            url = urllib.parse.parse_qs(qs)['q'][0]
2502        except KeyError:
2503            return link
2504        else:
2505            if "://" in url:
2506                return url
2507            else:
2508                # Google's internal services link, e.g.,
2509                # /search?q=google&..., which cannot be unwrapped into
2510                # an actual URL.
2511                raise ValueError(link)
2512
2513
2514class Sitelink(object):
2515    """Container for a sitelink."""
2516
2517    def __init__(self, title, url, abstract):
2518        self.title = title
2519        self.url = url
2520        self.abstract = abstract
2521        self.index = ''
2522
2523    def __eq__(self, other):
2524        return (
2525            self.title == other.title and
2526            self.url == other.url and
2527            self.abstract == other.abstract
2528        )
2529
2530    def __hash__(self):
2531        return hash((self.title, self.url, self.abstract))
2532
2533
2534Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract, prompt, reset')
2535
2536
2537class Result(object):
2538    """
2539    Container for one search result, with output helpers.
2540
2541    Parameters
2542    ----------
2543    index : int or str
2544    title : str
2545    url : str
2546    abstract : str
2547    metadata : str, optional
2548        Only applicable to Google News results, with publisher name and
2549        publishing time.
2550    sitelinks : list, optional
2551        List of ``SiteLink`` objects.
2552
2553    Attributes
2554    ----------
2555    index : str
2556    title : str
2557    url : str
2558    abstract : str
2559    metadata : str or None
2560    sitelinks : list
2561    matches : list
2562
2563    Class Variables
2564    ---------------
2565    colors : str
2566
2567    Methods
2568    -------
2569    print()
2570    jsonizable_object()
2571    urltable()
2572
2573    """
2574
2575    # Class variables
2576    colors = None
2577    urlexpand = True
2578
2579    def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None, matches=None):
2580        index = str(index)
2581        self.index = index
2582        self.title = title
2583        self.url = url
2584        self.abstract = abstract
2585        self.metadata = metadata
2586        self.sitelinks = [] if sitelinks is None else sitelinks
2587        self.matches = [] if matches is None else matches
2588
2589        self._urltable = {index: url}
2590        subindex = 'a'
2591        for sitelink in self.sitelinks:
2592            fullindex = index + subindex
2593            sitelink.index = fullindex
2594            self._urltable[fullindex] = sitelink.url
2595            subindex = chr(ord(subindex) + 1)
2596
2597    def __eq__(self, other):
2598        return (
2599            self.title == other.title and
2600            self.url == other.url and
2601            self.abstract == other.abstract and
2602            self.metadata == other.metadata and
2603            self.sitelinks == other.sitelinks and
2604            self.matches == other.matches
2605        )
2606
2607    def __hash__(self):
2608        sitelinks_hashable = tuple(self.sitelinks) if self.sitelinks is not None else None
2609        matches_hashable = tuple(self.matches) if self.matches is not None else None
2610        return hash(self.title, self.url, self.abstract, self.metadata, sitelinks_hashable, matches_hashable)
2611
2612    def _print_title_and_url(self, index, title, url, indent=0):
2613        colors = self.colors
2614
2615        if not self.urlexpand:
2616            url = '[' + urllib.parse.urlparse(url).netloc + ']'
2617
2618        if colors:
2619            # Adjust index to print result index clearly
2620            print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colors.reset), end='')
2621            if not self.urlexpand:
2622                print(' ' + colors.title + title + colors.reset + ' ' + colors.url + url + colors.reset)
2623            else:
2624                print(' ' + colors.title + title + colors.reset)
2625                print(' ' * (indent + 5) + colors.url + url + colors.reset)
2626        else:
2627            if self.urlexpand:
2628                print(' %s%-3s %s' % (' ' * indent, index + '.', title))
2629                print(' %s%s' % (' ' * (indent + 4), url))
2630            else:
2631                print(' %s%-3s %s %s' % (' ' * indent, index + '.', title, url))
2632
2633    def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=0):
2634        colors = self.colors
2635        try:
2636            columns, _ = os.get_terminal_size()
2637        except OSError:
2638            columns = 0
2639
2640        if metadata:
2641            if colors:
2642                print(' ' * (indent + 5) + colors.metadata + metadata + colors.reset)
2643            else:
2644                print(' ' * (indent + 5) + metadata)
2645
2646        if abstract:
2647            fillwidth = (columns - (indent + 6)) if columns > indent + 6 else len(abstract)
2648            wrapped_abstract = TrackedTextwrap(abstract, fillwidth)
2649            if colors:
2650                # Highlight matches.
2651                for match in matches or []:
2652                    offset = match['offset']
2653                    span = len(match['phrase'])
2654                    wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offset)
2655                    wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offset + span)
2656
2657            if colors:
2658                print(colors.abstract, end='')
2659            for line in wrapped_abstract.lines:
2660                print('%s%s' % (' ' * (indent + 5), line))
2661            if colors:
2662                print(colors.reset, end='')
2663
2664        print('')
2665
2666    def print(self):
2667        """Print the result entry."""
2668        self._print_title_and_url(self.index, self.title, self.url)
2669        self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches)
2670
2671        for sitelink in self.sitelinks:
2672            self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, indent=4)
2673            self._print_metadata_and_abstract(sitelink.abstract, indent=4)
2674
2675    def jsonizable_object(self):
2676        """Return a JSON-serializable dict representing the result entry."""
2677        obj = {
2678            'title': self.title,
2679            'url': self.url,
2680            'abstract': self.abstract
2681        }
2682        if self.metadata:
2683            obj['metadata'] = self.metadata
2684        if self.sitelinks:
2685            obj['sitelinks'] = [sitelink.__dict__ for sitelink in self.sitelinks]
2686        if self.matches:
2687            obj['matches'] = self.matches
2688        return obj
2689
2690    def urltable(self):
2691        """Return a index-to-URL table for the current result.
2692
2693        Normally, the table contains only a single entry, but when the result
2694        contains sitelinks, all sitelinks are included in this table.
2695
2696        Returns
2697        -------
2698        dict
2699            A dict mapping indices (strs) to URLs (also strs). Indices of
2700            sitelinks are the original index appended by lowercase letters a,
2701            b, c, etc.
2702
2703        """
2704        return self._urltable
2705
2706    @staticmethod
2707    def collapse_whitespace(s):
2708        return re.sub(r'[ \t\n\r]+', ' ', s)
2709
2710
2711class GooglerCmdException(Exception):
2712    pass
2713
2714
2715class NoKeywordsException(GooglerCmdException):
2716    pass
2717
2718
2719def require_keywords(method):
2720    # Require keywords to be set before we run a GooglerCmd method. If
2721    # no keywords have been set, raise a NoKeywordsException.
2722    @functools.wraps(method)
2723    def enforced_method(self, *args, **kwargs):
2724        if not self.keywords:
2725            raise NoKeywordsException('No keywords.')
2726        method(self, *args, **kwargs)
2727
2728    return enforced_method
2729
2730
2731def no_argument(method):
2732    # Normalize a do_* method of GooglerCmd that takes no argument to
2733    # one that takes an arg, but issue a warning when an nonempty
2734    # argument is given.
2735    @functools.wraps(method)
2736    def enforced_method(self, arg):
2737        if arg:
2738            method_name = arg.__name__
2739            command_name = method_name[3:] if method_name.startswith('do_') else method_name
2740            logger.warning("Argument to the '%s' command ignored.", command_name)
2741        method(self)
2742
2743    return enforced_method
2744
2745
2746class GooglerCmd(object):
2747    """
2748    Command line interpreter and executor class for googler.
2749
2750    Inspired by PSL cmd.Cmd.
2751
2752    Parameters
2753    ----------
2754    opts : argparse.Namespace
2755        Options and/or arguments.
2756
2757    Attributes
2758    ----------
2759    options : argparse.Namespace
2760        Options that are currently in effect. Read-only attribute.
2761    keywords : str or list or strs
2762        Current keywords. Read-only attribute
2763
2764    Methods
2765    -------
2766    fetch()
2767    display_results(prelude='\n', json_output=False)
2768    fetch_and_display(prelude='\n', json_output=False, interactive=True)
2769    read_next_command()
2770    help()
2771    cmdloop()
2772    """
2773
2774    # Class variables
2775    colors = None
2776    re_url_index = re.compile(r"\d+(a-z)?")
2777
2778    def __init__(self, opts):
2779        super().__init__()
2780
2781        self._opts = opts
2782
2783        self._google_url = GoogleUrl(opts)
2784
2785        if opts.html_file:
2786            # Preloaded HTML parsing mode, do not initialize connection.
2787            self._preload_from_file = opts.html_file
2788            self._conn = None
2789        else:
2790            self._preload_from_file = None
2791            proxy = opts.proxy if hasattr(opts, 'proxy') else None
2792            self._conn = GoogleConnection(self._google_url.hostname,
2793                                        address_family=opts.address_family,
2794                                        proxy=proxy,
2795                                        notweak=opts.notweak)
2796            atexit.register(self._conn.close)
2797
2798        self.results = []
2799        self._autocorrected = None
2800        self._showing_results_for = None
2801        self._results_filtered = False
2802        self._urltable = {}
2803
2804        self.promptcolor = True if os.getenv('DISABLE_PROMPT_COLOR') is None else False
2805
2806        self.no_results_instructions_shown = False
2807
2808    @property
2809    def options(self):
2810        """Current options."""
2811        return self._opts
2812
2813    @property
2814    def keywords(self):
2815        """Current keywords."""
2816        return self._google_url.keywords
2817
2818    @require_keywords
2819    def fetch(self):
2820        """Fetch a page and parse for results.
2821
2822        Results are stored in ``self.results``.
2823
2824        Raises
2825        ------
2826        GoogleConnectionError
2827
2828        See Also
2829        --------
2830        fetch_and_display
2831
2832        """
2833        # This method also sets self._results_filtered and
2834        # self._urltable.
2835        if self._preload_from_file:
2836            with open(self._preload_from_file, encoding='utf-8') as fp:
2837                page = fp.read()
2838        else:
2839            page = self._conn.fetch_page(self._google_url.relative())
2840            if logger.isEnabledFor(logging.DEBUG):
2841                import tempfile
2842                fd, tmpfile = tempfile.mkstemp(prefix='googler-response-', suffix='.html')
2843                os.close(fd)
2844                with open(tmpfile, 'w', encoding='utf-8') as fp:
2845                    fp.write(page)
2846                logger.debug("Response body written to '%s'.", tmpfile)
2847
2848        parser = GoogleParser(page, news=self._google_url.news, videos=self._google_url.videos)
2849
2850        self.results = parser.results
2851        self._autocorrected = parser.autocorrected
2852        self._showing_results_for = parser.showing_results_for
2853        self._results_filtered = parser.filtered
2854        self._urltable = {}
2855        for r in self.results:
2856            self._urltable.update(r.urltable())
2857
2858    def warn_no_results(self):
2859        printerr('No results.')
2860        if self.no_results_instructions_shown:
2861            return
2862
2863        try:
2864            import json
2865            import urllib.error
2866            import urllib.request
2867            info_json_url = '%s/master/info.json' % RAW_DOWNLOAD_REPO_BASE
2868            logger.debug('Fetching %s for project status...', info_json_url)
2869            try:
2870                with urllib.request.urlopen(info_json_url, timeout=5) as response:
2871                    try:
2872                        info = json.load(response)
2873                    except Exception:
2874                        logger.error('Failed to decode project status from %s', info_json_url)
2875                        raise RuntimeError
2876            except urllib.error.HTTPError as e:
2877                logger.error('Failed to fetch project status from %s: HTTP %d', info_json_url, e.code)
2878                raise RuntimeError
2879            epoch = info.get('epoch')
2880            if epoch > _EPOCH_:
2881                printerr('Your version of googler is broken due to Google-side changes.')
2882                tracking_issue = info.get('tracking_issue')
2883                fixed_on_master = info.get('fixed_on_master')
2884                fixed_in_release = info.get('fixed_in_release')
2885                if fixed_in_release:
2886                    printerr('A new version, %s, has been released to address the changes.' % fixed_in_release)
2887                    printerr('Please upgrade to the latest version.')
2888                elif fixed_on_master:
2889                    printerr('The fix has been pushed to master, pending a release.')
2890                    printerr('Please download the master version https://git.io/googler or wait for a release.')
2891                else:
2892                    printerr('The issue is tracked at https://github.com/jarun/googler/issues/%s.' % tracking_issue)
2893                return
2894        except RuntimeError:
2895            pass
2896
2897        printerr('If you believe this is a bug, please review '
2898                 'https://git.io/googler-no-results before submitting a bug report.')
2899        self.no_results_instructions_shown = True
2900
2901    @require_keywords
2902    def display_results(self, prelude='\n', json_output=False):
2903        """Display results stored in ``self.results``.
2904
2905        Parameters
2906        ----------
2907        See `fetch_and_display`.
2908
2909        """
2910        if json_output:
2911            # JSON output
2912            import json
2913            results_object = [r.jsonizable_object() for r in self.results]
2914            print(json.dumps(results_object, indent=2, sort_keys=True, ensure_ascii=False))
2915        else:
2916            # Regular output
2917            if not self.results:
2918                self.warn_no_results()
2919            else:
2920                sys.stderr.write(prelude)
2921                for r in self.results:
2922                    r.print()
2923
2924    @require_keywords
2925    def showing_results_for_alert(self, interactive=True):
2926        colors = self.colors
2927        if self._showing_results_for:
2928            if colors:
2929                # Underline the query
2930                actual_query = '\x1b[4m' + self._showing_results_for + '\x1b[24m'
2931            else:
2932                actual_query = self._showing_results_for
2933            if self._autocorrected:
2934                if interactive:
2935                    info = 'Showing results for %s; enter "x" for an exact search.' % actual_query
2936                else:
2937                    info = 'Showing results for %s; use -x, --exact for an exact search.' % actual_query
2938            else:
2939                info = 'No results found; showing results for %s.' % actual_query
2940            if interactive:
2941                printerr('')
2942            if colors:
2943                printerr(colors.prompt + info + colors.reset)
2944            else:
2945                printerr('** ' + info)
2946
2947    @require_keywords
2948    def fetch_and_display(self, prelude='\n', json_output=False, interactive=True):
2949        """Fetch a page and display results.
2950
2951        Results are stored in ``self.results``.
2952
2953        Parameters
2954        ----------
2955        prelude : str, optional
2956            A string that is written to stderr before showing actual results,
2957            usually serving as a separator. Default is an empty line.
2958        json_output : bool, optional
2959            Whether to dump results in JSON format. Default is False.
2960        interactive : bool, optional
2961            Whether to show contextual instructions, when e.g. Google
2962            has filtered the results. Default is True.
2963
2964        Raises
2965        ------
2966        GoogleConnectionError
2967
2968        See Also
2969        --------
2970        fetch
2971        display_results
2972
2973        """
2974        self.fetch()
2975        self.showing_results_for_alert()
2976        self.display_results(prelude=prelude, json_output=json_output)
2977        if self._results_filtered:
2978            colors = self.colors
2979            info = 'Enter "unfilter" to show similar results Google omitted.'
2980            if colors:
2981                printerr(colors.prompt + info + colors.reset)
2982            else:
2983                printerr('** ' + info)
2984            printerr('')
2985
2986    def read_next_command(self):
2987        """Show omniprompt and read user command line.
2988
2989        Command line is always stripped, and each consecutive group of
2990        whitespace is replaced with a single space character. If the
2991        command line is empty after stripping, when ignore it and keep
2992        reading. Exit with status 0 if we get EOF or an empty line
2993        (pre-strip, that is, a raw <enter>) twice in a row.
2994
2995        The new command line (non-empty) is stored in ``self.cmd``.
2996
2997        """
2998        colors = self.colors
2999        message = 'googler (? for help)'
3000        prompt = (colors.prompt + message + colors.reset + ' ') if (colors and self.promptcolor) else (message + ': ')
3001        enter_count = 0
3002        while True:
3003            try:
3004                cmd = input(prompt)
3005            except EOFError:
3006                sys.exit(0)
3007
3008            if not cmd:
3009                enter_count += 1
3010                if enter_count == 2:
3011                    # Double <enter>
3012                    sys.exit(0)
3013            else:
3014                enter_count = 0
3015
3016            cmd = ' '.join(cmd.split())
3017            if cmd:
3018                self.cmd = cmd
3019                break
3020
3021    @staticmethod
3022    def help():
3023        GooglerArgumentParser.print_omniprompt_help(sys.stderr)
3024        printerr('')
3025
3026    @require_keywords
3027    @no_argument
3028    def do_first(self):
3029        try:
3030            self._google_url.first_page()
3031        except ValueError as e:
3032            print(e, file=sys.stderr)
3033            return
3034
3035        self.fetch_and_display()
3036
3037    def do_google(self, arg):
3038        # Update keywords and reconstruct URL
3039        self._opts.keywords = arg
3040        self._google_url = GoogleUrl(self._opts)
3041        self.fetch_and_display()
3042
3043    @require_keywords
3044    @no_argument
3045    def do_next(self):
3046        # If > 5 results are being fetched each time,
3047        # block next when no parsed results in current fetch
3048        if not self.results and self._google_url._num > 5:
3049            printerr('No results.')
3050        else:
3051            self._google_url.next_page()
3052            self.fetch_and_display()
3053
3054    @require_keywords
3055    def do_open(self, *args):
3056        if not args:
3057            open_url(self._google_url.full())
3058            return
3059
3060        for nav in args:
3061            if nav == 'a':
3062                for key, value in sorted(self._urltable.items()):
3063                    open_url(self._urltable[key])
3064            elif nav in self._urltable:
3065                open_url(self._urltable[nav])
3066            elif '-' in nav:
3067                try:
3068                    vals = [int(x) for x in nav.split('-')]
3069                    if (len(vals) != 2):
3070                        printerr('Invalid range %s.' % nav)
3071                        continue
3072
3073                    if vals[0] > vals[1]:
3074                        vals[0], vals[1] = vals[1], vals[0]
3075
3076                    for _id in range(vals[0], vals[1] + 1):
3077                        if str(_id) in self._urltable:
3078                            open_url(self._urltable[str(_id)])
3079                        else:
3080                            printerr('Invalid index %s.' % _id)
3081                except ValueError:
3082                    printerr('Invalid range %s.' % nav)
3083            else:
3084                printerr('Invalid index %s.' % nav)
3085
3086    @require_keywords
3087    @no_argument
3088    def do_previous(self):
3089        try:
3090            self._google_url.prev_page()
3091        except ValueError as e:
3092            print(e, file=sys.stderr)
3093            return
3094
3095        self.fetch_and_display()
3096
3097    @require_keywords
3098    @no_argument
3099    def do_exact(self):
3100        # Reset start to 0 when exact is applied.
3101        self._google_url.update(start=0, exact=True)
3102        self.fetch_and_display()
3103
3104    @require_keywords
3105    @no_argument
3106    def do_unfilter(self):
3107        # Reset start to 0 when unfilter is applied.
3108        self._google_url.update(start=0)
3109        self._google_url.set_queries(filter=0)
3110        self.fetch_and_display()
3111
3112    def copy_url(self, idx):
3113        try:
3114            try:
3115                content = self._urltable[idx].encode('utf-8')
3116            except KeyError:
3117                printerr('Invalid index.')
3118                return
3119
3120            # try copying the url to clipboard using native utilities
3121            copier_params = []
3122            if sys.platform.startswith(('linux', 'freebsd', 'openbsd')):
3123                if shutil.which('xsel') is not None:
3124                    copier_params = ['xsel', '-b', '-i']
3125                elif shutil.which('xclip') is not None:
3126                    copier_params = ['xclip', '-selection', 'clipboard']
3127                elif shutil.which('wl-copy') is not None:
3128                    copier_params = ['wl-copy']
3129                elif shutil.which('termux-clipboard-set') is not None:
3130                    copier_params = ['termux-clipboard-set']
3131            elif sys.platform == 'darwin':
3132                copier_params = ['pbcopy']
3133            elif sys.platform == 'win32':
3134                copier_params = ['clip']
3135
3136            if copier_params:
3137                Popen(copier_params, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL).communicate(content)
3138                return
3139
3140            # If native clipboard utilities are absent, try to use terminal multiplexers
3141            # tmux
3142            if os.getenv('TMUX_PANE'):
3143                copier_params = ['tmux', 'set-buffer']
3144                Popen(copier_params + [content], stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate()
3145                return
3146
3147            # GNU Screen paste buffer
3148            if os.getenv('STY'):
3149                import tempfile
3150                copier_params = ['screen', '-X', 'readbuf', '-e', 'utf8']
3151                tmpfd, tmppath = tempfile.mkstemp()
3152                try:
3153                    with os.fdopen(tmpfd, 'wb') as fp:
3154                        fp.write(content)
3155                    copier_params.append(tmppath)
3156                    Popen(copier_params, stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate()
3157                finally:
3158                    os.unlink(tmppath)
3159                return
3160
3161            printerr('failed to locate suitable clipboard utility')
3162        except Exception:
3163            raise NoKeywordsException
3164
3165    def cmdloop(self):
3166        """Run REPL."""
3167        if self.keywords:
3168            self.fetch_and_display()
3169        else:
3170            printerr('Please initiate a query.')
3171
3172        while True:
3173            self.read_next_command()
3174            # TODO: Automatic dispatcher
3175            #
3176            # We can't write a dispatcher for now because that could
3177            # change behaviour of the prompt. However, we have already
3178            # laid a lot of ground work for the dispatcher, e.g., the
3179            # `no_argument' decorator.
3180            try:
3181                cmd = self.cmd
3182                if cmd == 'f':
3183                    self.do_first('')
3184                elif cmd.startswith('g '):
3185                    self.do_google(cmd[2:])
3186                elif cmd == 'n':
3187                    self.do_next('')
3188                elif cmd == 'o':
3189                    self.do_open()
3190                elif cmd.startswith('o '):
3191                    self.do_open(*cmd[2:].split())
3192                elif cmd.startswith('O '):
3193                    open_url.override_text_browser = True
3194                    self.do_open(*cmd[2:].split())
3195                    open_url.override_text_browser = False
3196                elif cmd == 'p':
3197                    self.do_previous('')
3198                elif cmd == 'q':
3199                    break
3200                elif cmd == 'x':
3201                    self.do_exact('')
3202                elif cmd == 'unfilter':
3203                    self.do_unfilter('')
3204                elif cmd == '?':
3205                    self.help()
3206                elif cmd in self._urltable:
3207                    open_url(self._urltable[cmd])
3208                elif self.keywords and cmd.isdigit() and int(cmd) < 100:
3209                    printerr('Index out of bound. To search for the number, use g.')
3210                elif cmd == 'u':
3211                    Result.urlexpand = not Result.urlexpand
3212                    self.display_results()
3213                elif cmd.startswith('c ') and self.re_url_index.match(cmd[2:]):
3214                    self.copy_url(cmd[2:])
3215                else:
3216                    self.do_google(cmd)
3217            except NoKeywordsException:
3218                printerr('Initiate a query first.')
3219
3220
3221class GooglerArgumentParser(argparse.ArgumentParser):
3222    """Custom argument parser for googler."""
3223
3224    # Print omniprompt help
3225    @staticmethod
3226    def print_omniprompt_help(file=None):
3227        file = sys.stderr if file is None else file
3228        file.write(textwrap.dedent("""
3229        omniprompt keys:
3230          n, p                  fetch the next or previous set of search results
3231          index                 open the result corresponding to index in browser
3232          f                     jump to the first page
3233          o [index|range|a ...] open space-separated result indices, numeric ranges
3234                                (sitelinks unsupported in ranges), or all, in browser
3235                                open the current search in browser, if no arguments
3236          O [index|range|a ...] like key 'o', but try to open in a GUI browser
3237          g keywords            new Google search for 'keywords' with original options
3238                                should be used to search omniprompt keys and indices
3239          c index               copy url to clipboard
3240          u                     toggle url expansion
3241          q, ^D, double Enter   exit googler
3242          ?                     show omniprompt help
3243          *                     other inputs issue a new search with original options
3244        """))
3245
3246    # Print information on googler
3247    @staticmethod
3248    def print_general_info(file=None):
3249        file = sys.stderr if file is None else file
3250        file.write(textwrap.dedent("""
3251        Version %s
3252        Copyright © 2008 Henri Hakkinen
3253        Copyright © 2015-2021 Arun Prakash Jana <engineerarun@gmail.com>
3254        Zhiming Wang <zmwangx@gmail.com>
3255        License: GPLv3
3256        Webpage: https://github.com/jarun/googler
3257        """ % _VERSION_))
3258
3259    # Augment print_help to print more than synopsis and options
3260    def print_help(self, file=None):
3261        super().print_help(file)
3262        self.print_omniprompt_help(file)
3263        self.print_general_info(file)
3264
3265    # Automatically print full help text on error
3266    def error(self, message):
3267        sys.stderr.write('%s: error: %s\n\n' % (self.prog, message))
3268        self.print_help(sys.stderr)
3269        self.exit(2)
3270
3271    # Type guards
3272    @staticmethod
3273    def positive_int(arg):
3274        """Try to convert a string into a positive integer."""
3275        try:
3276            n = int(arg)
3277            assert n > 0
3278            return n
3279        except (ValueError, AssertionError):
3280            raise argparse.ArgumentTypeError('%s is not a positive integer' % arg)
3281
3282    @staticmethod
3283    def nonnegative_int(arg):
3284        """Try to convert a string into a nonnegative integer."""
3285        try:
3286            n = int(arg)
3287            assert n >= 0
3288            return n
3289        except (ValueError, AssertionError):
3290            raise argparse.ArgumentTypeError('%s is not a non-negative integer' % arg)
3291
3292    @staticmethod
3293    def is_duration(arg):
3294        """Check if a string is a valid duration accepted by Google.
3295
3296        A valid duration is of the form dNUM, where d is a single letter h
3297        (hour), d (day), w (week), m (month), or y (year), and NUM is a
3298        non-negative integer.
3299        """
3300        try:
3301            if arg[0] not in ('h', 'd', 'w', 'm', 'y') or int(arg[1:]) < 0:
3302                raise ValueError
3303        except (TypeError, IndexError, ValueError):
3304            raise argparse.ArgumentTypeError('%s is not a valid duration' % arg)
3305        return arg
3306
3307    @staticmethod
3308    def is_date(arg):
3309        """Check if a string is a valid date/month/year accepted by Google."""
3310        if re.match(r'^(\d+/){0,2}\d+$', arg):
3311            return arg
3312        else:
3313            raise argparse.ArgumentTypeError('%s is not a valid date/month/year; '
3314                                             'use the American date format with slashes')
3315
3316    @staticmethod
3317    def is_colorstr(arg):
3318        """Check if a string is a valid color string."""
3319        try:
3320            assert len(arg) == 6
3321            for c in arg:
3322                assert c in COLORMAP
3323        except AssertionError:
3324            raise argparse.ArgumentTypeError('%s is not a valid color string' % arg)
3325        return arg
3326
3327
3328# Self-upgrade mechanism
3329
3330def system_is_windows():
3331    """Checks if the underlying system is Windows (Cygwin included)."""
3332    return sys.platform in {'win32', 'cygwin'}
3333
3334
3335def get_latest_ref(include_git=False):
3336    """Helper for download_latest_googler."""
3337    import urllib.request
3338
3339    if include_git:
3340        # Get SHA of latest commit on master
3341        request = urllib.request.Request('%s/commits/master' % API_REPO_BASE,
3342                                         headers={'Accept': 'application/vnd.github.v3.sha'})
3343        response = urllib.request.urlopen(request)
3344        if response.status != 200:
3345            raise http.client.HTTPException(response.reason)
3346        return response.read().decode('utf-8')
3347    else:
3348        # Get name of latest tag
3349        request = urllib.request.Request('%s/releases?per_page=1' % API_REPO_BASE,
3350                                         headers={'Accept': 'application/vnd.github.v3+json'})
3351        response = urllib.request.urlopen(request)
3352        if response.status != 200:
3353            raise http.client.HTTPException(response.reason)
3354        import json
3355        return json.loads(response.read().decode('utf-8'))[0]['tag_name']
3356
3357
3358def download_latest_googler(include_git=False):
3359    """Download latest googler to a temp file.
3360
3361    By default, the latest released version is downloaded, but if
3362    `include_git` is specified, then the latest git master is downloaded
3363    instead.
3364
3365    Parameters
3366    ----------
3367    include_git : bool, optional
3368        Download from git master. Default is False.
3369
3370    Returns
3371    -------
3372    (git_ref, path): tuple
3373         A tuple containing the git reference (either name of the latest
3374         tag or SHA of the latest commit) and path to the downloaded
3375         file.
3376
3377    """
3378    # Download googler to a tempfile
3379    git_ref = get_latest_ref(include_git=include_git)
3380    googler_download_url = '%s/%s/googler' % (RAW_DOWNLOAD_REPO_BASE, git_ref)
3381    printerr('Downloading %s' % googler_download_url)
3382    request = urllib.request.Request(googler_download_url,
3383                                     headers={'Accept-Encoding': 'gzip'})
3384    import tempfile
3385    fd, path = tempfile.mkstemp()
3386    atexit.register(lambda: os.remove(path) if os.path.exists(path) else None)
3387    os.close(fd)
3388    with open(path, 'wb') as fp:
3389        with urllib.request.urlopen(request) as response:
3390            if response.status != 200:
3391                raise http.client.HTTPException(response.reason)
3392            payload = response.read()
3393            try:
3394                fp.write(gzip.decompress(payload))
3395            except OSError:
3396                fp.write(payload)
3397    return git_ref, path
3398
3399
3400def self_replace(path):
3401    """Replace the current script with a specified file.
3402
3403    Both paths (the specified path and path to the current script) are
3404    resolved to absolute, symlink-free paths. Upon replacement, the
3405    owner and mode signatures of the current script are preserved. The
3406    caller needs to have the necessary permissions.
3407
3408    Replacement won't happen if the specified file is the same
3409    (content-wise) as the current script.
3410
3411    Parameters
3412    ----------
3413    path : str
3414        Path to the replacement file.
3415
3416    Returns
3417    -------
3418    bool
3419        True if replaced, False if skipped (specified file is the same
3420        as the current script).
3421
3422    """
3423    if system_is_windows():
3424        raise NotImplementedError('Self upgrade not supported on Windows.')
3425
3426    import filecmp
3427    import shutil
3428
3429    path = os.path.realpath(path)
3430    self_path = os.path.realpath(__file__)
3431
3432    if filecmp.cmp(path, self_path):
3433        return False
3434
3435    self_stat = os.stat(self_path)
3436    os.chown(path, self_stat.st_uid, self_stat.st_gid)
3437    os.chmod(path, self_stat.st_mode)
3438
3439    shutil.move(path, self_path)
3440    return True
3441
3442
3443def self_upgrade(include_git=False):
3444    """Perform in-place self-upgrade.
3445
3446    Parameters
3447    ----------
3448    include_git : bool, optional
3449        See `download_latest_googler`. Default is False.
3450
3451    """
3452    git_ref, path = download_latest_googler(include_git=include_git)
3453    if self_replace(path):
3454        printerr('Upgraded to %s.' % git_ref)
3455    else:
3456        printerr('Already up to date.')
3457
3458
3459def check_new_version():
3460    try:
3461        from distutils.version import StrictVersion as Version
3462    except ImportError:
3463        # distutils not available (thanks distros), use a concise poor
3464        # man's version parser.
3465        class Version(tuple):
3466            def __new__(cls, version_str):
3467                def parseint(s):
3468                    try:
3469                        return int(s)
3470                    except ValueError:
3471                        return 0
3472                return tuple.__new__(cls, [parseint(s) for s in version_str.split('.')])
3473
3474    import pathlib
3475    import tempfile
3476    import time
3477    cache = pathlib.Path(tempfile.gettempdir()) / 'googler-latest-version'
3478    latest_version_str = None
3479    # Try to load latest version string from cached location, if it
3480    # exists and is fresh enough.
3481    try:
3482        if cache.is_file() and time.time() - cache.stat().st_mtime < 86400:
3483            latest_version_str = cache.read_text().strip()
3484    except OSError:
3485        pass
3486    if not latest_version_str:
3487        try:
3488            latest_version_str = get_latest_ref().lstrip('v')
3489            cache.write_text(latest_version_str)
3490        except Exception:
3491            pass
3492    if not latest_version_str:
3493        return
3494    # Try to fetch latest version string from GitHub.
3495    try:
3496        current_version = Version(_VERSION_)
3497        latest_version = Version(latest_version_str)
3498    except ValueError:
3499        return
3500    if latest_version > current_version:
3501        print('\x1b[33;1mThe latest release of googler is v%s, please upgrade.\x1b[0m'
3502              % latest_version_str,
3503              file=sys.stderr)
3504
3505
3506# Miscellaneous functions
3507
3508def python_version():
3509    return '%d.%d.%d' % sys.version_info[:3]
3510
3511
3512def https_proxy_from_environment():
3513    return os.getenv('https_proxy')
3514
3515
3516def parse_proxy_spec(proxyspec):
3517    if '://' in proxyspec:
3518        pos = proxyspec.find('://')
3519        scheme = proxyspec[:pos]
3520        proxyspec = proxyspec[pos+3:]
3521        if scheme.lower() != 'http':
3522            # Only support HTTP proxies.
3523            #
3524            # In particular, we don't support HTTPS proxies since we
3525            # only speak plain HTTP to the proxy server, so don't give
3526            # users a false sense of security.
3527            raise NotImplementedError('Unsupported proxy scheme %s.' % scheme)
3528
3529    if '@' in proxyspec:
3530        pos = proxyspec.find('@')
3531        user_passwd = urllib.parse.unquote(proxyspec[:pos])
3532        # Remove trailing '/' if any
3533        host_port = proxyspec[pos+1:].rstrip('/')
3534    else:
3535        user_passwd = None
3536        host_port = proxyspec.rstrip('/')
3537
3538    if ':' not in host_port:
3539        # Use port 1080 as default, following curl.
3540        host_port += ':1080'
3541
3542    return user_passwd, host_port
3543
3544
3545def set_win_console_mode():
3546    # VT100 control sequences are supported on Windows 10 Anniversary Update and later.
3547    # https://docs.microsoft.com/en-us/windows/console/console-virtual-terminal-sequences
3548    # https://docs.microsoft.com/en-us/windows/console/setconsolemode
3549    if platform.release() == '10':
3550        STD_OUTPUT_HANDLE = -11
3551        STD_ERROR_HANDLE = -12
3552        ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
3553        try:
3554            from ctypes import windll, wintypes, byref
3555            kernel32 = windll.kernel32
3556            for nhandle in (STD_OUTPUT_HANDLE, STD_ERROR_HANDLE):
3557                handle = kernel32.GetStdHandle(nhandle)
3558                old_mode = wintypes.DWORD()
3559                if not kernel32.GetConsoleMode(handle, byref(old_mode)):
3560                    raise RuntimeError('GetConsoleMode failed')
3561                new_mode = old_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING
3562                if not kernel32.SetConsoleMode(handle, new_mode):
3563                    raise RuntimeError('SetConsoleMode failed')
3564            # Note: No need to restore at exit. SetConsoleMode seems to
3565            # be limited to the calling process.
3566        except Exception:
3567            pass
3568
3569
3570# Query autocompleter
3571
3572# This function is largely experimental and could raise any exception;
3573# you should be prepared to catch anything. When it works though, it
3574# returns a list of strings the prefix could autocomplete to (however,
3575# it is not guaranteed that they start with the specified prefix; for
3576# instance, they won't if the specified prefix ends in a punctuation
3577# mark.)
3578def completer_fetch_completions(prefix):
3579    import html
3580    import json
3581    import re
3582    import urllib.request
3583
3584    # One can pass the 'hl' query param to specify the language. We
3585    # ignore that for now.
3586    api_url = ('https://www.google.com/complete/search?client=psy-ab&q=%s' %
3587               urllib.parse.quote(prefix, safe=''))
3588    # A timeout of 3 seconds seems to be overly generous already.
3589    resp = urllib.request.urlopen(api_url, timeout=3)
3590    charset = resp.headers.get_content_charset()
3591    logger.debug('Completions charset: %s', charset)
3592    respobj = json.loads(resp.read().decode(charset))
3593
3594    # The response object, once parsed as JSON, should look like
3595    #
3596    # ['git',
3597    #  [['git<b>hub</b>', 0],
3598    #   ['git', 0],
3599    #   ['git<b>lab</b>', 0],
3600    #   ['git<b> stash</b>', 0]],
3601    #  {'q': 'oooAhRzoChqNmMbNaaDKXk1YY4k', 't': {'bpc': False, 'tlw': False}}]
3602    #
3603    # Note the each result entry need not have two members; e.g., for
3604    # 'gi', there is an entry ['gi<b>f</b>', 0, [131]].
3605    HTML_TAG = re.compile(r'<[^>]+>')
3606    return [html.unescape(HTML_TAG.sub('', entry[0])) for entry in respobj[1]]
3607
3608
3609def completer_run(prefix):
3610    if prefix:
3611        completions = completer_fetch_completions(prefix)
3612        if completions:
3613            print('\n'.join(completions))
3614    sys.exit(0)
3615
3616
3617def parse_args(args=None, namespace=None):
3618    """Parse googler arguments/options.
3619
3620    Parameters
3621    ----------
3622    args : list, optional
3623        Arguments to parse. Default is ``sys.argv``.
3624    namespace : argparse.Namespace
3625        Namespace to write to. Default is a new namespace.
3626
3627    Returns
3628    -------
3629    argparse.Namespace
3630        Namespace with parsed arguments / options.
3631
3632    """
3633
3634    colorstr_env = os.getenv('GOOGLER_COLORS')
3635
3636    argparser = GooglerArgumentParser(description='Google from the command-line.')
3637    addarg = argparser.add_argument
3638    addarg('-s', '--start', type=argparser.nonnegative_int, default=0,
3639           metavar='N', help='start at the Nth result')
3640    addarg('-n', '--count', dest='num', type=argparser.positive_int,
3641           default=10, metavar='N', help='show N results (default 10)')
3642    addarg('-N', '--news', action='store_true',
3643           help='show results from news section')
3644    addarg('-V', '--videos', action='store_true',
3645           help='show results from videos section')
3646    addarg('-c', '--tld', metavar='TLD',
3647           help="""country-specific search with top-level domain .TLD, e.g., 'in'
3648           for India""")
3649    addarg('-l', '--lang', metavar='LANG', help='display in language LANG')
3650    addarg('-g', '--geoloc', metavar='CC',
3651           help="""country-specific geolocation search with country code CC, e.g.
3652           'in' for India. Country codes are the same as top-level domains""")
3653    addarg('-x', '--exact', action='store_true',
3654           help='disable automatic spelling correction')
3655    addarg('--colorize', nargs='?', choices=['auto', 'always', 'never'],
3656           const='always', default='auto',
3657           help="""whether to colorize output; defaults to 'auto', which enables
3658           color when stdout is a tty device; using --colorize without an argument
3659           is equivalent to --colorize=always""")
3660    addarg('-C', '--nocolor', action='store_true',
3661           help='equivalent to --colorize=never')
3662    addarg('--colors', dest='colorstr', type=argparser.is_colorstr,
3663           default=colorstr_env if colorstr_env else 'GKlgxy', metavar='COLORS',
3664           help='set output colors (see man page for details)')
3665    addarg('-j', '--first', '--lucky', dest='lucky', action='store_true',
3666           help='open the first result in web browser and exit')
3667    addarg('-t', '--time', dest='duration', type=argparser.is_duration,
3668           metavar='dN', help='time limit search '
3669           '[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)]')
3670    addarg('--from', type=argparser.is_date,
3671           help="""starting date/month/year of date range; must use American date
3672           format with slashes, e.g., 2/24/2020, 2/2020, 2020; can be used in
3673           conjunction with --to, and overrides -t, --time""")
3674    addarg('--to', type=argparser.is_date,
3675           help='ending date/month/year of date range; see --from')
3676    addarg('-w', '--site', dest='sites', action='append', metavar='SITE',
3677           help='search a site using Google')
3678    addarg('-e', '--exclude', dest='exclude', action='append', metavar='SITE',
3679           help='exclude site from results')
3680    addarg('--unfilter', action='store_true', help='do not omit similar results')
3681    addarg('-p', '--proxy', default=https_proxy_from_environment(),
3682           help="""tunnel traffic through an HTTP proxy;
3683           PROXY is of the form [http://][user:password@]proxyhost[:port]""")
3684    addarg('--noua', action='store_true', help=argparse.SUPPRESS)
3685    addarg('--notweak', action='store_true',
3686           help='disable TCP optimizations and forced TLS 1.2')
3687    addarg('--json', action='store_true',
3688           help='output in JSON format; implies --noprompt')
3689    addarg('--url-handler', metavar='UTIL',
3690           help='custom script or cli utility to open results')
3691    addarg('--show-browser-logs', action='store_true',
3692           help='do not suppress browser output (stdout and stderr)')
3693    addarg('--np', '--noprompt', dest='noninteractive', action='store_true',
3694           help='search and exit, do not prompt')
3695    addarg('-4', '--ipv4', action='store_const', dest='address_family',
3696           const=socket.AF_INET, default=0,
3697           help="""only connect over IPv4
3698           (by default, IPv4 is preferred but IPv6 is used as a fallback)""")
3699    addarg('-6', '--ipv6', action='store_const', dest='address_family',
3700           const=socket.AF_INET6, default=0,
3701           help='only connect over IPv6')
3702    addarg('keywords', nargs='*', metavar='KEYWORD', help='search keywords')
3703    if ENABLE_SELF_UPGRADE_MECHANISM and not system_is_windows():
3704        addarg('-u', '--upgrade', action='store_true',
3705               help='perform in-place self-upgrade')
3706        addarg('--include-git', action='store_true',
3707               help='when used with --upgrade, get latest git master')
3708    addarg('-v', '--version', action='version', version=_VERSION_)
3709    addarg('-d', '--debug', action='store_true', help='enable debugging')
3710    # Hidden option for interacting with DOM in an IPython/pdb shell
3711    addarg('-D', '--debugger', action='store_true', help=argparse.SUPPRESS)
3712    # Hidden option for parsing dumped HTML
3713    addarg('--parse', dest='html_file', help=argparse.SUPPRESS)
3714    addarg('--complete', help=argparse.SUPPRESS)
3715
3716    parsed = argparser.parse_args(args, namespace)
3717    if parsed.nocolor:
3718        parsed.colorize = 'never'
3719
3720    return parsed
3721
3722
3723def main():
3724    try:
3725        opts = parse_args()
3726
3727        # Set logging level
3728        if opts.debug:
3729            logger.setLevel(logging.DEBUG)
3730            logger.debug('googler version %s', _VERSION_)
3731            logger.debug('Python version %s', python_version())
3732            logger.debug('Platform: %s', platform.platform())
3733            check_new_version()
3734
3735        if opts.debugger:
3736            global debugger
3737            debugger = True
3738
3739        # Handle query completer
3740        if opts.complete is not None:
3741            completer_run(opts.complete)
3742
3743        # Handle self-upgrade
3744        if hasattr(opts, 'upgrade') and opts.upgrade:
3745            self_upgrade(include_git=opts.include_git)
3746            sys.exit(0)
3747
3748        check_stdout_encoding()
3749
3750        if opts.keywords:
3751            try:
3752                # Add cmdline args to readline history
3753                readline.add_history(' '.join(opts.keywords))
3754            except Exception:
3755                pass
3756
3757        # Set colors
3758        if opts.colorize == 'always':
3759            colorize = True
3760        elif opts.colorize == 'auto':
3761            colorize = sys.stdout.isatty()
3762        else:  # opts.colorize == 'never'
3763            colorize = False
3764
3765        if colorize:
3766            colors = Colors(*[COLORMAP[c] for c in opts.colorstr], reset=COLORMAP['x'])
3767        else:
3768            colors = None
3769        Result.colors = colors
3770        Result.urlexpand = True if os.getenv('DISABLE_URL_EXPANSION') is None else False
3771        GooglerCmd.colors = colors
3772
3773        # Try to enable ANSI color support in cmd or PowerShell on Windows 10
3774        if sys.platform == 'win32' and sys.stdout.isatty() and colorize:
3775            set_win_console_mode()
3776
3777        if opts.url_handler is not None:
3778            open_url.url_handler = opts.url_handler
3779        else:
3780            # Set text browser override to False
3781            open_url.override_text_browser = False
3782
3783            # Handle browser output suppression
3784            if opts.show_browser_logs or (os.getenv('BROWSER') in text_browsers):
3785                open_url.suppress_browser_output = False
3786            else:
3787                open_url.suppress_browser_output = True
3788
3789        if opts.noua:
3790            logger.warning('--noua option has been deprecated and has no effect (see #284)')
3791
3792        repl = GooglerCmd(opts)
3793
3794        # Non-interactive mode
3795        if opts.json or opts.lucky or opts.noninteractive or opts.html_file:
3796            repl.fetch()
3797            if opts.lucky:
3798                if repl.results:
3799                    open_url(repl.results[0].url)
3800                else:
3801                    print('No results.', file=sys.stderr)
3802            else:
3803                repl.showing_results_for_alert(interactive=False)
3804                repl.display_results(json_output=opts.json)
3805            sys.exit(0)
3806
3807        # Interactive mode
3808        repl.cmdloop()
3809    except Exception as e:
3810        # With debugging on, let the exception through for a traceback;
3811        # otherwise, only print the exception error message.
3812        if logger.isEnabledFor(logging.DEBUG):
3813            raise
3814        else:
3815            logger.error(e)
3816            sys.exit(1)
3817
3818if __name__ == '__main__':
3819    main()
3820