1# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
2
3# Copyright 2014-2021 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
4#
5# This file is part of qutebrowser.
6#
7# qutebrowser is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# qutebrowser is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with qutebrowser.  If not, see <https://www.gnu.org/licenses/>.
19
20"""Utils regarding URL handling."""
21
22import re
23import base64
24import os.path
25import ipaddress
26import posixpath
27import urllib.parse
28import mimetypes
29from typing import Optional, Tuple, Union, Iterable
30
31from PyQt5.QtCore import QUrl
32from PyQt5.QtNetwork import QHostInfo, QHostAddress, QNetworkProxy
33
34from qutebrowser.api import cmdutils
35from qutebrowser.config import config
36from qutebrowser.utils import log, qtutils, message, utils
37from qutebrowser.browser.network import pac
38
39
40# FIXME: we probably could raise some exceptions on invalid URLs
41# https://github.com/qutebrowser/qutebrowser/issues/108
42
43
44# URL schemes supported by QtWebEngine
45WEBENGINE_SCHEMES = [
46    'about',
47    'data',
48    'file',
49    'filesystem',
50    'ftp',
51    'http',
52    'https',
53    'javascript',
54    'ws',
55    'wss',
56]
57
58
59class Error(Exception):
60
61    """Base class for errors in this module."""
62
63
64class InvalidUrlError(Error):
65
66    """Error raised if a function got an invalid URL."""
67
68    def __init__(self, url: QUrl) -> None:
69        if url.isValid():
70            raise ValueError("Got valid URL {}!".format(url.toDisplayString()))
71        self.url = url
72        self.msg = get_errstring(url)
73        super().__init__(self.msg)
74
75
76def _parse_search_term(s: str) -> Tuple[Optional[str], Optional[str]]:
77    """Get a search engine name and search term from a string.
78
79    Args:
80        s: The string to get a search engine for.
81
82    Return:
83        A (engine, term) tuple, where engine is None for the default engine.
84    """
85    s = s.strip()
86    split = s.split(maxsplit=1)
87    if not split:
88        raise ValueError("Empty search term!")
89
90    if len(split) == 2:
91        if split[0] in config.val.url.searchengines:
92            engine: Optional[str] = split[0]
93            term: Optional[str] = split[1]
94        else:
95            engine = None
96            term = s
97    else:
98        if config.val.url.open_base_url and s in config.val.url.searchengines:
99            engine = s
100            term = None
101        else:
102            engine = None
103            term = s
104
105    log.url.debug("engine {}, term {!r}".format(engine, term))
106    return (engine, term)
107
108
109def _get_search_url(txt: str) -> QUrl:
110    """Get a search engine URL for a text.
111
112    Args:
113        txt: Text to search for.
114
115    Return:
116        The search URL as a QUrl.
117    """
118    log.url.debug("Finding search engine for {!r}".format(txt))
119    engine, term = _parse_search_term(txt)
120    if not engine:
121        engine = 'DEFAULT'
122    if term:
123        template = config.val.url.searchengines[engine]
124        semiquoted_term = urllib.parse.quote(term)
125        quoted_term = urllib.parse.quote(term, safe='')
126        evaluated = template.format(semiquoted_term,
127                                    unquoted=term,
128                                    quoted=quoted_term,
129                                    semiquoted=semiquoted_term)
130        url = QUrl.fromUserInput(evaluated)
131    else:
132        url = QUrl.fromUserInput(config.val.url.searchengines[engine])
133        url.setPath(None)  # type: ignore[arg-type]
134        url.setFragment(None)  # type: ignore[arg-type]
135        url.setQuery(None)  # type: ignore[call-overload]
136    qtutils.ensure_valid(url)
137    return url
138
139
140def _is_url_naive(urlstr: str) -> bool:
141    """Naive check if given URL is really a URL.
142
143    Args:
144        urlstr: The URL to check for, as string.
145
146    Return:
147        True if the URL really is a URL, False otherwise.
148    """
149    url = QUrl.fromUserInput(urlstr)
150    assert url.isValid()
151    host = url.host()
152
153    # Valid IPv4/IPv6 address. Qt converts things like "23.42" or "1337" or
154    # "0xDEAD" to IP addresses, which we don't like, so we check if the host
155    # from Qt is part of the input.
156    if (not utils.raises(ValueError, ipaddress.ip_address, host) and
157            host in urlstr):
158        return True
159
160    tld = r'\.([^.0-9_-]+|xn--[a-z0-9-]+)$'
161    forbidden = r'[\u0000-\u002c\u002f\u003a-\u0060\u007b-\u00b6]'
162    return bool(re.search(tld, host) and not re.search(forbidden, host))
163
164
165def _is_url_dns(urlstr: str) -> bool:
166    """Check if a URL is really a URL via DNS.
167
168    Args:
169        url: The URL to check for as a string.
170
171    Return:
172        True if the URL really is a URL, False otherwise.
173    """
174    url = QUrl.fromUserInput(urlstr)
175    assert url.isValid()
176
177    if (utils.raises(ValueError, ipaddress.ip_address, urlstr) and
178            not QHostAddress(urlstr).isNull()):
179        log.url.debug("Bogus IP URL -> False")
180        # Qt treats things like "23.42" or "1337" or "0xDEAD" as valid URLs
181        # which we don't want to.
182        return False
183
184    host = url.host()
185    if not host:
186        log.url.debug("URL has no host -> False")
187        return False
188    log.url.debug("Doing DNS request for {}".format(host))
189    info = QHostInfo.fromName(host)
190    return not info.error()
191
192
193def fuzzy_url(urlstr: str,
194              cwd: str = None,
195              relative: bool = False,
196              do_search: bool = True,
197              force_search: bool = False) -> QUrl:
198    """Get a QUrl based on a user input which is URL or search term.
199
200    Args:
201        urlstr: URL to load as a string.
202        cwd: The current working directory, or None.
203        relative: Whether to resolve relative files.
204        do_search: Whether to perform a search on non-URLs.
205        force_search: Whether to force a search even if the content can be
206                      interpreted as a URL or a path.
207
208    Return:
209        A target QUrl to a search page or the original URL.
210    """
211    urlstr = urlstr.strip()
212    path = get_path_if_valid(urlstr, cwd=cwd, relative=relative,
213                             check_exists=True)
214
215    if not force_search and path is not None:
216        url = QUrl.fromLocalFile(path)
217    elif force_search or (do_search and not is_url(urlstr)):
218        # probably a search term
219        log.url.debug("URL is a fuzzy search term")
220        try:
221            url = _get_search_url(urlstr)
222        except ValueError:  # invalid search engine
223            url = QUrl.fromUserInput(urlstr)
224    else:  # probably an address
225        log.url.debug("URL is a fuzzy address")
226        url = QUrl.fromUserInput(urlstr)
227    log.url.debug("Converting fuzzy term {!r} to URL -> {}".format(
228        urlstr, url.toDisplayString()))
229    ensure_valid(url)
230    return url
231
232
233def _has_explicit_scheme(url: QUrl) -> bool:
234    """Check if a url has an explicit scheme given.
235
236    Args:
237        url: The URL as QUrl.
238    """
239    # Note that generic URI syntax actually would allow a second colon
240    # after the scheme delimiter. Since we don't know of any URIs
241    # using this and want to support e.g. searching for scoped C++
242    # symbols, we treat this as not a URI anyways.
243    return bool(url.isValid() and url.scheme() and
244                (url.host() or url.path()) and
245                not url.path().startswith(':'))
246
247
248def is_special_url(url: QUrl) -> bool:
249    """Return True if url is an about:... or other special URL.
250
251    Args:
252        url: The URL as QUrl.
253    """
254    if not url.isValid():
255        return False
256    special_schemes = ('about', 'qute', 'file')
257    return url.scheme() in special_schemes
258
259
260def is_url(urlstr: str) -> bool:
261    """Check if url seems to be a valid URL.
262
263    Args:
264        urlstr: The URL as string.
265
266    Return:
267        True if it is a valid URL, False otherwise.
268    """
269    autosearch = config.val.url.auto_search
270
271    log.url.debug("Checking if {!r} is a URL (autosearch={}).".format(
272        urlstr, autosearch))
273
274    urlstr = urlstr.strip()
275    qurl = QUrl(urlstr)
276    qurl_userinput = QUrl.fromUserInput(urlstr)
277
278    if autosearch == 'never':
279        # no autosearch, so everything is a URL unless it has an explicit
280        # search engine.
281        try:
282            engine, _term = _parse_search_term(urlstr)
283        except ValueError:
284            return False
285        else:
286            return engine is None
287
288    if not qurl_userinput.isValid():
289        # This will also catch non-URLs containing spaces.
290        return False
291
292    if _has_explicit_scheme(qurl) and ' ' not in urlstr:
293        # URLs with explicit schemes are always URLs
294        log.url.debug("Contains explicit scheme")
295        url = True
296    elif (autosearch == 'schemeless' and
297          (not _has_explicit_scheme(qurl) or ' ' in urlstr)):
298        # When autosearch=schemeless, URLs must contain schemes to be valid
299        log.url.debug("No explicit scheme in given URL, treating as non-URL")
300        url = False
301    elif qurl_userinput.host() in ['localhost', '127.0.0.1', '::1']:
302        log.url.debug("Is localhost.")
303        url = True
304    elif is_special_url(qurl):
305        # Special URLs are always URLs, even with autosearch=never
306        log.url.debug("Is a special URL.")
307        url = True
308    elif autosearch == 'dns':
309        log.url.debug("Checking via DNS check")
310        # We want to use QUrl.fromUserInput here, as the user might enter
311        # "foo.de" and that should be treated as URL here.
312        url = ' ' not in qurl_userinput.userName() and _is_url_dns(urlstr)
313    elif autosearch == 'naive':
314        log.url.debug("Checking via naive check")
315        url = ' ' not in qurl_userinput.userName() and _is_url_naive(urlstr)
316    else:  # pragma: no cover
317        raise ValueError("Invalid autosearch value")
318    log.url.debug("url = {}".format(url))
319    return url
320
321
322def ensure_valid(url: QUrl) -> None:
323    if not url.isValid():
324        raise InvalidUrlError(url)
325
326
327def invalid_url_error(url: QUrl, action: str) -> None:
328    """Display an error message for a URL.
329
330    Args:
331        action: The action which was interrupted by the error.
332    """
333    if url.isValid():
334        raise ValueError("Calling invalid_url_error with valid URL {}".format(
335            url.toDisplayString()))
336    errstring = get_errstring(
337        url, "Trying to {} with invalid URL".format(action))
338    message.error(errstring)
339
340
341def raise_cmdexc_if_invalid(url: QUrl) -> None:
342    """Check if the given QUrl is invalid, and if so, raise a CommandError."""
343    try:
344        ensure_valid(url)
345    except InvalidUrlError as e:
346        raise cmdutils.CommandError(str(e))
347
348
349def get_path_if_valid(pathstr: str,
350                      cwd: str = None,
351                      relative: bool = False,
352                      check_exists: bool = False) -> Optional[str]:
353    """Check if path is a valid path.
354
355    Args:
356        pathstr: The path as string.
357        cwd: The current working directory, or None.
358        relative: Whether to resolve relative files.
359        check_exists: Whether to check if the file
360                      actually exists of filesystem.
361
362    Return:
363        The path if it is a valid path, None otherwise.
364    """
365    pathstr = pathstr.strip()
366    log.url.debug("Checking if {!r} is a path".format(pathstr))
367    expanded = os.path.expanduser(pathstr)
368
369    if os.path.isabs(expanded):
370        path: Optional[str] = expanded
371    elif relative and cwd:
372        path = os.path.join(cwd, expanded)
373    elif relative:
374        try:
375            path = os.path.abspath(expanded)
376        except OSError:
377            path = None
378    else:
379        path = None
380
381    if check_exists:
382        if path is not None:
383            try:
384                if os.path.exists(path):
385                    log.url.debug("URL is a local file")
386                else:
387                    path = None
388            except UnicodeEncodeError:
389                log.url.debug(
390                    "URL contains characters which are not present in the "
391                    "current locale")
392                path = None
393
394    return path
395
396
397def filename_from_url(url: QUrl, fallback: str = None) -> Optional[str]:
398    """Get a suitable filename from a URL.
399
400    Args:
401        url: The URL to parse, as a QUrl.
402        fallback: Value to use if no name can be determined.
403
404    Return:
405        The suggested filename as a string, or None.
406    """
407    if not url.isValid():
408        return fallback
409
410    if url.scheme().lower() == 'data':
411        mimetype, _encoding = mimetypes.guess_type(url.toString())
412        if not mimetype:
413            return fallback
414
415        ext = utils.mimetype_extension(mimetype) or ''
416        return 'download' + ext
417
418    pathname = posixpath.basename(url.path())
419    if pathname:
420        return pathname
421    elif url.host():
422        return url.host() + '.html'
423    else:
424        return fallback
425
426
427HostTupleType = Tuple[str, str, int]
428
429
430def host_tuple(url: QUrl) -> HostTupleType:
431    """Get a (scheme, host, port) tuple from a QUrl.
432
433    This is suitable to identify a connection, e.g. for SSL errors.
434    """
435    ensure_valid(url)
436    scheme, host, port = url.scheme(), url.host(), url.port()
437    assert scheme
438    if not host:
439        raise ValueError("Got URL {} without host.".format(
440            url.toDisplayString()))
441    if port == -1:
442        port_mapping = {
443            'http': 80,
444            'https': 443,
445            'ftp': 21,
446        }
447        try:
448            port = port_mapping[scheme]
449        except KeyError:
450            raise ValueError("Got URL {} with unknown port.".format(
451                url.toDisplayString()))
452    return scheme, host, port
453
454
455def get_errstring(url: QUrl, base: str = "Invalid URL") -> str:
456    """Get an error string for a URL.
457
458    Args:
459        url: The URL as a QUrl.
460        base: The base error string.
461
462    Return:
463        A new string with url.errorString() is appended if available.
464    """
465    url_error = url.errorString()
466    if url_error:
467        return base + " - {}".format(url_error)
468    else:
469        return base
470
471
472def same_domain(url1: QUrl, url2: QUrl) -> bool:
473    """Check if url1 and url2 belong to the same website.
474
475    This will use a "public suffix list" to determine what a "top level domain"
476    is. All further domains are ignored.
477
478    For example example.com and www.example.com are considered the same. but
479    example.co.uk and test.co.uk are not.
480
481    If the URL's schemes or ports are different, they are always treated as not equal.
482
483    Return:
484        True if the domains are the same, False otherwise.
485    """
486    ensure_valid(url1)
487    ensure_valid(url2)
488
489    if url1.scheme() != url2.scheme():
490        return False
491    if url1.port() != url2.port():
492        return False
493
494    suffix1 = url1.topLevelDomain()
495    suffix2 = url2.topLevelDomain()
496    if not suffix1:
497        return url1.host() == url2.host()
498
499    if suffix1 != suffix2:
500        return False
501
502    domain1 = url1.host()[:-len(suffix1)].split('.')[-1]
503    domain2 = url2.host()[:-len(suffix2)].split('.')[-1]
504    return domain1 == domain2
505
506
507def encoded_url(url: QUrl) -> str:
508    """Return the fully encoded url as string.
509
510    Args:
511        url: The url to encode as QUrl.
512    """
513    return url.toEncoded().data().decode('ascii')
514
515
516def file_url(path: str) -> str:
517    """Return a file:// url (as string) to the given local path.
518
519    Arguments:
520        path: The absolute path to the local file
521    """
522    url = QUrl.fromLocalFile(path)
523    return url.toString(QUrl.FullyEncoded)  # type: ignore[arg-type]
524
525
526def data_url(mimetype: str, data: bytes) -> QUrl:
527    """Get a data: QUrl for the given data."""
528    b64 = base64.b64encode(data).decode('ascii')
529    url = QUrl('data:{};base64,{}'.format(mimetype, b64))
530    qtutils.ensure_valid(url)
531    return url
532
533
534def safe_display_string(qurl: QUrl) -> str:
535    """Get a IDN-homograph phishing safe form of the given QUrl.
536
537    If we're dealing with a Punycode-encoded URL, this prepends the hostname in
538    its encoded form, to make sure those URLs are distinguishable.
539
540    See https://github.com/qutebrowser/qutebrowser/issues/2547
541    and https://bugreports.qt.io/browse/QTBUG-60365
542    """
543    ensure_valid(qurl)
544
545    host = qurl.host(QUrl.FullyEncoded)
546    assert '..' not in host, qurl  # https://bugreports.qt.io/browse/QTBUG-60364
547
548    for part in host.split('.'):
549        url_host = qurl.host(QUrl.FullyDecoded)
550        if part.startswith('xn--') and host != url_host:
551            return '({}) {}'.format(host, qurl.toDisplayString())
552
553    return qurl.toDisplayString()
554
555
556class InvalidProxyTypeError(Exception):
557
558    """Error raised when proxy_from_url gets an unknown proxy type."""
559
560    def __init__(self, typ: str) -> None:
561        super().__init__("Invalid proxy type {}!".format(typ))
562
563
564def proxy_from_url(url: QUrl) -> Union[QNetworkProxy, pac.PACFetcher]:
565    """Create a QNetworkProxy from QUrl and a proxy type.
566
567    Args:
568        url: URL of a proxy (possibly with credentials).
569
570    Return:
571        New QNetworkProxy.
572    """
573    ensure_valid(url)
574
575    scheme = url.scheme()
576    if scheme in ['pac+http', 'pac+https', 'pac+file']:
577        fetcher = pac.PACFetcher(url)
578        fetcher.fetch()
579        return fetcher
580
581    types = {
582        'http': QNetworkProxy.HttpProxy,
583        'socks': QNetworkProxy.Socks5Proxy,
584        'socks5': QNetworkProxy.Socks5Proxy,
585        'direct': QNetworkProxy.NoProxy,
586    }
587    if scheme not in types:
588        raise InvalidProxyTypeError(scheme)
589
590    proxy = QNetworkProxy(types[scheme], url.host())
591
592    if url.port() != -1:
593        proxy.setPort(url.port())
594    if url.userName():
595        proxy.setUser(url.userName())
596    if url.password():
597        proxy.setPassword(url.password())
598    return proxy
599
600
601def parse_javascript_url(url: QUrl) -> str:
602    """Get JavaScript source from the given URL.
603
604    See https://wiki.whatwg.org/wiki/URL_schemes#javascript:_URLs
605    and https://github.com/whatwg/url/issues/385
606    """
607    ensure_valid(url)
608    if url.scheme() != 'javascript':
609        raise Error("Expected a javascript:... URL")
610    if url.authority():
611        raise Error("URL contains unexpected components: {}"
612                    .format(url.authority()))
613
614    urlstr = url.toString(QUrl.FullyEncoded)  # type: ignore[arg-type]
615    urlstr = urllib.parse.unquote(urlstr)
616
617    code = urlstr[len('javascript:'):]
618    if not code:
619        raise Error("Resulted in empty JavaScript code")
620
621    return code
622
623
624def widened_hostnames(hostname: str) -> Iterable[str]:
625    """A generator for widening string hostnames.
626
627    Ex: a.c.foo -> [a.c.foo, c.foo, foo]"""
628    while hostname:
629        yield hostname
630        hostname = hostname.partition(".")[-1]
631