1# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et: 2 3# Copyright 2014-2021 Florian Bruhin (The Compiler) <mail@qutebrowser.org> 4# 5# This file is part of qutebrowser. 6# 7# qutebrowser is free software: you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation, either version 3 of the License, or 10# (at your option) any later version. 11# 12# qutebrowser is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with qutebrowser. If not, see <https://www.gnu.org/licenses/>. 19 20"""Utils regarding URL handling.""" 21 22import re 23import base64 24import os.path 25import ipaddress 26import posixpath 27import urllib.parse 28import mimetypes 29from typing import Optional, Tuple, Union, Iterable 30 31from PyQt5.QtCore import QUrl 32from PyQt5.QtNetwork import QHostInfo, QHostAddress, QNetworkProxy 33 34from qutebrowser.api import cmdutils 35from qutebrowser.config import config 36from qutebrowser.utils import log, qtutils, message, utils 37from qutebrowser.browser.network import pac 38 39 40# FIXME: we probably could raise some exceptions on invalid URLs 41# https://github.com/qutebrowser/qutebrowser/issues/108 42 43 44# URL schemes supported by QtWebEngine 45WEBENGINE_SCHEMES = [ 46 'about', 47 'data', 48 'file', 49 'filesystem', 50 'ftp', 51 'http', 52 'https', 53 'javascript', 54 'ws', 55 'wss', 56] 57 58 59class Error(Exception): 60 61 """Base class for errors in this module.""" 62 63 64class InvalidUrlError(Error): 65 66 """Error raised if a function got an invalid URL.""" 67 68 def __init__(self, url: QUrl) -> None: 69 if url.isValid(): 70 raise ValueError("Got valid URL {}!".format(url.toDisplayString())) 71 self.url = url 72 self.msg = get_errstring(url) 73 super().__init__(self.msg) 74 75 76def _parse_search_term(s: str) -> Tuple[Optional[str], Optional[str]]: 77 """Get a search engine name and search term from a string. 78 79 Args: 80 s: The string to get a search engine for. 81 82 Return: 83 A (engine, term) tuple, where engine is None for the default engine. 84 """ 85 s = s.strip() 86 split = s.split(maxsplit=1) 87 if not split: 88 raise ValueError("Empty search term!") 89 90 if len(split) == 2: 91 if split[0] in config.val.url.searchengines: 92 engine: Optional[str] = split[0] 93 term: Optional[str] = split[1] 94 else: 95 engine = None 96 term = s 97 else: 98 if config.val.url.open_base_url and s in config.val.url.searchengines: 99 engine = s 100 term = None 101 else: 102 engine = None 103 term = s 104 105 log.url.debug("engine {}, term {!r}".format(engine, term)) 106 return (engine, term) 107 108 109def _get_search_url(txt: str) -> QUrl: 110 """Get a search engine URL for a text. 111 112 Args: 113 txt: Text to search for. 114 115 Return: 116 The search URL as a QUrl. 117 """ 118 log.url.debug("Finding search engine for {!r}".format(txt)) 119 engine, term = _parse_search_term(txt) 120 if not engine: 121 engine = 'DEFAULT' 122 if term: 123 template = config.val.url.searchengines[engine] 124 semiquoted_term = urllib.parse.quote(term) 125 quoted_term = urllib.parse.quote(term, safe='') 126 evaluated = template.format(semiquoted_term, 127 unquoted=term, 128 quoted=quoted_term, 129 semiquoted=semiquoted_term) 130 url = QUrl.fromUserInput(evaluated) 131 else: 132 url = QUrl.fromUserInput(config.val.url.searchengines[engine]) 133 url.setPath(None) # type: ignore[arg-type] 134 url.setFragment(None) # type: ignore[arg-type] 135 url.setQuery(None) # type: ignore[call-overload] 136 qtutils.ensure_valid(url) 137 return url 138 139 140def _is_url_naive(urlstr: str) -> bool: 141 """Naive check if given URL is really a URL. 142 143 Args: 144 urlstr: The URL to check for, as string. 145 146 Return: 147 True if the URL really is a URL, False otherwise. 148 """ 149 url = QUrl.fromUserInput(urlstr) 150 assert url.isValid() 151 host = url.host() 152 153 # Valid IPv4/IPv6 address. Qt converts things like "23.42" or "1337" or 154 # "0xDEAD" to IP addresses, which we don't like, so we check if the host 155 # from Qt is part of the input. 156 if (not utils.raises(ValueError, ipaddress.ip_address, host) and 157 host in urlstr): 158 return True 159 160 tld = r'\.([^.0-9_-]+|xn--[a-z0-9-]+)$' 161 forbidden = r'[\u0000-\u002c\u002f\u003a-\u0060\u007b-\u00b6]' 162 return bool(re.search(tld, host) and not re.search(forbidden, host)) 163 164 165def _is_url_dns(urlstr: str) -> bool: 166 """Check if a URL is really a URL via DNS. 167 168 Args: 169 url: The URL to check for as a string. 170 171 Return: 172 True if the URL really is a URL, False otherwise. 173 """ 174 url = QUrl.fromUserInput(urlstr) 175 assert url.isValid() 176 177 if (utils.raises(ValueError, ipaddress.ip_address, urlstr) and 178 not QHostAddress(urlstr).isNull()): 179 log.url.debug("Bogus IP URL -> False") 180 # Qt treats things like "23.42" or "1337" or "0xDEAD" as valid URLs 181 # which we don't want to. 182 return False 183 184 host = url.host() 185 if not host: 186 log.url.debug("URL has no host -> False") 187 return False 188 log.url.debug("Doing DNS request for {}".format(host)) 189 info = QHostInfo.fromName(host) 190 return not info.error() 191 192 193def fuzzy_url(urlstr: str, 194 cwd: str = None, 195 relative: bool = False, 196 do_search: bool = True, 197 force_search: bool = False) -> QUrl: 198 """Get a QUrl based on a user input which is URL or search term. 199 200 Args: 201 urlstr: URL to load as a string. 202 cwd: The current working directory, or None. 203 relative: Whether to resolve relative files. 204 do_search: Whether to perform a search on non-URLs. 205 force_search: Whether to force a search even if the content can be 206 interpreted as a URL or a path. 207 208 Return: 209 A target QUrl to a search page or the original URL. 210 """ 211 urlstr = urlstr.strip() 212 path = get_path_if_valid(urlstr, cwd=cwd, relative=relative, 213 check_exists=True) 214 215 if not force_search and path is not None: 216 url = QUrl.fromLocalFile(path) 217 elif force_search or (do_search and not is_url(urlstr)): 218 # probably a search term 219 log.url.debug("URL is a fuzzy search term") 220 try: 221 url = _get_search_url(urlstr) 222 except ValueError: # invalid search engine 223 url = QUrl.fromUserInput(urlstr) 224 else: # probably an address 225 log.url.debug("URL is a fuzzy address") 226 url = QUrl.fromUserInput(urlstr) 227 log.url.debug("Converting fuzzy term {!r} to URL -> {}".format( 228 urlstr, url.toDisplayString())) 229 ensure_valid(url) 230 return url 231 232 233def _has_explicit_scheme(url: QUrl) -> bool: 234 """Check if a url has an explicit scheme given. 235 236 Args: 237 url: The URL as QUrl. 238 """ 239 # Note that generic URI syntax actually would allow a second colon 240 # after the scheme delimiter. Since we don't know of any URIs 241 # using this and want to support e.g. searching for scoped C++ 242 # symbols, we treat this as not a URI anyways. 243 return bool(url.isValid() and url.scheme() and 244 (url.host() or url.path()) and 245 not url.path().startswith(':')) 246 247 248def is_special_url(url: QUrl) -> bool: 249 """Return True if url is an about:... or other special URL. 250 251 Args: 252 url: The URL as QUrl. 253 """ 254 if not url.isValid(): 255 return False 256 special_schemes = ('about', 'qute', 'file') 257 return url.scheme() in special_schemes 258 259 260def is_url(urlstr: str) -> bool: 261 """Check if url seems to be a valid URL. 262 263 Args: 264 urlstr: The URL as string. 265 266 Return: 267 True if it is a valid URL, False otherwise. 268 """ 269 autosearch = config.val.url.auto_search 270 271 log.url.debug("Checking if {!r} is a URL (autosearch={}).".format( 272 urlstr, autosearch)) 273 274 urlstr = urlstr.strip() 275 qurl = QUrl(urlstr) 276 qurl_userinput = QUrl.fromUserInput(urlstr) 277 278 if autosearch == 'never': 279 # no autosearch, so everything is a URL unless it has an explicit 280 # search engine. 281 try: 282 engine, _term = _parse_search_term(urlstr) 283 except ValueError: 284 return False 285 else: 286 return engine is None 287 288 if not qurl_userinput.isValid(): 289 # This will also catch non-URLs containing spaces. 290 return False 291 292 if _has_explicit_scheme(qurl) and ' ' not in urlstr: 293 # URLs with explicit schemes are always URLs 294 log.url.debug("Contains explicit scheme") 295 url = True 296 elif (autosearch == 'schemeless' and 297 (not _has_explicit_scheme(qurl) or ' ' in urlstr)): 298 # When autosearch=schemeless, URLs must contain schemes to be valid 299 log.url.debug("No explicit scheme in given URL, treating as non-URL") 300 url = False 301 elif qurl_userinput.host() in ['localhost', '127.0.0.1', '::1']: 302 log.url.debug("Is localhost.") 303 url = True 304 elif is_special_url(qurl): 305 # Special URLs are always URLs, even with autosearch=never 306 log.url.debug("Is a special URL.") 307 url = True 308 elif autosearch == 'dns': 309 log.url.debug("Checking via DNS check") 310 # We want to use QUrl.fromUserInput here, as the user might enter 311 # "foo.de" and that should be treated as URL here. 312 url = ' ' not in qurl_userinput.userName() and _is_url_dns(urlstr) 313 elif autosearch == 'naive': 314 log.url.debug("Checking via naive check") 315 url = ' ' not in qurl_userinput.userName() and _is_url_naive(urlstr) 316 else: # pragma: no cover 317 raise ValueError("Invalid autosearch value") 318 log.url.debug("url = {}".format(url)) 319 return url 320 321 322def ensure_valid(url: QUrl) -> None: 323 if not url.isValid(): 324 raise InvalidUrlError(url) 325 326 327def invalid_url_error(url: QUrl, action: str) -> None: 328 """Display an error message for a URL. 329 330 Args: 331 action: The action which was interrupted by the error. 332 """ 333 if url.isValid(): 334 raise ValueError("Calling invalid_url_error with valid URL {}".format( 335 url.toDisplayString())) 336 errstring = get_errstring( 337 url, "Trying to {} with invalid URL".format(action)) 338 message.error(errstring) 339 340 341def raise_cmdexc_if_invalid(url: QUrl) -> None: 342 """Check if the given QUrl is invalid, and if so, raise a CommandError.""" 343 try: 344 ensure_valid(url) 345 except InvalidUrlError as e: 346 raise cmdutils.CommandError(str(e)) 347 348 349def get_path_if_valid(pathstr: str, 350 cwd: str = None, 351 relative: bool = False, 352 check_exists: bool = False) -> Optional[str]: 353 """Check if path is a valid path. 354 355 Args: 356 pathstr: The path as string. 357 cwd: The current working directory, or None. 358 relative: Whether to resolve relative files. 359 check_exists: Whether to check if the file 360 actually exists of filesystem. 361 362 Return: 363 The path if it is a valid path, None otherwise. 364 """ 365 pathstr = pathstr.strip() 366 log.url.debug("Checking if {!r} is a path".format(pathstr)) 367 expanded = os.path.expanduser(pathstr) 368 369 if os.path.isabs(expanded): 370 path: Optional[str] = expanded 371 elif relative and cwd: 372 path = os.path.join(cwd, expanded) 373 elif relative: 374 try: 375 path = os.path.abspath(expanded) 376 except OSError: 377 path = None 378 else: 379 path = None 380 381 if check_exists: 382 if path is not None: 383 try: 384 if os.path.exists(path): 385 log.url.debug("URL is a local file") 386 else: 387 path = None 388 except UnicodeEncodeError: 389 log.url.debug( 390 "URL contains characters which are not present in the " 391 "current locale") 392 path = None 393 394 return path 395 396 397def filename_from_url(url: QUrl, fallback: str = None) -> Optional[str]: 398 """Get a suitable filename from a URL. 399 400 Args: 401 url: The URL to parse, as a QUrl. 402 fallback: Value to use if no name can be determined. 403 404 Return: 405 The suggested filename as a string, or None. 406 """ 407 if not url.isValid(): 408 return fallback 409 410 if url.scheme().lower() == 'data': 411 mimetype, _encoding = mimetypes.guess_type(url.toString()) 412 if not mimetype: 413 return fallback 414 415 ext = utils.mimetype_extension(mimetype) or '' 416 return 'download' + ext 417 418 pathname = posixpath.basename(url.path()) 419 if pathname: 420 return pathname 421 elif url.host(): 422 return url.host() + '.html' 423 else: 424 return fallback 425 426 427HostTupleType = Tuple[str, str, int] 428 429 430def host_tuple(url: QUrl) -> HostTupleType: 431 """Get a (scheme, host, port) tuple from a QUrl. 432 433 This is suitable to identify a connection, e.g. for SSL errors. 434 """ 435 ensure_valid(url) 436 scheme, host, port = url.scheme(), url.host(), url.port() 437 assert scheme 438 if not host: 439 raise ValueError("Got URL {} without host.".format( 440 url.toDisplayString())) 441 if port == -1: 442 port_mapping = { 443 'http': 80, 444 'https': 443, 445 'ftp': 21, 446 } 447 try: 448 port = port_mapping[scheme] 449 except KeyError: 450 raise ValueError("Got URL {} with unknown port.".format( 451 url.toDisplayString())) 452 return scheme, host, port 453 454 455def get_errstring(url: QUrl, base: str = "Invalid URL") -> str: 456 """Get an error string for a URL. 457 458 Args: 459 url: The URL as a QUrl. 460 base: The base error string. 461 462 Return: 463 A new string with url.errorString() is appended if available. 464 """ 465 url_error = url.errorString() 466 if url_error: 467 return base + " - {}".format(url_error) 468 else: 469 return base 470 471 472def same_domain(url1: QUrl, url2: QUrl) -> bool: 473 """Check if url1 and url2 belong to the same website. 474 475 This will use a "public suffix list" to determine what a "top level domain" 476 is. All further domains are ignored. 477 478 For example example.com and www.example.com are considered the same. but 479 example.co.uk and test.co.uk are not. 480 481 If the URL's schemes or ports are different, they are always treated as not equal. 482 483 Return: 484 True if the domains are the same, False otherwise. 485 """ 486 ensure_valid(url1) 487 ensure_valid(url2) 488 489 if url1.scheme() != url2.scheme(): 490 return False 491 if url1.port() != url2.port(): 492 return False 493 494 suffix1 = url1.topLevelDomain() 495 suffix2 = url2.topLevelDomain() 496 if not suffix1: 497 return url1.host() == url2.host() 498 499 if suffix1 != suffix2: 500 return False 501 502 domain1 = url1.host()[:-len(suffix1)].split('.')[-1] 503 domain2 = url2.host()[:-len(suffix2)].split('.')[-1] 504 return domain1 == domain2 505 506 507def encoded_url(url: QUrl) -> str: 508 """Return the fully encoded url as string. 509 510 Args: 511 url: The url to encode as QUrl. 512 """ 513 return url.toEncoded().data().decode('ascii') 514 515 516def file_url(path: str) -> str: 517 """Return a file:// url (as string) to the given local path. 518 519 Arguments: 520 path: The absolute path to the local file 521 """ 522 url = QUrl.fromLocalFile(path) 523 return url.toString(QUrl.FullyEncoded) # type: ignore[arg-type] 524 525 526def data_url(mimetype: str, data: bytes) -> QUrl: 527 """Get a data: QUrl for the given data.""" 528 b64 = base64.b64encode(data).decode('ascii') 529 url = QUrl('data:{};base64,{}'.format(mimetype, b64)) 530 qtutils.ensure_valid(url) 531 return url 532 533 534def safe_display_string(qurl: QUrl) -> str: 535 """Get a IDN-homograph phishing safe form of the given QUrl. 536 537 If we're dealing with a Punycode-encoded URL, this prepends the hostname in 538 its encoded form, to make sure those URLs are distinguishable. 539 540 See https://github.com/qutebrowser/qutebrowser/issues/2547 541 and https://bugreports.qt.io/browse/QTBUG-60365 542 """ 543 ensure_valid(qurl) 544 545 host = qurl.host(QUrl.FullyEncoded) 546 assert '..' not in host, qurl # https://bugreports.qt.io/browse/QTBUG-60364 547 548 for part in host.split('.'): 549 url_host = qurl.host(QUrl.FullyDecoded) 550 if part.startswith('xn--') and host != url_host: 551 return '({}) {}'.format(host, qurl.toDisplayString()) 552 553 return qurl.toDisplayString() 554 555 556class InvalidProxyTypeError(Exception): 557 558 """Error raised when proxy_from_url gets an unknown proxy type.""" 559 560 def __init__(self, typ: str) -> None: 561 super().__init__("Invalid proxy type {}!".format(typ)) 562 563 564def proxy_from_url(url: QUrl) -> Union[QNetworkProxy, pac.PACFetcher]: 565 """Create a QNetworkProxy from QUrl and a proxy type. 566 567 Args: 568 url: URL of a proxy (possibly with credentials). 569 570 Return: 571 New QNetworkProxy. 572 """ 573 ensure_valid(url) 574 575 scheme = url.scheme() 576 if scheme in ['pac+http', 'pac+https', 'pac+file']: 577 fetcher = pac.PACFetcher(url) 578 fetcher.fetch() 579 return fetcher 580 581 types = { 582 'http': QNetworkProxy.HttpProxy, 583 'socks': QNetworkProxy.Socks5Proxy, 584 'socks5': QNetworkProxy.Socks5Proxy, 585 'direct': QNetworkProxy.NoProxy, 586 } 587 if scheme not in types: 588 raise InvalidProxyTypeError(scheme) 589 590 proxy = QNetworkProxy(types[scheme], url.host()) 591 592 if url.port() != -1: 593 proxy.setPort(url.port()) 594 if url.userName(): 595 proxy.setUser(url.userName()) 596 if url.password(): 597 proxy.setPassword(url.password()) 598 return proxy 599 600 601def parse_javascript_url(url: QUrl) -> str: 602 """Get JavaScript source from the given URL. 603 604 See https://wiki.whatwg.org/wiki/URL_schemes#javascript:_URLs 605 and https://github.com/whatwg/url/issues/385 606 """ 607 ensure_valid(url) 608 if url.scheme() != 'javascript': 609 raise Error("Expected a javascript:... URL") 610 if url.authority(): 611 raise Error("URL contains unexpected components: {}" 612 .format(url.authority())) 613 614 urlstr = url.toString(QUrl.FullyEncoded) # type: ignore[arg-type] 615 urlstr = urllib.parse.unquote(urlstr) 616 617 code = urlstr[len('javascript:'):] 618 if not code: 619 raise Error("Resulted in empty JavaScript code") 620 621 return code 622 623 624def widened_hostnames(hostname: str) -> Iterable[str]: 625 """A generator for widening string hostnames. 626 627 Ex: a.c.foo -> [a.c.foo, c.foo, foo]""" 628 while hostname: 629 yield hostname 630 hostname = hostname.partition(".")[-1] 631