1# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et: 2 3# Copyright 2018-2021 Florian Bruhin (The Compiler) <mail@qutebrowser.org> 4# 5# This file is part of qutebrowser. 6# 7# qutebrowser is free software: you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation, either version 3 of the License, or 10# (at your option) any later version. 11# 12# qutebrowser is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with qutebrowser. If not, see <https://www.gnu.org/licenses/>. 19 20"""A Chromium-like URL matching pattern. 21 22See: 23https://developer.chrome.com/apps/match_patterns 24https://cs.chromium.org/chromium/src/extensions/common/url_pattern.cc 25https://cs.chromium.org/chromium/src/extensions/common/url_pattern.h 26 27Based on the following commit in Chromium: 28https://chromium.googlesource.com/chromium/src/+/6f4a6681eae01c2036336c18b06303e16a304a7c 29(October 10 2020, newest commit as per October 28th 2020) 30""" 31 32import ipaddress 33import fnmatch 34import urllib.parse 35from typing import Any, Optional, Tuple 36 37from PyQt5.QtCore import QUrl 38 39from qutebrowser.utils import utils, qtutils 40 41 42class ParseError(Exception): 43 44 """Raised when a pattern could not be parsed.""" 45 46 47class UrlPattern: 48 49 """A Chromium-like URL matching pattern. 50 51 Class attributes: 52 _DEFAULT_PORTS: The default ports used for schemes which support ports. 53 _SCHEMES_WITHOUT_HOST: Schemes which don't need a host. 54 55 Attributes: 56 host: The host to match to, or None for any host. 57 _pattern: The given pattern as string. 58 _match_all: Whether the pattern should match all URLs. 59 _match_subdomains: Whether the pattern should match subdomains of the 60 given host. 61 _scheme: The scheme to match to, or None to match any scheme. 62 Note that with Chromium, '*'/None only matches http/https and 63 not file/ftp. We deviate from that as per-URL settings aren't 64 security relevant. 65 _path: The path to match to, or None for any path. 66 _port: The port to match to as integer, or None for any port. 67 """ 68 69 _DEFAULT_PORTS = {'https': 443, 'http': 80, 'ftp': 21} 70 _SCHEMES_WITHOUT_HOST = ['about', 'file', 'data', 'javascript'] 71 72 def __init__(self, pattern: str) -> None: 73 # Make sure all attributes are initialized if we exit early. 74 self._pattern = pattern 75 self._match_all = False 76 self._match_subdomains: bool = False 77 self._scheme: Optional[str] = None 78 self.host: Optional[str] = None 79 self._path: Optional[str] = None 80 self._port: Optional[int] = None 81 82 # > The special pattern <all_urls> matches any URL that starts with a 83 # > permitted scheme. 84 if pattern == '<all_urls>': 85 self._match_all = True 86 return 87 88 if '\0' in pattern: 89 raise ParseError("May not contain NUL byte") 90 91 pattern = self._fixup_pattern(pattern) 92 93 # We use urllib.parse instead of QUrl here because it can handle 94 # hosts with * in them. 95 try: 96 parsed = urllib.parse.urlparse(pattern) 97 except ValueError as e: 98 raise ParseError(str(e)) 99 100 assert parsed is not None 101 102 self._init_scheme(parsed) 103 self._init_host(parsed) 104 self._init_path(parsed) 105 self._init_port(parsed) 106 107 def _to_tuple(self) -> Tuple[ 108 bool, # _match_all 109 bool, # _match_subdomains 110 Optional[str], # _scheme 111 Optional[str], # host 112 Optional[str], # _path 113 Optional[int], # _port 114 ]: 115 """Get a pattern with information used for __eq__/__hash__.""" 116 return (self._match_all, self._match_subdomains, self._scheme, 117 self.host, self._path, self._port) 118 119 def __hash__(self) -> int: 120 return hash(self._to_tuple()) 121 122 def __eq__(self, other: Any) -> bool: 123 if not isinstance(other, UrlPattern): 124 return NotImplemented 125 return self._to_tuple() == other._to_tuple() 126 127 def __repr__(self) -> str: 128 return utils.get_repr(self, pattern=self._pattern, constructor=True) 129 130 def __str__(self) -> str: 131 return self._pattern 132 133 def _fixup_pattern(self, pattern: str) -> str: 134 """Make sure the given pattern is parseable by urllib.parse.""" 135 if pattern.startswith('*:'): # Any scheme, but *:// is unparsable 136 pattern = 'any:' + pattern[2:] 137 138 schemes = tuple(s + ':' for s in self._SCHEMES_WITHOUT_HOST) 139 if '://' not in pattern and not pattern.startswith(schemes): 140 pattern = 'any://' + pattern 141 142 # Chromium handles file://foo like file:///foo 143 # FIXME This doesn't actually strip the hostname correctly. 144 if (pattern.startswith('file://') and 145 not pattern.startswith('file:///')): 146 pattern = 'file:///' + pattern[len("file://"):] 147 148 return pattern 149 150 def _init_scheme(self, parsed: urllib.parse.ParseResult) -> None: 151 """Parse the scheme from the given URL. 152 153 Deviation from Chromium: 154 - We assume * when no scheme has been given. 155 """ 156 if not parsed.scheme: 157 raise ParseError("Missing scheme") 158 159 if parsed.scheme == 'any': 160 self._scheme = None 161 return 162 163 self._scheme = parsed.scheme 164 165 def _init_path(self, parsed: urllib.parse.ParseResult) -> None: 166 """Parse the path from the given URL. 167 168 Deviation from Chromium: 169 - We assume * when no path has been given. 170 """ 171 if self._scheme == 'about' and not parsed.path.strip(): 172 raise ParseError("Pattern without path") 173 174 if parsed.path == '/*': 175 self._path = None 176 elif not parsed.path: 177 # When the user doesn't add a trailing slash, we assume the pattern 178 # matches any path. 179 self._path = None 180 else: 181 self._path = parsed.path 182 183 def _init_host(self, parsed: urllib.parse.ParseResult) -> None: 184 """Parse the host from the given URL. 185 186 Deviation from Chromium: 187 - http://:1234/ is not a valid URL because it has no host. 188 - We don't allow patterns for dot/space hosts which QUrl considers 189 invalid. 190 """ 191 if parsed.hostname is None or not parsed.hostname.strip(): 192 if self._scheme not in self._SCHEMES_WITHOUT_HOST: 193 raise ParseError("Pattern without host") 194 assert self.host is None 195 return 196 197 if parsed.netloc.startswith('['): 198 # Using QUrl parsing to minimize ipv6 addresses 199 url = QUrl() 200 url.setHost(parsed.hostname) 201 if not url.isValid(): 202 raise ParseError(url.errorString()) 203 self.host = url.host() 204 return 205 206 if parsed.hostname == '*': 207 self._match_subdomains = True 208 hostname = None 209 elif parsed.hostname.startswith('*.'): 210 if len(parsed.hostname) == 2: 211 # We don't allow just '*.' as a host. 212 raise ParseError("Pattern without host") 213 self._match_subdomains = True 214 hostname = parsed.hostname[2:] 215 elif set(parsed.hostname) in {frozenset('.'), frozenset('. ')}: 216 raise ParseError("Invalid host") 217 else: 218 hostname = parsed.hostname 219 220 if hostname is None: 221 self.host = None 222 elif '*' in hostname: 223 # Only * or *.foo is allowed as host. 224 raise ParseError("Invalid host wildcard") 225 else: 226 self.host = hostname.rstrip('.') 227 228 def _init_port(self, parsed: urllib.parse.ParseResult) -> None: 229 """Parse the port from the given URL. 230 231 Deviation from Chromium: 232 - We use None instead of "*" if there's no port filter. 233 """ 234 if parsed.netloc.endswith(':*'): 235 # We can't access parsed.port as it tries to run int() 236 self._port = None 237 elif parsed.netloc.endswith(':'): 238 raise ParseError("Invalid port: Port is empty") 239 else: 240 try: 241 self._port = parsed.port 242 except ValueError as e: 243 raise ParseError("Invalid port: {}".format(e)) 244 245 scheme_has_port = (self._scheme in list(self._DEFAULT_PORTS) or 246 self._scheme is None) 247 if self._port is not None and not scheme_has_port: 248 raise ParseError("Ports are unsupported with {} scheme".format( 249 self._scheme)) 250 251 def _matches_scheme(self, scheme: str) -> bool: 252 return self._scheme is None or self._scheme == scheme 253 254 def _matches_host(self, host: str) -> bool: 255 # FIXME what about multiple dots? 256 host = host.rstrip('.') 257 258 # If we have no host in the match pattern, that means that we're 259 # matching all hosts, which means we have a match no matter what the 260 # test host is. 261 # Contrary to Chromium, we don't need to check for 262 # self._match_subdomains, as we want to return True here for e.g. 263 # file:// as well. 264 if self.host is None: 265 return True 266 267 # If the hosts are exactly equal, we have a match. 268 if host == self.host: 269 return True 270 271 # Otherwise, we can only match if our match pattern matches subdomains. 272 if not self._match_subdomains: 273 return False 274 275 # We don't do subdomain matching against IP addresses, so we can give 276 # up now if the test host is an IP address. 277 if not utils.raises(ValueError, ipaddress.ip_address, host): 278 return False 279 280 # Check if the test host is a subdomain of our host. 281 if len(host) <= (len(self.host) + 1): 282 return False 283 284 if not host.endswith(self.host): 285 return False 286 287 return host[len(host) - len(self.host) - 1] == '.' 288 289 def _matches_port(self, scheme: str, port: int) -> bool: 290 if port == -1 and scheme in self._DEFAULT_PORTS: 291 port = self._DEFAULT_PORTS[scheme] 292 return self._port is None or self._port == port 293 294 def _matches_path(self, path: str) -> bool: 295 """Match the URL's path. 296 297 Deviations from Chromium: 298 - Chromium only matches <all_urls> with "javascript:" (pathless); but 299 we also match *://*/* and friends. 300 """ 301 if self._path is None: 302 return True 303 304 # Match 'google.com' with 'google.com/' 305 if path + '/*' == self._path: 306 return True 307 308 # FIXME Chromium seems to have a more optimized glob matching which 309 # doesn't rely on regexes. Do we need that too? 310 return fnmatch.fnmatchcase(path, self._path) 311 312 def matches(self, qurl: QUrl) -> bool: 313 """Check if the pattern matches the given QUrl.""" 314 qtutils.ensure_valid(qurl) 315 316 if self._match_all: 317 return True 318 319 if not self._matches_scheme(qurl.scheme()): 320 return False 321 # FIXME ignore for file:// like Chromium? 322 if not self._matches_host(qurl.host()): 323 return False 324 if not self._matches_port(qurl.scheme(), qurl.port()): 325 return False 326 if not self._matches_path(qurl.path()): 327 return False 328 329 return True 330