1# vim: ft=python fileencoding=utf-8 sts=4 sw=4 et:
2
3# Copyright 2018-2021 Florian Bruhin (The Compiler) <mail@qutebrowser.org>
4#
5# This file is part of qutebrowser.
6#
7# qutebrowser is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# qutebrowser is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with qutebrowser.  If not, see <https://www.gnu.org/licenses/>.
19
20"""A Chromium-like URL matching pattern.
21
22See:
23https://developer.chrome.com/apps/match_patterns
24https://cs.chromium.org/chromium/src/extensions/common/url_pattern.cc
25https://cs.chromium.org/chromium/src/extensions/common/url_pattern.h
26
27Based on the following commit in Chromium:
28https://chromium.googlesource.com/chromium/src/+/6f4a6681eae01c2036336c18b06303e16a304a7c
29(October 10 2020, newest commit as per October 28th 2020)
30"""
31
32import ipaddress
33import fnmatch
34import urllib.parse
35from typing import Any, Optional, Tuple
36
37from PyQt5.QtCore import QUrl
38
39from qutebrowser.utils import utils, qtutils
40
41
42class ParseError(Exception):
43
44    """Raised when a pattern could not be parsed."""
45
46
47class UrlPattern:
48
49    """A Chromium-like URL matching pattern.
50
51    Class attributes:
52        _DEFAULT_PORTS: The default ports used for schemes which support ports.
53        _SCHEMES_WITHOUT_HOST: Schemes which don't need a host.
54
55    Attributes:
56        host: The host to match to, or None for any host.
57        _pattern: The given pattern as string.
58        _match_all: Whether the pattern should match all URLs.
59        _match_subdomains: Whether the pattern should match subdomains of the
60                           given host.
61        _scheme: The scheme to match to, or None to match any scheme.
62                 Note that with Chromium, '*'/None only matches http/https and
63                 not file/ftp. We deviate from that as per-URL settings aren't
64                 security relevant.
65        _path: The path to match to, or None for any path.
66        _port: The port to match to as integer, or None for any port.
67    """
68
69    _DEFAULT_PORTS = {'https': 443, 'http': 80, 'ftp': 21}
70    _SCHEMES_WITHOUT_HOST = ['about', 'file', 'data', 'javascript']
71
72    def __init__(self, pattern: str) -> None:
73        # Make sure all attributes are initialized if we exit early.
74        self._pattern = pattern
75        self._match_all = False
76        self._match_subdomains: bool = False
77        self._scheme: Optional[str] = None
78        self.host: Optional[str] = None
79        self._path: Optional[str] = None
80        self._port: Optional[int] = None
81
82        # > The special pattern <all_urls> matches any URL that starts with a
83        # > permitted scheme.
84        if pattern == '<all_urls>':
85            self._match_all = True
86            return
87
88        if '\0' in pattern:
89            raise ParseError("May not contain NUL byte")
90
91        pattern = self._fixup_pattern(pattern)
92
93        # We use urllib.parse instead of QUrl here because it can handle
94        # hosts with * in them.
95        try:
96            parsed = urllib.parse.urlparse(pattern)
97        except ValueError as e:
98            raise ParseError(str(e))
99
100        assert parsed is not None
101
102        self._init_scheme(parsed)
103        self._init_host(parsed)
104        self._init_path(parsed)
105        self._init_port(parsed)
106
107    def _to_tuple(self) -> Tuple[
108        bool,  # _match_all
109        bool,  # _match_subdomains
110        Optional[str],  # _scheme
111        Optional[str],  # host
112        Optional[str],  # _path
113        Optional[int],  # _port
114    ]:
115        """Get a pattern with information used for __eq__/__hash__."""
116        return (self._match_all, self._match_subdomains, self._scheme,
117                self.host, self._path, self._port)
118
119    def __hash__(self) -> int:
120        return hash(self._to_tuple())
121
122    def __eq__(self, other: Any) -> bool:
123        if not isinstance(other, UrlPattern):
124            return NotImplemented
125        return self._to_tuple() == other._to_tuple()
126
127    def __repr__(self) -> str:
128        return utils.get_repr(self, pattern=self._pattern, constructor=True)
129
130    def __str__(self) -> str:
131        return self._pattern
132
133    def _fixup_pattern(self, pattern: str) -> str:
134        """Make sure the given pattern is parseable by urllib.parse."""
135        if pattern.startswith('*:'):  # Any scheme, but *:// is unparsable
136            pattern = 'any:' + pattern[2:]
137
138        schemes = tuple(s + ':' for s in self._SCHEMES_WITHOUT_HOST)
139        if '://' not in pattern and not pattern.startswith(schemes):
140            pattern = 'any://' + pattern
141
142        # Chromium handles file://foo like file:///foo
143        # FIXME This doesn't actually strip the hostname correctly.
144        if (pattern.startswith('file://') and
145                not pattern.startswith('file:///')):
146            pattern = 'file:///' + pattern[len("file://"):]
147
148        return pattern
149
150    def _init_scheme(self, parsed: urllib.parse.ParseResult) -> None:
151        """Parse the scheme from the given URL.
152
153        Deviation from Chromium:
154        - We assume * when no scheme has been given.
155        """
156        if not parsed.scheme:
157            raise ParseError("Missing scheme")
158
159        if parsed.scheme == 'any':
160            self._scheme = None
161            return
162
163        self._scheme = parsed.scheme
164
165    def _init_path(self, parsed: urllib.parse.ParseResult) -> None:
166        """Parse the path from the given URL.
167
168        Deviation from Chromium:
169        - We assume * when no path has been given.
170        """
171        if self._scheme == 'about' and not parsed.path.strip():
172            raise ParseError("Pattern without path")
173
174        if parsed.path == '/*':
175            self._path = None
176        elif not parsed.path:
177            # When the user doesn't add a trailing slash, we assume the pattern
178            # matches any path.
179            self._path = None
180        else:
181            self._path = parsed.path
182
183    def _init_host(self, parsed: urllib.parse.ParseResult) -> None:
184        """Parse the host from the given URL.
185
186        Deviation from Chromium:
187        - http://:1234/ is not a valid URL because it has no host.
188        - We don't allow patterns for dot/space hosts which QUrl considers
189          invalid.
190        """
191        if parsed.hostname is None or not parsed.hostname.strip():
192            if self._scheme not in self._SCHEMES_WITHOUT_HOST:
193                raise ParseError("Pattern without host")
194            assert self.host is None
195            return
196
197        if parsed.netloc.startswith('['):
198            # Using QUrl parsing to minimize ipv6 addresses
199            url = QUrl()
200            url.setHost(parsed.hostname)
201            if not url.isValid():
202                raise ParseError(url.errorString())
203            self.host = url.host()
204            return
205
206        if parsed.hostname == '*':
207            self._match_subdomains = True
208            hostname = None
209        elif parsed.hostname.startswith('*.'):
210            if len(parsed.hostname) == 2:
211                # We don't allow just '*.' as a host.
212                raise ParseError("Pattern without host")
213            self._match_subdomains = True
214            hostname = parsed.hostname[2:]
215        elif set(parsed.hostname) in {frozenset('.'), frozenset('. ')}:
216            raise ParseError("Invalid host")
217        else:
218            hostname = parsed.hostname
219
220        if hostname is None:
221            self.host = None
222        elif '*' in hostname:
223            # Only * or *.foo is allowed as host.
224            raise ParseError("Invalid host wildcard")
225        else:
226            self.host = hostname.rstrip('.')
227
228    def _init_port(self, parsed: urllib.parse.ParseResult) -> None:
229        """Parse the port from the given URL.
230
231        Deviation from Chromium:
232        - We use None instead of "*" if there's no port filter.
233        """
234        if parsed.netloc.endswith(':*'):
235            # We can't access parsed.port as it tries to run int()
236            self._port = None
237        elif parsed.netloc.endswith(':'):
238            raise ParseError("Invalid port: Port is empty")
239        else:
240            try:
241                self._port = parsed.port
242            except ValueError as e:
243                raise ParseError("Invalid port: {}".format(e))
244
245        scheme_has_port = (self._scheme in list(self._DEFAULT_PORTS) or
246                           self._scheme is None)
247        if self._port is not None and not scheme_has_port:
248            raise ParseError("Ports are unsupported with {} scheme".format(
249                self._scheme))
250
251    def _matches_scheme(self, scheme: str) -> bool:
252        return self._scheme is None or self._scheme == scheme
253
254    def _matches_host(self, host: str) -> bool:
255        # FIXME what about multiple dots?
256        host = host.rstrip('.')
257
258        # If we have no host in the match pattern, that means that we're
259        # matching all hosts, which means we have a match no matter what the
260        # test host is.
261        # Contrary to Chromium, we don't need to check for
262        # self._match_subdomains, as we want to return True here for e.g.
263        # file:// as well.
264        if self.host is None:
265            return True
266
267        # If the hosts are exactly equal, we have a match.
268        if host == self.host:
269            return True
270
271        # Otherwise, we can only match if our match pattern matches subdomains.
272        if not self._match_subdomains:
273            return False
274
275        # We don't do subdomain matching against IP addresses, so we can give
276        # up now if the test host is an IP address.
277        if not utils.raises(ValueError, ipaddress.ip_address, host):
278            return False
279
280        # Check if the test host is a subdomain of our host.
281        if len(host) <= (len(self.host) + 1):
282            return False
283
284        if not host.endswith(self.host):
285            return False
286
287        return host[len(host) - len(self.host) - 1] == '.'
288
289    def _matches_port(self, scheme: str, port: int) -> bool:
290        if port == -1 and scheme in self._DEFAULT_PORTS:
291            port = self._DEFAULT_PORTS[scheme]
292        return self._port is None or self._port == port
293
294    def _matches_path(self, path: str) -> bool:
295        """Match the URL's path.
296
297        Deviations from Chromium:
298        - Chromium only matches <all_urls> with "javascript:" (pathless); but
299          we also match *://*/* and friends.
300        """
301        if self._path is None:
302            return True
303
304        # Match 'google.com' with 'google.com/'
305        if path + '/*' == self._path:
306            return True
307
308        # FIXME Chromium seems to have a more optimized glob matching which
309        # doesn't rely on regexes. Do we need that too?
310        return fnmatch.fnmatchcase(path, self._path)
311
312    def matches(self, qurl: QUrl) -> bool:
313        """Check if the pattern matches the given QUrl."""
314        qtutils.ensure_valid(qurl)
315
316        if self._match_all:
317            return True
318
319        if not self._matches_scheme(qurl.scheme()):
320            return False
321        # FIXME ignore for file:// like Chromium?
322        if not self._matches_host(qurl.host()):
323            return False
324        if not self._matches_port(qurl.scheme(), qurl.port()):
325            return False
326        if not self._matches_path(qurl.path()):
327            return False
328
329        return True
330