1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // The matching logic distinguishes between the terms URL pattern and
6 // subpattern. A URL pattern usually stands for the full thing, e.g.
7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring
8 // of a pattern not containing the wildcard '*' character. For the example above
9 // the subpatterns are: "example.com^", "path" and "par=val^".
10 //
11 // The separator placeholder '^' symbol is used in subpatterns to match any
12 // separator character, which is any ASCII symbol except letters, digits, and
13 // the following: '_', '-', '.', '%'. Note that the separator placeholder
14 // character '^' is itself a separator, as well as '\0'.
15
16 #include "components/url_pattern_index/url_pattern.h"
17
18 #include <stddef.h>
19
20 #include <algorithm>
21 #include <ostream>
22
23 #include "base/check_op.h"
24 #include "base/notreached.h"
25 #include "base/numerics/checked_math.h"
26 #include "base/strings/string_util.h"
27 #include "components/url_pattern_index/flat/url_pattern_index_generated.h"
28 #include "components/url_pattern_index/fuzzy_pattern_matching.h"
29 #include "components/url_pattern_index/string_splitter.h"
30 #include "url/gurl.h"
31 #include "url/third_party/mozilla/url_parse.h"
32
33 namespace url_pattern_index {
34
35 namespace {
36
37 constexpr char kWildcard = '*';
38
39 class IsWildcard {
40 public:
operator ()(char c) const41 bool operator()(char c) const { return c == kWildcard; }
42 };
43
ConvertUrlPatternType(flat::UrlPatternType type)44 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {
45 switch (type) {
46 case flat::UrlPatternType_SUBSTRING:
47 return proto::URL_PATTERN_TYPE_SUBSTRING;
48 case flat::UrlPatternType_WILDCARDED:
49 return proto::URL_PATTERN_TYPE_WILDCARDED;
50 case flat::UrlPatternType_REGEXP:
51 return proto::URL_PATTERN_TYPE_REGEXP;
52 default:
53 return proto::URL_PATTERN_TYPE_UNSPECIFIED;
54 }
55 }
56
ConvertAnchorType(flat::AnchorType type)57 proto::AnchorType ConvertAnchorType(flat::AnchorType type) {
58 switch (type) {
59 case flat::AnchorType_NONE:
60 return proto::ANCHOR_TYPE_NONE;
61 case flat::AnchorType_BOUNDARY:
62 return proto::ANCHOR_TYPE_BOUNDARY;
63 case flat::AnchorType_SUBDOMAIN:
64 return proto::ANCHOR_TYPE_SUBDOMAIN;
65 default:
66 return proto::ANCHOR_TYPE_UNSPECIFIED;
67 }
68 }
69
ConvertString(const flatbuffers::String * string)70 base::StringPiece ConvertString(const flatbuffers::String* string) {
71 return string ? base::StringPiece(string->data(), string->size())
72 : base::StringPiece();
73 }
74
HasAnyUpperAscii(base::StringPiece string)75 bool HasAnyUpperAscii(base::StringPiece string) {
76 return std::any_of(string.begin(), string.end(), base::IsAsciiUpper<char>);
77 }
78
79 // Returns whether |position| within the |url| belongs to its |host| component
80 // and corresponds to the beginning of a (sub-)domain.
IsSubdomainAnchored(base::StringPiece url,url::Component host,size_t position)81 inline bool IsSubdomainAnchored(base::StringPiece url,
82 url::Component host,
83 size_t position) {
84 DCHECK_LE(position, url.size());
85 const size_t host_begin = static_cast<size_t>(host.begin);
86 const size_t host_end = static_cast<size_t>(host.end());
87 DCHECK_LE(host_end, url.size());
88
89 return position == host_begin ||
90 (position > host_begin && position <= host_end &&
91 url[position - 1] == '.');
92 }
93
94 // Returns the position of the leftmost occurrence of a |subpattern| in the
95 // |text| starting no earlier than |from| the specified position. If the
96 // |subpattern| has separator placeholders, searches for a fuzzy occurrence.
FindSubpattern(base::StringPiece text,base::StringPiece subpattern,size_t from=0)97 size_t FindSubpattern(base::StringPiece text,
98 base::StringPiece subpattern,
99 size_t from = 0) {
100 const bool is_fuzzy =
101 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
102 return is_fuzzy ? FindFuzzy(text, subpattern, from)
103 : text.find(subpattern, from);
104 }
105
106 // Same as FindSubpattern(url, subpattern), but searches for an occurrence that
107 // starts at the beginning of a (sub-)domain within the url's |host| component.
FindSubdomainAnchoredSubpattern(base::StringPiece url,url::Component host,base::StringPiece subpattern)108 size_t FindSubdomainAnchoredSubpattern(base::StringPiece url,
109 url::Component host,
110 base::StringPiece subpattern) {
111 const bool is_fuzzy =
112 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
113
114 // Any match found after the end of the host will be discarded, so just
115 // avoid searching there for the subpattern to begin with.
116 //
117 // Check for overflow.
118 size_t max_match_end = 0;
119 if (!base::CheckAdd(host.end(), subpattern.length())
120 .AssignIfValid(&max_match_end)) {
121 return base::StringPiece::npos;
122 }
123 const base::StringPiece url_match_candidate = url.substr(0, max_match_end);
124 const base::StringPiece url_host = url.substr(0, host.end());
125
126 for (size_t position = static_cast<size_t>(host.begin);
127 position <= static_cast<size_t>(host.end()); ++position) {
128 // Enforce as a loop precondition that we are always anchored at a
129 // sub-domain before calling find. This is to reduce the number of potential
130 // searches for |subpattern|.
131 DCHECK(IsSubdomainAnchored(url, host, position));
132
133 position = is_fuzzy ? FindFuzzy(url_match_candidate, subpattern, position)
134 : url_match_candidate.find(subpattern, position);
135 if (position == base::StringPiece::npos ||
136 IsSubdomainAnchored(url, host, position)) {
137 return position;
138 }
139
140 // Enforce the loop precondition. This skips |position| to the next '.',
141 // within the host, which the loop itself increments to the anchored
142 // sub-domain.
143 position = url_host.find('.', position);
144 if (position == base::StringPiece::npos)
145 break;
146 }
147 return base::StringPiece::npos;
148 }
149
150 // Helper for DoesTextMatchLastSubpattern. Treats kSeparatorPlaceholder as not
151 // matching the end of the text.
DoesTextMatchLastSubpatternInternal(proto::AnchorType anchor_left,proto::AnchorType anchor_right,base::StringPiece text,url::Component url_host,base::StringPiece subpattern)152 bool DoesTextMatchLastSubpatternInternal(proto::AnchorType anchor_left,
153 proto::AnchorType anchor_right,
154 base::StringPiece text,
155 url::Component url_host,
156 base::StringPiece subpattern) {
157 // Enumerate all possible combinations of |anchor_left| and |anchor_right|.
158 if (anchor_left == proto::ANCHOR_TYPE_NONE &&
159 anchor_right == proto::ANCHOR_TYPE_NONE) {
160 return FindSubpattern(text, subpattern) != base::StringPiece::npos;
161 }
162
163 if (anchor_left == proto::ANCHOR_TYPE_NONE &&
164 anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
165 return EndsWithFuzzy(text, subpattern);
166 }
167
168 if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY &&
169 anchor_right == proto::ANCHOR_TYPE_NONE) {
170 return StartsWithFuzzy(text, subpattern);
171 }
172
173 if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY &&
174 anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
175 return text.size() == subpattern.size() &&
176 StartsWithFuzzy(text, subpattern);
177 }
178
179 if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN &&
180 anchor_right == proto::ANCHOR_TYPE_NONE) {
181 return url_host.is_nonempty() &&
182 FindSubdomainAnchoredSubpattern(text, url_host, subpattern) !=
183 base::StringPiece::npos;
184 }
185
186 if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN &&
187 anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
188 return url_host.is_nonempty() && text.size() >= subpattern.size() &&
189 IsSubdomainAnchored(text, url_host,
190 text.size() - subpattern.size()) &&
191 EndsWithFuzzy(text, subpattern);
192 }
193
194 NOTREACHED();
195 return false;
196 }
197
198 // Matches the last |subpattern| against |text|. Special treatment is required
199 // for the last subpattern since |kSeparatorPlaceholder| can also match the end
200 // of the text.
DoesTextMatchLastSubpattern(proto::AnchorType anchor_left,proto::AnchorType anchor_right,base::StringPiece text,url::Component url_host,base::StringPiece subpattern)201 bool DoesTextMatchLastSubpattern(proto::AnchorType anchor_left,
202 proto::AnchorType anchor_right,
203 base::StringPiece text,
204 url::Component url_host,
205 base::StringPiece subpattern) {
206 DCHECK(!subpattern.empty());
207
208 if (DoesTextMatchLastSubpatternInternal(anchor_left, anchor_right, text,
209 url_host, subpattern)) {
210 return true;
211 }
212
213 // If the last |subpattern| ends with kSeparatorPlaceholder, then it can also
214 // match the end of text.
215 if (subpattern.back() == kSeparatorPlaceholder) {
216 subpattern.remove_suffix(1);
217 return DoesTextMatchLastSubpatternInternal(
218 anchor_left, proto::ANCHOR_TYPE_BOUNDARY, text, url_host, subpattern);
219 }
220
221 return false;
222 }
223
224 // Returns whether the given |url_pattern| matches the given |url_spec|.
225 // Compares the pattern the the url in a case-sensitive manner.
IsCaseSensitiveMatch(base::StringPiece url_pattern,proto::AnchorType anchor_left,proto::AnchorType anchor_right,base::StringPiece url_spec,url::Component url_host)226 bool IsCaseSensitiveMatch(base::StringPiece url_pattern,
227 proto::AnchorType anchor_left,
228 proto::AnchorType anchor_right,
229 base::StringPiece url_spec,
230 url::Component url_host) {
231 DCHECK(!url_spec.empty());
232
233 StringSplitter<IsWildcard> subpatterns(url_pattern);
234 auto subpattern_it = subpatterns.begin();
235 auto subpattern_end = subpatterns.end();
236
237 // No subpatterns.
238 if (subpattern_it == subpattern_end) {
239 return anchor_left == proto::ANCHOR_TYPE_NONE ||
240 anchor_right == proto::ANCHOR_TYPE_NONE;
241 }
242
243 base::StringPiece subpattern = *subpattern_it;
244 ++subpattern_it;
245
246 // There is only one |subpattern|.
247 if (subpattern_it == subpattern_end) {
248 return DoesTextMatchLastSubpattern(anchor_left, anchor_right, url_spec,
249 url_host, subpattern);
250 }
251
252 // Otherwise, the first |subpattern| does not have to be a suffix. But it
253 // still can have a left anchor. Check and handle that.
254 base::StringPiece text = url_spec;
255 if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY) {
256 if (!StartsWithFuzzy(url_spec, subpattern))
257 return false;
258 text.remove_prefix(subpattern.size());
259 } else if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
260 if (!url_host.is_nonempty())
261 return false;
262 const size_t match_begin =
263 FindSubdomainAnchoredSubpattern(url_spec, url_host, subpattern);
264 if (match_begin == base::StringPiece::npos)
265 return false;
266 text.remove_prefix(match_begin + subpattern.size());
267 } else {
268 DCHECK_EQ(anchor_left, proto::ANCHOR_TYPE_NONE);
269 // Get back to the initial |subpattern|, process it in the loop below.
270 subpattern_it = subpatterns.begin();
271 }
272
273 DCHECK(subpattern_it != subpattern_end);
274 subpattern = *subpattern_it;
275
276 // Consecutively find all the remaining subpatterns in the |text|. Handle the
277 // last subpattern outside the loop.
278 while (++subpattern_it != subpattern_end) {
279 DCHECK(!subpattern.empty());
280
281 const size_t match_position = FindSubpattern(text, subpattern);
282 if (match_position == base::StringPiece::npos)
283 return false;
284 text.remove_prefix(match_position + subpattern.size());
285
286 subpattern = *subpattern_it;
287 }
288
289 return DoesTextMatchLastSubpattern(proto::ANCHOR_TYPE_NONE, anchor_right,
290 text, url::Component(), subpattern);
291 }
292
293 } // namespace
294
UrlInfo(const GURL & url)295 UrlPattern::UrlInfo::UrlInfo(const GURL& url)
296 : spec_(url.possibly_invalid_spec()),
297 host_(url.parsed_for_possibly_invalid_spec().host) {
298 DCHECK(url.is_valid());
299 }
300
GetLowerCaseSpec() const301 base::StringPiece UrlPattern::UrlInfo::GetLowerCaseSpec() const {
302 if (lower_case_spec_cached_)
303 return *lower_case_spec_cached_;
304
305 if (!HasAnyUpperAscii(spec_)) {
306 lower_case_spec_cached_ = spec_;
307 } else {
308 lower_case_spec_owner_ = base::ToLowerASCII(spec_);
309 lower_case_spec_cached_ = lower_case_spec_owner_;
310 }
311 return *lower_case_spec_cached_;
312 }
313
314 UrlPattern::UrlInfo::~UrlInfo() = default;
315
316 UrlPattern::UrlPattern() = default;
317
UrlPattern(base::StringPiece url_pattern,proto::UrlPatternType type,MatchCase match_case)318 UrlPattern::UrlPattern(base::StringPiece url_pattern,
319 proto::UrlPatternType type,
320 MatchCase match_case)
321 : type_(type), url_pattern_(url_pattern), match_case_(match_case) {}
322
UrlPattern(base::StringPiece url_pattern,proto::AnchorType anchor_left,proto::AnchorType anchor_right)323 UrlPattern::UrlPattern(base::StringPiece url_pattern,
324 proto::AnchorType anchor_left,
325 proto::AnchorType anchor_right)
326 : type_(proto::URL_PATTERN_TYPE_WILDCARDED),
327 url_pattern_(url_pattern),
328 anchor_left_(anchor_left),
329 anchor_right_(anchor_right) {}
330
UrlPattern(const flat::UrlRule & rule)331 UrlPattern::UrlPattern(const flat::UrlRule& rule)
332 : type_(ConvertUrlPatternType(rule.url_pattern_type())),
333 url_pattern_(ConvertString(rule.url_pattern())),
334 anchor_left_(ConvertAnchorType(rule.anchor_left())),
335 anchor_right_(ConvertAnchorType(rule.anchor_right())),
336 match_case_(rule.options() & flat::OptionFlag_IS_CASE_INSENSITIVE
337 ? MatchCase::kFalse
338 : MatchCase::kTrue) {}
339
340 UrlPattern::~UrlPattern() = default;
341
MatchesUrl(const UrlInfo & url) const342 bool UrlPattern::MatchesUrl(const UrlInfo& url) const {
343 DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
344 type_ == proto::URL_PATTERN_TYPE_WILDCARDED);
345 DCHECK(base::IsStringASCII(url_pattern_));
346 DCHECK(base::IsStringASCII(url.spec()));
347 DCHECK(base::IsStringASCII(url.GetLowerCaseSpec()));
348
349 // Pre-process patterns to ensure left anchored and right anchored patterns
350 // don't begin and end with a wildcard respectively i.e. change "|*xyz" to
351 // "*xyz" and "xyz*|" to "xyz*".
352 proto::AnchorType anchor_left = anchor_left_;
353 proto::AnchorType anchor_right = anchor_right_;
354 if (!url_pattern_.empty()) {
355 if (url_pattern_.front() == kWildcard) {
356 // Note: We don't handle "||*" and expect clients to disallow it.
357 DCHECK_NE(proto::ANCHOR_TYPE_SUBDOMAIN, anchor_left_);
358 anchor_left = proto::ANCHOR_TYPE_NONE;
359 }
360 if (url_pattern_.back() == kWildcard)
361 anchor_right = proto::ANCHOR_TYPE_NONE;
362 }
363
364 if (match_case()) {
365 return IsCaseSensitiveMatch(url_pattern_, anchor_left, anchor_right,
366 url.spec(), url.host());
367 }
368
369 // Use the lower-cased url for case-insensitive comparison. Case-insensitive
370 // patterns should already be lower-cased.
371 DCHECK(!HasAnyUpperAscii(url_pattern_));
372 return IsCaseSensitiveMatch(url_pattern_, anchor_left, anchor_right,
373 url.GetLowerCaseSpec(), url.host());
374 }
375
operator <<(std::ostream & out,const UrlPattern & pattern)376 std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {
377 switch (pattern.anchor_left()) {
378 case proto::ANCHOR_TYPE_SUBDOMAIN:
379 out << '|';
380 FALLTHROUGH;
381 case proto::ANCHOR_TYPE_BOUNDARY:
382 out << '|';
383 FALLTHROUGH;
384 default:
385 break;
386 }
387 out << pattern.url_pattern();
388 if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)
389 out << '|';
390 if (pattern.match_case())
391 out << "$match-case";
392
393 return out;
394 }
395
396 } // namespace url_pattern_index
397