1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // The matching logic distinguishes between the terms URL pattern and
6 // subpattern. A URL pattern usually stands for the full thing, e.g.
7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring
8 // of a pattern not containing the wildcard '*' character. For the example above
9 // the subpatterns are: "example.com^", "path" and "par=val^".
10 //
11 // The separator placeholder '^' symbol is used in subpatterns to match any
12 // separator character, which is any ASCII symbol except letters, digits, and
13 // the following: '_', '-', '.', '%'. Note that the separator placeholder
14 // character '^' is itself a separator, as well as '\0'.
15 
16 #include "components/url_pattern_index/url_pattern.h"
17 
18 #include <stddef.h>
19 
20 #include <algorithm>
21 #include <ostream>
22 
23 #include "base/check_op.h"
24 #include "base/notreached.h"
25 #include "base/numerics/checked_math.h"
26 #include "base/strings/string_util.h"
27 #include "components/url_pattern_index/flat/url_pattern_index_generated.h"
28 #include "components/url_pattern_index/fuzzy_pattern_matching.h"
29 #include "components/url_pattern_index/string_splitter.h"
30 #include "url/gurl.h"
31 #include "url/third_party/mozilla/url_parse.h"
32 
33 namespace url_pattern_index {
34 
35 namespace {
36 
37 constexpr char kWildcard = '*';
38 
39 class IsWildcard {
40  public:
operator ()(char c) const41   bool operator()(char c) const { return c == kWildcard; }
42 };
43 
ConvertUrlPatternType(flat::UrlPatternType type)44 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {
45   switch (type) {
46     case flat::UrlPatternType_SUBSTRING:
47       return proto::URL_PATTERN_TYPE_SUBSTRING;
48     case flat::UrlPatternType_WILDCARDED:
49       return proto::URL_PATTERN_TYPE_WILDCARDED;
50     case flat::UrlPatternType_REGEXP:
51       return proto::URL_PATTERN_TYPE_REGEXP;
52     default:
53       return proto::URL_PATTERN_TYPE_UNSPECIFIED;
54   }
55 }
56 
ConvertAnchorType(flat::AnchorType type)57 proto::AnchorType ConvertAnchorType(flat::AnchorType type) {
58   switch (type) {
59     case flat::AnchorType_NONE:
60       return proto::ANCHOR_TYPE_NONE;
61     case flat::AnchorType_BOUNDARY:
62       return proto::ANCHOR_TYPE_BOUNDARY;
63     case flat::AnchorType_SUBDOMAIN:
64       return proto::ANCHOR_TYPE_SUBDOMAIN;
65     default:
66       return proto::ANCHOR_TYPE_UNSPECIFIED;
67   }
68 }
69 
ConvertString(const flatbuffers::String * string)70 base::StringPiece ConvertString(const flatbuffers::String* string) {
71   return string ? base::StringPiece(string->data(), string->size())
72                 : base::StringPiece();
73 }
74 
HasAnyUpperAscii(base::StringPiece string)75 bool HasAnyUpperAscii(base::StringPiece string) {
76   return std::any_of(string.begin(), string.end(), base::IsAsciiUpper<char>);
77 }
78 
79 // Returns whether |position| within the |url| belongs to its |host| component
80 // and corresponds to the beginning of a (sub-)domain.
IsSubdomainAnchored(base::StringPiece url,url::Component host,size_t position)81 inline bool IsSubdomainAnchored(base::StringPiece url,
82                                 url::Component host,
83                                 size_t position) {
84   DCHECK_LE(position, url.size());
85   const size_t host_begin = static_cast<size_t>(host.begin);
86   const size_t host_end = static_cast<size_t>(host.end());
87   DCHECK_LE(host_end, url.size());
88 
89   return position == host_begin ||
90          (position > host_begin && position <= host_end &&
91           url[position - 1] == '.');
92 }
93 
94 // Returns the position of the leftmost occurrence of a |subpattern| in the
95 // |text| starting no earlier than |from| the specified position. If the
96 // |subpattern| has separator placeholders, searches for a fuzzy occurrence.
FindSubpattern(base::StringPiece text,base::StringPiece subpattern,size_t from=0)97 size_t FindSubpattern(base::StringPiece text,
98                       base::StringPiece subpattern,
99                       size_t from = 0) {
100   const bool is_fuzzy =
101       (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
102   return is_fuzzy ? FindFuzzy(text, subpattern, from)
103                   : text.find(subpattern, from);
104 }
105 
106 // Same as FindSubpattern(url, subpattern), but searches for an occurrence that
107 // starts at the beginning of a (sub-)domain within the url's |host| component.
FindSubdomainAnchoredSubpattern(base::StringPiece url,url::Component host,base::StringPiece subpattern)108 size_t FindSubdomainAnchoredSubpattern(base::StringPiece url,
109                                        url::Component host,
110                                        base::StringPiece subpattern) {
111   const bool is_fuzzy =
112       (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
113 
114   // Any match found after the end of the host will be discarded, so just
115   // avoid searching there for the subpattern to begin with.
116   //
117   // Check for overflow.
118   size_t max_match_end = 0;
119   if (!base::CheckAdd(host.end(), subpattern.length())
120            .AssignIfValid(&max_match_end)) {
121     return base::StringPiece::npos;
122   }
123   const base::StringPiece url_match_candidate = url.substr(0, max_match_end);
124   const base::StringPiece url_host = url.substr(0, host.end());
125 
126   for (size_t position = static_cast<size_t>(host.begin);
127        position <= static_cast<size_t>(host.end()); ++position) {
128     // Enforce as a loop precondition that we are always anchored at a
129     // sub-domain before calling find. This is to reduce the number of potential
130     // searches for |subpattern|.
131     DCHECK(IsSubdomainAnchored(url, host, position));
132 
133     position = is_fuzzy ? FindFuzzy(url_match_candidate, subpattern, position)
134                         : url_match_candidate.find(subpattern, position);
135     if (position == base::StringPiece::npos ||
136         IsSubdomainAnchored(url, host, position)) {
137       return position;
138     }
139 
140     // Enforce the loop precondition. This skips |position| to the next '.',
141     // within the host, which the loop itself increments to the anchored
142     // sub-domain.
143     position = url_host.find('.', position);
144     if (position == base::StringPiece::npos)
145       break;
146   }
147   return base::StringPiece::npos;
148 }
149 
150 // Helper for DoesTextMatchLastSubpattern. Treats kSeparatorPlaceholder as not
151 // matching the end of the text.
DoesTextMatchLastSubpatternInternal(proto::AnchorType anchor_left,proto::AnchorType anchor_right,base::StringPiece text,url::Component url_host,base::StringPiece subpattern)152 bool DoesTextMatchLastSubpatternInternal(proto::AnchorType anchor_left,
153                                          proto::AnchorType anchor_right,
154                                          base::StringPiece text,
155                                          url::Component url_host,
156                                          base::StringPiece subpattern) {
157   // Enumerate all possible combinations of |anchor_left| and |anchor_right|.
158   if (anchor_left == proto::ANCHOR_TYPE_NONE &&
159       anchor_right == proto::ANCHOR_TYPE_NONE) {
160     return FindSubpattern(text, subpattern) != base::StringPiece::npos;
161   }
162 
163   if (anchor_left == proto::ANCHOR_TYPE_NONE &&
164       anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
165     return EndsWithFuzzy(text, subpattern);
166   }
167 
168   if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY &&
169       anchor_right == proto::ANCHOR_TYPE_NONE) {
170     return StartsWithFuzzy(text, subpattern);
171   }
172 
173   if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY &&
174       anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
175     return text.size() == subpattern.size() &&
176            StartsWithFuzzy(text, subpattern);
177   }
178 
179   if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN &&
180       anchor_right == proto::ANCHOR_TYPE_NONE) {
181     return url_host.is_nonempty() &&
182            FindSubdomainAnchoredSubpattern(text, url_host, subpattern) !=
183                base::StringPiece::npos;
184   }
185 
186   if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN &&
187       anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
188     return url_host.is_nonempty() && text.size() >= subpattern.size() &&
189            IsSubdomainAnchored(text, url_host,
190                                text.size() - subpattern.size()) &&
191            EndsWithFuzzy(text, subpattern);
192   }
193 
194   NOTREACHED();
195   return false;
196 }
197 
198 // Matches the last |subpattern| against |text|. Special treatment is required
199 // for the last subpattern since |kSeparatorPlaceholder| can also match the end
200 // of the text.
DoesTextMatchLastSubpattern(proto::AnchorType anchor_left,proto::AnchorType anchor_right,base::StringPiece text,url::Component url_host,base::StringPiece subpattern)201 bool DoesTextMatchLastSubpattern(proto::AnchorType anchor_left,
202                                  proto::AnchorType anchor_right,
203                                  base::StringPiece text,
204                                  url::Component url_host,
205                                  base::StringPiece subpattern) {
206   DCHECK(!subpattern.empty());
207 
208   if (DoesTextMatchLastSubpatternInternal(anchor_left, anchor_right, text,
209                                           url_host, subpattern)) {
210     return true;
211   }
212 
213   // If the last |subpattern| ends with kSeparatorPlaceholder, then it can also
214   // match the end of text.
215   if (subpattern.back() == kSeparatorPlaceholder) {
216     subpattern.remove_suffix(1);
217     return DoesTextMatchLastSubpatternInternal(
218         anchor_left, proto::ANCHOR_TYPE_BOUNDARY, text, url_host, subpattern);
219   }
220 
221   return false;
222 }
223 
224 // Returns whether the given |url_pattern| matches the given |url_spec|.
225 // Compares the pattern the the url in a case-sensitive manner.
IsCaseSensitiveMatch(base::StringPiece url_pattern,proto::AnchorType anchor_left,proto::AnchorType anchor_right,base::StringPiece url_spec,url::Component url_host)226 bool IsCaseSensitiveMatch(base::StringPiece url_pattern,
227                           proto::AnchorType anchor_left,
228                           proto::AnchorType anchor_right,
229                           base::StringPiece url_spec,
230                           url::Component url_host) {
231   DCHECK(!url_spec.empty());
232 
233   StringSplitter<IsWildcard> subpatterns(url_pattern);
234   auto subpattern_it = subpatterns.begin();
235   auto subpattern_end = subpatterns.end();
236 
237   // No subpatterns.
238   if (subpattern_it == subpattern_end) {
239     return anchor_left == proto::ANCHOR_TYPE_NONE ||
240            anchor_right == proto::ANCHOR_TYPE_NONE;
241   }
242 
243   base::StringPiece subpattern = *subpattern_it;
244   ++subpattern_it;
245 
246   // There is only one |subpattern|.
247   if (subpattern_it == subpattern_end) {
248     return DoesTextMatchLastSubpattern(anchor_left, anchor_right, url_spec,
249                                        url_host, subpattern);
250   }
251 
252   // Otherwise, the first |subpattern| does not have to be a suffix. But it
253   // still can have a left anchor. Check and handle that.
254   base::StringPiece text = url_spec;
255   if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY) {
256     if (!StartsWithFuzzy(url_spec, subpattern))
257       return false;
258     text.remove_prefix(subpattern.size());
259   } else if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
260     if (!url_host.is_nonempty())
261       return false;
262     const size_t match_begin =
263         FindSubdomainAnchoredSubpattern(url_spec, url_host, subpattern);
264     if (match_begin == base::StringPiece::npos)
265       return false;
266     text.remove_prefix(match_begin + subpattern.size());
267   } else {
268     DCHECK_EQ(anchor_left, proto::ANCHOR_TYPE_NONE);
269     // Get back to the initial |subpattern|, process it in the loop below.
270     subpattern_it = subpatterns.begin();
271   }
272 
273   DCHECK(subpattern_it != subpattern_end);
274   subpattern = *subpattern_it;
275 
276   // Consecutively find all the remaining subpatterns in the |text|. Handle the
277   // last subpattern outside the loop.
278   while (++subpattern_it != subpattern_end) {
279     DCHECK(!subpattern.empty());
280 
281     const size_t match_position = FindSubpattern(text, subpattern);
282     if (match_position == base::StringPiece::npos)
283       return false;
284     text.remove_prefix(match_position + subpattern.size());
285 
286     subpattern = *subpattern_it;
287   }
288 
289   return DoesTextMatchLastSubpattern(proto::ANCHOR_TYPE_NONE, anchor_right,
290                                      text, url::Component(), subpattern);
291 }
292 
293 }  // namespace
294 
UrlInfo(const GURL & url)295 UrlPattern::UrlInfo::UrlInfo(const GURL& url)
296     : spec_(url.possibly_invalid_spec()),
297       host_(url.parsed_for_possibly_invalid_spec().host) {
298   DCHECK(url.is_valid());
299 }
300 
GetLowerCaseSpec() const301 base::StringPiece UrlPattern::UrlInfo::GetLowerCaseSpec() const {
302   if (lower_case_spec_cached_)
303     return *lower_case_spec_cached_;
304 
305   if (!HasAnyUpperAscii(spec_)) {
306     lower_case_spec_cached_ = spec_;
307   } else {
308     lower_case_spec_owner_ = base::ToLowerASCII(spec_);
309     lower_case_spec_cached_ = lower_case_spec_owner_;
310   }
311   return *lower_case_spec_cached_;
312 }
313 
314 UrlPattern::UrlInfo::~UrlInfo() = default;
315 
316 UrlPattern::UrlPattern() = default;
317 
UrlPattern(base::StringPiece url_pattern,proto::UrlPatternType type,MatchCase match_case)318 UrlPattern::UrlPattern(base::StringPiece url_pattern,
319                        proto::UrlPatternType type,
320                        MatchCase match_case)
321     : type_(type), url_pattern_(url_pattern), match_case_(match_case) {}
322 
UrlPattern(base::StringPiece url_pattern,proto::AnchorType anchor_left,proto::AnchorType anchor_right)323 UrlPattern::UrlPattern(base::StringPiece url_pattern,
324                        proto::AnchorType anchor_left,
325                        proto::AnchorType anchor_right)
326     : type_(proto::URL_PATTERN_TYPE_WILDCARDED),
327       url_pattern_(url_pattern),
328       anchor_left_(anchor_left),
329       anchor_right_(anchor_right) {}
330 
UrlPattern(const flat::UrlRule & rule)331 UrlPattern::UrlPattern(const flat::UrlRule& rule)
332     : type_(ConvertUrlPatternType(rule.url_pattern_type())),
333       url_pattern_(ConvertString(rule.url_pattern())),
334       anchor_left_(ConvertAnchorType(rule.anchor_left())),
335       anchor_right_(ConvertAnchorType(rule.anchor_right())),
336       match_case_(rule.options() & flat::OptionFlag_IS_CASE_INSENSITIVE
337                       ? MatchCase::kFalse
338                       : MatchCase::kTrue) {}
339 
340 UrlPattern::~UrlPattern() = default;
341 
MatchesUrl(const UrlInfo & url) const342 bool UrlPattern::MatchesUrl(const UrlInfo& url) const {
343   DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
344          type_ == proto::URL_PATTERN_TYPE_WILDCARDED);
345   DCHECK(base::IsStringASCII(url_pattern_));
346   DCHECK(base::IsStringASCII(url.spec()));
347   DCHECK(base::IsStringASCII(url.GetLowerCaseSpec()));
348 
349   // Pre-process patterns to ensure left anchored and right anchored patterns
350   // don't begin and end with a wildcard respectively i.e. change "|*xyz" to
351   // "*xyz" and "xyz*|" to "xyz*".
352   proto::AnchorType anchor_left = anchor_left_;
353   proto::AnchorType anchor_right = anchor_right_;
354   if (!url_pattern_.empty()) {
355     if (url_pattern_.front() == kWildcard) {
356       // Note: We don't handle "||*" and expect clients to disallow it.
357       DCHECK_NE(proto::ANCHOR_TYPE_SUBDOMAIN, anchor_left_);
358       anchor_left = proto::ANCHOR_TYPE_NONE;
359     }
360     if (url_pattern_.back() == kWildcard)
361       anchor_right = proto::ANCHOR_TYPE_NONE;
362   }
363 
364   if (match_case()) {
365     return IsCaseSensitiveMatch(url_pattern_, anchor_left, anchor_right,
366                                 url.spec(), url.host());
367   }
368 
369   // Use the lower-cased url for case-insensitive comparison. Case-insensitive
370   // patterns should already be lower-cased.
371   DCHECK(!HasAnyUpperAscii(url_pattern_));
372   return IsCaseSensitiveMatch(url_pattern_, anchor_left, anchor_right,
373                               url.GetLowerCaseSpec(), url.host());
374 }
375 
operator <<(std::ostream & out,const UrlPattern & pattern)376 std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {
377   switch (pattern.anchor_left()) {
378     case proto::ANCHOR_TYPE_SUBDOMAIN:
379       out << '|';
380       FALLTHROUGH;
381     case proto::ANCHOR_TYPE_BOUNDARY:
382       out << '|';
383       FALLTHROUGH;
384     default:
385       break;
386   }
387   out << pattern.url_pattern();
388   if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)
389     out << '|';
390   if (pattern.match_case())
391     out << "$match-case";
392 
393   return out;
394 }
395 
396 }  // namespace url_pattern_index
397