1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/omnibox/browser/scored_history_match.h"
6 
7 #include <algorithm>
8 #include <memory>
9 #include <numeric>
10 #include <utility>
11 
12 #include "base/auto_reset.h"
13 #include "base/bind.h"
14 #include "base/i18n/break_iterator.h"
15 #include "base/strings/string16.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/test/scoped_feature_list.h"
18 #include "build/build_config.h"
19 #include "components/omnibox/browser/omnibox_field_trial.h"
20 #include "components/omnibox/common/omnibox_features.h"
21 #include "components/search_engines/search_terms_data.h"
22 #include "testing/gmock/include/gmock/gmock.h"
23 #include "testing/gtest/include/gtest/gtest.h"
24 
25 using base::ASCIIToUTF16;
26 using testing::ElementsAre;
27 using testing::Pair;
28 
29 namespace {
30 
31 // Returns a VisitInfoVector that includes |num_visits| spread over the
32 // last |frequency|*|num_visits| days (relative to |now|).  A frequency of
33 // one means one visit each day, two means every other day, etc.
CreateVisitInfoVector(int num_visits,int frequency,base::Time now)34 VisitInfoVector CreateVisitInfoVector(int num_visits,
35                                       int frequency,
36                                       base::Time now) {
37   VisitInfoVector visits;
38   for (int i = 0; i < num_visits; ++i) {
39     visits.push_back(
40         std::make_pair(now - base::TimeDelta::FromDays(i * frequency),
41                        ui::PAGE_TRANSITION_LINK));
42   }
43   return visits;
44 }
45 
46 }  // namespace
47 
48 class ScoredHistoryMatchTest : public testing::Test {
49  protected:
50   // Convenience function to create a history::URLRow with basic data for |url|,
51   // |title|, |visit_count|, and |typed_count|. |days_since_last_visit| gives
52   // the number of days ago to which to set the URL's last_visit.
53   history::URLRow MakeURLRow(const char* url,
54                              const char* title,
55                              int visit_count,
56                              int days_since_last_visit,
57                              int typed_count);
58 
59   // Convenience function to set the word starts information from a
60   // history::URLRow's URL and title.
61   void PopulateWordStarts(const history::URLRow& url_row,
62                           RowWordStarts* word_starts);
63 
64   // Convenience functions for easily creating vectors of search terms.
65   String16Vector Make1Term(const char* term) const;
66   String16Vector Make2Terms(const char* term_1, const char* term_2) const;
67 
68   // Convenience function for GetTopicalityScore() that builds the term match
69   // and word break information automatically that are needed to call
70   // GetTopicalityScore().
71   float GetTopicalityScoreOfTermAgainstURLAndTitle(
72       const std::vector<const std::string>&,
73       const WordStarts term_word_starts,
74       const GURL& url,
75       const base::string16& title);
76 };
77 
MakeURLRow(const char * url,const char * title,int visit_count,int days_since_last_visit,int typed_count)78 history::URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url,
79                                                    const char* title,
80                                                    int visit_count,
81                                                    int days_since_last_visit,
82                                                    int typed_count) {
83   history::URLRow row(GURL(url), 0);
84   row.set_title(ASCIIToUTF16(title));
85   row.set_visit_count(visit_count);
86   row.set_typed_count(typed_count);
87   row.set_last_visit(base::Time::NowFromSystemTime() -
88                      base::TimeDelta::FromDays(days_since_last_visit));
89   return row;
90 }
91 
PopulateWordStarts(const history::URLRow & url_row,RowWordStarts * word_starts)92 void ScoredHistoryMatchTest::PopulateWordStarts(const history::URLRow& url_row,
93                                                 RowWordStarts* word_starts) {
94   String16SetFromString16(ASCIIToUTF16(url_row.url().spec()),
95                           &word_starts->url_word_starts_);
96   String16SetFromString16(url_row.title(), &word_starts->title_word_starts_);
97 }
98 
Make1Term(const char * term) const99 String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const {
100   String16Vector original_terms;
101   original_terms.push_back(ASCIIToUTF16(term));
102   return original_terms;
103 }
104 
Make2Terms(const char * term_1,const char * term_2) const105 String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
106                                                   const char* term_2) const {
107   String16Vector original_terms;
108   original_terms.push_back(ASCIIToUTF16(term_1));
109   original_terms.push_back(ASCIIToUTF16(term_2));
110   return original_terms;
111 }
112 
GetTopicalityScoreOfTermAgainstURLAndTitle(const std::vector<const std::string> & terms,const WordStarts term_word_starts,const GURL & url,const base::string16 & title)113 float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
114     const std::vector<const std::string>& terms,
115     const WordStarts term_word_starts,
116     const GURL& url,
117     const base::string16& title) {
118   String16Vector term_vector;
119   std::transform(terms.begin(), terms.end(), std::back_inserter(term_vector),
120                  [](auto term) { return base::UTF8ToUTF16(term); });
121   std::string terms_joint =
122       std::accumulate(std::next(terms.begin()), terms.end(), terms[0],
123                       [](std::string accumulator, std::string term) {
124                         return accumulator + " " + term;
125                       });
126   RowWordStarts row_word_starts;
127   base::string16 url_string = base::UTF8ToUTF16(url.spec());
128   String16SetFromString16(url_string, &row_word_starts.url_word_starts_);
129   String16SetFromString16(title, &row_word_starts.title_word_starts_);
130   auto row = history::URLRow(GURL(url));
131   row.set_title(title);
132   ScoredHistoryMatch scored_match(
133       row, VisitInfoVector(), base::UTF8ToUTF16(terms_joint), term_vector,
134       term_word_starts, row_word_starts, false, 1, base::Time::Max());
135   scored_match.topicality_threshold_ = -1;
136   return scored_match.GetTopicalityScore(term_vector.size(), url,
137                                          base::OffsetAdjuster::Adjustments(),
138                                          term_word_starts, row_word_starts);
139 }
140 
TEST_F(ScoredHistoryMatchTest,Scoring)141 TEST_F(ScoredHistoryMatchTest, Scoring) {
142   // We use NowFromSystemTime() because MakeURLRow uses the same function
143   // to calculate last visit time when building a row.
144   base::Time now = base::Time::NowFromSystemTime();
145 
146   history::URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1));
147   RowWordStarts word_starts_a;
148   PopulateWordStarts(row_a, &word_starts_a);
149   WordStarts one_word_no_offset(1, 0u);
150   VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now);
151   // Mark one visit as typed.
152   visits_a[0].second = ui::PAGE_TRANSITION_TYPED;
153   ScoredHistoryMatch scored_a(row_a, visits_a, ASCIIToUTF16("abc"),
154                               Make1Term("abc"), one_word_no_offset,
155                               word_starts_a, false, 1, now);
156 
157   // Test scores based on visit_count.
158   history::URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1));
159   RowWordStarts word_starts_b;
160   PopulateWordStarts(row_b, &word_starts_b);
161   VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now);
162   visits_b[0].second = ui::PAGE_TRANSITION_TYPED;
163   ScoredHistoryMatch scored_b(row_b, visits_b, ASCIIToUTF16("abc"),
164                               Make1Term("abc"), one_word_no_offset,
165                               word_starts_b, false, 1, now);
166   EXPECT_GT(scored_b.raw_score, scored_a.raw_score);
167 
168   // Test scores based on last_visit.
169   history::URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1));
170   RowWordStarts word_starts_c;
171   PopulateWordStarts(row_c, &word_starts_c);
172   VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now);
173   visits_c[0].second = ui::PAGE_TRANSITION_TYPED;
174   ScoredHistoryMatch scored_c(row_c, visits_c, ASCIIToUTF16("abc"),
175                               Make1Term("abc"), one_word_no_offset,
176                               word_starts_c, false, 1, now);
177   EXPECT_GT(scored_c.raw_score, scored_a.raw_score);
178 
179   // Test scores based on typed_count.
180   history::URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3));
181   RowWordStarts word_starts_d;
182   PopulateWordStarts(row_d, &word_starts_d);
183   VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now);
184   visits_d[0].second = ui::PAGE_TRANSITION_TYPED;
185   visits_d[1].second = ui::PAGE_TRANSITION_TYPED;
186   visits_d[2].second = ui::PAGE_TRANSITION_TYPED;
187   ScoredHistoryMatch scored_d(row_d, visits_d, ASCIIToUTF16("abc"),
188                               Make1Term("abc"), one_word_no_offset,
189                               word_starts_d, false, 1, now);
190   EXPECT_GT(scored_d.raw_score, scored_a.raw_score);
191 
192   // Test scores based on a terms appearing multiple times.
193   history::URLRow row_e(MakeURLRow(
194       "http://csi.csi.csi/csi_csi",
195       "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3));
196   RowWordStarts word_starts_e;
197   PopulateWordStarts(row_e, &word_starts_e);
198   const VisitInfoVector visits_e = visits_d;
199   ScoredHistoryMatch scored_e(row_e, visits_e, ASCIIToUTF16("csi"),
200                               Make1Term("csi"), one_word_no_offset,
201                               word_starts_e, false, 1, now);
202   EXPECT_LT(scored_e.raw_score, 1400);
203 
204   // Test that a result with only a mid-term match (i.e., not at a word
205   // boundary) scores 0.
206   ScoredHistoryMatch scored_f(row_a, visits_a, ASCIIToUTF16("cd"),
207                               Make1Term("cd"), one_word_no_offset,
208                               word_starts_a, false, 1, now);
209   EXPECT_EQ(scored_f.raw_score, 0);
210 }
211 
TEST_F(ScoredHistoryMatchTest,ScoringBookmarks)212 TEST_F(ScoredHistoryMatchTest, ScoringBookmarks) {
213   // We use NowFromSystemTime() because MakeURLRow uses the same function
214   // to calculate last visit time when building a row.
215   base::Time now = base::Time::NowFromSystemTime();
216 
217   std::string url_string("http://fedcba");
218   const GURL url(url_string);
219   history::URLRow row(MakeURLRow(url_string.c_str(), "abcd bcd", 8, 3, 1));
220   RowWordStarts word_starts;
221   PopulateWordStarts(row, &word_starts);
222   WordStarts one_word_no_offset(1, 0u);
223   VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
224   ScoredHistoryMatch scored(row, visits, ASCIIToUTF16("abc"), Make1Term("abc"),
225                             one_word_no_offset, word_starts, false, 1, now);
226   // Now check that if URL is bookmarked then its score increases.
227   base::AutoReset<float> reset(&ScoredHistoryMatch::bookmark_value_, 5);
228   ScoredHistoryMatch scored_with_bookmark(row, visits, ASCIIToUTF16("abc"),
229                                           Make1Term("abc"), one_word_no_offset,
230                                           word_starts, true, 1, now);
231   EXPECT_GT(scored_with_bookmark.raw_score, scored.raw_score);
232 }
233 
TEST_F(ScoredHistoryMatchTest,ScoringTLD)234 TEST_F(ScoredHistoryMatchTest, ScoringTLD) {
235   // We use NowFromSystemTime() because MakeURLRow uses the same function
236   // to calculate last visit time when building a row.
237   base::Time now = base::Time::NowFromSystemTime();
238 
239   // By default, a tld match should not contribute to the suggestion score.
240   std::string url_string("http://fedcba.com/");
241   const GURL url(url_string);
242   history::URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
243   RowWordStarts word_starts;
244   PopulateWordStarts(row, &word_starts);
245   WordStarts two_words_no_offsets(2, 0u);
246   VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
247   ScoredHistoryMatch scored(row, visits, ASCIIToUTF16("fed com"),
248                             Make2Terms("fed", "com"), two_words_no_offsets,
249                             word_starts, false, 1, now);
250   EXPECT_GT(scored.raw_score, 0);
251 
252   // Now allow credit for the match in the TLD.
253   base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_tld_matches_, true);
254   ScoredHistoryMatch scored_with_tld(
255       row, visits, ASCIIToUTF16("fed com"), Make2Terms("fed", "com"),
256       two_words_no_offsets, word_starts, false, 1, now);
257   EXPECT_GT(scored_with_tld.raw_score, 0);
258 
259   EXPECT_GT(scored_with_tld.raw_score, scored.raw_score);
260 }
261 
TEST_F(ScoredHistoryMatchTest,ScoringScheme)262 TEST_F(ScoredHistoryMatchTest, ScoringScheme) {
263   // We use NowFromSystemTime() because MakeURLRow uses the same function
264   // to calculate last visit time when building a row.
265   base::Time now = base::Time::NowFromSystemTime();
266 
267   // By default, a scheme match should not contribute to the suggestion score
268   std::string url_string("http://fedcba/");
269   const GURL url(url_string);
270   history::URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
271   RowWordStarts word_starts;
272   PopulateWordStarts(row, &word_starts);
273   WordStarts two_words_no_offsets(2, 0u);
274   VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
275   ScoredHistoryMatch scored(row, visits, ASCIIToUTF16("fed http"),
276                             Make2Terms("fed", "http"), two_words_no_offsets,
277                             word_starts, false, 1, now);
278   EXPECT_GT(scored.raw_score, 0);
279 
280   // Now allow credit for the match in the scheme.
281   base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_scheme_matches_, true);
282   ScoredHistoryMatch scored_with_scheme(
283       row, visits, ASCIIToUTF16("fed http"), Make2Terms("fed", "http"),
284       two_words_no_offsets, word_starts, false, 1, now);
285   EXPECT_GT(scored_with_scheme.raw_score, 0);
286 
287   EXPECT_GT(scored_with_scheme.raw_score, scored.raw_score);
288 }
289 
TEST_F(ScoredHistoryMatchTest,MatchURLComponents)290 TEST_F(ScoredHistoryMatchTest, MatchURLComponents) {
291   // We use NowFromSystemTime() because MakeURLRow uses the same function
292   // to calculate last visit time when building a row.
293   base::Time now = base::Time::NowFromSystemTime();
294   RowWordStarts word_starts;
295   WordStarts one_word_no_offset(1, 0u);
296   VisitInfoVector visits;
297 
298   {
299     history::URLRow row(
300         MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1));
301     PopulateWordStarts(row, &word_starts);
302     ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("g"), Make1Term("g"),
303                                 one_word_no_offset, word_starts, false, 1, now);
304     EXPECT_FALSE(scored_a.match_in_scheme);
305     EXPECT_FALSE(scored_a.match_in_subdomain);
306     ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("w"), Make1Term("w"),
307                                 one_word_no_offset, word_starts, false, 1, now);
308     EXPECT_FALSE(scored_b.match_in_scheme);
309     EXPECT_TRUE(scored_b.match_in_subdomain);
310     ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("h"), Make1Term("h"),
311                                 one_word_no_offset, word_starts, false, 1, now);
312     EXPECT_TRUE(scored_c.match_in_scheme);
313     EXPECT_FALSE(scored_c.match_in_subdomain);
314     ScoredHistoryMatch scored_d(row, visits, ASCIIToUTF16("o"), Make1Term("o"),
315                                 one_word_no_offset, word_starts, false, 1, now);
316     EXPECT_FALSE(scored_d.match_in_scheme);
317     EXPECT_FALSE(scored_d.match_in_subdomain);
318   }
319 
320   {
321     history::URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1));
322     PopulateWordStarts(row, &word_starts);
323     ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("t"), Make1Term("t"),
324                                 one_word_no_offset, word_starts, false, 1, now);
325     EXPECT_FALSE(scored_a.match_in_scheme);
326     EXPECT_TRUE(scored_a.match_in_subdomain);
327     ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("f"), Make1Term("f"),
328                                 one_word_no_offset, word_starts, false, 1, now);
329     EXPECT_FALSE(scored_b.match_in_scheme);
330     EXPECT_FALSE(scored_b.match_in_subdomain);
331     ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("o"), Make1Term("o"),
332                                 one_word_no_offset, word_starts, false, 1, now);
333     EXPECT_FALSE(scored_c.match_in_scheme);
334     EXPECT_FALSE(scored_c.match_in_subdomain);
335   }
336 
337   {
338     history::URLRow row(MakeURLRow("http://en.m.foo.com", "abcdef", 3, 30, 1));
339     PopulateWordStarts(row, &word_starts);
340     ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("e"), Make1Term("e"),
341                                 one_word_no_offset, word_starts, false, 1, now);
342     EXPECT_FALSE(scored_a.match_in_scheme);
343     EXPECT_TRUE(scored_a.match_in_subdomain);
344     ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("m"), Make1Term("m"),
345                                 one_word_no_offset, word_starts, false, 1, now);
346     EXPECT_FALSE(scored_b.match_in_scheme);
347     EXPECT_TRUE(scored_b.match_in_subdomain);
348     ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("f"), Make1Term("f"),
349                                 one_word_no_offset, word_starts, false, 1, now);
350     EXPECT_FALSE(scored_c.match_in_scheme);
351     EXPECT_FALSE(scored_c.match_in_subdomain);
352   }
353 
354   {
355     history::URLRow row(
356         MakeURLRow("https://www.testing.com/xxx?yyy#zzz", "abcdef", 3, 30, 1));
357     PopulateWordStarts(row, &word_starts);
358     ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("t"), Make1Term("t"),
359                                 one_word_no_offset, word_starts, false, 1, now);
360     EXPECT_FALSE(scored_a.match_in_scheme);
361     EXPECT_FALSE(scored_a.match_in_subdomain);
362     ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("h"), Make1Term("h"),
363                                 one_word_no_offset, word_starts, false, 1, now);
364     EXPECT_TRUE(scored_b.match_in_scheme);
365     EXPECT_FALSE(scored_b.match_in_subdomain);
366     ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("w"), Make1Term("w"),
367                                 one_word_no_offset, word_starts, false, 1, now);
368     EXPECT_FALSE(scored_c.match_in_scheme);
369     EXPECT_TRUE(scored_c.match_in_subdomain);
370     ScoredHistoryMatch scored_d(row, visits, ASCIIToUTF16("x"), Make1Term("x"),
371                                 one_word_no_offset, word_starts, false, 1, now);
372     EXPECT_FALSE(scored_d.match_in_scheme);
373     EXPECT_FALSE(scored_d.match_in_subdomain);
374     ScoredHistoryMatch scored_e(row, visits, ASCIIToUTF16("y"), Make1Term("y"),
375                                 one_word_no_offset, word_starts, false, 1, now);
376     EXPECT_FALSE(scored_e.match_in_scheme);
377     EXPECT_FALSE(scored_e.match_in_subdomain);
378     ScoredHistoryMatch scored_f(row, visits, ASCIIToUTF16("z"), Make1Term("z"),
379                                 one_word_no_offset, word_starts, false, 1, now);
380     EXPECT_FALSE(scored_f.match_in_scheme);
381     EXPECT_FALSE(scored_f.match_in_subdomain);
382     ScoredHistoryMatch scored_g(row, visits, ASCIIToUTF16("https://www"),
383                                 Make1Term("https://www"), one_word_no_offset,
384                                 word_starts, false, 1, now);
385     EXPECT_TRUE(scored_g.match_in_scheme);
386     EXPECT_TRUE(scored_g.match_in_subdomain);
387     ScoredHistoryMatch scored_h(row, visits, ASCIIToUTF16("testing.com/x"),
388                                 Make1Term("testing.com/x"), one_word_no_offset,
389                                 word_starts, false, 1, now);
390     EXPECT_FALSE(scored_h.match_in_scheme);
391     EXPECT_FALSE(scored_h.match_in_subdomain);
392     ScoredHistoryMatch scored_i(row, visits,
393                                 ASCIIToUTF16("https://www.testing.com/x"),
394                                 Make1Term("https://www.testing.com/x"),
395                                 one_word_no_offset, word_starts, false, 1, now);
396     EXPECT_TRUE(scored_i.match_in_scheme);
397     EXPECT_TRUE(scored_i.match_in_subdomain);
398   }
399 
400   {
401     history::URLRow row(
402         MakeURLRow("http://www.xn--1lq90ic7f1rc.cn/xnblah", "abcd", 3, 30, 1));
403     PopulateWordStarts(row, &word_starts);
404     ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("x"), Make1Term("x"),
405                                 one_word_no_offset, word_starts, false, 1, now);
406     EXPECT_FALSE(scored_a.match_in_scheme);
407     EXPECT_FALSE(scored_a.match_in_subdomain);
408     ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("xn"),
409                                 Make1Term("xn"), one_word_no_offset,
410                                 word_starts, false, 1, now);
411     EXPECT_FALSE(scored_b.match_in_scheme);
412     EXPECT_FALSE(scored_b.match_in_subdomain);
413     ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("w"), Make1Term("w"),
414                                 one_word_no_offset, word_starts, false, 1, now);
415     EXPECT_FALSE(scored_c.match_in_scheme);
416     EXPECT_TRUE(scored_c.match_in_subdomain);
417   }
418 }
419 
TEST_F(ScoredHistoryMatchTest,GetTopicalityScoreTrailingSlash)420 TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
421   const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
422       {"def"}, {0}, GURL("http://abc.def.com/"),
423       ASCIIToUTF16("Non-Matching Title"));
424   const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
425       {"def"}, {0}, GURL("http://abc.def.com"),
426       ASCIIToUTF16("Non-Matching Title"));
427   EXPECT_EQ(hostname_no_slash, hostname);
428 }
429 
TEST_F(ScoredHistoryMatchTest,FilterMatches)430 TEST_F(ScoredHistoryMatchTest, FilterMatches) {
431   // For ease in interpreting this test, imagine the URL
432   //    http://test.com/default/foo.aspxhome/hello.html.
433   //    012345678901234567890123456789012345678901234567
434   //              1         2         3         4
435   // We test how FilterTermMatchesByWordStarts() reacts to various
436   // one-character inputs.
437   WordStarts terms_to_word_starts_offsets;
438   terms_to_word_starts_offsets.push_back(0);
439   WordStarts word_starts;
440   word_starts.push_back(0);
441   word_starts.push_back(7);
442   word_starts.push_back(12);
443   word_starts.push_back(16);
444   word_starts.push_back(24);
445   word_starts.push_back(28);
446   word_starts.push_back(37);
447   word_starts.push_back(43);
448 
449   // Check that "h" matches "http", "hello", and "html" but not "aspxhome" when
450   // asked to filter non-word-start matches after the hostname.  The "15" in
451   // the filter call below is the position of the "/" ending the hostname.
452   TermMatches term_matches;
453   term_matches.push_back(TermMatch(0, 0, 1));
454   term_matches.push_back(TermMatch(0, 32, 1));
455   term_matches.push_back(TermMatch(0, 37, 1));
456   term_matches.push_back(TermMatch(0, 43, 1));
457   TermMatches filtered_term_matches =
458       ScoredHistoryMatch::FilterTermMatchesByWordStarts(
459           term_matches, terms_to_word_starts_offsets, word_starts, 15,
460           std::string::npos);
461   ASSERT_EQ(3u, filtered_term_matches.size());
462   EXPECT_EQ(0u, filtered_term_matches[0].offset);
463   EXPECT_EQ(37u, filtered_term_matches[1].offset);
464   EXPECT_EQ(43u, filtered_term_matches[2].offset);
465   // The "http" match should remain after removing the mid-word matches in the
466   // scheme.  The "4" is the position of the ":" character ending the scheme.
467   filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
468       filtered_term_matches, terms_to_word_starts_offsets, word_starts, 0, 5);
469   ASSERT_EQ(3u, filtered_term_matches.size());
470   EXPECT_EQ(0u, filtered_term_matches[0].offset);
471   EXPECT_EQ(37u, filtered_term_matches[1].offset);
472   EXPECT_EQ(43u, filtered_term_matches[2].offset);
473 
474   // Check that "t" matches "http" twice and "test" twice but not "default" or
475   // "html" when asked to filter non-word-start matches after the hostname.
476   term_matches.clear();
477   term_matches.push_back(TermMatch(0, 1, 1));
478   term_matches.push_back(TermMatch(0, 2, 1));
479   term_matches.push_back(TermMatch(0, 7, 1));
480   term_matches.push_back(TermMatch(0, 10, 1));
481   term_matches.push_back(TermMatch(0, 22, 1));
482   term_matches.push_back(TermMatch(0, 45, 1));
483   filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
484       term_matches, terms_to_word_starts_offsets, word_starts, 15,
485       std::string::npos);
486   ASSERT_EQ(4u, filtered_term_matches.size());
487   EXPECT_EQ(1u, filtered_term_matches[0].offset);
488   EXPECT_EQ(2u, filtered_term_matches[1].offset);
489   EXPECT_EQ(7u, filtered_term_matches[2].offset);
490   EXPECT_EQ(10u, filtered_term_matches[3].offset);
491   // The "http" matches should disappear after removing mid-word matches in the
492   // scheme.
493   filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
494       filtered_term_matches, terms_to_word_starts_offsets, word_starts, 0, 4);
495   ASSERT_EQ(2u, filtered_term_matches.size());
496   EXPECT_EQ(7u, filtered_term_matches[0].offset);
497   EXPECT_EQ(10u, filtered_term_matches[1].offset);
498 
499   // Check that "e" matches "test" but not "default" or "hello" when asked to
500   // filter non-word-start matches after the hostname.
501   term_matches.clear();
502   term_matches.push_back(TermMatch(0, 8, 1));
503   term_matches.push_back(TermMatch(0, 17, 1));
504   term_matches.push_back(TermMatch(0, 38, 1));
505   filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
506       term_matches, terms_to_word_starts_offsets, word_starts, 15,
507       std::string::npos);
508   ASSERT_EQ(1u, filtered_term_matches.size());
509   EXPECT_EQ(8u, filtered_term_matches[0].offset);
510 
511   // Check that "d" matches "default" when asked to filter non-word-start
512   // matches after the hostname.
513   term_matches.clear();
514   term_matches.push_back(TermMatch(0, 16, 1));
515   filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
516       term_matches, terms_to_word_starts_offsets, word_starts, 15,
517       std::string::npos);
518   ASSERT_EQ(1u, filtered_term_matches.size());
519   EXPECT_EQ(16u, filtered_term_matches[0].offset);
520 
521   // Check that "a" matches "aspxhome" but not "default" when asked to filter
522   // non-word-start matches after the hostname.
523   term_matches.clear();
524   term_matches.push_back(TermMatch(0, 19, 1));
525   term_matches.push_back(TermMatch(0, 28, 1));
526   filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
527       term_matches, terms_to_word_starts_offsets, word_starts, 15,
528       std::string::npos);
529   ASSERT_EQ(1u, filtered_term_matches.size());
530   EXPECT_EQ(28u, filtered_term_matches[0].offset);
531 
532   // Check that ".a" matches "aspxhome", i.e., that we recognize that is
533   // is a valid match at a word break.  To recognize this,
534   // |terms_to_word_starts_offsets| must record that the "word" in this term
535   // starts at the second character.
536   term_matches.clear();
537   term_matches.push_back(TermMatch(0, 27, 1));
538   filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
539       term_matches, /*terms_to_word_starts_offsets*/ {1}, word_starts, 15,
540       std::string::npos);
541   ASSERT_EQ(1u, filtered_term_matches.size());
542   EXPECT_EQ(27u, filtered_term_matches[0].offset);
543 
544   // Check "de" + "fa" + "lt" matches "defa" when |allow_midword_continuations|
545   // is true.
546   term_matches.clear();
547   term_matches.push_back(TermMatch(0, 16, 2));
548   term_matches.push_back(TermMatch(1, 18, 2));
549   term_matches.push_back(TermMatch(2, 21, 2));
550   filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
551       term_matches, {0, 0, 0}, word_starts, 15, std::string::npos, true);
552   ASSERT_EQ(2u, filtered_term_matches.size());
553   EXPECT_EQ(16u, filtered_term_matches[0].offset);
554   EXPECT_EQ(18u, filtered_term_matches[1].offset);
555 
556   // Check "de" + "fa" + "lt" matches "de" when |allow_midword_continuations| is
557   // false.
558   term_matches.clear();
559   term_matches.push_back(TermMatch(0, 16, 2));
560   term_matches.push_back(TermMatch(1, 18, 2));
561   term_matches.push_back(TermMatch(2, 21, 2));
562   filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
563       term_matches, {0, 0, 0}, word_starts, 15, std::string::npos, false);
564   ASSERT_EQ(1u, filtered_term_matches.size());
565   EXPECT_EQ(16u, filtered_term_matches[0].offset);
566 }
567 
TEST_F(ScoredHistoryMatchTest,GetFrequency)568 TEST_F(ScoredHistoryMatchTest, GetFrequency) {
569   // Build a fake ScoredHistoryMatch, which we'll then reuse multiple times.
570   history::URLRow row(GURL("http://foo"));
571   RowWordStarts row_word_starts;
572   PopulateWordStarts(row, &row_word_starts);
573   base::Time now(base::Time::Max());
574   VisitInfoVector visits;
575   ScoredHistoryMatch match(row, visits, ASCIIToUTF16("foo"), Make1Term("foo"),
576                            WordStarts{0}, row_word_starts, false, 1, now);
577 
578   // Record the score for one untyped visit.
579   visits = {{now, ui::PAGE_TRANSITION_LINK}};
580   const float one_untyped_score = match.GetFrequency(now, false, visits);
581 
582   // The score for one typed visit should be larger.
583   visits = VisitInfoVector{{now, ui::PAGE_TRANSITION_TYPED}};
584   const float one_typed_score = match.GetFrequency(now, false, visits);
585   EXPECT_GT(one_typed_score, one_untyped_score);
586 
587   // It shouldn't matter if the typed visit has a transition qualifier.
588   visits = {
589       {now, ui::PageTransitionFromInt(ui::PAGE_TRANSITION_TYPED |
590                                       ui::PAGE_TRANSITION_SERVER_REDIRECT)}};
591   EXPECT_EQ(one_typed_score, match.GetFrequency(now, false, visits));
592 
593   // A score for one untyped visit to a bookmarked page should be larger than
594   // the one untyped visit to a non-bookmarked page.
595   visits = {{now, ui::PAGE_TRANSITION_LINK}};
596   EXPECT_GE(match.GetFrequency(now, true, visits), one_untyped_score);
597 
598   // Now consider pages visited twice, with one visit being typed and one
599   // untyped.
600 
601   // A two-visit score should have a higher score than the single typed visit
602   // score.
603   visits = {{now, ui::PAGE_TRANSITION_TYPED},
604             {now - base::TimeDelta::FromDays(1), ui::PAGE_TRANSITION_LINK}};
605   const float two_visits_score = match.GetFrequency(now, false, visits);
606   EXPECT_GT(two_visits_score, one_typed_score);
607 
608   // Add an third untyped visit.
609   visits.push_back(
610       {now - base::TimeDelta::FromDays(2), ui::PAGE_TRANSITION_LINK});
611 
612   // The score should be higher than the two-visit score.
613   const float three_visits_score = match.GetFrequency(now, false, visits);
614   EXPECT_GT(three_visits_score, two_visits_score);
615 
616   // If we're only supposed to consider the most recent two visits, then the
617   // score should be the same as in the two-visit case.
618   {
619     base::AutoReset<size_t> tmp1(&ScoredHistoryMatch::max_visits_to_score_, 2);
620     EXPECT_EQ(two_visits_score, match.GetFrequency(now, false, visits));
621 
622     // Check again with the third visit being typed.
623     visits[2].second = ui::PAGE_TRANSITION_TYPED;
624     EXPECT_EQ(two_visits_score, match.GetFrequency(now, false, visits));
625   }
626 }
627 
TEST_F(ScoredHistoryMatchTest,GetDocumentSpecificityScore)628 TEST_F(ScoredHistoryMatchTest, GetDocumentSpecificityScore) {
629   // Build a fake ScoredHistoryMatch, which we'll then reuse multiple times.
630   history::URLRow row(GURL("http://foo"));
631   RowWordStarts row_word_starts;
632   PopulateWordStarts(row, &row_word_starts);
633   base::Time now(base::Time::Max());
634   VisitInfoVector visits;
635   ScoredHistoryMatch match(row, visits, ASCIIToUTF16("foo"), Make1Term("foo"),
636                            WordStarts{0}, row_word_starts, false, 1, now);
637 
638   EXPECT_EQ(3.0, match.GetDocumentSpecificityScore(1));
639   EXPECT_EQ(1.0, match.GetDocumentSpecificityScore(5));
640   EXPECT_EQ(1.0, match.GetDocumentSpecificityScore(50));
641 
642   OmniboxFieldTrial::NumMatchesScores matches_to_specificity;
643   base::AutoReset<OmniboxFieldTrial::NumMatchesScores*> tmp(
644       &ScoredHistoryMatch::matches_to_specificity_override_,
645       &matches_to_specificity);
646 
647   matches_to_specificity = {{1, 3.0}};
648   EXPECT_EQ(3.0, match.GetDocumentSpecificityScore(1));
649   EXPECT_EQ(1.0, match.GetDocumentSpecificityScore(5));
650 
651   matches_to_specificity = {{1, 3.0}, {3, 1.5}};
652   EXPECT_EQ(3.0, match.GetDocumentSpecificityScore(1));
653   EXPECT_EQ(1.5, match.GetDocumentSpecificityScore(2));
654   EXPECT_EQ(1.5, match.GetDocumentSpecificityScore(3));
655   EXPECT_EQ(1.0, match.GetDocumentSpecificityScore(4));
656 }
657 
658 // This function only tests scoring of single terms that match exactly
659 // once somewhere in the URL or title.
TEST_F(ScoredHistoryMatchTest,GetTopicalityScore)660 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
661   GURL url("http://abc.def.com/path1/path2?arg1=val1&arg2=val2#hash_fragment");
662   base::string16 title = ASCIIToUTF16("here is a - title");
663   auto Score = [&](const std::vector<const std::string>& term_vector,
664                    const WordStarts term_word_starts) {
665     return GetTopicalityScoreOfTermAgainstURLAndTitle(
666         term_vector, term_word_starts, url, title);
667   };
668   const float hostname_score = Score({"abc"}, {0});
669   const float hostname_mid_word_score = Score({"bc"}, {0});
670   const float hostname_score_preceeding_punctuation = Score({"://abc"}, {3});
671   const float domain_name_score = Score({"def"}, {0});
672   const float domain_name_mid_word_score = Score({"ef"}, {0});
673   const float domain_name_score_preceeding_dot = Score({".def"}, {1});
674   const float tld_score = Score({"com"}, {0});
675   const float tld_mid_word_score = Score({"om"}, {0});
676   const float tld_score_preceeding_dot = Score({".com"}, {1});
677   const float path_score = Score({"path1"}, {0});
678   const float path_mid_word_score = Score({"ath1"}, {0});
679   const float path_score_preceeding_slash = Score({"/path1"}, {1});
680   const float arg_score = Score({"arg1"}, {0});
681   const float arg_mid_word_score = Score({"rg1"}, {0});
682   const float arg_score_preceeding_question_mark = Score({"?arg1"}, {1});
683   const float protocol_score = Score({"htt"}, {0});
684   const float protocol_mid_word_score = Score({"tt"}, {0});
685   const float title_score = Score({"her"}, {0});
686   const float title_mid_word_score = Score({"er"}, {0});
687   const float wordless_match_at_title_mid_word_score = Score({"-"}, {1});
688   // Verify hostname and domain name > path > arg.
689   EXPECT_GT(hostname_score, path_score);
690   EXPECT_GT(domain_name_score, path_score);
691   EXPECT_GT(path_score, arg_score);
692   // Verify leading punctuation doesn't confuse scoring.
693   EXPECT_EQ(hostname_score, hostname_score_preceeding_punctuation);
694   EXPECT_EQ(domain_name_score, domain_name_score_preceeding_dot);
695   EXPECT_EQ(tld_score, tld_score_preceeding_dot);
696   EXPECT_EQ(path_score, path_score_preceeding_slash);
697   EXPECT_EQ(arg_score, arg_score_preceeding_question_mark);
698   // Verify that domain name > path and domain name > arg for non-word
699   // boundaries.
700   EXPECT_GT(hostname_mid_word_score, path_mid_word_score);
701   EXPECT_GT(domain_name_mid_word_score, path_mid_word_score);
702   EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score);
703   EXPECT_GT(hostname_mid_word_score, arg_mid_word_score);
704   // Also verify that the matches at non-word-boundaries all score
705   // worse than the matches at word boundaries.  These three sets suffice.
706   EXPECT_GT(arg_score, hostname_mid_word_score);
707   EXPECT_GT(arg_score, domain_name_mid_word_score);
708   EXPECT_GT(title_score, title_mid_word_score);
709   // Verify mid word scores are scored 0 unless 1) in the host or domain 2) or
710   // the match contains no words.
711   EXPECT_GT(hostname_mid_word_score, 0);
712   EXPECT_GT(domain_name_mid_word_score, 0);
713   EXPECT_EQ(tld_mid_word_score, 0);
714   EXPECT_EQ(path_mid_word_score, 0);
715   EXPECT_EQ(arg_mid_word_score, 0);
716   EXPECT_EQ(protocol_mid_word_score, 0);
717   EXPECT_EQ(title_mid_word_score, 0);
718   EXPECT_GT(wordless_match_at_title_mid_word_score, 0);
719   // Check that title matches fit somewhere reasonable compared to the
720   // various types of URL matches.
721   EXPECT_GT(title_score, arg_score);
722   EXPECT_GT(arg_score, title_mid_word_score);
723   // Finally, verify that protocol matches and top level domain name
724   // matches (.com, .net, etc.) score worse than some of the mid-word
725   // matches that actually count.
726   EXPECT_GT(hostname_mid_word_score, protocol_score);
727   EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score);
728   EXPECT_GT(hostname_mid_word_score, tld_score);
729   EXPECT_GT(hostname_mid_word_score, tld_mid_word_score);
730 }
731 
TEST_F(ScoredHistoryMatchTest,GetTopicalityScore_MidwordMatching)732 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore_MidwordMatching) {
733   GURL url("http://abc.def.com/path1/path2?arg1=val1&arg2=val2#hash_fragment");
734   base::string16 title = ASCIIToUTF16("here is a - title");
735   auto Score = [&](const std::vector<const std::string>& term_vector,
736                    const WordStarts term_word_starts) {
737     return GetTopicalityScoreOfTermAgainstURLAndTitle(
738         term_vector, term_word_starts, url, title);
739   };
740 
741   // Check that midword matches are allowed and scored.
742   const float wordstart = Score({"frag"}, {0u});
743   const float midword = Score({"ment"}, {0u});
744   const float wordstart_midword_continuation =
745       Score({"frag", "ment"}, {0u, 0u});
746   const float wordstart_midword_disjoint = Score({"frag", "ent"}, {0u, 0u});
747 
748   EXPECT_GT(wordstart, 0);
749   // Midword matches should not contribute to the score if they are disjoint.
750   EXPECT_EQ(midword, 0);
751   EXPECT_GT(wordstart_midword_continuation, 0);
752   EXPECT_GT(wordstart_midword_disjoint, 0);
753   // Midword matches should not contribute to the score if they are disjoint.
754   EXPECT_GT(wordstart_midword_continuation, wordstart_midword_disjoint);
755 }
756 
757 // Test the function GetFinalRelevancyScore().
TEST_F(ScoredHistoryMatchTest,GetFinalRelevancyScore)758 TEST_F(ScoredHistoryMatchTest, GetFinalRelevancyScore) {
759   // relevance_buckets = "0.0:100,1.0:200,4.0:500,8.0:900,10.0:1000";
760   ScoredHistoryMatch::ScoreMaxRelevances relevance_buckets = {
761       {0.0, 100}, {1.0, 200}, {4.0, 500}, {8.0, 900}, {10.0, 1000}};
762   base::AutoReset<ScoredHistoryMatch::ScoreMaxRelevances*> tmp(
763       &ScoredHistoryMatch::relevance_buckets_override_, &relevance_buckets);
764 
765   // Check when topicality score is zero.
766   float topicality_score = 0.0;
767   float frequency_score = 10.0;
768   float specificity_score = 1.0;
769   // intermediate_score = 0.0 * 10.0 * 1.0 = 0.0.
770   EXPECT_EQ(0, ScoredHistoryMatch::GetFinalRelevancyScore(
771                    topicality_score, frequency_score, specificity_score));
772 
773   // Check when intermediate score falls at the border range.
774   topicality_score = 0.4f;
775   frequency_score = 10.0f;
776   // intermediate_score = 0.4 * 10.0 * 1.0 = 4.0.
777   EXPECT_EQ(500, ScoredHistoryMatch::GetFinalRelevancyScore(
778                      topicality_score, frequency_score, specificity_score));
779 
780   // Checking the score that falls into one of the buckets.
781   topicality_score = 0.5f;
782   frequency_score = 10.0f;
783   // intermediate_score = 0.5 * 10.0 * 1.0 = 5.0.
784   EXPECT_EQ(600,  // 500 + (((900 - 500)/(8 -4)) * 1) = 600.
785             ScoredHistoryMatch::GetFinalRelevancyScore(
786                 topicality_score, frequency_score, specificity_score));
787 
788   // Never give the score greater than maximum specified.
789   topicality_score = 0.5f;
790   frequency_score = 22.0f;
791   // intermediate_score = 0.5 * 22.0 * 1.0 = 11.0
792   EXPECT_EQ(1000, ScoredHistoryMatch::GetFinalRelevancyScore(
793                       topicality_score, frequency_score, specificity_score));
794 }
795 
796 // Test the function GetHQPBucketsFromString().
TEST_F(ScoredHistoryMatchTest,GetHQPBucketsFromString)797 TEST_F(ScoredHistoryMatchTest, GetHQPBucketsFromString) {
798   std::string buckets_str = "0.0:400,1.5:600,12.0:1300,20.0:1399";
799   std::vector<ScoredHistoryMatch::ScoreMaxRelevance> hqp_buckets =
800       ScoredHistoryMatch::GetHQPBucketsFromString(buckets_str);
801   EXPECT_THAT(hqp_buckets, ElementsAre(Pair(0.0, 400), Pair(1.5, 600),
802                                        Pair(12.0, 1300), Pair(20.0, 1399)));
803   // Test using an invalid string.
804   buckets_str = "0.0,400,1.5,600";
805   hqp_buckets = ScoredHistoryMatch::GetHQPBucketsFromString(buckets_str);
806   EXPECT_TRUE(hqp_buckets.empty());
807 }
808