1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/omnibox/browser/scored_history_match.h"
6
7 #include <algorithm>
8 #include <memory>
9 #include <numeric>
10 #include <utility>
11
12 #include "base/auto_reset.h"
13 #include "base/bind.h"
14 #include "base/i18n/break_iterator.h"
15 #include "base/strings/string16.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/test/scoped_feature_list.h"
18 #include "build/build_config.h"
19 #include "components/omnibox/browser/omnibox_field_trial.h"
20 #include "components/omnibox/common/omnibox_features.h"
21 #include "components/search_engines/search_terms_data.h"
22 #include "testing/gmock/include/gmock/gmock.h"
23 #include "testing/gtest/include/gtest/gtest.h"
24
25 using base::ASCIIToUTF16;
26 using testing::ElementsAre;
27 using testing::Pair;
28
29 namespace {
30
31 // Returns a VisitInfoVector that includes |num_visits| spread over the
32 // last |frequency|*|num_visits| days (relative to |now|). A frequency of
33 // one means one visit each day, two means every other day, etc.
CreateVisitInfoVector(int num_visits,int frequency,base::Time now)34 VisitInfoVector CreateVisitInfoVector(int num_visits,
35 int frequency,
36 base::Time now) {
37 VisitInfoVector visits;
38 for (int i = 0; i < num_visits; ++i) {
39 visits.push_back(
40 std::make_pair(now - base::TimeDelta::FromDays(i * frequency),
41 ui::PAGE_TRANSITION_LINK));
42 }
43 return visits;
44 }
45
46 } // namespace
47
48 class ScoredHistoryMatchTest : public testing::Test {
49 protected:
50 // Convenience function to create a history::URLRow with basic data for |url|,
51 // |title|, |visit_count|, and |typed_count|. |days_since_last_visit| gives
52 // the number of days ago to which to set the URL's last_visit.
53 history::URLRow MakeURLRow(const char* url,
54 const char* title,
55 int visit_count,
56 int days_since_last_visit,
57 int typed_count);
58
59 // Convenience function to set the word starts information from a
60 // history::URLRow's URL and title.
61 void PopulateWordStarts(const history::URLRow& url_row,
62 RowWordStarts* word_starts);
63
64 // Convenience functions for easily creating vectors of search terms.
65 String16Vector Make1Term(const char* term) const;
66 String16Vector Make2Terms(const char* term_1, const char* term_2) const;
67
68 // Convenience function for GetTopicalityScore() that builds the term match
69 // and word break information automatically that are needed to call
70 // GetTopicalityScore().
71 float GetTopicalityScoreOfTermAgainstURLAndTitle(
72 const std::vector<const std::string>&,
73 const WordStarts term_word_starts,
74 const GURL& url,
75 const base::string16& title);
76 };
77
MakeURLRow(const char * url,const char * title,int visit_count,int days_since_last_visit,int typed_count)78 history::URLRow ScoredHistoryMatchTest::MakeURLRow(const char* url,
79 const char* title,
80 int visit_count,
81 int days_since_last_visit,
82 int typed_count) {
83 history::URLRow row(GURL(url), 0);
84 row.set_title(ASCIIToUTF16(title));
85 row.set_visit_count(visit_count);
86 row.set_typed_count(typed_count);
87 row.set_last_visit(base::Time::NowFromSystemTime() -
88 base::TimeDelta::FromDays(days_since_last_visit));
89 return row;
90 }
91
PopulateWordStarts(const history::URLRow & url_row,RowWordStarts * word_starts)92 void ScoredHistoryMatchTest::PopulateWordStarts(const history::URLRow& url_row,
93 RowWordStarts* word_starts) {
94 String16SetFromString16(ASCIIToUTF16(url_row.url().spec()),
95 &word_starts->url_word_starts_);
96 String16SetFromString16(url_row.title(), &word_starts->title_word_starts_);
97 }
98
Make1Term(const char * term) const99 String16Vector ScoredHistoryMatchTest::Make1Term(const char* term) const {
100 String16Vector original_terms;
101 original_terms.push_back(ASCIIToUTF16(term));
102 return original_terms;
103 }
104
Make2Terms(const char * term_1,const char * term_2) const105 String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
106 const char* term_2) const {
107 String16Vector original_terms;
108 original_terms.push_back(ASCIIToUTF16(term_1));
109 original_terms.push_back(ASCIIToUTF16(term_2));
110 return original_terms;
111 }
112
GetTopicalityScoreOfTermAgainstURLAndTitle(const std::vector<const std::string> & terms,const WordStarts term_word_starts,const GURL & url,const base::string16 & title)113 float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
114 const std::vector<const std::string>& terms,
115 const WordStarts term_word_starts,
116 const GURL& url,
117 const base::string16& title) {
118 String16Vector term_vector;
119 std::transform(terms.begin(), terms.end(), std::back_inserter(term_vector),
120 [](auto term) { return base::UTF8ToUTF16(term); });
121 std::string terms_joint =
122 std::accumulate(std::next(terms.begin()), terms.end(), terms[0],
123 [](std::string accumulator, std::string term) {
124 return accumulator + " " + term;
125 });
126 RowWordStarts row_word_starts;
127 base::string16 url_string = base::UTF8ToUTF16(url.spec());
128 String16SetFromString16(url_string, &row_word_starts.url_word_starts_);
129 String16SetFromString16(title, &row_word_starts.title_word_starts_);
130 auto row = history::URLRow(GURL(url));
131 row.set_title(title);
132 ScoredHistoryMatch scored_match(
133 row, VisitInfoVector(), base::UTF8ToUTF16(terms_joint), term_vector,
134 term_word_starts, row_word_starts, false, 1, base::Time::Max());
135 scored_match.topicality_threshold_ = -1;
136 return scored_match.GetTopicalityScore(term_vector.size(), url,
137 base::OffsetAdjuster::Adjustments(),
138 term_word_starts, row_word_starts);
139 }
140
TEST_F(ScoredHistoryMatchTest,Scoring)141 TEST_F(ScoredHistoryMatchTest, Scoring) {
142 // We use NowFromSystemTime() because MakeURLRow uses the same function
143 // to calculate last visit time when building a row.
144 base::Time now = base::Time::NowFromSystemTime();
145
146 history::URLRow row_a(MakeURLRow("http://fedcba", "abcd bcd", 3, 30, 1));
147 RowWordStarts word_starts_a;
148 PopulateWordStarts(row_a, &word_starts_a);
149 WordStarts one_word_no_offset(1, 0u);
150 VisitInfoVector visits_a = CreateVisitInfoVector(3, 30, now);
151 // Mark one visit as typed.
152 visits_a[0].second = ui::PAGE_TRANSITION_TYPED;
153 ScoredHistoryMatch scored_a(row_a, visits_a, ASCIIToUTF16("abc"),
154 Make1Term("abc"), one_word_no_offset,
155 word_starts_a, false, 1, now);
156
157 // Test scores based on visit_count.
158 history::URLRow row_b(MakeURLRow("http://abcdef", "abcd bcd", 10, 30, 1));
159 RowWordStarts word_starts_b;
160 PopulateWordStarts(row_b, &word_starts_b);
161 VisitInfoVector visits_b = CreateVisitInfoVector(10, 30, now);
162 visits_b[0].second = ui::PAGE_TRANSITION_TYPED;
163 ScoredHistoryMatch scored_b(row_b, visits_b, ASCIIToUTF16("abc"),
164 Make1Term("abc"), one_word_no_offset,
165 word_starts_b, false, 1, now);
166 EXPECT_GT(scored_b.raw_score, scored_a.raw_score);
167
168 // Test scores based on last_visit.
169 history::URLRow row_c(MakeURLRow("http://abcdef", "abcd bcd", 3, 10, 1));
170 RowWordStarts word_starts_c;
171 PopulateWordStarts(row_c, &word_starts_c);
172 VisitInfoVector visits_c = CreateVisitInfoVector(3, 10, now);
173 visits_c[0].second = ui::PAGE_TRANSITION_TYPED;
174 ScoredHistoryMatch scored_c(row_c, visits_c, ASCIIToUTF16("abc"),
175 Make1Term("abc"), one_word_no_offset,
176 word_starts_c, false, 1, now);
177 EXPECT_GT(scored_c.raw_score, scored_a.raw_score);
178
179 // Test scores based on typed_count.
180 history::URLRow row_d(MakeURLRow("http://abcdef", "abcd bcd", 3, 30, 3));
181 RowWordStarts word_starts_d;
182 PopulateWordStarts(row_d, &word_starts_d);
183 VisitInfoVector visits_d = CreateVisitInfoVector(3, 30, now);
184 visits_d[0].second = ui::PAGE_TRANSITION_TYPED;
185 visits_d[1].second = ui::PAGE_TRANSITION_TYPED;
186 visits_d[2].second = ui::PAGE_TRANSITION_TYPED;
187 ScoredHistoryMatch scored_d(row_d, visits_d, ASCIIToUTF16("abc"),
188 Make1Term("abc"), one_word_no_offset,
189 word_starts_d, false, 1, now);
190 EXPECT_GT(scored_d.raw_score, scored_a.raw_score);
191
192 // Test scores based on a terms appearing multiple times.
193 history::URLRow row_e(MakeURLRow(
194 "http://csi.csi.csi/csi_csi",
195 "CSI Guide to CSI Las Vegas, CSI New York, CSI Provo", 3, 30, 3));
196 RowWordStarts word_starts_e;
197 PopulateWordStarts(row_e, &word_starts_e);
198 const VisitInfoVector visits_e = visits_d;
199 ScoredHistoryMatch scored_e(row_e, visits_e, ASCIIToUTF16("csi"),
200 Make1Term("csi"), one_word_no_offset,
201 word_starts_e, false, 1, now);
202 EXPECT_LT(scored_e.raw_score, 1400);
203
204 // Test that a result with only a mid-term match (i.e., not at a word
205 // boundary) scores 0.
206 ScoredHistoryMatch scored_f(row_a, visits_a, ASCIIToUTF16("cd"),
207 Make1Term("cd"), one_word_no_offset,
208 word_starts_a, false, 1, now);
209 EXPECT_EQ(scored_f.raw_score, 0);
210 }
211
TEST_F(ScoredHistoryMatchTest,ScoringBookmarks)212 TEST_F(ScoredHistoryMatchTest, ScoringBookmarks) {
213 // We use NowFromSystemTime() because MakeURLRow uses the same function
214 // to calculate last visit time when building a row.
215 base::Time now = base::Time::NowFromSystemTime();
216
217 std::string url_string("http://fedcba");
218 const GURL url(url_string);
219 history::URLRow row(MakeURLRow(url_string.c_str(), "abcd bcd", 8, 3, 1));
220 RowWordStarts word_starts;
221 PopulateWordStarts(row, &word_starts);
222 WordStarts one_word_no_offset(1, 0u);
223 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
224 ScoredHistoryMatch scored(row, visits, ASCIIToUTF16("abc"), Make1Term("abc"),
225 one_word_no_offset, word_starts, false, 1, now);
226 // Now check that if URL is bookmarked then its score increases.
227 base::AutoReset<float> reset(&ScoredHistoryMatch::bookmark_value_, 5);
228 ScoredHistoryMatch scored_with_bookmark(row, visits, ASCIIToUTF16("abc"),
229 Make1Term("abc"), one_word_no_offset,
230 word_starts, true, 1, now);
231 EXPECT_GT(scored_with_bookmark.raw_score, scored.raw_score);
232 }
233
TEST_F(ScoredHistoryMatchTest,ScoringTLD)234 TEST_F(ScoredHistoryMatchTest, ScoringTLD) {
235 // We use NowFromSystemTime() because MakeURLRow uses the same function
236 // to calculate last visit time when building a row.
237 base::Time now = base::Time::NowFromSystemTime();
238
239 // By default, a tld match should not contribute to the suggestion score.
240 std::string url_string("http://fedcba.com/");
241 const GURL url(url_string);
242 history::URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
243 RowWordStarts word_starts;
244 PopulateWordStarts(row, &word_starts);
245 WordStarts two_words_no_offsets(2, 0u);
246 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
247 ScoredHistoryMatch scored(row, visits, ASCIIToUTF16("fed com"),
248 Make2Terms("fed", "com"), two_words_no_offsets,
249 word_starts, false, 1, now);
250 EXPECT_GT(scored.raw_score, 0);
251
252 // Now allow credit for the match in the TLD.
253 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_tld_matches_, true);
254 ScoredHistoryMatch scored_with_tld(
255 row, visits, ASCIIToUTF16("fed com"), Make2Terms("fed", "com"),
256 two_words_no_offsets, word_starts, false, 1, now);
257 EXPECT_GT(scored_with_tld.raw_score, 0);
258
259 EXPECT_GT(scored_with_tld.raw_score, scored.raw_score);
260 }
261
TEST_F(ScoredHistoryMatchTest,ScoringScheme)262 TEST_F(ScoredHistoryMatchTest, ScoringScheme) {
263 // We use NowFromSystemTime() because MakeURLRow uses the same function
264 // to calculate last visit time when building a row.
265 base::Time now = base::Time::NowFromSystemTime();
266
267 // By default, a scheme match should not contribute to the suggestion score
268 std::string url_string("http://fedcba/");
269 const GURL url(url_string);
270 history::URLRow row(MakeURLRow(url_string.c_str(), "", 8, 3, 1));
271 RowWordStarts word_starts;
272 PopulateWordStarts(row, &word_starts);
273 WordStarts two_words_no_offsets(2, 0u);
274 VisitInfoVector visits = CreateVisitInfoVector(8, 3, now);
275 ScoredHistoryMatch scored(row, visits, ASCIIToUTF16("fed http"),
276 Make2Terms("fed", "http"), two_words_no_offsets,
277 word_starts, false, 1, now);
278 EXPECT_GT(scored.raw_score, 0);
279
280 // Now allow credit for the match in the scheme.
281 base::AutoReset<bool> reset(&ScoredHistoryMatch::allow_scheme_matches_, true);
282 ScoredHistoryMatch scored_with_scheme(
283 row, visits, ASCIIToUTF16("fed http"), Make2Terms("fed", "http"),
284 two_words_no_offsets, word_starts, false, 1, now);
285 EXPECT_GT(scored_with_scheme.raw_score, 0);
286
287 EXPECT_GT(scored_with_scheme.raw_score, scored.raw_score);
288 }
289
TEST_F(ScoredHistoryMatchTest,MatchURLComponents)290 TEST_F(ScoredHistoryMatchTest, MatchURLComponents) {
291 // We use NowFromSystemTime() because MakeURLRow uses the same function
292 // to calculate last visit time when building a row.
293 base::Time now = base::Time::NowFromSystemTime();
294 RowWordStarts word_starts;
295 WordStarts one_word_no_offset(1, 0u);
296 VisitInfoVector visits;
297
298 {
299 history::URLRow row(
300 MakeURLRow("http://www.google.com", "abcdef", 3, 30, 1));
301 PopulateWordStarts(row, &word_starts);
302 ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("g"), Make1Term("g"),
303 one_word_no_offset, word_starts, false, 1, now);
304 EXPECT_FALSE(scored_a.match_in_scheme);
305 EXPECT_FALSE(scored_a.match_in_subdomain);
306 ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("w"), Make1Term("w"),
307 one_word_no_offset, word_starts, false, 1, now);
308 EXPECT_FALSE(scored_b.match_in_scheme);
309 EXPECT_TRUE(scored_b.match_in_subdomain);
310 ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("h"), Make1Term("h"),
311 one_word_no_offset, word_starts, false, 1, now);
312 EXPECT_TRUE(scored_c.match_in_scheme);
313 EXPECT_FALSE(scored_c.match_in_subdomain);
314 ScoredHistoryMatch scored_d(row, visits, ASCIIToUTF16("o"), Make1Term("o"),
315 one_word_no_offset, word_starts, false, 1, now);
316 EXPECT_FALSE(scored_d.match_in_scheme);
317 EXPECT_FALSE(scored_d.match_in_subdomain);
318 }
319
320 {
321 history::URLRow row(MakeURLRow("http://teams.foo.com", "abcdef", 3, 30, 1));
322 PopulateWordStarts(row, &word_starts);
323 ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("t"), Make1Term("t"),
324 one_word_no_offset, word_starts, false, 1, now);
325 EXPECT_FALSE(scored_a.match_in_scheme);
326 EXPECT_TRUE(scored_a.match_in_subdomain);
327 ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("f"), Make1Term("f"),
328 one_word_no_offset, word_starts, false, 1, now);
329 EXPECT_FALSE(scored_b.match_in_scheme);
330 EXPECT_FALSE(scored_b.match_in_subdomain);
331 ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("o"), Make1Term("o"),
332 one_word_no_offset, word_starts, false, 1, now);
333 EXPECT_FALSE(scored_c.match_in_scheme);
334 EXPECT_FALSE(scored_c.match_in_subdomain);
335 }
336
337 {
338 history::URLRow row(MakeURLRow("http://en.m.foo.com", "abcdef", 3, 30, 1));
339 PopulateWordStarts(row, &word_starts);
340 ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("e"), Make1Term("e"),
341 one_word_no_offset, word_starts, false, 1, now);
342 EXPECT_FALSE(scored_a.match_in_scheme);
343 EXPECT_TRUE(scored_a.match_in_subdomain);
344 ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("m"), Make1Term("m"),
345 one_word_no_offset, word_starts, false, 1, now);
346 EXPECT_FALSE(scored_b.match_in_scheme);
347 EXPECT_TRUE(scored_b.match_in_subdomain);
348 ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("f"), Make1Term("f"),
349 one_word_no_offset, word_starts, false, 1, now);
350 EXPECT_FALSE(scored_c.match_in_scheme);
351 EXPECT_FALSE(scored_c.match_in_subdomain);
352 }
353
354 {
355 history::URLRow row(
356 MakeURLRow("https://www.testing.com/xxx?yyy#zzz", "abcdef", 3, 30, 1));
357 PopulateWordStarts(row, &word_starts);
358 ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("t"), Make1Term("t"),
359 one_word_no_offset, word_starts, false, 1, now);
360 EXPECT_FALSE(scored_a.match_in_scheme);
361 EXPECT_FALSE(scored_a.match_in_subdomain);
362 ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("h"), Make1Term("h"),
363 one_word_no_offset, word_starts, false, 1, now);
364 EXPECT_TRUE(scored_b.match_in_scheme);
365 EXPECT_FALSE(scored_b.match_in_subdomain);
366 ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("w"), Make1Term("w"),
367 one_word_no_offset, word_starts, false, 1, now);
368 EXPECT_FALSE(scored_c.match_in_scheme);
369 EXPECT_TRUE(scored_c.match_in_subdomain);
370 ScoredHistoryMatch scored_d(row, visits, ASCIIToUTF16("x"), Make1Term("x"),
371 one_word_no_offset, word_starts, false, 1, now);
372 EXPECT_FALSE(scored_d.match_in_scheme);
373 EXPECT_FALSE(scored_d.match_in_subdomain);
374 ScoredHistoryMatch scored_e(row, visits, ASCIIToUTF16("y"), Make1Term("y"),
375 one_word_no_offset, word_starts, false, 1, now);
376 EXPECT_FALSE(scored_e.match_in_scheme);
377 EXPECT_FALSE(scored_e.match_in_subdomain);
378 ScoredHistoryMatch scored_f(row, visits, ASCIIToUTF16("z"), Make1Term("z"),
379 one_word_no_offset, word_starts, false, 1, now);
380 EXPECT_FALSE(scored_f.match_in_scheme);
381 EXPECT_FALSE(scored_f.match_in_subdomain);
382 ScoredHistoryMatch scored_g(row, visits, ASCIIToUTF16("https://www"),
383 Make1Term("https://www"), one_word_no_offset,
384 word_starts, false, 1, now);
385 EXPECT_TRUE(scored_g.match_in_scheme);
386 EXPECT_TRUE(scored_g.match_in_subdomain);
387 ScoredHistoryMatch scored_h(row, visits, ASCIIToUTF16("testing.com/x"),
388 Make1Term("testing.com/x"), one_word_no_offset,
389 word_starts, false, 1, now);
390 EXPECT_FALSE(scored_h.match_in_scheme);
391 EXPECT_FALSE(scored_h.match_in_subdomain);
392 ScoredHistoryMatch scored_i(row, visits,
393 ASCIIToUTF16("https://www.testing.com/x"),
394 Make1Term("https://www.testing.com/x"),
395 one_word_no_offset, word_starts, false, 1, now);
396 EXPECT_TRUE(scored_i.match_in_scheme);
397 EXPECT_TRUE(scored_i.match_in_subdomain);
398 }
399
400 {
401 history::URLRow row(
402 MakeURLRow("http://www.xn--1lq90ic7f1rc.cn/xnblah", "abcd", 3, 30, 1));
403 PopulateWordStarts(row, &word_starts);
404 ScoredHistoryMatch scored_a(row, visits, ASCIIToUTF16("x"), Make1Term("x"),
405 one_word_no_offset, word_starts, false, 1, now);
406 EXPECT_FALSE(scored_a.match_in_scheme);
407 EXPECT_FALSE(scored_a.match_in_subdomain);
408 ScoredHistoryMatch scored_b(row, visits, ASCIIToUTF16("xn"),
409 Make1Term("xn"), one_word_no_offset,
410 word_starts, false, 1, now);
411 EXPECT_FALSE(scored_b.match_in_scheme);
412 EXPECT_FALSE(scored_b.match_in_subdomain);
413 ScoredHistoryMatch scored_c(row, visits, ASCIIToUTF16("w"), Make1Term("w"),
414 one_word_no_offset, word_starts, false, 1, now);
415 EXPECT_FALSE(scored_c.match_in_scheme);
416 EXPECT_TRUE(scored_c.match_in_subdomain);
417 }
418 }
419
TEST_F(ScoredHistoryMatchTest,GetTopicalityScoreTrailingSlash)420 TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
421 const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
422 {"def"}, {0}, GURL("http://abc.def.com/"),
423 ASCIIToUTF16("Non-Matching Title"));
424 const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
425 {"def"}, {0}, GURL("http://abc.def.com"),
426 ASCIIToUTF16("Non-Matching Title"));
427 EXPECT_EQ(hostname_no_slash, hostname);
428 }
429
TEST_F(ScoredHistoryMatchTest,FilterMatches)430 TEST_F(ScoredHistoryMatchTest, FilterMatches) {
431 // For ease in interpreting this test, imagine the URL
432 // http://test.com/default/foo.aspxhome/hello.html.
433 // 012345678901234567890123456789012345678901234567
434 // 1 2 3 4
435 // We test how FilterTermMatchesByWordStarts() reacts to various
436 // one-character inputs.
437 WordStarts terms_to_word_starts_offsets;
438 terms_to_word_starts_offsets.push_back(0);
439 WordStarts word_starts;
440 word_starts.push_back(0);
441 word_starts.push_back(7);
442 word_starts.push_back(12);
443 word_starts.push_back(16);
444 word_starts.push_back(24);
445 word_starts.push_back(28);
446 word_starts.push_back(37);
447 word_starts.push_back(43);
448
449 // Check that "h" matches "http", "hello", and "html" but not "aspxhome" when
450 // asked to filter non-word-start matches after the hostname. The "15" in
451 // the filter call below is the position of the "/" ending the hostname.
452 TermMatches term_matches;
453 term_matches.push_back(TermMatch(0, 0, 1));
454 term_matches.push_back(TermMatch(0, 32, 1));
455 term_matches.push_back(TermMatch(0, 37, 1));
456 term_matches.push_back(TermMatch(0, 43, 1));
457 TermMatches filtered_term_matches =
458 ScoredHistoryMatch::FilterTermMatchesByWordStarts(
459 term_matches, terms_to_word_starts_offsets, word_starts, 15,
460 std::string::npos);
461 ASSERT_EQ(3u, filtered_term_matches.size());
462 EXPECT_EQ(0u, filtered_term_matches[0].offset);
463 EXPECT_EQ(37u, filtered_term_matches[1].offset);
464 EXPECT_EQ(43u, filtered_term_matches[2].offset);
465 // The "http" match should remain after removing the mid-word matches in the
466 // scheme. The "4" is the position of the ":" character ending the scheme.
467 filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
468 filtered_term_matches, terms_to_word_starts_offsets, word_starts, 0, 5);
469 ASSERT_EQ(3u, filtered_term_matches.size());
470 EXPECT_EQ(0u, filtered_term_matches[0].offset);
471 EXPECT_EQ(37u, filtered_term_matches[1].offset);
472 EXPECT_EQ(43u, filtered_term_matches[2].offset);
473
474 // Check that "t" matches "http" twice and "test" twice but not "default" or
475 // "html" when asked to filter non-word-start matches after the hostname.
476 term_matches.clear();
477 term_matches.push_back(TermMatch(0, 1, 1));
478 term_matches.push_back(TermMatch(0, 2, 1));
479 term_matches.push_back(TermMatch(0, 7, 1));
480 term_matches.push_back(TermMatch(0, 10, 1));
481 term_matches.push_back(TermMatch(0, 22, 1));
482 term_matches.push_back(TermMatch(0, 45, 1));
483 filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
484 term_matches, terms_to_word_starts_offsets, word_starts, 15,
485 std::string::npos);
486 ASSERT_EQ(4u, filtered_term_matches.size());
487 EXPECT_EQ(1u, filtered_term_matches[0].offset);
488 EXPECT_EQ(2u, filtered_term_matches[1].offset);
489 EXPECT_EQ(7u, filtered_term_matches[2].offset);
490 EXPECT_EQ(10u, filtered_term_matches[3].offset);
491 // The "http" matches should disappear after removing mid-word matches in the
492 // scheme.
493 filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
494 filtered_term_matches, terms_to_word_starts_offsets, word_starts, 0, 4);
495 ASSERT_EQ(2u, filtered_term_matches.size());
496 EXPECT_EQ(7u, filtered_term_matches[0].offset);
497 EXPECT_EQ(10u, filtered_term_matches[1].offset);
498
499 // Check that "e" matches "test" but not "default" or "hello" when asked to
500 // filter non-word-start matches after the hostname.
501 term_matches.clear();
502 term_matches.push_back(TermMatch(0, 8, 1));
503 term_matches.push_back(TermMatch(0, 17, 1));
504 term_matches.push_back(TermMatch(0, 38, 1));
505 filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
506 term_matches, terms_to_word_starts_offsets, word_starts, 15,
507 std::string::npos);
508 ASSERT_EQ(1u, filtered_term_matches.size());
509 EXPECT_EQ(8u, filtered_term_matches[0].offset);
510
511 // Check that "d" matches "default" when asked to filter non-word-start
512 // matches after the hostname.
513 term_matches.clear();
514 term_matches.push_back(TermMatch(0, 16, 1));
515 filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
516 term_matches, terms_to_word_starts_offsets, word_starts, 15,
517 std::string::npos);
518 ASSERT_EQ(1u, filtered_term_matches.size());
519 EXPECT_EQ(16u, filtered_term_matches[0].offset);
520
521 // Check that "a" matches "aspxhome" but not "default" when asked to filter
522 // non-word-start matches after the hostname.
523 term_matches.clear();
524 term_matches.push_back(TermMatch(0, 19, 1));
525 term_matches.push_back(TermMatch(0, 28, 1));
526 filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
527 term_matches, terms_to_word_starts_offsets, word_starts, 15,
528 std::string::npos);
529 ASSERT_EQ(1u, filtered_term_matches.size());
530 EXPECT_EQ(28u, filtered_term_matches[0].offset);
531
532 // Check that ".a" matches "aspxhome", i.e., that we recognize that is
533 // is a valid match at a word break. To recognize this,
534 // |terms_to_word_starts_offsets| must record that the "word" in this term
535 // starts at the second character.
536 term_matches.clear();
537 term_matches.push_back(TermMatch(0, 27, 1));
538 filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
539 term_matches, /*terms_to_word_starts_offsets*/ {1}, word_starts, 15,
540 std::string::npos);
541 ASSERT_EQ(1u, filtered_term_matches.size());
542 EXPECT_EQ(27u, filtered_term_matches[0].offset);
543
544 // Check "de" + "fa" + "lt" matches "defa" when |allow_midword_continuations|
545 // is true.
546 term_matches.clear();
547 term_matches.push_back(TermMatch(0, 16, 2));
548 term_matches.push_back(TermMatch(1, 18, 2));
549 term_matches.push_back(TermMatch(2, 21, 2));
550 filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
551 term_matches, {0, 0, 0}, word_starts, 15, std::string::npos, true);
552 ASSERT_EQ(2u, filtered_term_matches.size());
553 EXPECT_EQ(16u, filtered_term_matches[0].offset);
554 EXPECT_EQ(18u, filtered_term_matches[1].offset);
555
556 // Check "de" + "fa" + "lt" matches "de" when |allow_midword_continuations| is
557 // false.
558 term_matches.clear();
559 term_matches.push_back(TermMatch(0, 16, 2));
560 term_matches.push_back(TermMatch(1, 18, 2));
561 term_matches.push_back(TermMatch(2, 21, 2));
562 filtered_term_matches = ScoredHistoryMatch::FilterTermMatchesByWordStarts(
563 term_matches, {0, 0, 0}, word_starts, 15, std::string::npos, false);
564 ASSERT_EQ(1u, filtered_term_matches.size());
565 EXPECT_EQ(16u, filtered_term_matches[0].offset);
566 }
567
TEST_F(ScoredHistoryMatchTest,GetFrequency)568 TEST_F(ScoredHistoryMatchTest, GetFrequency) {
569 // Build a fake ScoredHistoryMatch, which we'll then reuse multiple times.
570 history::URLRow row(GURL("http://foo"));
571 RowWordStarts row_word_starts;
572 PopulateWordStarts(row, &row_word_starts);
573 base::Time now(base::Time::Max());
574 VisitInfoVector visits;
575 ScoredHistoryMatch match(row, visits, ASCIIToUTF16("foo"), Make1Term("foo"),
576 WordStarts{0}, row_word_starts, false, 1, now);
577
578 // Record the score for one untyped visit.
579 visits = {{now, ui::PAGE_TRANSITION_LINK}};
580 const float one_untyped_score = match.GetFrequency(now, false, visits);
581
582 // The score for one typed visit should be larger.
583 visits = VisitInfoVector{{now, ui::PAGE_TRANSITION_TYPED}};
584 const float one_typed_score = match.GetFrequency(now, false, visits);
585 EXPECT_GT(one_typed_score, one_untyped_score);
586
587 // It shouldn't matter if the typed visit has a transition qualifier.
588 visits = {
589 {now, ui::PageTransitionFromInt(ui::PAGE_TRANSITION_TYPED |
590 ui::PAGE_TRANSITION_SERVER_REDIRECT)}};
591 EXPECT_EQ(one_typed_score, match.GetFrequency(now, false, visits));
592
593 // A score for one untyped visit to a bookmarked page should be larger than
594 // the one untyped visit to a non-bookmarked page.
595 visits = {{now, ui::PAGE_TRANSITION_LINK}};
596 EXPECT_GE(match.GetFrequency(now, true, visits), one_untyped_score);
597
598 // Now consider pages visited twice, with one visit being typed and one
599 // untyped.
600
601 // A two-visit score should have a higher score than the single typed visit
602 // score.
603 visits = {{now, ui::PAGE_TRANSITION_TYPED},
604 {now - base::TimeDelta::FromDays(1), ui::PAGE_TRANSITION_LINK}};
605 const float two_visits_score = match.GetFrequency(now, false, visits);
606 EXPECT_GT(two_visits_score, one_typed_score);
607
608 // Add an third untyped visit.
609 visits.push_back(
610 {now - base::TimeDelta::FromDays(2), ui::PAGE_TRANSITION_LINK});
611
612 // The score should be higher than the two-visit score.
613 const float three_visits_score = match.GetFrequency(now, false, visits);
614 EXPECT_GT(three_visits_score, two_visits_score);
615
616 // If we're only supposed to consider the most recent two visits, then the
617 // score should be the same as in the two-visit case.
618 {
619 base::AutoReset<size_t> tmp1(&ScoredHistoryMatch::max_visits_to_score_, 2);
620 EXPECT_EQ(two_visits_score, match.GetFrequency(now, false, visits));
621
622 // Check again with the third visit being typed.
623 visits[2].second = ui::PAGE_TRANSITION_TYPED;
624 EXPECT_EQ(two_visits_score, match.GetFrequency(now, false, visits));
625 }
626 }
627
TEST_F(ScoredHistoryMatchTest,GetDocumentSpecificityScore)628 TEST_F(ScoredHistoryMatchTest, GetDocumentSpecificityScore) {
629 // Build a fake ScoredHistoryMatch, which we'll then reuse multiple times.
630 history::URLRow row(GURL("http://foo"));
631 RowWordStarts row_word_starts;
632 PopulateWordStarts(row, &row_word_starts);
633 base::Time now(base::Time::Max());
634 VisitInfoVector visits;
635 ScoredHistoryMatch match(row, visits, ASCIIToUTF16("foo"), Make1Term("foo"),
636 WordStarts{0}, row_word_starts, false, 1, now);
637
638 EXPECT_EQ(3.0, match.GetDocumentSpecificityScore(1));
639 EXPECT_EQ(1.0, match.GetDocumentSpecificityScore(5));
640 EXPECT_EQ(1.0, match.GetDocumentSpecificityScore(50));
641
642 OmniboxFieldTrial::NumMatchesScores matches_to_specificity;
643 base::AutoReset<OmniboxFieldTrial::NumMatchesScores*> tmp(
644 &ScoredHistoryMatch::matches_to_specificity_override_,
645 &matches_to_specificity);
646
647 matches_to_specificity = {{1, 3.0}};
648 EXPECT_EQ(3.0, match.GetDocumentSpecificityScore(1));
649 EXPECT_EQ(1.0, match.GetDocumentSpecificityScore(5));
650
651 matches_to_specificity = {{1, 3.0}, {3, 1.5}};
652 EXPECT_EQ(3.0, match.GetDocumentSpecificityScore(1));
653 EXPECT_EQ(1.5, match.GetDocumentSpecificityScore(2));
654 EXPECT_EQ(1.5, match.GetDocumentSpecificityScore(3));
655 EXPECT_EQ(1.0, match.GetDocumentSpecificityScore(4));
656 }
657
658 // This function only tests scoring of single terms that match exactly
659 // once somewhere in the URL or title.
TEST_F(ScoredHistoryMatchTest,GetTopicalityScore)660 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
661 GURL url("http://abc.def.com/path1/path2?arg1=val1&arg2=val2#hash_fragment");
662 base::string16 title = ASCIIToUTF16("here is a - title");
663 auto Score = [&](const std::vector<const std::string>& term_vector,
664 const WordStarts term_word_starts) {
665 return GetTopicalityScoreOfTermAgainstURLAndTitle(
666 term_vector, term_word_starts, url, title);
667 };
668 const float hostname_score = Score({"abc"}, {0});
669 const float hostname_mid_word_score = Score({"bc"}, {0});
670 const float hostname_score_preceeding_punctuation = Score({"://abc"}, {3});
671 const float domain_name_score = Score({"def"}, {0});
672 const float domain_name_mid_word_score = Score({"ef"}, {0});
673 const float domain_name_score_preceeding_dot = Score({".def"}, {1});
674 const float tld_score = Score({"com"}, {0});
675 const float tld_mid_word_score = Score({"om"}, {0});
676 const float tld_score_preceeding_dot = Score({".com"}, {1});
677 const float path_score = Score({"path1"}, {0});
678 const float path_mid_word_score = Score({"ath1"}, {0});
679 const float path_score_preceeding_slash = Score({"/path1"}, {1});
680 const float arg_score = Score({"arg1"}, {0});
681 const float arg_mid_word_score = Score({"rg1"}, {0});
682 const float arg_score_preceeding_question_mark = Score({"?arg1"}, {1});
683 const float protocol_score = Score({"htt"}, {0});
684 const float protocol_mid_word_score = Score({"tt"}, {0});
685 const float title_score = Score({"her"}, {0});
686 const float title_mid_word_score = Score({"er"}, {0});
687 const float wordless_match_at_title_mid_word_score = Score({"-"}, {1});
688 // Verify hostname and domain name > path > arg.
689 EXPECT_GT(hostname_score, path_score);
690 EXPECT_GT(domain_name_score, path_score);
691 EXPECT_GT(path_score, arg_score);
692 // Verify leading punctuation doesn't confuse scoring.
693 EXPECT_EQ(hostname_score, hostname_score_preceeding_punctuation);
694 EXPECT_EQ(domain_name_score, domain_name_score_preceeding_dot);
695 EXPECT_EQ(tld_score, tld_score_preceeding_dot);
696 EXPECT_EQ(path_score, path_score_preceeding_slash);
697 EXPECT_EQ(arg_score, arg_score_preceeding_question_mark);
698 // Verify that domain name > path and domain name > arg for non-word
699 // boundaries.
700 EXPECT_GT(hostname_mid_word_score, path_mid_word_score);
701 EXPECT_GT(domain_name_mid_word_score, path_mid_word_score);
702 EXPECT_GT(domain_name_mid_word_score, arg_mid_word_score);
703 EXPECT_GT(hostname_mid_word_score, arg_mid_word_score);
704 // Also verify that the matches at non-word-boundaries all score
705 // worse than the matches at word boundaries. These three sets suffice.
706 EXPECT_GT(arg_score, hostname_mid_word_score);
707 EXPECT_GT(arg_score, domain_name_mid_word_score);
708 EXPECT_GT(title_score, title_mid_word_score);
709 // Verify mid word scores are scored 0 unless 1) in the host or domain 2) or
710 // the match contains no words.
711 EXPECT_GT(hostname_mid_word_score, 0);
712 EXPECT_GT(domain_name_mid_word_score, 0);
713 EXPECT_EQ(tld_mid_word_score, 0);
714 EXPECT_EQ(path_mid_word_score, 0);
715 EXPECT_EQ(arg_mid_word_score, 0);
716 EXPECT_EQ(protocol_mid_word_score, 0);
717 EXPECT_EQ(title_mid_word_score, 0);
718 EXPECT_GT(wordless_match_at_title_mid_word_score, 0);
719 // Check that title matches fit somewhere reasonable compared to the
720 // various types of URL matches.
721 EXPECT_GT(title_score, arg_score);
722 EXPECT_GT(arg_score, title_mid_word_score);
723 // Finally, verify that protocol matches and top level domain name
724 // matches (.com, .net, etc.) score worse than some of the mid-word
725 // matches that actually count.
726 EXPECT_GT(hostname_mid_word_score, protocol_score);
727 EXPECT_GT(hostname_mid_word_score, protocol_mid_word_score);
728 EXPECT_GT(hostname_mid_word_score, tld_score);
729 EXPECT_GT(hostname_mid_word_score, tld_mid_word_score);
730 }
731
TEST_F(ScoredHistoryMatchTest,GetTopicalityScore_MidwordMatching)732 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore_MidwordMatching) {
733 GURL url("http://abc.def.com/path1/path2?arg1=val1&arg2=val2#hash_fragment");
734 base::string16 title = ASCIIToUTF16("here is a - title");
735 auto Score = [&](const std::vector<const std::string>& term_vector,
736 const WordStarts term_word_starts) {
737 return GetTopicalityScoreOfTermAgainstURLAndTitle(
738 term_vector, term_word_starts, url, title);
739 };
740
741 // Check that midword matches are allowed and scored.
742 const float wordstart = Score({"frag"}, {0u});
743 const float midword = Score({"ment"}, {0u});
744 const float wordstart_midword_continuation =
745 Score({"frag", "ment"}, {0u, 0u});
746 const float wordstart_midword_disjoint = Score({"frag", "ent"}, {0u, 0u});
747
748 EXPECT_GT(wordstart, 0);
749 // Midword matches should not contribute to the score if they are disjoint.
750 EXPECT_EQ(midword, 0);
751 EXPECT_GT(wordstart_midword_continuation, 0);
752 EXPECT_GT(wordstart_midword_disjoint, 0);
753 // Midword matches should not contribute to the score if they are disjoint.
754 EXPECT_GT(wordstart_midword_continuation, wordstart_midword_disjoint);
755 }
756
757 // Test the function GetFinalRelevancyScore().
TEST_F(ScoredHistoryMatchTest,GetFinalRelevancyScore)758 TEST_F(ScoredHistoryMatchTest, GetFinalRelevancyScore) {
759 // relevance_buckets = "0.0:100,1.0:200,4.0:500,8.0:900,10.0:1000";
760 ScoredHistoryMatch::ScoreMaxRelevances relevance_buckets = {
761 {0.0, 100}, {1.0, 200}, {4.0, 500}, {8.0, 900}, {10.0, 1000}};
762 base::AutoReset<ScoredHistoryMatch::ScoreMaxRelevances*> tmp(
763 &ScoredHistoryMatch::relevance_buckets_override_, &relevance_buckets);
764
765 // Check when topicality score is zero.
766 float topicality_score = 0.0;
767 float frequency_score = 10.0;
768 float specificity_score = 1.0;
769 // intermediate_score = 0.0 * 10.0 * 1.0 = 0.0.
770 EXPECT_EQ(0, ScoredHistoryMatch::GetFinalRelevancyScore(
771 topicality_score, frequency_score, specificity_score));
772
773 // Check when intermediate score falls at the border range.
774 topicality_score = 0.4f;
775 frequency_score = 10.0f;
776 // intermediate_score = 0.4 * 10.0 * 1.0 = 4.0.
777 EXPECT_EQ(500, ScoredHistoryMatch::GetFinalRelevancyScore(
778 topicality_score, frequency_score, specificity_score));
779
780 // Checking the score that falls into one of the buckets.
781 topicality_score = 0.5f;
782 frequency_score = 10.0f;
783 // intermediate_score = 0.5 * 10.0 * 1.0 = 5.0.
784 EXPECT_EQ(600, // 500 + (((900 - 500)/(8 -4)) * 1) = 600.
785 ScoredHistoryMatch::GetFinalRelevancyScore(
786 topicality_score, frequency_score, specificity_score));
787
788 // Never give the score greater than maximum specified.
789 topicality_score = 0.5f;
790 frequency_score = 22.0f;
791 // intermediate_score = 0.5 * 22.0 * 1.0 = 11.0
792 EXPECT_EQ(1000, ScoredHistoryMatch::GetFinalRelevancyScore(
793 topicality_score, frequency_score, specificity_score));
794 }
795
796 // Test the function GetHQPBucketsFromString().
TEST_F(ScoredHistoryMatchTest,GetHQPBucketsFromString)797 TEST_F(ScoredHistoryMatchTest, GetHQPBucketsFromString) {
798 std::string buckets_str = "0.0:400,1.5:600,12.0:1300,20.0:1399";
799 std::vector<ScoredHistoryMatch::ScoreMaxRelevance> hqp_buckets =
800 ScoredHistoryMatch::GetHQPBucketsFromString(buckets_str);
801 EXPECT_THAT(hqp_buckets, ElementsAre(Pair(0.0, 400), Pair(1.5, 600),
802 Pair(12.0, 1300), Pair(20.0, 1399)));
803 // Test using an invalid string.
804 buckets_str = "0.0,400,1.5,600";
805 hqp_buckets = ScoredHistoryMatch::GetHQPBucketsFromString(buckets_str);
806 EXPECT_TRUE(hqp_buckets.empty());
807 }
808