1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/omnibox/browser/in_memory_url_index_types.h"
6 
7 #include <algorithm>
8 #include <functional>
9 #include <iterator>
10 #include <numeric>
11 #include <set>
12 
13 #include "base/i18n/break_iterator.h"
14 #include "base/i18n/case_conversion.h"
15 #include "base/strings/string_util.h"
16 #include "base/trace_event/memory_usage_estimator.h"
17 #include "components/omnibox/browser/tailored_word_break_iterator.h"
18 #include "net/base/escape.h"
19 
20 namespace {
21 // The maximum number of characters to consider from an URL and page title
22 // while matching user-typed terms.
23 const size_t kMaxSignificantChars = 200;
24 
String16VectorFromString16Internal(base::string16 word,size_t previous_postion,bool break_on_space,String16Vector * words,WordStarts * word_starts)25 void String16VectorFromString16Internal(base::string16 word,
26                                         size_t previous_postion,
27                                         bool break_on_space,
28                                         String16Vector* words,
29                                         WordStarts* word_starts) {
30   size_t initial_whitespace = 0;
31   if (break_on_space) {
32     base::string16 trimmed_word;
33     base::TrimWhitespace(word, base::TRIM_LEADING, &trimmed_word);
34     initial_whitespace = word.length() - trimmed_word.length();
35     base::TrimWhitespace(trimmed_word, base::TRIM_TRAILING, &word);
36   }
37   if (word.empty())
38     return;
39   words->push_back(word);
40   if (!word_starts)
41     return;
42   size_t word_start = previous_postion + initial_whitespace;
43   if (word_start < kMaxSignificantChars)
44     word_starts->push_back(word_start);
45 }
46 }
47 
48 // Matches within URL and Title Strings ----------------------------------------
49 
MatchTermsInString(const String16Vector & terms,const base::string16 & cleaned_string)50 TermMatches MatchTermsInString(const String16Vector& terms,
51                                const base::string16& cleaned_string) {
52   TermMatches matches;
53   for (size_t i = 0; i < terms.size(); ++i) {
54     TermMatches term_matches = MatchTermInString(terms[i], cleaned_string, i);
55     matches.insert(matches.end(), term_matches.begin(), term_matches.end());
56   }
57   return matches;
58 }
59 
MatchTermInString(const base::string16 & term,const base::string16 & cleaned_string,int term_num)60 TermMatches MatchTermInString(const base::string16& term,
61                               const base::string16& cleaned_string,
62                               int term_num) {
63   const size_t kMaxCompareLength = 2048;
64   const base::string16& short_string =
65       (cleaned_string.length() > kMaxCompareLength) ?
66       cleaned_string.substr(0, kMaxCompareLength) : cleaned_string;
67   TermMatches matches;
68   for (size_t location = short_string.find(term);
69        location != base::string16::npos;
70        location = short_string.find(term, location + 1))
71     matches.push_back(TermMatch(term_num, location, term.length()));
72   return matches;
73 }
74 
75 // Comparison function for sorting TermMatches by their offsets.
SortMatchComparator(const TermMatch & m1,const TermMatch & m2)76 bool SortMatchComparator(const TermMatch& m1, const TermMatch& m2) {
77   // Return the match that occurs first (smallest offset). In the case of a tie,
78   // return the longer match.
79   return m1.offset == m2.offset ? m1.length > m2.length : m1.offset < m2.offset;
80 }
81 
SortMatches(const TermMatches & matches)82 TermMatches SortMatches(const TermMatches& matches) {
83   TermMatches sorted_matches(matches);
84   std::sort(sorted_matches.begin(), sorted_matches.end(), SortMatchComparator);
85   return sorted_matches;
86 }
87 
88 // Assumes |sorted_matches| is already sorted.
DeoverlapMatches(const TermMatches & sorted_matches)89 TermMatches DeoverlapMatches(const TermMatches& sorted_matches) {
90   TermMatches out;
91   std::copy_if(
92       sorted_matches.begin(), sorted_matches.end(), std::back_inserter(out),
93       [&out](const TermMatch& match) {
94         return out.empty() ||
95                match.offset >= (out.back().offset + out.back().length); });
96   return out;
97 }
98 
OffsetsFromTermMatches(const TermMatches & matches)99 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches) {
100   std::vector<size_t> offsets;
101   for (const auto& match : matches) {
102     offsets.push_back(match.offset);
103     offsets.push_back(match.offset + match.length);
104   }
105   return offsets;
106 }
107 
ReplaceOffsetsInTermMatches(const TermMatches & matches,const std::vector<size_t> & offsets)108 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
109                                         const std::vector<size_t>& offsets) {
110   DCHECK_EQ(2 * matches.size(), offsets.size());
111   TermMatches new_matches;
112   auto offset_iter = offsets.begin();
113   for (auto term_iter = matches.begin(); term_iter != matches.end();
114        ++term_iter, ++offset_iter) {
115     const size_t starting_offset = *offset_iter;
116     ++offset_iter;
117     const size_t ending_offset = *offset_iter;
118     if ((starting_offset != base::string16::npos) &&
119         (ending_offset != base::string16::npos) &&
120         (starting_offset != ending_offset)) {
121       TermMatch new_match(*term_iter);
122       new_match.offset = starting_offset;
123       new_match.length = ending_offset - starting_offset;
124       new_matches.push_back(new_match);
125     }
126   }
127   return new_matches;
128 }
129 
130 // Utility Functions -----------------------------------------------------------
131 
String16SetFromString16(const base::string16 & cleaned_uni_string,WordStarts * word_starts)132 String16Set String16SetFromString16(const base::string16& cleaned_uni_string,
133                                     WordStarts* word_starts) {
134   String16Vector words =
135       String16VectorFromString16(cleaned_uni_string, false, word_starts);
136   for (auto& word : words)
137     word = base::i18n::ToLower(word).substr(0, kMaxSignificantChars);
138   return String16Set(std::make_move_iterator(words.begin()),
139                      std::make_move_iterator(words.end()));
140 }
141 
String16VectorFromString16(const base::string16 & cleaned_uni_string,bool break_on_space,WordStarts * word_starts)142 String16Vector String16VectorFromString16(
143     const base::string16& cleaned_uni_string,
144     bool break_on_space,
145     WordStarts* word_starts) {
146   if (word_starts)
147     word_starts->clear();
148   base::i18n::BreakIterator::BreakType break_mode =
149       break_on_space ? base::i18n::BreakIterator::BREAK_SPACE
150                      : base::i18n::BreakIterator::BREAK_WORD;
151   String16Vector words;
152   if (!break_on_space) {
153     TailoredWordBreakIterator iter(cleaned_uni_string, break_mode);
154     if (!iter.Init())
155       return words;
156     while (iter.Advance()) {
157       if (iter.IsWord()) {
158         String16VectorFromString16Internal(iter.GetString(), iter.prev(), false,
159                                            &words, word_starts);
160       }
161     }
162   } else {
163     base::i18n::BreakIterator iter(cleaned_uni_string, break_mode);
164     if (!iter.Init())
165       return words;
166     while (iter.Advance()) {
167       String16VectorFromString16Internal(iter.GetString(), iter.prev(), true,
168                                          &words, word_starts);
169     }
170   }
171   return words;
172 }
173 
Char16SetFromString16(const base::string16 & term)174 Char16Set Char16SetFromString16(const base::string16& term) {
175   return Char16Set(term.begin(), term.end());
176 }
177 
178 // HistoryInfoMapValue ---------------------------------------------------------
179 
180 HistoryInfoMapValue::HistoryInfoMapValue() = default;
181 HistoryInfoMapValue::HistoryInfoMapValue(const HistoryInfoMapValue& other) =
182     default;
183 HistoryInfoMapValue::HistoryInfoMapValue(HistoryInfoMapValue&& other) = default;
184 HistoryInfoMapValue& HistoryInfoMapValue::operator=(
185     const HistoryInfoMapValue& other) = default;
186 HistoryInfoMapValue& HistoryInfoMapValue::operator=(
187     HistoryInfoMapValue&& other) = default;
188 HistoryInfoMapValue::~HistoryInfoMapValue() = default;
189 
EstimateMemoryUsage() const190 size_t HistoryInfoMapValue::EstimateMemoryUsage() const {
191   return base::trace_event::EstimateMemoryUsage(url_row) +
192          base::trace_event::EstimateMemoryUsage(visits);
193 }
194 
195 // RowWordStarts ---------------------------------------------------------------
196 
197 RowWordStarts::RowWordStarts() = default;
198 RowWordStarts::RowWordStarts(const RowWordStarts& other) = default;
199 RowWordStarts::RowWordStarts(RowWordStarts&& other) = default;
200 RowWordStarts& RowWordStarts::operator=(const RowWordStarts& other) = default;
201 RowWordStarts& RowWordStarts::operator=(RowWordStarts&& other) = default;
202 RowWordStarts::~RowWordStarts() = default;
203 
EstimateMemoryUsage() const204 size_t RowWordStarts::EstimateMemoryUsage() const {
205   return base::trace_event::EstimateMemoryUsage(url_word_starts_) +
206          base::trace_event::EstimateMemoryUsage(title_word_starts_);
207 }
208 
Clear()209 void RowWordStarts::Clear() {
210   url_word_starts_.clear();
211   title_word_starts_.clear();
212 }
213