1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/omnibox/browser/in_memory_url_index_types.h"
6
7 #include <algorithm>
8 #include <functional>
9 #include <iterator>
10 #include <numeric>
11 #include <set>
12
13 #include "base/i18n/break_iterator.h"
14 #include "base/i18n/case_conversion.h"
15 #include "base/strings/string_util.h"
16 #include "base/trace_event/memory_usage_estimator.h"
17 #include "components/omnibox/browser/tailored_word_break_iterator.h"
18 #include "net/base/escape.h"
19
20 namespace {
21 // The maximum number of characters to consider from an URL and page title
22 // while matching user-typed terms.
23 const size_t kMaxSignificantChars = 200;
24
String16VectorFromString16Internal(base::string16 word,size_t previous_postion,bool break_on_space,String16Vector * words,WordStarts * word_starts)25 void String16VectorFromString16Internal(base::string16 word,
26 size_t previous_postion,
27 bool break_on_space,
28 String16Vector* words,
29 WordStarts* word_starts) {
30 size_t initial_whitespace = 0;
31 if (break_on_space) {
32 base::string16 trimmed_word;
33 base::TrimWhitespace(word, base::TRIM_LEADING, &trimmed_word);
34 initial_whitespace = word.length() - trimmed_word.length();
35 base::TrimWhitespace(trimmed_word, base::TRIM_TRAILING, &word);
36 }
37 if (word.empty())
38 return;
39 words->push_back(word);
40 if (!word_starts)
41 return;
42 size_t word_start = previous_postion + initial_whitespace;
43 if (word_start < kMaxSignificantChars)
44 word_starts->push_back(word_start);
45 }
46 }
47
48 // Matches within URL and Title Strings ----------------------------------------
49
MatchTermsInString(const String16Vector & terms,const base::string16 & cleaned_string)50 TermMatches MatchTermsInString(const String16Vector& terms,
51 const base::string16& cleaned_string) {
52 TermMatches matches;
53 for (size_t i = 0; i < terms.size(); ++i) {
54 TermMatches term_matches = MatchTermInString(terms[i], cleaned_string, i);
55 matches.insert(matches.end(), term_matches.begin(), term_matches.end());
56 }
57 return matches;
58 }
59
MatchTermInString(const base::string16 & term,const base::string16 & cleaned_string,int term_num)60 TermMatches MatchTermInString(const base::string16& term,
61 const base::string16& cleaned_string,
62 int term_num) {
63 const size_t kMaxCompareLength = 2048;
64 const base::string16& short_string =
65 (cleaned_string.length() > kMaxCompareLength) ?
66 cleaned_string.substr(0, kMaxCompareLength) : cleaned_string;
67 TermMatches matches;
68 for (size_t location = short_string.find(term);
69 location != base::string16::npos;
70 location = short_string.find(term, location + 1))
71 matches.push_back(TermMatch(term_num, location, term.length()));
72 return matches;
73 }
74
75 // Comparison function for sorting TermMatches by their offsets.
SortMatchComparator(const TermMatch & m1,const TermMatch & m2)76 bool SortMatchComparator(const TermMatch& m1, const TermMatch& m2) {
77 // Return the match that occurs first (smallest offset). In the case of a tie,
78 // return the longer match.
79 return m1.offset == m2.offset ? m1.length > m2.length : m1.offset < m2.offset;
80 }
81
SortMatches(const TermMatches & matches)82 TermMatches SortMatches(const TermMatches& matches) {
83 TermMatches sorted_matches(matches);
84 std::sort(sorted_matches.begin(), sorted_matches.end(), SortMatchComparator);
85 return sorted_matches;
86 }
87
88 // Assumes |sorted_matches| is already sorted.
DeoverlapMatches(const TermMatches & sorted_matches)89 TermMatches DeoverlapMatches(const TermMatches& sorted_matches) {
90 TermMatches out;
91 std::copy_if(
92 sorted_matches.begin(), sorted_matches.end(), std::back_inserter(out),
93 [&out](const TermMatch& match) {
94 return out.empty() ||
95 match.offset >= (out.back().offset + out.back().length); });
96 return out;
97 }
98
OffsetsFromTermMatches(const TermMatches & matches)99 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches) {
100 std::vector<size_t> offsets;
101 for (const auto& match : matches) {
102 offsets.push_back(match.offset);
103 offsets.push_back(match.offset + match.length);
104 }
105 return offsets;
106 }
107
ReplaceOffsetsInTermMatches(const TermMatches & matches,const std::vector<size_t> & offsets)108 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
109 const std::vector<size_t>& offsets) {
110 DCHECK_EQ(2 * matches.size(), offsets.size());
111 TermMatches new_matches;
112 auto offset_iter = offsets.begin();
113 for (auto term_iter = matches.begin(); term_iter != matches.end();
114 ++term_iter, ++offset_iter) {
115 const size_t starting_offset = *offset_iter;
116 ++offset_iter;
117 const size_t ending_offset = *offset_iter;
118 if ((starting_offset != base::string16::npos) &&
119 (ending_offset != base::string16::npos) &&
120 (starting_offset != ending_offset)) {
121 TermMatch new_match(*term_iter);
122 new_match.offset = starting_offset;
123 new_match.length = ending_offset - starting_offset;
124 new_matches.push_back(new_match);
125 }
126 }
127 return new_matches;
128 }
129
130 // Utility Functions -----------------------------------------------------------
131
String16SetFromString16(const base::string16 & cleaned_uni_string,WordStarts * word_starts)132 String16Set String16SetFromString16(const base::string16& cleaned_uni_string,
133 WordStarts* word_starts) {
134 String16Vector words =
135 String16VectorFromString16(cleaned_uni_string, false, word_starts);
136 for (auto& word : words)
137 word = base::i18n::ToLower(word).substr(0, kMaxSignificantChars);
138 return String16Set(std::make_move_iterator(words.begin()),
139 std::make_move_iterator(words.end()));
140 }
141
String16VectorFromString16(const base::string16 & cleaned_uni_string,bool break_on_space,WordStarts * word_starts)142 String16Vector String16VectorFromString16(
143 const base::string16& cleaned_uni_string,
144 bool break_on_space,
145 WordStarts* word_starts) {
146 if (word_starts)
147 word_starts->clear();
148 base::i18n::BreakIterator::BreakType break_mode =
149 break_on_space ? base::i18n::BreakIterator::BREAK_SPACE
150 : base::i18n::BreakIterator::BREAK_WORD;
151 String16Vector words;
152 if (!break_on_space) {
153 TailoredWordBreakIterator iter(cleaned_uni_string, break_mode);
154 if (!iter.Init())
155 return words;
156 while (iter.Advance()) {
157 if (iter.IsWord()) {
158 String16VectorFromString16Internal(iter.GetString(), iter.prev(), false,
159 &words, word_starts);
160 }
161 }
162 } else {
163 base::i18n::BreakIterator iter(cleaned_uni_string, break_mode);
164 if (!iter.Init())
165 return words;
166 while (iter.Advance()) {
167 String16VectorFromString16Internal(iter.GetString(), iter.prev(), true,
168 &words, word_starts);
169 }
170 }
171 return words;
172 }
173
Char16SetFromString16(const base::string16 & term)174 Char16Set Char16SetFromString16(const base::string16& term) {
175 return Char16Set(term.begin(), term.end());
176 }
177
178 // HistoryInfoMapValue ---------------------------------------------------------
179
180 HistoryInfoMapValue::HistoryInfoMapValue() = default;
181 HistoryInfoMapValue::HistoryInfoMapValue(const HistoryInfoMapValue& other) =
182 default;
183 HistoryInfoMapValue::HistoryInfoMapValue(HistoryInfoMapValue&& other) = default;
184 HistoryInfoMapValue& HistoryInfoMapValue::operator=(
185 const HistoryInfoMapValue& other) = default;
186 HistoryInfoMapValue& HistoryInfoMapValue::operator=(
187 HistoryInfoMapValue&& other) = default;
188 HistoryInfoMapValue::~HistoryInfoMapValue() = default;
189
EstimateMemoryUsage() const190 size_t HistoryInfoMapValue::EstimateMemoryUsage() const {
191 return base::trace_event::EstimateMemoryUsage(url_row) +
192 base::trace_event::EstimateMemoryUsage(visits);
193 }
194
195 // RowWordStarts ---------------------------------------------------------------
196
197 RowWordStarts::RowWordStarts() = default;
198 RowWordStarts::RowWordStarts(const RowWordStarts& other) = default;
199 RowWordStarts::RowWordStarts(RowWordStarts&& other) = default;
200 RowWordStarts& RowWordStarts::operator=(const RowWordStarts& other) = default;
201 RowWordStarts& RowWordStarts::operator=(RowWordStarts&& other) = default;
202 RowWordStarts::~RowWordStarts() = default;
203
EstimateMemoryUsage() const204 size_t RowWordStarts::EstimateMemoryUsage() const {
205 return base::trace_event::EstimateMemoryUsage(url_word_starts_) +
206 base::trace_event::EstimateMemoryUsage(title_word_starts_);
207 }
208
Clear()209 void RowWordStarts::Clear() {
210 url_word_starts_.clear();
211 title_word_starts_.clear();
212 }
213