1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_
6 #define COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_
7 
8 #include <stddef.h>
9 
10 #include <map>
11 #include <unordered_map>
12 #include <vector>
13 
14 #include "base/containers/flat_set.h"
15 #include "base/strings/string16.h"
16 #include "components/history/core/browser/history_types.h"
17 #include "url/gurl.h"
18 
19 // Convenience Types -----------------------------------------------------------
20 
21 typedef std::vector<base::string16> String16Vector;
22 typedef base::flat_set<base::string16> String16Set;
23 typedef base::flat_set<base::char16> Char16Set;
24 typedef std::vector<base::char16> Char16Vector;
25 
26 // A vector that contains the offsets at which each word starts within a string.
27 typedef std::vector<size_t> WordStarts;
28 
29 // Matches within URL and Title Strings ----------------------------------------
30 
31 // Specifies where an omnibox term occurs within a string. Used for specifying
32 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
33 // scoring a result.
34 struct TermMatch {
TermMatchTermMatch35   TermMatch() : term_num(0), offset(0), length(0) {}
TermMatchTermMatch36   TermMatch(int term_num, size_t offset, size_t length)
37       : term_num(term_num),
38         offset(offset),
39         length(length) {}
40 
41   int term_num;  // The index of the term in the original search string.
42   size_t offset;  // The starting offset of the substring match.
43   size_t length;  // The length of the substring match.
44 };
45 typedef std::vector<TermMatch> TermMatches;
46 
47 // Returns the joined TermMatches of each term. See MatchTermInString.
48 TermMatches MatchTermsInString(const String16Vector& terms,
49                                const base::string16& cleaned_string);
50 
51 // Returns a TermMatches which has an entry for each occurrence of the
52 // string |term| found in the string |cleaned_string|. Use
53 // CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing
54 // |cleaned_string| to this function. The function marks each match
55 // with |term_num| so that the resulting TermMatches can be merged
56 // with other TermMatches for other terms. Note that only the first
57 // 2,048 characters of |string| are considered during the match
58 // operation.
59 TermMatches MatchTermInString(const base::string16& term,
60                               const base::string16& cleaned_string,
61                               int term_num);
62 
63 // Sorts |matches| by offset and returns the result.
64 TermMatches SortMatches(const TermMatches& matches);
65 
66 // Removes overlapping substring matches from |matches| and returns the
67 // cleaned up matches.  Assumes |matches| is already sorted.
68 TermMatches DeoverlapMatches(const TermMatches& sorted_matches);
69 
70 // Extracts and returns the offsets from |matches|.  This includes both
71 // the offsets corresponding to the beginning of a match and the offsets
72 // corresponding to the end of a match (i.e., offset+length for that match).
73 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches);
74 
75 // Replaces the offsets and lengths in |matches| with those given in |offsets|.
76 // |offsets| gives beginning and ending offsets for each match; this function
77 // translates (beginning, ending) offset into (beginning offset, length of
78 // match).  It deletes any matches for which an endpoint is npos and returns
79 // the updated list of matches.
80 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
81                                         const std::vector<size_t>& offsets);
82 
83 // Utility Functions -----------------------------------------------------------
84 
85 // Breaks the string |cleaned_uni_string| down into individual words.
86 // Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before
87 // passing |cleaned_uni_string| to this function. If |word_starts| is
88 // not NULL then clears and pushes the offsets within
89 // |cleaned_uni_string| at which each word starts onto
90 // |word_starts|. These offsets are collected only up to the first
91 // kMaxSignificantChars of |cleaned_uni_string|.
92 String16Set String16SetFromString16(const base::string16& cleaned_uni_string,
93                                     WordStarts* word_starts);
94 
95 // Breaks the |cleaned_uni_string| string down into individual words and
96 // return a vector with the individual words in their original order. Use
97 // CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing
98 // |cleaned_uni_string| to this function. If |break_on_space| is false then
99 // the string is broken using BreakIterator's BREAK_WORD detection logic,
100 // augmented so that it additionally breaks words at underscores. The resulting
101 // list will contain only words containing alpha-numeric characters. If
102 // |break_on_space| is true then the string is broken only at whitespace (no
103 // word-boundary logic, no breaking at underscores). (|break_on_space| tells
104 // BreakIterator to use BREAK_SPACE logic.) For more details, refer to the
105 // comments in base/i18n/break_iterator.h.) If |word_starts| is not NULL
106 // then clears and pushes the word starts onto |word_starts|.
107 //
108 // Example:
109 //   Given: |cleaned_uni_string|: "http://www.google.com/ harry the_rabbit."
110 //   With |break_on_space| false the returned list will contain:
111 //    "http", "www", "google", "com", "harry", "the", "rabbit"
112 //   With |break_on_space| true the returned list will contain:
113 //    "http://", "www.google.com/", "harry", "the_rabbit."
114 String16Vector String16VectorFromString16(
115     const base::string16& cleaned_uni_string,
116     bool break_on_space,
117     WordStarts* word_starts);
118 
119 // Breaks the |uni_word| string down into its individual characters.
120 // Note that this is temporarily intended to work on a single word, but
121 // _will_ work on a string of words, perhaps with unexpected results.
122 // TODO(mrossetti): Lots of optimizations possible here for not restarting
123 // a search if the user is just typing along. Also, change this to uniString
124 // and properly handle substring matches, scoring and sorting the results
125 // by score. Also, provide the metrics for where the matches occur so that
126 // the UI can highlight the matched sections.
127 Char16Set Char16SetFromString16(const base::string16& uni_word);
128 
129 // Support for InMemoryURLIndex Private Data -----------------------------------
130 
131 // An index into a list of all of the words we have indexed.
132 typedef size_t WordID;
133 
134 // A map allowing a WordID to be determined given a word.
135 typedef std::map<base::string16, WordID> WordMap;
136 
137 // A map from character to the word_ids of words containing that character.
138 typedef base::flat_set<WordID> WordIDSet;  // An index into the WordList.
139 typedef std::map<base::char16, WordIDSet> CharWordIDMap;
140 
141 // A map from word (by word_id) to history items containing that word.
142 typedef history::URLID HistoryID;
143 typedef base::flat_set<HistoryID> HistoryIDSet;
144 typedef std::vector<HistoryID> HistoryIDVector;
145 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
146 typedef std::map<HistoryID, WordIDSet> HistoryIDWordMap;
147 
148 
149 // Information used in scoring a particular URL.
150 typedef std::vector<history::VisitInfo> VisitInfoVector;
151 struct HistoryInfoMapValue {
152   HistoryInfoMapValue();
153   HistoryInfoMapValue(const HistoryInfoMapValue& other);
154   HistoryInfoMapValue(HistoryInfoMapValue&& other);
155   HistoryInfoMapValue& operator=(const HistoryInfoMapValue& other);
156   HistoryInfoMapValue& operator=(HistoryInfoMapValue&& other);
157   ~HistoryInfoMapValue();
158 
159   // Estimates dynamic memory usage.
160   // See base/trace_event/memory_usage_estimator.h for more info.
161   size_t EstimateMemoryUsage() const;
162 
163   // This field is always populated.
164   history::URLRow url_row;
165 
166   // This field gets filled in asynchronously after a visit.  As such,
167   // it's almost always correct.  If it's wrong, it's likely to either
168   // be empty (if this URL was recently added to the index) or
169   // slightly out-of-date (one visit behind).
170   VisitInfoVector visits;
171 };
172 
173 // A map from history_id to the history's URL and title.
174 typedef std::unordered_map<HistoryID, HistoryInfoMapValue> HistoryInfoMap;
175 
176 // A map from history_id to URL and page title word start metrics.
177 struct RowWordStarts {
178   RowWordStarts();
179   RowWordStarts(const RowWordStarts& other);
180   RowWordStarts(RowWordStarts&& other);
181   RowWordStarts& operator=(const RowWordStarts& other);
182   RowWordStarts& operator=(RowWordStarts&& other);
183   ~RowWordStarts();
184 
185   // Estimates dynamic memory usage.
186   // See base/trace_event/memory_usage_estimator.h for more info.
187   size_t EstimateMemoryUsage() const;
188 
189   // Clears both url_word_starts_ and title_word_starts_.
190   void Clear();
191 
192   WordStarts url_word_starts_;
193   WordStarts title_word_starts_;
194 };
195 typedef std::map<HistoryID, RowWordStarts> WordStartsMap;
196 
197 #endif  // COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_
198