1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_ 6 #define COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_ 7 8 #include <stddef.h> 9 10 #include <map> 11 #include <unordered_map> 12 #include <vector> 13 14 #include "base/containers/flat_set.h" 15 #include "base/strings/string16.h" 16 #include "components/history/core/browser/history_types.h" 17 #include "url/gurl.h" 18 19 // Convenience Types ----------------------------------------------------------- 20 21 typedef std::vector<base::string16> String16Vector; 22 typedef base::flat_set<base::string16> String16Set; 23 typedef base::flat_set<base::char16> Char16Set; 24 typedef std::vector<base::char16> Char16Vector; 25 26 // A vector that contains the offsets at which each word starts within a string. 27 typedef std::vector<size_t> WordStarts; 28 29 // Matches within URL and Title Strings ---------------------------------------- 30 31 // Specifies where an omnibox term occurs within a string. Used for specifying 32 // highlights in AutocompleteMatches (ACMatchClassifications) and to assist in 33 // scoring a result. 34 struct TermMatch { TermMatchTermMatch35 TermMatch() : term_num(0), offset(0), length(0) {} TermMatchTermMatch36 TermMatch(int term_num, size_t offset, size_t length) 37 : term_num(term_num), 38 offset(offset), 39 length(length) {} 40 41 int term_num; // The index of the term in the original search string. 42 size_t offset; // The starting offset of the substring match. 43 size_t length; // The length of the substring match. 44 }; 45 typedef std::vector<TermMatch> TermMatches; 46 47 // Returns the joined TermMatches of each term. See MatchTermInString. 48 TermMatches MatchTermsInString(const String16Vector& terms, 49 const base::string16& cleaned_string); 50 51 // Returns a TermMatches which has an entry for each occurrence of the 52 // string |term| found in the string |cleaned_string|. Use 53 // CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing 54 // |cleaned_string| to this function. The function marks each match 55 // with |term_num| so that the resulting TermMatches can be merged 56 // with other TermMatches for other terms. Note that only the first 57 // 2,048 characters of |string| are considered during the match 58 // operation. 59 TermMatches MatchTermInString(const base::string16& term, 60 const base::string16& cleaned_string, 61 int term_num); 62 63 // Sorts |matches| by offset and returns the result. 64 TermMatches SortMatches(const TermMatches& matches); 65 66 // Removes overlapping substring matches from |matches| and returns the 67 // cleaned up matches. Assumes |matches| is already sorted. 68 TermMatches DeoverlapMatches(const TermMatches& sorted_matches); 69 70 // Extracts and returns the offsets from |matches|. This includes both 71 // the offsets corresponding to the beginning of a match and the offsets 72 // corresponding to the end of a match (i.e., offset+length for that match). 73 std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches); 74 75 // Replaces the offsets and lengths in |matches| with those given in |offsets|. 76 // |offsets| gives beginning and ending offsets for each match; this function 77 // translates (beginning, ending) offset into (beginning offset, length of 78 // match). It deletes any matches for which an endpoint is npos and returns 79 // the updated list of matches. 80 TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches, 81 const std::vector<size_t>& offsets); 82 83 // Utility Functions ----------------------------------------------------------- 84 85 // Breaks the string |cleaned_uni_string| down into individual words. 86 // Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before 87 // passing |cleaned_uni_string| to this function. If |word_starts| is 88 // not NULL then clears and pushes the offsets within 89 // |cleaned_uni_string| at which each word starts onto 90 // |word_starts|. These offsets are collected only up to the first 91 // kMaxSignificantChars of |cleaned_uni_string|. 92 String16Set String16SetFromString16(const base::string16& cleaned_uni_string, 93 WordStarts* word_starts); 94 95 // Breaks the |cleaned_uni_string| string down into individual words and 96 // return a vector with the individual words in their original order. Use 97 // CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing 98 // |cleaned_uni_string| to this function. If |break_on_space| is false then 99 // the string is broken using BreakIterator's BREAK_WORD detection logic, 100 // augmented so that it additionally breaks words at underscores. The resulting 101 // list will contain only words containing alpha-numeric characters. If 102 // |break_on_space| is true then the string is broken only at whitespace (no 103 // word-boundary logic, no breaking at underscores). (|break_on_space| tells 104 // BreakIterator to use BREAK_SPACE logic.) For more details, refer to the 105 // comments in base/i18n/break_iterator.h.) If |word_starts| is not NULL 106 // then clears and pushes the word starts onto |word_starts|. 107 // 108 // Example: 109 // Given: |cleaned_uni_string|: "http://www.google.com/ harry the_rabbit." 110 // With |break_on_space| false the returned list will contain: 111 // "http", "www", "google", "com", "harry", "the", "rabbit" 112 // With |break_on_space| true the returned list will contain: 113 // "http://", "www.google.com/", "harry", "the_rabbit." 114 String16Vector String16VectorFromString16( 115 const base::string16& cleaned_uni_string, 116 bool break_on_space, 117 WordStarts* word_starts); 118 119 // Breaks the |uni_word| string down into its individual characters. 120 // Note that this is temporarily intended to work on a single word, but 121 // _will_ work on a string of words, perhaps with unexpected results. 122 // TODO(mrossetti): Lots of optimizations possible here for not restarting 123 // a search if the user is just typing along. Also, change this to uniString 124 // and properly handle substring matches, scoring and sorting the results 125 // by score. Also, provide the metrics for where the matches occur so that 126 // the UI can highlight the matched sections. 127 Char16Set Char16SetFromString16(const base::string16& uni_word); 128 129 // Support for InMemoryURLIndex Private Data ----------------------------------- 130 131 // An index into a list of all of the words we have indexed. 132 typedef size_t WordID; 133 134 // A map allowing a WordID to be determined given a word. 135 typedef std::map<base::string16, WordID> WordMap; 136 137 // A map from character to the word_ids of words containing that character. 138 typedef base::flat_set<WordID> WordIDSet; // An index into the WordList. 139 typedef std::map<base::char16, WordIDSet> CharWordIDMap; 140 141 // A map from word (by word_id) to history items containing that word. 142 typedef history::URLID HistoryID; 143 typedef base::flat_set<HistoryID> HistoryIDSet; 144 typedef std::vector<HistoryID> HistoryIDVector; 145 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; 146 typedef std::map<HistoryID, WordIDSet> HistoryIDWordMap; 147 148 149 // Information used in scoring a particular URL. 150 typedef std::vector<history::VisitInfo> VisitInfoVector; 151 struct HistoryInfoMapValue { 152 HistoryInfoMapValue(); 153 HistoryInfoMapValue(const HistoryInfoMapValue& other); 154 HistoryInfoMapValue(HistoryInfoMapValue&& other); 155 HistoryInfoMapValue& operator=(const HistoryInfoMapValue& other); 156 HistoryInfoMapValue& operator=(HistoryInfoMapValue&& other); 157 ~HistoryInfoMapValue(); 158 159 // Estimates dynamic memory usage. 160 // See base/trace_event/memory_usage_estimator.h for more info. 161 size_t EstimateMemoryUsage() const; 162 163 // This field is always populated. 164 history::URLRow url_row; 165 166 // This field gets filled in asynchronously after a visit. As such, 167 // it's almost always correct. If it's wrong, it's likely to either 168 // be empty (if this URL was recently added to the index) or 169 // slightly out-of-date (one visit behind). 170 VisitInfoVector visits; 171 }; 172 173 // A map from history_id to the history's URL and title. 174 typedef std::unordered_map<HistoryID, HistoryInfoMapValue> HistoryInfoMap; 175 176 // A map from history_id to URL and page title word start metrics. 177 struct RowWordStarts { 178 RowWordStarts(); 179 RowWordStarts(const RowWordStarts& other); 180 RowWordStarts(RowWordStarts&& other); 181 RowWordStarts& operator=(const RowWordStarts& other); 182 RowWordStarts& operator=(RowWordStarts&& other); 183 ~RowWordStarts(); 184 185 // Estimates dynamic memory usage. 186 // See base/trace_event/memory_usage_estimator.h for more info. 187 size_t EstimateMemoryUsage() const; 188 189 // Clears both url_word_starts_ and title_word_starts_. 190 void Clear(); 191 192 WordStarts url_word_starts_; 193 WordStarts title_word_starts_; 194 }; 195 typedef std::map<HistoryID, RowWordStarts> WordStartsMap; 196 197 #endif // COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_ 198