in_memory_url_index_types.h - OpenGrok cross reference for /dports/www/chromium-legacy/chromium-88.0.4324.182/components/omnibox/browser/in_memory_url_index_types.h

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_
#define COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_

#include <stddef.h>

#include <map>
#include <unordered_map>
#include <vector>

#include "base/containers/flat_set.h"
#include "base/strings/string16.h"
#include "components/history/core/browser/history_types.h"
#include "url/gurl.h"

// Convenience Types -----------------------------------------------------------

typedef std::vector<base::string16> String16Vector;
typedef base::flat_set<base::string16> String16Set;
typedef base::flat_set<base::char16> Char16Set;
typedef std::vector<base::char16> Char16Vector;

// A vector that contains the offsets at which each word starts within a string.
typedef std::vector<size_t> WordStarts;

// Matches within URL and Title Strings ----------------------------------------

// Specifies where an omnibox term occurs within a string. Used for specifying
// highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
// scoring a result.
struct TermMatch {
  TermMatch() : term_num(0), offset(0), length(0) {}
  TermMatch(int term_num, size_t offset, size_t length)
      : term_num(term_num),
        offset(offset),
        length(length) {}

  int term_num;  // The index of the term in the original search string.
  size_t offset;  // The starting offset of the substring match.
  size_t length;  // The length of the substring match.
};
typedef std::vector<TermMatch> TermMatches;

// Returns the joined TermMatches of each term. See MatchTermInString.
TermMatches MatchTermsInString(const String16Vector& terms,
                               const base::string16& cleaned_string);

// Returns a TermMatches which has an entry for each occurrence of the
// string |term| found in the string |cleaned_string|. Use
// CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing
// |cleaned_string| to this function. The function marks each match
// with |term_num| so that the resulting TermMatches can be merged
// with other TermMatches for other terms. Note that only the first
// 2,048 characters of |string| are considered during the match
// operation.
TermMatches MatchTermInString(const base::string16& term,
                              const base::string16& cleaned_string,
                              int term_num);

// Sorts |matches| by offset and returns the result.
TermMatches SortMatches(const TermMatches& matches);

// Removes overlapping substring matches from |matches| and returns the
// cleaned up matches.  Assumes |matches| is already sorted.
TermMatches DeoverlapMatches(const TermMatches& sorted_matches);

// Extracts and returns the offsets from |matches|.  This includes both
// the offsets corresponding to the beginning of a match and the offsets
// corresponding to the end of a match (i.e., offset+length for that match).
std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches);

// Replaces the offsets and lengths in |matches| with those given in |offsets|.
// |offsets| gives beginning and ending offsets for each match; this function
// translates (beginning, ending) offset into (beginning offset, length of
// match).  It deletes any matches for which an endpoint is npos and returns
// the updated list of matches.
TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
                                        const std::vector<size_t>& offsets);

// Utility Functions -----------------------------------------------------------

// Breaks the string |cleaned_uni_string| down into individual words.
// Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before
// passing |cleaned_uni_string| to this function. If |word_starts| is
// not NULL then clears and pushes the offsets within
// |cleaned_uni_string| at which each word starts onto
// |word_starts|. These offsets are collected only up to the first
// kMaxSignificantChars of |cleaned_uni_string|.
String16Set String16SetFromString16(const base::string16& cleaned_uni_string,
                                    WordStarts* word_starts);

// Breaks the |cleaned_uni_string| string down into individual words and
// return a vector with the individual words in their original order. Use
// CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing
// |cleaned_uni_string| to this function. If |break_on_space| is false then
// the string is broken using BreakIterator's BREAK_WORD detection logic,
// augmented so that it additionally breaks words at underscores. The resulting
// list will contain only words containing alpha-numeric characters. If
// |break_on_space| is true then the string is broken only at whitespace (no
// word-boundary logic, no breaking at underscores). (|break_on_space| tells
// BreakIterator to use BREAK_SPACE logic.) For more details, refer to the
// comments in base/i18n/break_iterator.h.) If |word_starts| is not NULL
// then clears and pushes the word starts onto |word_starts|.
//
// Example:
//   Given: |cleaned_uni_string|: "http://www.google.com/ harry the_rabbit."
//   With |break_on_space| false the returned list will contain:
//    "http", "www", "google", "com", "harry", "the", "rabbit"
//   With |break_on_space| true the returned list will contain:
//    "http://", "www.google.com/", "harry", "the_rabbit."
String16Vector String16VectorFromString16(
    const base::string16& cleaned_uni_string,
    bool break_on_space,
    WordStarts* word_starts);

// Breaks the |uni_word| string down into its individual characters.
// Note that this is temporarily intended to work on a single word, but
// _will_ work on a string of words, perhaps with unexpected results.
// TODO(mrossetti): Lots of optimizations possible here for not restarting
// a search if the user is just typing along. Also, change this to uniString
// and properly handle substring matches, scoring and sorting the results
// by score. Also, provide the metrics for where the matches occur so that
// the UI can highlight the matched sections.
Char16Set Char16SetFromString16(const base::string16& uni_word);

// Support for InMemoryURLIndex Private Data -----------------------------------

// An index into a list of all of the words we have indexed.
typedef size_t WordID;

// A map allowing a WordID to be determined given a word.
typedef std::map<base::string16, WordID> WordMap;

// A map from character to the word_ids of words containing that character.
typedef base::flat_set<WordID> WordIDSet;  // An index into the WordList.
typedef std::map<base::char16, WordIDSet> CharWordIDMap;

// A map from word (by word_id) to history items containing that word.
typedef history::URLID HistoryID;
typedef base::flat_set<HistoryID> HistoryIDSet;
typedef std::vector<HistoryID> HistoryIDVector;
typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
typedef std::map<HistoryID, WordIDSet> HistoryIDWordMap;


// Information used in scoring a particular URL.
typedef std::vector<history::VisitInfo> VisitInfoVector;
struct HistoryInfoMapValue {
  HistoryInfoMapValue();
  HistoryInfoMapValue(const HistoryInfoMapValue& other);
  HistoryInfoMapValue(HistoryInfoMapValue&& other);
  HistoryInfoMapValue& operator=(const HistoryInfoMapValue& other);
  HistoryInfoMapValue& operator=(HistoryInfoMapValue&& other);
  ~HistoryInfoMapValue();

  // Estimates dynamic memory usage.
  // See base/trace_event/memory_usage_estimator.h for more info.
  size_t EstimateMemoryUsage() const;

  // This field is always populated.
  history::URLRow url_row;

  // This field gets filled in asynchronously after a visit.  As such,
  // it's almost always correct.  If it's wrong, it's likely to either
  // be empty (if this URL was recently added to the index) or
  // slightly out-of-date (one visit behind).
  VisitInfoVector visits;
};

// A map from history_id to the history's URL and title.
typedef std::unordered_map<HistoryID, HistoryInfoMapValue> HistoryInfoMap;

// A map from history_id to URL and page title word start metrics.
struct RowWordStarts {
  RowWordStarts();
  RowWordStarts(const RowWordStarts& other);
  RowWordStarts(RowWordStarts&& other);
  RowWordStarts& operator=(const RowWordStarts& other);
  RowWordStarts& operator=(RowWordStarts&& other);
  ~RowWordStarts();

  // Estimates dynamic memory usage.
  // See base/trace_event/memory_usage_estimator.h for more info.
  size_t EstimateMemoryUsage() const;

  // Clears both url_word_starts_ and title_word_starts_.
  void Clear();

  WordStarts url_word_starts_;
  WordStarts title_word_starts_;
};
typedef std::map<HistoryID, RowWordStarts> WordStartsMap;

#endif  // COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_