1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/offline_pages/core/model/get_pages_task.h"
6
7 #include <algorithm>
8 #include <string>
9 #include <utility>
10
11 #include "base/bind.h"
12 #include "base/files/file_path.h"
13 #include "base/memory/ptr_util.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "components/offline_pages/core/offline_page_client_policy.h"
16 #include "components/offline_pages/core/offline_page_item_utils.h"
17 #include "components/offline_pages/core/offline_store_utils.h"
18 #include "sql/database.h"
19 #include "sql/statement.h"
20 #include "sql/transaction.h"
21 #include "url/gurl.h"
22
23 namespace offline_pages {
24 namespace {
25
26 using ReadResult = GetPagesTask::ReadResult;
27
28 #define OFFLINE_PAGE_PROJECTION \
29 " offline_id,creation_time,file_size,last_access_time," \
30 "access_count,system_download_id,file_missing_time," \
31 "client_namespace,client_id,online_url," \
32 "file_path,title,original_url,request_origin,digest," \
33 "snippet,attribution"
34
OfflinePageClientId(const sql::Statement & statement)35 ClientId OfflinePageClientId(const sql::Statement& statement) {
36 return ClientId(statement.ColumnString(7), statement.ColumnString(8));
37 }
38
39 // Create an offline page item from a SQL result.
40 // Expects the order of columns as defined by OFFLINE_PAGE_PROJECTION macro.
MakeOfflinePageItem(const sql::Statement & statement)41 OfflinePageItem MakeOfflinePageItem(const sql::Statement& statement) {
42 OfflinePageItem item;
43 item.offline_id = statement.ColumnInt64(0);
44 item.creation_time = store_utils::FromDatabaseTime(statement.ColumnInt64(1));
45 item.file_size = statement.ColumnInt64(2);
46 item.last_access_time =
47 store_utils::FromDatabaseTime(statement.ColumnInt64(3));
48 item.access_count = statement.ColumnInt(4);
49 item.system_download_id = statement.ColumnInt64(5);
50 item.file_missing_time =
51 store_utils::FromDatabaseTime(statement.ColumnInt64(6));
52 item.client_id = OfflinePageClientId(statement);
53 item.url = GURL(statement.ColumnString(9));
54 item.file_path = base::FilePath(
55 store_utils::FromDatabaseFilePath(statement.ColumnString(10)));
56 item.title = statement.ColumnString16(11);
57 item.original_url_if_different = GURL(statement.ColumnString(12));
58 item.request_origin = statement.ColumnString(13);
59 item.digest = statement.ColumnString(14);
60 item.snippet = statement.ColumnString(15);
61 item.attribution = statement.ColumnString(16);
62 return item;
63 }
64
65 // Returns a pattern to be used in an SQLite LIKE expression to match a
66 // a URL ignoring the fragment. Warning: this match produces false positives,
67 // so the URL match must be verified by doing
68 // UrlWithoutFragment(a)==UrlWithoutFragment(b).
RelaxedLikePattern(const GURL & url)69 std::string RelaxedLikePattern(const GURL& url) {
70 // In a LIKE expression, % matches any number of characters, and _ matches any
71 // single character.
72 // Replace % with _ in the URL to because _ is more restrictive.
73 // Append a % to match any URL with our URL as a prefix, just in case the URL
74 // being matched has a fragment.
75 std::string string_to_match = UrlWithoutFragment(url).spec();
76 std::replace(string_to_match.begin(), string_to_match.end(), '%', '_');
77 string_to_match.push_back('%');
78 return string_to_match;
79 }
80
81 } // namespace
82
83 GetPagesTask::ReadResult::ReadResult() = default;
84 GetPagesTask::ReadResult::ReadResult(const ReadResult& other) = default;
85 GetPagesTask::ReadResult::~ReadResult() = default;
86
GetPagesTask(OfflinePageMetadataStore * store,const PageCriteria & criteria,MultipleOfflinePageItemCallback callback)87 GetPagesTask::GetPagesTask(OfflinePageMetadataStore* store,
88 const PageCriteria& criteria,
89 MultipleOfflinePageItemCallback callback)
90 : store_(store),
91 criteria_(criteria),
92 callback_(std::move(callback)) {
93 DCHECK(store_);
94 DCHECK(!callback_.is_null());
95 }
96
97 GetPagesTask::~GetPagesTask() = default;
98
Run()99 void GetPagesTask::Run() {
100 store_->Execute(base::BindOnce(&GetPagesTask::ReadPagesWithCriteriaSync,
101 std::move(criteria_)),
102 base::BindOnce(&GetPagesTask::CompleteWithResult,
103 weak_ptr_factory_.GetWeakPtr()),
104 ReadResult());
105 }
106
CompleteWithResult(ReadResult result)107 void GetPagesTask::CompleteWithResult(ReadResult result) {
108 std::move(callback_).Run(result.pages);
109 TaskComplete();
110 }
111
112 // Some comments on query performance as of March 2019:
113 // - SQLite stores data in row-oriented fashion, so there's little cost to
114 // querying additional columns.
115 // - SQLite supports REGEXP, but it's slow, seems hardly worth using. LIKE is
116 // fast.
117 // - Adding more simple conditions to the WHERE clause seems to hardly increase
118 // runtime, so it's advantageous to add new conditions if they are likely to
119 // eliminate output.
120 // - When a single item is returned from a query, using a WHERE clause is about
121 // 10x faster compared to just querying all rows and filtering the output in
122 // C++.
123 // - The below query can process 10K rows in ~1ms (in-memory db).
124 // - If offline_id is in criteria, SQLite will use the index to evaluate the
125 // query quickly. Otherwise, we need to read the whole table anyway. Unless
126 // the db is loaded to memory, and disk access will likely dwarf any
127 // other query optimizations.
ReadPagesWithCriteriaSync(const PageCriteria & criteria,sql::Database * db)128 ReadResult GetPagesTask::ReadPagesWithCriteriaSync(
129 const PageCriteria& criteria,
130 sql::Database* db) {
131 ReadResult result;
132
133 // Quick return for known empty results.
134 if ((criteria.offline_ids && criteria.offline_ids.value().empty()) ||
135 (criteria.client_ids && criteria.client_ids.value().empty()) ||
136 (criteria.client_namespaces &&
137 criteria.client_namespaces.value().empty())) {
138 result.success = true;
139 return result;
140 }
141
142 // Note: the WHERE clause here is a relaxed form of |criteria|, so returned
143 // items must be re-checked with |MeetsCriteria|.
144 static const char kSql[] =
145 "SELECT " OFFLINE_PAGE_PROJECTION
146 " FROM offlinepages_v1"
147 " WHERE"
148 " offline_id BETWEEN ? AND ?"
149 " AND (? OR file_size=?)"
150 " AND (? OR digest=?)"
151 " AND (? OR instr(?,client_namespace)>0)"
152 " AND (? OR request_origin=?)"
153 " AND (? OR instr(?,client_id)>0)"
154 " AND (? OR online_url LIKE ? OR original_url LIKE ?)"
155 // Order by either creation_time or last_access_time, depending on
156 // bound parameters.
157 " ORDER BY creation_time*?+last_access_time*?";
158
159 sql::Statement statement(db->GetCachedStatement(SQL_FROM_HERE, kSql));
160
161 int param = 0;
162
163 if (criteria.offline_ids) {
164 const std::vector<int64_t> ids = criteria.offline_ids.value();
165 auto min_max = std::minmax_element(ids.begin(), ids.end());
166 statement.BindInt64(param++, *min_max.first);
167 statement.BindInt64(param++, *min_max.second);
168 } else {
169 statement.BindInt64(param++, INT64_MIN);
170 statement.BindInt64(param++, INT64_MAX);
171 }
172
173 statement.BindBool(param++, !criteria.file_size);
174 statement.BindInt64(param++, criteria.file_size.value_or(0));
175
176 statement.BindBool(param++, criteria.digest.empty());
177 statement.BindString(param++, criteria.digest);
178
179 // For namespace and client_id, we use SQL's substring match function,
180 // instr(), to provided an inexact match within the query. In both cases, we
181 // pass SQLite a string equal to the concatenation of all possible values we
182 // want to find, and then search that string for the row's namespace and
183 // client_id respectively.
184 std::vector<std::string> potential_namespaces =
185 PotentiallyMatchingNamespaces(criteria);
186 if (!potential_namespaces.empty()) {
187 statement.BindBool(param++, false);
188 statement.BindString(param++, base::JoinString(potential_namespaces, ""));
189 } else {
190 statement.BindBool(param++, true);
191 statement.BindString(param++, "");
192 }
193
194 statement.BindBool(param++, criteria.request_origin.empty());
195 statement.BindString(param++, criteria.request_origin);
196
197 if (criteria.client_ids) {
198 // Collect all client ids into a single string for matching in the query
199 // with substring match (instr()).
200 std::string concatenated_ids;
201 for (const ClientId& id : criteria.client_ids.value()) {
202 concatenated_ids += id.id;
203 }
204 statement.BindBool(param++, false);
205 statement.BindString(param++, concatenated_ids);
206 } else {
207 statement.BindBool(param++, true);
208 statement.BindString(param++, std::string());
209 }
210
211 const std::string url_pattern = !criteria.url.is_empty()
212 ? RelaxedLikePattern(criteria.url)
213 : std::string();
214
215 statement.BindBool(param++, criteria.url.is_empty());
216 statement.BindString(param++, url_pattern);
217 statement.BindString(param++, url_pattern);
218
219 // ORDER BY criteria.
220 switch (criteria.result_order) {
221 case PageCriteria::kDescendingCreationTime:
222 statement.BindInt64(param++, -1);
223 statement.BindInt64(param++, 0);
224 break;
225 case PageCriteria::kAscendingAccessTime:
226 statement.BindInt64(param++, 0);
227 statement.BindInt64(param++, 1);
228 break;
229 case PageCriteria::kDescendingAccessTime:
230 statement.BindInt64(param++, 0);
231 statement.BindInt64(param++, -1);
232 break;
233 }
234
235 while (statement.Step()) {
236 // Initially, read just the client ID to avoid creating the offline item
237 // if it's filtered out.
238 if (!MeetsCriteria(criteria, OfflinePageClientId(statement))) {
239 continue;
240 }
241 OfflinePageItem item = MakeOfflinePageItem(statement);
242 if (!MeetsCriteria(criteria, item))
243 continue;
244
245 result.pages.push_back(std::move(item));
246 if (criteria.maximum_matches == result.pages.size())
247 break;
248 }
249
250 result.success = statement.Succeeded();
251 if (!result.success) {
252 DLOG(ERROR) << "ReadPagesWithCriteriaSync: statement.Succeeded()=false";
253 result.pages.clear();
254 }
255 return result;
256 }
257
258 } // namespace offline_pages
259