1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/offline_pages/core/model/get_pages_task.h"
6 
7 #include <algorithm>
8 #include <string>
9 #include <utility>
10 
11 #include "base/bind.h"
12 #include "base/files/file_path.h"
13 #include "base/memory/ptr_util.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "components/offline_pages/core/offline_page_client_policy.h"
16 #include "components/offline_pages/core/offline_page_item_utils.h"
17 #include "components/offline_pages/core/offline_store_utils.h"
18 #include "sql/database.h"
19 #include "sql/statement.h"
20 #include "sql/transaction.h"
21 #include "url/gurl.h"
22 
23 namespace offline_pages {
24 namespace {
25 
26 using ReadResult = GetPagesTask::ReadResult;
27 
28 #define OFFLINE_PAGE_PROJECTION                           \
29   " offline_id,creation_time,file_size,last_access_time," \
30   "access_count,system_download_id,file_missing_time,"    \
31   "client_namespace,client_id,online_url,"                \
32   "file_path,title,original_url,request_origin,digest,"   \
33   "snippet,attribution"
34 
OfflinePageClientId(const sql::Statement & statement)35 ClientId OfflinePageClientId(const sql::Statement& statement) {
36   return ClientId(statement.ColumnString(7), statement.ColumnString(8));
37 }
38 
39 // Create an offline page item from a SQL result.
40 // Expects the order of columns as defined by OFFLINE_PAGE_PROJECTION macro.
MakeOfflinePageItem(const sql::Statement & statement)41 OfflinePageItem MakeOfflinePageItem(const sql::Statement& statement) {
42   OfflinePageItem item;
43   item.offline_id = statement.ColumnInt64(0);
44   item.creation_time = store_utils::FromDatabaseTime(statement.ColumnInt64(1));
45   item.file_size = statement.ColumnInt64(2);
46   item.last_access_time =
47       store_utils::FromDatabaseTime(statement.ColumnInt64(3));
48   item.access_count = statement.ColumnInt(4);
49   item.system_download_id = statement.ColumnInt64(5);
50   item.file_missing_time =
51       store_utils::FromDatabaseTime(statement.ColumnInt64(6));
52   item.client_id = OfflinePageClientId(statement);
53   item.url = GURL(statement.ColumnString(9));
54   item.file_path = base::FilePath(
55       store_utils::FromDatabaseFilePath(statement.ColumnString(10)));
56   item.title = statement.ColumnString16(11);
57   item.original_url_if_different = GURL(statement.ColumnString(12));
58   item.request_origin = statement.ColumnString(13);
59   item.digest = statement.ColumnString(14);
60   item.snippet = statement.ColumnString(15);
61   item.attribution = statement.ColumnString(16);
62   return item;
63 }
64 
65 // Returns a pattern to be used in an SQLite LIKE expression to match a
66 // a URL ignoring the fragment. Warning: this match produces false positives,
67 // so the URL match must be verified by doing
68 // UrlWithoutFragment(a)==UrlWithoutFragment(b).
RelaxedLikePattern(const GURL & url)69 std::string RelaxedLikePattern(const GURL& url) {
70   // In a LIKE expression, % matches any number of characters, and _ matches any
71   // single character.
72   // Replace % with _ in the URL to because _ is more restrictive.
73   // Append a % to match any URL with our URL as a prefix, just in case the URL
74   // being matched has a fragment.
75   std::string string_to_match = UrlWithoutFragment(url).spec();
76   std::replace(string_to_match.begin(), string_to_match.end(), '%', '_');
77   string_to_match.push_back('%');
78   return string_to_match;
79 }
80 
81 }  // namespace
82 
83 GetPagesTask::ReadResult::ReadResult() = default;
84 GetPagesTask::ReadResult::ReadResult(const ReadResult& other) = default;
85 GetPagesTask::ReadResult::~ReadResult() = default;
86 
GetPagesTask(OfflinePageMetadataStore * store,const PageCriteria & criteria,MultipleOfflinePageItemCallback callback)87 GetPagesTask::GetPagesTask(OfflinePageMetadataStore* store,
88                            const PageCriteria& criteria,
89                            MultipleOfflinePageItemCallback callback)
90     : store_(store),
91       criteria_(criteria),
92       callback_(std::move(callback)) {
93   DCHECK(store_);
94   DCHECK(!callback_.is_null());
95 }
96 
97 GetPagesTask::~GetPagesTask() = default;
98 
Run()99 void GetPagesTask::Run() {
100   store_->Execute(base::BindOnce(&GetPagesTask::ReadPagesWithCriteriaSync,
101                                  std::move(criteria_)),
102                   base::BindOnce(&GetPagesTask::CompleteWithResult,
103                                  weak_ptr_factory_.GetWeakPtr()),
104                   ReadResult());
105 }
106 
CompleteWithResult(ReadResult result)107 void GetPagesTask::CompleteWithResult(ReadResult result) {
108   std::move(callback_).Run(result.pages);
109   TaskComplete();
110 }
111 
112 // Some comments on query performance as of March 2019:
113 // - SQLite stores data in row-oriented fashion, so there's little cost to
114 //   querying additional columns.
115 // - SQLite supports REGEXP, but it's slow, seems hardly worth using. LIKE is
116 //   fast.
117 // - Adding more simple conditions to the WHERE clause seems to hardly increase
118 //   runtime, so it's advantageous to add new conditions if they are likely to
119 //   eliminate output.
120 // - When a single item is returned from a query, using a WHERE clause is about
121 //   10x faster compared to just querying all rows and filtering the output in
122 //   C++.
123 // - The below query can process 10K rows in ~1ms (in-memory db).
124 // - If offline_id is in criteria, SQLite will use the index to evaluate the
125 //   query quickly. Otherwise, we need to read the whole table anyway. Unless
126 //   the db is loaded to memory, and disk access will likely dwarf any
127 //   other query optimizations.
ReadPagesWithCriteriaSync(const PageCriteria & criteria,sql::Database * db)128 ReadResult GetPagesTask::ReadPagesWithCriteriaSync(
129     const PageCriteria& criteria,
130     sql::Database* db) {
131   ReadResult result;
132 
133   // Quick return for known empty results.
134   if ((criteria.offline_ids && criteria.offline_ids.value().empty()) ||
135       (criteria.client_ids && criteria.client_ids.value().empty()) ||
136       (criteria.client_namespaces &&
137        criteria.client_namespaces.value().empty())) {
138     result.success = true;
139     return result;
140   }
141 
142   // Note: the WHERE clause here is a relaxed form of |criteria|, so returned
143   // items must be re-checked with |MeetsCriteria|.
144   static const char kSql[] =
145       "SELECT " OFFLINE_PAGE_PROJECTION
146       " FROM offlinepages_v1"
147       " WHERE"
148       " offline_id BETWEEN ? AND ?"
149       " AND (? OR file_size=?)"
150       " AND (? OR digest=?)"
151       " AND (? OR instr(?,client_namespace)>0)"
152       " AND (? OR request_origin=?)"
153       " AND (? OR instr(?,client_id)>0)"
154       " AND (? OR online_url LIKE ? OR original_url LIKE ?)"
155       // Order by either creation_time or last_access_time, depending on
156       // bound parameters.
157       " ORDER BY creation_time*?+last_access_time*?";
158 
159   sql::Statement statement(db->GetCachedStatement(SQL_FROM_HERE, kSql));
160 
161   int param = 0;
162 
163   if (criteria.offline_ids) {
164     const std::vector<int64_t> ids = criteria.offline_ids.value();
165     auto min_max = std::minmax_element(ids.begin(), ids.end());
166     statement.BindInt64(param++, *min_max.first);
167     statement.BindInt64(param++, *min_max.second);
168   } else {
169     statement.BindInt64(param++, INT64_MIN);
170     statement.BindInt64(param++, INT64_MAX);
171   }
172 
173   statement.BindBool(param++, !criteria.file_size);
174   statement.BindInt64(param++, criteria.file_size.value_or(0));
175 
176   statement.BindBool(param++, criteria.digest.empty());
177   statement.BindString(param++, criteria.digest);
178 
179   // For namespace and client_id, we use SQL's substring match function,
180   // instr(), to provided an inexact match within the query. In both cases, we
181   // pass SQLite a string equal to the concatenation of all possible values we
182   // want to find, and then search that string for the row's namespace and
183   // client_id respectively.
184   std::vector<std::string> potential_namespaces =
185       PotentiallyMatchingNamespaces(criteria);
186   if (!potential_namespaces.empty()) {
187     statement.BindBool(param++, false);
188     statement.BindString(param++, base::JoinString(potential_namespaces, ""));
189   } else {
190     statement.BindBool(param++, true);
191     statement.BindString(param++, "");
192   }
193 
194   statement.BindBool(param++, criteria.request_origin.empty());
195   statement.BindString(param++, criteria.request_origin);
196 
197   if (criteria.client_ids) {
198     // Collect all client ids into a single string for matching in the query
199     // with substring match (instr()).
200     std::string concatenated_ids;
201     for (const ClientId& id : criteria.client_ids.value()) {
202       concatenated_ids += id.id;
203     }
204     statement.BindBool(param++, false);
205     statement.BindString(param++, concatenated_ids);
206   } else {
207     statement.BindBool(param++, true);
208     statement.BindString(param++, std::string());
209   }
210 
211   const std::string url_pattern = !criteria.url.is_empty()
212                                       ? RelaxedLikePattern(criteria.url)
213                                       : std::string();
214 
215   statement.BindBool(param++, criteria.url.is_empty());
216   statement.BindString(param++, url_pattern);
217   statement.BindString(param++, url_pattern);
218 
219   // ORDER BY criteria.
220   switch (criteria.result_order) {
221     case PageCriteria::kDescendingCreationTime:
222       statement.BindInt64(param++, -1);
223       statement.BindInt64(param++, 0);
224       break;
225     case PageCriteria::kAscendingAccessTime:
226       statement.BindInt64(param++, 0);
227       statement.BindInt64(param++, 1);
228       break;
229     case PageCriteria::kDescendingAccessTime:
230       statement.BindInt64(param++, 0);
231       statement.BindInt64(param++, -1);
232       break;
233   }
234 
235   while (statement.Step()) {
236     // Initially, read just the client ID to avoid creating the offline item
237     // if it's filtered out.
238     if (!MeetsCriteria(criteria, OfflinePageClientId(statement))) {
239       continue;
240     }
241     OfflinePageItem item = MakeOfflinePageItem(statement);
242     if (!MeetsCriteria(criteria, item))
243       continue;
244 
245     result.pages.push_back(std::move(item));
246     if (criteria.maximum_matches == result.pages.size())
247       break;
248   }
249 
250   result.success = statement.Succeeded();
251   if (!result.success) {
252     DLOG(ERROR) << "ReadPagesWithCriteriaSync: statement.Succeeded()=false";
253     result.pages.clear();
254   }
255   return result;
256 }
257 
258 }  // namespace offline_pages
259