1 // Copyright 2018 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/offline_pages/core/model/startup_maintenance_task.h"
6
7 #include <map>
8 #include <set>
9 #include <string>
10 #include <vector>
11
12 #include "base/bind.h"
13 #include "base/files/file_enumerator.h"
14 #include "base/files/file_util.h"
15 #include "base/metrics/histogram_functions.h"
16 #include "base/metrics/histogram_macros.h"
17 #include "base/numerics/safe_conversions.h"
18 #include "base/trace_event/trace_event.h"
19 #include "components/offline_pages/core/archive_manager.h"
20 #include "components/offline_pages/core/model/delete_page_task.h"
21 #include "components/offline_pages/core/offline_page_client_policy.h"
22 #include "components/offline_pages/core/offline_page_metadata_store.h"
23 #include "components/offline_pages/core/offline_store_utils.h"
24 #include "sql/database.h"
25 #include "sql/statement.h"
26 #include "sql/transaction.h"
27
28 namespace offline_pages {
29
30 namespace {
31
32 #define OFFLINE_PAGES_TABLE_NAME "offlinepages_v1"
33
34 struct PageInfo {
35 int64_t offline_id;
36 base::FilePath file_path;
37 };
38
GetPageInfosByNamespaces(const std::vector<std::string> & temp_namespaces,sql::Database * db)39 std::vector<PageInfo> GetPageInfosByNamespaces(
40 const std::vector<std::string>& temp_namespaces,
41 sql::Database* db) {
42 std::vector<PageInfo> result;
43
44 static const char kSql[] =
45 "SELECT offline_id, file_path"
46 " FROM " OFFLINE_PAGES_TABLE_NAME " WHERE client_namespace = ?";
47
48 for (const auto& temp_namespace : temp_namespaces) {
49 sql::Statement statement(db->GetCachedStatement(SQL_FROM_HERE, kSql));
50 statement.BindString(0, temp_namespace);
51 while (statement.Step()) {
52 result.push_back(
53 {statement.ColumnInt64(0),
54 store_utils::FromDatabaseFilePath(statement.ColumnString(1))});
55 }
56 }
57
58 return result;
59 }
60
GetAllArchives(const base::FilePath & archives_dir)61 std::set<base::FilePath> GetAllArchives(const base::FilePath& archives_dir) {
62 std::set<base::FilePath> result;
63 base::FileEnumerator file_enumerator(archives_dir, false,
64 base::FileEnumerator::FILES,
65 FILE_PATH_LITERAL("*.mhtml"));
66 for (auto archive_path = file_enumerator.Next(); !archive_path.empty();
67 archive_path = file_enumerator.Next()) {
68 result.insert(archive_path);
69 }
70 return result;
71 }
72
DeleteFiles(const std::vector<base::FilePath> & file_paths)73 bool DeleteFiles(const std::vector<base::FilePath>& file_paths) {
74 bool result = true;
75 for (const auto& file_path : file_paths)
76 result = base::DeleteFile(file_path) && result;
77 return result;
78 }
79
80 // This method is clearing the private dir(the legacy dir).
81 // - For all files associated with temporary pages:
82 // The strategy is if any temporary page
83 // is still left behind in the legacy dir, delete them.
84 // - For all files associated with persistent pages:
85 // Leave them as-is, since they might be still in use.
86 // - For all files without any associated DB entry:
87 // Delete the files, since they're 'headless' and has no way to be accessed.
ClearLegacyPagesInPrivateDirSync(sql::Database * db,const base::FilePath & private_dir)88 SyncOperationResult ClearLegacyPagesInPrivateDirSync(
89 sql::Database* db,
90 const base::FilePath& private_dir) {
91 // One large database transaction that will:
92 // 1. Get temporary page infos from the database.
93 // 2. Get persistent page infos from the database, in case they're in private
94 // dir.
95 // 3. Get all file paths in private dir as a set F.
96 // 4. For each temporary page info:
97 // - If its file path is in F, record its offline id for deletion.
98 // 5. For each persistent page info:
99 // - If its file path is in F, remove it from F.
100 // 6. Delete page entries by recorded offline ids, and delete the remaining
101 // files in F.
102 sql::Transaction transaction(db);
103 if (!transaction.Begin())
104 return SyncOperationResult::TRANSACTION_BEGIN_ERROR;
105
106 std::vector<PageInfo> temporary_page_infos =
107 GetPageInfosByNamespaces(GetTemporaryPolicyNamespaces(), db);
108 std::vector<PageInfo> persistent_page_infos =
109 GetPageInfosByNamespaces(GetPersistentPolicyNamespaces(), db);
110 std::map<base::FilePath, PageInfo> path_to_page_info;
111
112 std::set<base::FilePath> archive_paths = GetAllArchives(private_dir);
113 std::vector<int64_t> offline_ids_to_delete;
114
115 for (const auto& page_info : temporary_page_infos) {
116 if (archive_paths.find(page_info.file_path) != archive_paths.end())
117 offline_ids_to_delete.push_back(page_info.offline_id);
118 }
119 for (const auto& page_info : persistent_page_infos) {
120 auto iter = archive_paths.find(page_info.file_path);
121 if (iter != archive_paths.end())
122 archive_paths.erase(iter);
123 }
124
125 // Try to delete the pages by offline ids collected above.
126 // If there's any database related errors, the function will return failure,
127 // and the database operations will be rolled back since the transaction will
128 // not be committed.
129 if (!DeletePageTask::DeletePagesFromDbSync(offline_ids_to_delete, db))
130 return SyncOperationResult::DB_OPERATION_ERROR;
131
132 if (!transaction.Commit())
133 return SyncOperationResult::TRANSACTION_COMMIT_ERROR;
134
135 std::vector<base::FilePath> files_to_delete(archive_paths.begin(),
136 archive_paths.end());
137 if (!DeleteFiles(files_to_delete))
138 return SyncOperationResult::FILE_OPERATION_ERROR;
139
140 size_t headless_file_count =
141 files_to_delete.size() - offline_ids_to_delete.size();
142 if (headless_file_count > 0) {
143 UMA_HISTOGRAM_COUNTS_1M(
144 "OfflinePages.ConsistencyCheck.Legacy.DeletedHeadlessFileCount",
145 headless_file_count);
146 }
147
148 return SyncOperationResult::SUCCESS;
149 }
150
CheckTemporaryPageConsistencySync(sql::Database * db,const base::FilePath & archives_dir)151 SyncOperationResult CheckTemporaryPageConsistencySync(
152 sql::Database* db,
153 const base::FilePath& archives_dir) {
154 // One large database transaction that will:
155 // 1. Get page infos by |namespaces| from the database.
156 // 2. Decide which pages to delete.
157 // 3. Delete metadata entries from the database.
158 sql::Transaction transaction(db);
159 if (!transaction.Begin())
160 return SyncOperationResult::TRANSACTION_BEGIN_ERROR;
161
162 std::vector<PageInfo> page_infos =
163 GetPageInfosByNamespaces(GetTemporaryPolicyNamespaces(), db);
164
165 std::set<base::FilePath> page_info_paths;
166 std::vector<int64_t> offline_ids_to_delete;
167 for (const auto& page_info : page_infos) {
168 // Get pages whose archive files does not exist and delete.
169 if (!base::PathExists(page_info.file_path)) {
170 offline_ids_to_delete.push_back(page_info.offline_id);
171 } else {
172 // Extract existing file paths from |page_infos| so that we can do a
173 // faster matching later.
174 page_info_paths.insert(page_info.file_path);
175 }
176 }
177
178 if (offline_ids_to_delete.size() > 0) {
179 // Try to delete the pages by offline ids collected above. If there's any
180 // database related errors, the function will return false, and the database
181 // operations will be rolled back since the transaction will not be
182 // committed.
183 if (!DeletePageTask::DeletePagesFromDbSync(offline_ids_to_delete, db))
184 return SyncOperationResult::DB_OPERATION_ERROR;
185 UMA_HISTOGRAM_COUNTS_1M(
186 "OfflinePages.ConsistencyCheck.Temporary.PagesMissingArchiveFileCount",
187 base::saturated_cast<int32_t>(offline_ids_to_delete.size()));
188 }
189
190 if (!transaction.Commit())
191 return SyncOperationResult::TRANSACTION_COMMIT_ERROR;
192
193 // Delete any files in the temporary archive directory that no longer have
194 // associated entries in the database.
195 std::set<base::FilePath> archive_paths = GetAllArchives(archives_dir);
196 std::vector<base::FilePath> files_to_delete;
197 for (const auto& archive_path : archive_paths) {
198 if (page_info_paths.find(archive_path) == page_info_paths.end())
199 files_to_delete.push_back(archive_path);
200 }
201
202 if (files_to_delete.size() > 0) {
203 if (!DeleteFiles(files_to_delete))
204 return SyncOperationResult::FILE_OPERATION_ERROR;
205 UMA_HISTOGRAM_COUNTS_1M(
206 "OfflinePages.ConsistencyCheck.Temporary.PagesMissingDbEntryCount",
207 static_cast<int32_t>(files_to_delete.size()));
208 }
209
210 return SyncOperationResult::SUCCESS;
211 }
212
ReportStorageUsageSync(sql::Database * db)213 void ReportStorageUsageSync(sql::Database* db) {
214 static const char kSql[] =
215 "SELECT sum(file_size) FROM " OFFLINE_PAGES_TABLE_NAME
216 " WHERE client_namespace = ?";
217 for (const auto& name_space : GetAllPolicyNamespaces()) {
218 sql::Statement statement(db->GetCachedStatement(SQL_FROM_HERE, kSql));
219 statement.BindString(0, name_space);
220 int size_in_kib = 0;
221 while (statement.Step()) {
222 size_in_kib = base::saturated_cast<int>(statement.ColumnInt64(0) / 1024);
223 }
224 base::UmaHistogramCustomCounts(
225 "OfflinePages.ClearStoragePreRunUsage2." + name_space, size_in_kib, 1,
226 10000000, 50);
227 }
228 }
229
StartupMaintenanceSync(const base::FilePath & temporary_archives_dir,const base::FilePath & private_archives_dir,sql::Database * db)230 bool StartupMaintenanceSync(
231 const base::FilePath& temporary_archives_dir,
232 const base::FilePath& private_archives_dir,
233 sql::Database* db) {
234 // Clear temporary pages that are in legacy directory, which is also the
235 // directory that serves as the 'private' directory.
236 SyncOperationResult result =
237 ClearLegacyPagesInPrivateDirSync(db, private_archives_dir);
238
239 // Clear temporary pages in cache directory.
240 result = CheckTemporaryPageConsistencySync(db, temporary_archives_dir);
241 UMA_HISTOGRAM_ENUMERATION("OfflinePages.ConsistencyCheck.Temporary.Result",
242 result);
243
244 // Report storage usage UMA, |temporary_namespaces| + |persistent_namespaces|
245 // should be all namespaces. This is implicitly checked by the
246 // TestReportStorageUsage unit test.
247 ReportStorageUsageSync(db);
248
249 return true;
250 }
251
252 } // namespace
253
StartupMaintenanceTask(OfflinePageMetadataStore * store,ArchiveManager * archive_manager)254 StartupMaintenanceTask::StartupMaintenanceTask(OfflinePageMetadataStore* store,
255 ArchiveManager* archive_manager)
256 : store_(store), archive_manager_(archive_manager) {
257 DCHECK(store_);
258 DCHECK(archive_manager_);
259 }
260
261 StartupMaintenanceTask::~StartupMaintenanceTask() = default;
262
Run()263 void StartupMaintenanceTask::Run() {
264 TRACE_EVENT_ASYNC_BEGIN0("offline_pages", "StartupMaintenanceTask running",
265 this);
266 store_->Execute(
267 base::BindOnce(&StartupMaintenanceSync,
268 archive_manager_->GetTemporaryArchivesDir(),
269 archive_manager_->GetPrivateArchivesDir()),
270 base::BindOnce(&StartupMaintenanceTask::OnStartupMaintenanceDone,
271 weak_ptr_factory_.GetWeakPtr()),
272 false);
273 }
274
OnStartupMaintenanceDone(bool result)275 void StartupMaintenanceTask::OnStartupMaintenanceDone(bool result) {
276 TRACE_EVENT_ASYNC_END1("offline_pages", "StartupMaintenanceTask running",
277 this, "result", result);
278 TaskComplete();
279 }
280
281 } // namespace offline_pages
282