1 // Copyright 2018 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/offline_pages/core/model/startup_maintenance_task.h"
6 
7 #include <map>
8 #include <set>
9 #include <string>
10 #include <vector>
11 
12 #include "base/bind.h"
13 #include "base/files/file_enumerator.h"
14 #include "base/files/file_util.h"
15 #include "base/metrics/histogram_functions.h"
16 #include "base/metrics/histogram_macros.h"
17 #include "base/numerics/safe_conversions.h"
18 #include "base/trace_event/trace_event.h"
19 #include "components/offline_pages/core/archive_manager.h"
20 #include "components/offline_pages/core/model/delete_page_task.h"
21 #include "components/offline_pages/core/offline_page_client_policy.h"
22 #include "components/offline_pages/core/offline_page_metadata_store.h"
23 #include "components/offline_pages/core/offline_store_utils.h"
24 #include "sql/database.h"
25 #include "sql/statement.h"
26 #include "sql/transaction.h"
27 
28 namespace offline_pages {
29 
30 namespace {
31 
32 #define OFFLINE_PAGES_TABLE_NAME "offlinepages_v1"
33 
34 struct PageInfo {
35   int64_t offline_id;
36   base::FilePath file_path;
37 };
38 
GetPageInfosByNamespaces(const std::vector<std::string> & temp_namespaces,sql::Database * db)39 std::vector<PageInfo> GetPageInfosByNamespaces(
40     const std::vector<std::string>& temp_namespaces,
41     sql::Database* db) {
42   std::vector<PageInfo> result;
43 
44   static const char kSql[] =
45       "SELECT offline_id, file_path"
46       " FROM " OFFLINE_PAGES_TABLE_NAME " WHERE client_namespace = ?";
47 
48   for (const auto& temp_namespace : temp_namespaces) {
49     sql::Statement statement(db->GetCachedStatement(SQL_FROM_HERE, kSql));
50     statement.BindString(0, temp_namespace);
51     while (statement.Step()) {
52       result.push_back(
53           {statement.ColumnInt64(0),
54            store_utils::FromDatabaseFilePath(statement.ColumnString(1))});
55     }
56   }
57 
58   return result;
59 }
60 
GetAllArchives(const base::FilePath & archives_dir)61 std::set<base::FilePath> GetAllArchives(const base::FilePath& archives_dir) {
62   std::set<base::FilePath> result;
63   base::FileEnumerator file_enumerator(archives_dir, false,
64                                        base::FileEnumerator::FILES,
65                                        FILE_PATH_LITERAL("*.mhtml"));
66   for (auto archive_path = file_enumerator.Next(); !archive_path.empty();
67        archive_path = file_enumerator.Next()) {
68     result.insert(archive_path);
69   }
70   return result;
71 }
72 
DeleteFiles(const std::vector<base::FilePath> & file_paths)73 bool DeleteFiles(const std::vector<base::FilePath>& file_paths) {
74   bool result = true;
75   for (const auto& file_path : file_paths)
76     result = base::DeleteFile(file_path, false) && result;
77   return result;
78 }
79 
80 // This method is clearing the private dir(the legacy dir).
81 // - For all files associated with temporary pages:
82 //   The strategy is if any temporary page
83 //   is still left behind in the legacy dir, delete them.
84 // - For all files associated with persistent pages:
85 //   Leave them as-is, since they might be still in use.
86 // - For all files without any associated DB entry:
87 //   Delete the files, since they're 'headless' and has no way to be accessed.
ClearLegacyPagesInPrivateDirSync(sql::Database * db,const base::FilePath & private_dir)88 SyncOperationResult ClearLegacyPagesInPrivateDirSync(
89     sql::Database* db,
90     const base::FilePath& private_dir) {
91   // One large database transaction that will:
92   // 1. Get temporary page infos from the database.
93   // 2. Get persistent page infos from the database, in case they're in private
94   //    dir.
95   // 3. Get all file paths in private dir as a set F.
96   // 4. For each temporary page info:
97   //    - If its file path is in F, record its offline id for deletion.
98   // 5. For each persistent page info:
99   //    - If its file path is in F, remove it from F.
100   // 6. Delete page entries by recorded offline ids, and delete the remaining
101   //    files in F.
102   sql::Transaction transaction(db);
103   if (!transaction.Begin())
104     return SyncOperationResult::TRANSACTION_BEGIN_ERROR;
105 
106   std::vector<PageInfo> temporary_page_infos =
107       GetPageInfosByNamespaces(GetTemporaryPolicyNamespaces(), db);
108   std::vector<PageInfo> persistent_page_infos =
109       GetPageInfosByNamespaces(GetPersistentPolicyNamespaces(), db);
110   std::map<base::FilePath, PageInfo> path_to_page_info;
111 
112   std::set<base::FilePath> archive_paths = GetAllArchives(private_dir);
113   std::vector<int64_t> offline_ids_to_delete;
114 
115   for (const auto& page_info : temporary_page_infos) {
116     if (archive_paths.find(page_info.file_path) != archive_paths.end())
117       offline_ids_to_delete.push_back(page_info.offline_id);
118   }
119   for (const auto& page_info : persistent_page_infos) {
120     auto iter = archive_paths.find(page_info.file_path);
121     if (iter != archive_paths.end())
122       archive_paths.erase(iter);
123   }
124 
125   // Try to delete the pages by offline ids collected above.
126   // If there's any database related errors, the function will return failure,
127   // and the database operations will be rolled back since the transaction will
128   // not be committed.
129   if (!DeletePageTask::DeletePagesFromDbSync(offline_ids_to_delete, db))
130     return SyncOperationResult::DB_OPERATION_ERROR;
131 
132   if (!transaction.Commit())
133     return SyncOperationResult::TRANSACTION_COMMIT_ERROR;
134 
135   std::vector<base::FilePath> files_to_delete(archive_paths.begin(),
136                                               archive_paths.end());
137   if (!DeleteFiles(files_to_delete))
138     return SyncOperationResult::FILE_OPERATION_ERROR;
139 
140   size_t headless_file_count =
141       files_to_delete.size() - offline_ids_to_delete.size();
142   if (headless_file_count > 0) {
143     UMA_HISTOGRAM_COUNTS_1M(
144         "OfflinePages.ConsistencyCheck.Legacy.DeletedHeadlessFileCount",
145         headless_file_count);
146   }
147 
148   return SyncOperationResult::SUCCESS;
149 }
150 
CheckTemporaryPageConsistencySync(sql::Database * db,const base::FilePath & archives_dir)151 SyncOperationResult CheckTemporaryPageConsistencySync(
152     sql::Database* db,
153     const base::FilePath& archives_dir) {
154   // One large database transaction that will:
155   // 1. Get page infos by |namespaces| from the database.
156   // 2. Decide which pages to delete.
157   // 3. Delete metadata entries from the database.
158   sql::Transaction transaction(db);
159   if (!transaction.Begin())
160     return SyncOperationResult::TRANSACTION_BEGIN_ERROR;
161 
162   std::vector<PageInfo> page_infos =
163       GetPageInfosByNamespaces(GetTemporaryPolicyNamespaces(), db);
164 
165   std::set<base::FilePath> page_info_paths;
166   std::vector<int64_t> offline_ids_to_delete;
167   for (const auto& page_info : page_infos) {
168     // Get pages whose archive files does not exist and delete.
169     if (!base::PathExists(page_info.file_path)) {
170       offline_ids_to_delete.push_back(page_info.offline_id);
171     } else {
172       // Extract existing file paths from |page_infos| so that we can do a
173       // faster matching later.
174       page_info_paths.insert(page_info.file_path);
175     }
176   }
177 
178   if (offline_ids_to_delete.size() > 0) {
179     // Try to delete the pages by offline ids collected above. If there's any
180     // database related errors, the function will return false, and the database
181     // operations will be rolled back since the transaction will not be
182     // committed.
183     if (!DeletePageTask::DeletePagesFromDbSync(offline_ids_to_delete, db))
184       return SyncOperationResult::DB_OPERATION_ERROR;
185     UMA_HISTOGRAM_COUNTS_1M(
186         "OfflinePages.ConsistencyCheck.Temporary.PagesMissingArchiveFileCount",
187         base::saturated_cast<int32_t>(offline_ids_to_delete.size()));
188   }
189 
190   if (!transaction.Commit())
191     return SyncOperationResult::TRANSACTION_COMMIT_ERROR;
192 
193   // Delete any files in the temporary archive directory that no longer have
194   // associated entries in the database.
195   std::set<base::FilePath> archive_paths = GetAllArchives(archives_dir);
196   std::vector<base::FilePath> files_to_delete;
197   for (const auto& archive_path : archive_paths) {
198     if (page_info_paths.find(archive_path) == page_info_paths.end())
199       files_to_delete.push_back(archive_path);
200   }
201 
202   if (files_to_delete.size() > 0) {
203     if (!DeleteFiles(files_to_delete))
204       return SyncOperationResult::FILE_OPERATION_ERROR;
205     UMA_HISTOGRAM_COUNTS_1M(
206         "OfflinePages.ConsistencyCheck.Temporary.PagesMissingDbEntryCount",
207         static_cast<int32_t>(files_to_delete.size()));
208   }
209 
210   return SyncOperationResult::SUCCESS;
211 }
212 
ReportStorageUsageSync(sql::Database * db)213 void ReportStorageUsageSync(sql::Database* db) {
214   static const char kSql[] =
215       "SELECT sum(file_size) FROM " OFFLINE_PAGES_TABLE_NAME
216       " WHERE client_namespace = ?";
217   for (const auto& name_space : GetAllPolicyNamespaces()) {
218     sql::Statement statement(db->GetCachedStatement(SQL_FROM_HERE, kSql));
219     statement.BindString(0, name_space);
220     int size_in_kib = 0;
221     while (statement.Step()) {
222       size_in_kib = base::saturated_cast<int>(statement.ColumnInt64(0) / 1024);
223     }
224     base::UmaHistogramCustomCounts(
225         "OfflinePages.ClearStoragePreRunUsage2." + name_space, size_in_kib, 1,
226         10000000, 50);
227   }
228 }
229 
StartupMaintenanceSync(const base::FilePath & temporary_archives_dir,const base::FilePath & private_archives_dir,sql::Database * db)230 bool StartupMaintenanceSync(
231     const base::FilePath& temporary_archives_dir,
232     const base::FilePath& private_archives_dir,
233     sql::Database* db) {
234   // Clear temporary pages that are in legacy directory, which is also the
235   // directory that serves as the 'private' directory.
236   SyncOperationResult result =
237       ClearLegacyPagesInPrivateDirSync(db, private_archives_dir);
238 
239   // Clear temporary pages in cache directory.
240   result = CheckTemporaryPageConsistencySync(db, temporary_archives_dir);
241   UMA_HISTOGRAM_ENUMERATION("OfflinePages.ConsistencyCheck.Temporary.Result",
242                             result);
243 
244   // Report storage usage UMA, |temporary_namespaces| + |persistent_namespaces|
245   // should be all namespaces. This is implicitly checked by the
246   // TestReportStorageUsage unit test.
247   ReportStorageUsageSync(db);
248 
249   return true;
250 }
251 
252 }  // namespace
253 
StartupMaintenanceTask(OfflinePageMetadataStore * store,ArchiveManager * archive_manager)254 StartupMaintenanceTask::StartupMaintenanceTask(OfflinePageMetadataStore* store,
255                                                ArchiveManager* archive_manager)
256     : store_(store), archive_manager_(archive_manager) {
257   DCHECK(store_);
258   DCHECK(archive_manager_);
259 }
260 
261 StartupMaintenanceTask::~StartupMaintenanceTask() = default;
262 
Run()263 void StartupMaintenanceTask::Run() {
264   TRACE_EVENT_ASYNC_BEGIN0("offline_pages", "StartupMaintenanceTask running",
265                            this);
266   store_->Execute(
267       base::BindOnce(&StartupMaintenanceSync,
268                      archive_manager_->GetTemporaryArchivesDir(),
269                      archive_manager_->GetPrivateArchivesDir()),
270       base::BindOnce(&StartupMaintenanceTask::OnStartupMaintenanceDone,
271                      weak_ptr_factory_.GetWeakPtr()),
272       false);
273 }
274 
OnStartupMaintenanceDone(bool result)275 void StartupMaintenanceTask::OnStartupMaintenanceDone(bool result) {
276   TRACE_EVENT_ASYNC_END1("offline_pages", "StartupMaintenanceTask running",
277                          this, "result", result);
278   TaskComplete();
279 }
280 
281 }  // namespace offline_pages
282