1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2012 Facebook.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file.
9
10 #ifndef ROCKSDB_LITE
11
12 #include "utilities/checkpoint/checkpoint_impl.h"
13
14 #include <algorithm>
15 #include <cinttypes>
16 #include <string>
17 #include <tuple>
18 #include <unordered_set>
19 #include <vector>
20
21 #include "db/wal_manager.h"
22 #include "file/file_util.h"
23 #include "file/filename.h"
24 #include "logging/logging.h"
25 #include "port/port.h"
26 #include "rocksdb/db.h"
27 #include "rocksdb/env.h"
28 #include "rocksdb/metadata.h"
29 #include "rocksdb/options.h"
30 #include "rocksdb/transaction_log.h"
31 #include "rocksdb/types.h"
32 #include "rocksdb/utilities/checkpoint.h"
33 #include "test_util/sync_point.h"
34 #include "util/cast_util.h"
35 #include "util/file_checksum_helper.h"
36
37 namespace ROCKSDB_NAMESPACE {
38
Create(DB * db,Checkpoint ** checkpoint_ptr)39 Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) {
40 *checkpoint_ptr = new CheckpointImpl(db);
41 return Status::OK();
42 }
43
CreateCheckpoint(const std::string &,uint64_t,uint64_t *)44 Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/,
45 uint64_t /*log_size_for_flush*/,
46 uint64_t* /*sequence_number_ptr*/) {
47 return Status::NotSupported("");
48 }
49
CleanStagingDirectory(const std::string & full_private_path,Logger * info_log)50 void CheckpointImpl::CleanStagingDirectory(const std::string& full_private_path,
51 Logger* info_log) {
52 std::vector<std::string> subchildren;
53 Status s = db_->GetEnv()->FileExists(full_private_path);
54 if (s.IsNotFound()) {
55 return;
56 }
57 ROCKS_LOG_INFO(info_log, "File exists %s -- %s", full_private_path.c_str(),
58 s.ToString().c_str());
59 s = db_->GetEnv()->GetChildren(full_private_path, &subchildren);
60 if (s.ok()) {
61 for (auto& subchild : subchildren) {
62 std::string subchild_path = full_private_path + "/" + subchild;
63 s = db_->GetEnv()->DeleteFile(subchild_path);
64 ROCKS_LOG_INFO(info_log, "Delete file %s -- %s", subchild_path.c_str(),
65 s.ToString().c_str());
66 }
67 }
68 // finally delete the private dir
69 s = db_->GetEnv()->DeleteDir(full_private_path);
70 ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s", full_private_path.c_str(),
71 s.ToString().c_str());
72 }
73
ExportColumnFamily(ColumnFamilyHandle *,const std::string &,ExportImportFilesMetaData **)74 Status Checkpoint::ExportColumnFamily(
75 ColumnFamilyHandle* /*handle*/, const std::string& /*export_dir*/,
76 ExportImportFilesMetaData** /*metadata*/) {
77 return Status::NotSupported("");
78 }
79
80 // Builds an openable snapshot of RocksDB
CreateCheckpoint(const std::string & checkpoint_dir,uint64_t log_size_for_flush,uint64_t * sequence_number_ptr)81 Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
82 uint64_t log_size_for_flush,
83 uint64_t* sequence_number_ptr) {
84 DBOptions db_options = db_->GetDBOptions();
85
86 Status s = db_->GetEnv()->FileExists(checkpoint_dir);
87 if (s.ok()) {
88 return Status::InvalidArgument("Directory exists");
89 } else if (!s.IsNotFound()) {
90 assert(s.IsIOError());
91 return s;
92 }
93
94 ROCKS_LOG_INFO(
95 db_options.info_log,
96 "Started the snapshot process -- creating snapshot in directory %s",
97 checkpoint_dir.c_str());
98
99 size_t final_nonslash_idx = checkpoint_dir.find_last_not_of('/');
100 if (final_nonslash_idx == std::string::npos) {
101 // npos means it's only slashes or empty. Non-empty means it's the root
102 // directory, but it shouldn't be because we verified above the directory
103 // doesn't exist.
104 assert(checkpoint_dir.empty());
105 return Status::InvalidArgument("invalid checkpoint directory name");
106 }
107
108 std::string full_private_path =
109 checkpoint_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
110 ROCKS_LOG_INFO(db_options.info_log,
111 "Snapshot process -- using temporary directory %s",
112 full_private_path.c_str());
113 CleanStagingDirectory(full_private_path, db_options.info_log.get());
114 // create snapshot directory
115 s = db_->GetEnv()->CreateDir(full_private_path);
116 uint64_t sequence_number = 0;
117 if (s.ok()) {
118 // enable file deletions
119 s = db_->DisableFileDeletions();
120 const bool disabled_file_deletions = s.ok();
121
122 if (s.ok() || s.IsNotSupported()) {
123 s = CreateCustomCheckpoint(
124 [&](const std::string& src_dirname, const std::string& fname,
125 FileType) {
126 ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s",
127 fname.c_str());
128 return db_->GetFileSystem()->LinkFile(
129 src_dirname + "/" + fname, full_private_path + "/" + fname,
130 IOOptions(), nullptr);
131 } /* link_file_cb */,
132 [&](const std::string& src_dirname, const std::string& fname,
133 uint64_t size_limit_bytes, FileType,
134 const std::string& /* checksum_func_name */,
135 const std::string& /* checksum_val */) {
136 ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str());
137 return CopyFile(db_->GetFileSystem(), src_dirname + "/" + fname,
138 full_private_path + "/" + fname, size_limit_bytes,
139 db_options.use_fsync);
140 } /* copy_file_cb */,
141 [&](const std::string& fname, const std::string& contents, FileType) {
142 ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
143 return CreateFile(db_->GetFileSystem(),
144 full_private_path + "/" + fname, contents,
145 db_options.use_fsync);
146 } /* create_file_cb */,
147 &sequence_number, log_size_for_flush);
148
149 // we copied all the files, enable file deletions
150 if (disabled_file_deletions) {
151 Status ss = db_->EnableFileDeletions(false);
152 assert(ss.ok());
153 ss.PermitUncheckedError();
154 }
155 }
156 }
157
158 if (s.ok()) {
159 // move tmp private backup to real snapshot directory
160 s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir);
161 }
162 if (s.ok()) {
163 std::unique_ptr<Directory> checkpoint_directory;
164 s = db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory);
165 if (s.ok() && checkpoint_directory != nullptr) {
166 s = checkpoint_directory->Fsync();
167 }
168 }
169
170 if (s.ok()) {
171 if (sequence_number_ptr != nullptr) {
172 *sequence_number_ptr = sequence_number;
173 }
174 // here we know that we succeeded and installed the new snapshot
175 ROCKS_LOG_INFO(db_options.info_log, "Snapshot DONE. All is good");
176 ROCKS_LOG_INFO(db_options.info_log, "Snapshot sequence number: %" PRIu64,
177 sequence_number);
178 } else {
179 // clean all the files we might have created
180 ROCKS_LOG_INFO(db_options.info_log, "Snapshot failed -- %s",
181 s.ToString().c_str());
182 CleanStagingDirectory(full_private_path, db_options.info_log.get());
183 }
184 return s;
185 }
186
CreateCustomCheckpoint(std::function<Status (const std::string & src_dirname,const std::string & src_fname,FileType type)> link_file_cb,std::function<Status (const std::string & src_dirname,const std::string & src_fname,uint64_t size_limit_bytes,FileType type,const std::string & checksum_func_name,const std::string & checksum_val)> copy_file_cb,std::function<Status (const std::string & fname,const std::string & contents,FileType type)> create_file_cb,uint64_t * sequence_number,uint64_t log_size_for_flush,bool get_live_table_checksum)187 Status CheckpointImpl::CreateCustomCheckpoint(
188 std::function<Status(const std::string& src_dirname,
189 const std::string& src_fname, FileType type)>
190 link_file_cb,
191 std::function<Status(
192 const std::string& src_dirname, const std::string& src_fname,
193 uint64_t size_limit_bytes, FileType type,
194 const std::string& checksum_func_name, const std::string& checksum_val)>
195 copy_file_cb,
196 std::function<Status(const std::string& fname, const std::string& contents,
197 FileType type)>
198 create_file_cb,
199 uint64_t* sequence_number, uint64_t log_size_for_flush,
200 bool get_live_table_checksum) {
201 *sequence_number = db_->GetLatestSequenceNumber();
202
203 LiveFilesStorageInfoOptions opts;
204 opts.include_checksum_info = get_live_table_checksum;
205 opts.wal_size_for_flush = log_size_for_flush;
206
207 std::vector<LiveFileStorageInfo> infos;
208 {
209 Status s = db_->GetLiveFilesStorageInfo(opts, &infos);
210 if (!s.ok()) {
211 return s;
212 }
213 }
214
215 // Verify that everything except WAL files are in same directory
216 // (db_paths / cf_paths not supported)
217 std::unordered_set<std::string> dirs;
218 for (auto& info : infos) {
219 if (info.file_type != kWalFile) {
220 dirs.insert(info.directory);
221 }
222 }
223 if (dirs.size() > 1) {
224 return Status::NotSupported(
225 "db_paths / cf_paths not supported for Checkpoint nor BackupEngine");
226 }
227
228 bool same_fs = true;
229
230 for (auto& info : infos) {
231 Status s;
232 if (!info.replacement_contents.empty()) {
233 // Currently should only be used for CURRENT file.
234 assert(info.file_type == kCurrentFile);
235
236 if (info.size != info.replacement_contents.size()) {
237 s = Status::Corruption("Inconsistent size metadata for " +
238 info.relative_filename);
239 } else {
240 s = create_file_cb(info.relative_filename, info.replacement_contents,
241 info.file_type);
242 }
243 } else {
244 if (same_fs && !info.trim_to_size) {
245 s = link_file_cb(info.directory, info.relative_filename,
246 info.file_type);
247 if (s.IsNotSupported()) {
248 same_fs = false;
249 s = Status::OK();
250 }
251 s.MustCheck();
252 }
253 if (!same_fs || info.trim_to_size) {
254 assert(info.file_checksum_func_name.empty() ==
255 !opts.include_checksum_info);
256 // no assertion on file_checksum because empty is used for both "not
257 // set" and "unknown"
258 if (opts.include_checksum_info) {
259 s = copy_file_cb(info.directory, info.relative_filename, info.size,
260 info.file_type, info.file_checksum_func_name,
261 info.file_checksum);
262 } else {
263 s = copy_file_cb(info.directory, info.relative_filename, info.size,
264 info.file_type, kUnknownFileChecksumFuncName,
265 kUnknownFileChecksum);
266 }
267 }
268 }
269 if (!s.ok()) {
270 return s;
271 }
272 }
273
274 return Status::OK();
275 }
276
277 // Exports all live SST files of a specified Column Family onto export_dir,
278 // returning SST files information in metadata.
ExportColumnFamily(ColumnFamilyHandle * handle,const std::string & export_dir,ExportImportFilesMetaData ** metadata)279 Status CheckpointImpl::ExportColumnFamily(
280 ColumnFamilyHandle* handle, const std::string& export_dir,
281 ExportImportFilesMetaData** metadata) {
282 auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
283 const auto cf_name = cfh->GetName();
284 const auto db_options = db_->GetDBOptions();
285
286 assert(metadata != nullptr);
287 assert(*metadata == nullptr);
288 auto s = db_->GetEnv()->FileExists(export_dir);
289 if (s.ok()) {
290 return Status::InvalidArgument("Specified export_dir exists");
291 } else if (!s.IsNotFound()) {
292 assert(s.IsIOError());
293 return s;
294 }
295
296 const auto final_nonslash_idx = export_dir.find_last_not_of('/');
297 if (final_nonslash_idx == std::string::npos) {
298 return Status::InvalidArgument("Specified export_dir invalid");
299 }
300 ROCKS_LOG_INFO(db_options.info_log,
301 "[%s] export column family onto export directory %s",
302 cf_name.c_str(), export_dir.c_str());
303
304 // Create a temporary export directory.
305 const auto tmp_export_dir =
306 export_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
307 s = db_->GetEnv()->CreateDir(tmp_export_dir);
308
309 if (s.ok()) {
310 s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle);
311 }
312
313 ColumnFamilyMetaData db_metadata;
314 if (s.ok()) {
315 // Export live sst files with file deletions disabled.
316 s = db_->DisableFileDeletions();
317 if (s.ok()) {
318 db_->GetColumnFamilyMetaData(handle, &db_metadata);
319
320 s = ExportFilesInMetaData(
321 db_options, db_metadata,
322 [&](const std::string& src_dirname, const std::string& fname) {
323 ROCKS_LOG_INFO(db_options.info_log, "[%s] HardLinking %s",
324 cf_name.c_str(), fname.c_str());
325 return db_->GetEnv()->LinkFile(src_dirname + fname,
326 tmp_export_dir + fname);
327 } /*link_file_cb*/,
328 [&](const std::string& src_dirname, const std::string& fname) {
329 ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s",
330 cf_name.c_str(), fname.c_str());
331 return CopyFile(db_->GetFileSystem(), src_dirname + fname,
332 tmp_export_dir + fname, 0, db_options.use_fsync);
333 } /*copy_file_cb*/);
334
335 const auto enable_status = db_->EnableFileDeletions(false /*force*/);
336 if (s.ok()) {
337 s = enable_status;
338 }
339 }
340 }
341
342 auto moved_to_user_specified_dir = false;
343 if (s.ok()) {
344 // Move temporary export directory to the actual export directory.
345 s = db_->GetEnv()->RenameFile(tmp_export_dir, export_dir);
346 }
347
348 if (s.ok()) {
349 // Fsync export directory.
350 moved_to_user_specified_dir = true;
351 std::unique_ptr<Directory> dir_ptr;
352 s = db_->GetEnv()->NewDirectory(export_dir, &dir_ptr);
353 if (s.ok()) {
354 assert(dir_ptr != nullptr);
355 s = dir_ptr->Fsync();
356 }
357 }
358
359 if (s.ok()) {
360 // Export of files succeeded. Fill in the metadata information.
361 auto result_metadata = new ExportImportFilesMetaData();
362 result_metadata->db_comparator_name = handle->GetComparator()->Name();
363 for (const auto& level_metadata : db_metadata.levels) {
364 for (const auto& file_metadata : level_metadata.files) {
365 LiveFileMetaData live_file_metadata;
366 live_file_metadata.size = file_metadata.size;
367 live_file_metadata.name = std::move(file_metadata.name);
368 live_file_metadata.file_number = file_metadata.file_number;
369 live_file_metadata.db_path = export_dir;
370 live_file_metadata.smallest_seqno = file_metadata.smallest_seqno;
371 live_file_metadata.largest_seqno = file_metadata.largest_seqno;
372 live_file_metadata.smallestkey = std::move(file_metadata.smallestkey);
373 live_file_metadata.largestkey = std::move(file_metadata.largestkey);
374 live_file_metadata.oldest_blob_file_number =
375 file_metadata.oldest_blob_file_number;
376 live_file_metadata.level = level_metadata.level;
377 result_metadata->files.push_back(live_file_metadata);
378 }
379 *metadata = result_metadata;
380 }
381 ROCKS_LOG_INFO(db_options.info_log, "[%s] Export succeeded.",
382 cf_name.c_str());
383 } else {
384 // Failure: Clean up all the files/directories created.
385 ROCKS_LOG_INFO(db_options.info_log, "[%s] Export failed. %s",
386 cf_name.c_str(), s.ToString().c_str());
387 std::vector<std::string> subchildren;
388 const auto cleanup_dir =
389 moved_to_user_specified_dir ? export_dir : tmp_export_dir;
390 db_->GetEnv()->GetChildren(cleanup_dir, &subchildren);
391 for (const auto& subchild : subchildren) {
392 const auto subchild_path = cleanup_dir + "/" + subchild;
393 const auto status = db_->GetEnv()->DeleteFile(subchild_path);
394 if (!status.ok()) {
395 ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup file %s: %s",
396 subchild_path.c_str(), status.ToString().c_str());
397 }
398 }
399 const auto status = db_->GetEnv()->DeleteDir(cleanup_dir);
400 if (!status.ok()) {
401 ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup dir %s: %s",
402 cleanup_dir.c_str(), status.ToString().c_str());
403 }
404 }
405 return s;
406 }
407
ExportFilesInMetaData(const DBOptions & db_options,const ColumnFamilyMetaData & metadata,std::function<Status (const std::string & src_dirname,const std::string & src_fname)> link_file_cb,std::function<Status (const std::string & src_dirname,const std::string & src_fname)> copy_file_cb)408 Status CheckpointImpl::ExportFilesInMetaData(
409 const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
410 std::function<Status(const std::string& src_dirname,
411 const std::string& src_fname)>
412 link_file_cb,
413 std::function<Status(const std::string& src_dirname,
414 const std::string& src_fname)>
415 copy_file_cb) {
416 Status s;
417 auto hardlink_file = true;
418
419 // Copy/hard link files in metadata.
420 size_t num_files = 0;
421 for (const auto& level_metadata : metadata.levels) {
422 for (const auto& file_metadata : level_metadata.files) {
423 uint64_t number;
424 FileType type;
425 const auto ok = ParseFileName(file_metadata.name, &number, &type);
426 if (!ok) {
427 s = Status::Corruption("Could not parse file name");
428 break;
429 }
430
431 // We should only get sst files here.
432 assert(type == kTableFile);
433 assert(file_metadata.size > 0 && file_metadata.name[0] == '/');
434 const auto src_fname = file_metadata.name;
435 ++num_files;
436
437 if (hardlink_file) {
438 s = link_file_cb(db_->GetName(), src_fname);
439 if (num_files == 1 && s.IsNotSupported()) {
440 // Fallback to copy if link failed due to cross-device directories.
441 hardlink_file = false;
442 s = Status::OK();
443 }
444 }
445 if (!hardlink_file) {
446 s = copy_file_cb(db_->GetName(), src_fname);
447 }
448 if (!s.ok()) {
449 break;
450 }
451 }
452 }
453 ROCKS_LOG_INFO(db_options.info_log, "Number of table files %" ROCKSDB_PRIszt,
454 num_files);
455
456 return s;
457 }
458 } // namespace ROCKSDB_NAMESPACE
459
460 #endif // ROCKSDB_LITE
461