1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6
7 #ifndef ROCKSDB_LITE
8
9 #include <algorithm>
10 #include <cstdint>
11 #include <memory>
12 #include <string>
13 #include <vector>
14
15 #include "db/db_impl/db_impl.h"
16 #include "db/job_context.h"
17 #include "db/version_set.h"
18 #include "file/file_util.h"
19 #include "file/filename.h"
20 #include "logging/logging.h"
21 #include "port/port.h"
22 #include "rocksdb/db.h"
23 #include "rocksdb/env.h"
24 #include "rocksdb/metadata.h"
25 #include "rocksdb/types.h"
26 #include "test_util/sync_point.h"
27 #include "util/file_checksum_helper.h"
28 #include "util/mutexlock.h"
29
30 namespace ROCKSDB_NAMESPACE {
31
FlushForGetLiveFiles()32 Status DBImpl::FlushForGetLiveFiles() {
33 mutex_.AssertHeld();
34
35 // flush all dirty data to disk.
36 Status status;
37 if (immutable_db_options_.atomic_flush) {
38 autovector<ColumnFamilyData*> cfds;
39 SelectColumnFamiliesForAtomicFlush(&cfds);
40 mutex_.Unlock();
41 status =
42 AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles);
43 if (status.IsColumnFamilyDropped()) {
44 status = Status::OK();
45 }
46 mutex_.Lock();
47 } else {
48 for (auto cfd : *versions_->GetColumnFamilySet()) {
49 if (cfd->IsDropped()) {
50 continue;
51 }
52 cfd->Ref();
53 mutex_.Unlock();
54 status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
55 TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
56 TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
57 mutex_.Lock();
58 cfd->UnrefAndTryDelete();
59 if (!status.ok() && !status.IsColumnFamilyDropped()) {
60 break;
61 } else if (status.IsColumnFamilyDropped()) {
62 status = Status::OK();
63 }
64 }
65 }
66 versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
67 return status;
68 }
69
GetLiveFiles(std::vector<std::string> & ret,uint64_t * manifest_file_size,bool flush_memtable)70 Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
71 uint64_t* manifest_file_size,
72 bool flush_memtable) {
73 *manifest_file_size = 0;
74
75 mutex_.Lock();
76
77 if (flush_memtable) {
78 Status status = FlushForGetLiveFiles();
79 if (!status.ok()) {
80 mutex_.Unlock();
81 ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
82 status.ToString().c_str());
83 return status;
84 }
85 }
86
87 // Make a set of all of the live table and blob files
88 std::vector<uint64_t> live_table_files;
89 std::vector<uint64_t> live_blob_files;
90 for (auto cfd : *versions_->GetColumnFamilySet()) {
91 if (cfd->IsDropped()) {
92 continue;
93 }
94 cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files);
95 }
96
97 ret.clear();
98 ret.reserve(live_table_files.size() + live_blob_files.size() +
99 3); // for CURRENT + MANIFEST + OPTIONS
100
101 // create names of the live files. The names are not absolute
102 // paths, instead they are relative to dbname_;
103 for (const auto& table_file_number : live_table_files) {
104 ret.emplace_back(MakeTableFileName("", table_file_number));
105 }
106
107 for (const auto& blob_file_number : live_blob_files) {
108 ret.emplace_back(BlobFileName("", blob_file_number));
109 }
110
111 ret.emplace_back(CurrentFileName(""));
112 ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
113 // The OPTIONS file number is zero in read-write mode when OPTIONS file
114 // writing failed and the DB was configured with
115 // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
116 // number is zero when no OPTIONS file exist at all. In those cases we do not
117 // record any OPTIONS file in the live file list.
118 if (versions_->options_file_number() != 0) {
119 ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
120 }
121
122 // find length of manifest file while holding the mutex lock
123 *manifest_file_size = versions_->manifest_file_size();
124
125 mutex_.Unlock();
126 return Status::OK();
127 }
128
GetSortedWalFiles(VectorLogPtr & files)129 Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
130 {
131 // If caller disabled deletions, this function should return files that are
132 // guaranteed not to be deleted until deletions are re-enabled. We need to
133 // wait for pending purges to finish since WalManager doesn't know which
134 // files are going to be purged. Additional purges won't be scheduled as
135 // long as deletions are disabled (so the below loop must terminate).
136 InstrumentedMutexLock l(&mutex_);
137 while (disable_delete_obsolete_files_ > 0 &&
138 (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0)) {
139 bg_cv_.Wait();
140 }
141 }
142
143 // Disable deletion in order to avoid the case where a file is deleted in
144 // the middle of the process so IO error is returned.
145 Status s = DisableFileDeletions();
146 bool file_deletion_supported = !s.IsNotSupported();
147 if (s.ok() || !file_deletion_supported) {
148 s = wal_manager_.GetSortedWalFiles(files);
149 if (file_deletion_supported) {
150 Status s2 = EnableFileDeletions(false);
151 if (!s2.ok() && s.ok()) {
152 s = s2;
153 }
154 }
155 }
156
157 return s;
158 }
159
GetCurrentWalFile(std::unique_ptr<LogFile> * current_log_file)160 Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
161 uint64_t current_logfile_number;
162 {
163 InstrumentedMutexLock l(&mutex_);
164 current_logfile_number = logfile_number_;
165 }
166
167 return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
168 }
169
GetLiveFilesStorageInfo(const LiveFilesStorageInfoOptions & opts,std::vector<LiveFileStorageInfo> * files)170 Status DBImpl::GetLiveFilesStorageInfo(
171 const LiveFilesStorageInfoOptions& opts,
172 std::vector<LiveFileStorageInfo>* files) {
173 // To avoid returning partial results, only move to ouput on success
174 assert(files);
175 files->clear();
176 std::vector<LiveFileStorageInfo> results;
177
178 // NOTE: This implementation was largely migrated from Checkpoint.
179
180 Status s;
181 VectorLogPtr live_wal_files;
182 bool flush_memtable = true;
183 if (!immutable_db_options_.allow_2pc) {
184 if (opts.wal_size_for_flush == port::kMaxUint64) {
185 flush_memtable = false;
186 } else if (opts.wal_size_for_flush > 0) {
187 // If out standing log files are small, we skip the flush.
188 s = GetSortedWalFiles(live_wal_files);
189
190 if (!s.ok()) {
191 return s;
192 }
193
194 // Don't flush column families if total log size is smaller than
195 // log_size_for_flush. We copy the log files instead.
196 // We may be able to cover 2PC case too.
197 uint64_t total_wal_size = 0;
198 for (auto& wal : live_wal_files) {
199 total_wal_size += wal->SizeFileBytes();
200 }
201 if (total_wal_size < opts.wal_size_for_flush) {
202 flush_memtable = false;
203 }
204 live_wal_files.clear();
205 }
206 }
207
208 // This is a modified version of GetLiveFiles, to get access to more
209 // metadata.
210 mutex_.Lock();
211 if (flush_memtable) {
212 Status status = FlushForGetLiveFiles();
213 if (!status.ok()) {
214 mutex_.Unlock();
215 ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
216 status.ToString().c_str());
217 return status;
218 }
219 }
220
221 // Make a set of all of the live table and blob files
222 for (auto cfd : *versions_->GetColumnFamilySet()) {
223 if (cfd->IsDropped()) {
224 continue;
225 }
226 VersionStorageInfo& vsi = *cfd->current()->storage_info();
227 auto& cf_paths = cfd->ioptions()->cf_paths;
228
229 auto GetDir = [&](size_t path_id) {
230 // Matching TableFileName() behavior
231 if (path_id >= cf_paths.size()) {
232 assert(false);
233 return cf_paths.back().path;
234 } else {
235 return cf_paths[path_id].path;
236 }
237 };
238
239 for (int level = 0; level < vsi.num_levels(); ++level) {
240 const auto& level_files = vsi.LevelFiles(level);
241 for (const auto& meta : level_files) {
242 assert(meta);
243
244 results.emplace_back();
245 LiveFileStorageInfo& info = results.back();
246
247 info.relative_filename = MakeTableFileName(meta->fd.GetNumber());
248 info.directory = GetDir(meta->fd.GetPathId());
249 info.file_number = meta->fd.GetNumber();
250 info.file_type = kTableFile;
251 info.size = meta->fd.GetFileSize();
252 if (opts.include_checksum_info) {
253 info.file_checksum_func_name = meta->file_checksum_func_name;
254 info.file_checksum = meta->file_checksum;
255 if (info.file_checksum_func_name.empty()) {
256 info.file_checksum_func_name = kUnknownFileChecksumFuncName;
257 info.file_checksum = kUnknownFileChecksum;
258 }
259 }
260 info.temperature = meta->temperature;
261 }
262 }
263 const auto& blob_files = vsi.GetBlobFiles();
264 for (const auto& pair : blob_files) {
265 const auto& meta = pair.second;
266 assert(meta);
267
268 results.emplace_back();
269 LiveFileStorageInfo& info = results.back();
270
271 info.relative_filename = BlobFileName(meta->GetBlobFileNumber());
272 info.directory = GetName(); // TODO?: support db_paths/cf_paths
273 info.file_number = meta->GetBlobFileNumber();
274 info.file_type = kBlobFile;
275 info.size = meta->GetBlobFileSize();
276 if (opts.include_checksum_info) {
277 info.file_checksum_func_name = meta->GetChecksumMethod();
278 info.file_checksum = meta->GetChecksumValue();
279 if (info.file_checksum_func_name.empty()) {
280 info.file_checksum_func_name = kUnknownFileChecksumFuncName;
281 info.file_checksum = kUnknownFileChecksum;
282 }
283 }
284 // TODO?: info.temperature
285 }
286 }
287
288 // Capture some final info before releasing mutex
289 const uint64_t manifest_number = versions_->manifest_file_number();
290 const uint64_t manifest_size = versions_->manifest_file_size();
291 const uint64_t options_number = versions_->options_file_number();
292 const uint64_t options_size = versions_->options_file_size_;
293 const uint64_t min_log_num = MinLogNumberToKeep();
294
295 mutex_.Unlock();
296
297 std::string manifest_fname = DescriptorFileName(manifest_number);
298 { // MANIFEST
299 results.emplace_back();
300 LiveFileStorageInfo& info = results.back();
301
302 info.relative_filename = manifest_fname;
303 info.directory = GetName();
304 info.file_number = manifest_number;
305 info.file_type = kDescriptorFile;
306 info.size = manifest_size;
307 info.trim_to_size = true;
308 if (opts.include_checksum_info) {
309 info.file_checksum_func_name = kUnknownFileChecksumFuncName;
310 info.file_checksum = kUnknownFileChecksum;
311 }
312 }
313
314 { // CURRENT
315 results.emplace_back();
316 LiveFileStorageInfo& info = results.back();
317
318 info.relative_filename = kCurrentFileName;
319 info.directory = GetName();
320 info.file_type = kCurrentFile;
321 // CURRENT could be replaced so we have to record the contents we want
322 // for it
323 info.replacement_contents = manifest_fname + "\n";
324 info.size = manifest_fname.size() + 1;
325 if (opts.include_checksum_info) {
326 info.file_checksum_func_name = kUnknownFileChecksumFuncName;
327 info.file_checksum = kUnknownFileChecksum;
328 }
329 }
330
331 // The OPTIONS file number is zero in read-write mode when OPTIONS file
332 // writing failed and the DB was configured with
333 // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
334 // number is zero when no OPTIONS file exist at all. In those cases we do not
335 // record any OPTIONS file in the live file list.
336 if (options_number != 0) {
337 results.emplace_back();
338 LiveFileStorageInfo& info = results.back();
339
340 info.relative_filename = OptionsFileName(options_number);
341 info.directory = GetName();
342 info.file_number = options_number;
343 info.file_type = kOptionsFile;
344 info.size = options_size;
345 if (opts.include_checksum_info) {
346 info.file_checksum_func_name = kUnknownFileChecksumFuncName;
347 info.file_checksum = kUnknownFileChecksum;
348 }
349 }
350
351 // Some legacy testing stuff TODO: carefully clean up obsolete parts
352 TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
353
354 TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
355 TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
356
357 if (s.ok()) {
358 s = FlushWAL(false /* sync */);
359 }
360
361 TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
362 TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
363
364 // if we have more than one column family, we need to also get WAL files
365 if (s.ok()) {
366 s = GetSortedWalFiles(live_wal_files);
367 }
368 if (!s.ok()) {
369 return s;
370 }
371
372 size_t wal_size = live_wal_files.size();
373
374 ROCKS_LOG_INFO(immutable_db_options_.info_log,
375 "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size());
376
377 // Link WAL files. Copy exact size of last one because it is the only one
378 // that has changes after the last flush.
379 auto wal_dir = immutable_db_options_.GetWalDir();
380 for (size_t i = 0; s.ok() && i < wal_size; ++i) {
381 if ((live_wal_files[i]->Type() == kAliveLogFile) &&
382 (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) {
383 results.emplace_back();
384 LiveFileStorageInfo& info = results.back();
385 auto f = live_wal_files[i]->PathName();
386 assert(!f.empty() && f[0] == '/');
387 info.relative_filename = f.substr(1);
388 info.directory = wal_dir;
389 info.file_number = live_wal_files[i]->LogNumber();
390 info.file_type = kWalFile;
391 info.size = live_wal_files[i]->SizeFileBytes();
392 // Only last should need to be trimmed
393 info.trim_to_size = (i + 1 == wal_size);
394 if (opts.include_checksum_info) {
395 info.file_checksum_func_name = kUnknownFileChecksumFuncName;
396 info.file_checksum = kUnknownFileChecksum;
397 }
398 }
399 }
400
401 if (s.ok()) {
402 // Only move output on success
403 *files = std::move(results);
404 }
405 return s;
406 }
407
408 } // namespace ROCKSDB_NAMESPACE
409
410 #endif // ROCKSDB_LITE
411