1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 
7 #ifndef ROCKSDB_LITE
8 
9 #include <algorithm>
10 #include <cstdint>
11 #include <memory>
12 #include <string>
13 #include <vector>
14 
15 #include "db/db_impl/db_impl.h"
16 #include "db/job_context.h"
17 #include "db/version_set.h"
18 #include "file/file_util.h"
19 #include "file/filename.h"
20 #include "logging/logging.h"
21 #include "port/port.h"
22 #include "rocksdb/db.h"
23 #include "rocksdb/env.h"
24 #include "rocksdb/metadata.h"
25 #include "rocksdb/types.h"
26 #include "test_util/sync_point.h"
27 #include "util/file_checksum_helper.h"
28 #include "util/mutexlock.h"
29 
30 namespace ROCKSDB_NAMESPACE {
31 
FlushForGetLiveFiles()32 Status DBImpl::FlushForGetLiveFiles() {
33   mutex_.AssertHeld();
34 
35   // flush all dirty data to disk.
36   Status status;
37   if (immutable_db_options_.atomic_flush) {
38     autovector<ColumnFamilyData*> cfds;
39     SelectColumnFamiliesForAtomicFlush(&cfds);
40     mutex_.Unlock();
41     status =
42         AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles);
43     if (status.IsColumnFamilyDropped()) {
44       status = Status::OK();
45     }
46     mutex_.Lock();
47   } else {
48     for (auto cfd : *versions_->GetColumnFamilySet()) {
49       if (cfd->IsDropped()) {
50         continue;
51       }
52       cfd->Ref();
53       mutex_.Unlock();
54       status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
55       TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
56       TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
57       mutex_.Lock();
58       cfd->UnrefAndTryDelete();
59       if (!status.ok() && !status.IsColumnFamilyDropped()) {
60         break;
61       } else if (status.IsColumnFamilyDropped()) {
62         status = Status::OK();
63       }
64     }
65   }
66   versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
67   return status;
68 }
69 
GetLiveFiles(std::vector<std::string> & ret,uint64_t * manifest_file_size,bool flush_memtable)70 Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
71                             uint64_t* manifest_file_size,
72                             bool flush_memtable) {
73   *manifest_file_size = 0;
74 
75   mutex_.Lock();
76 
77   if (flush_memtable) {
78     Status status = FlushForGetLiveFiles();
79     if (!status.ok()) {
80       mutex_.Unlock();
81       ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
82                       status.ToString().c_str());
83       return status;
84     }
85   }
86 
87   // Make a set of all of the live table and blob files
88   std::vector<uint64_t> live_table_files;
89   std::vector<uint64_t> live_blob_files;
90   for (auto cfd : *versions_->GetColumnFamilySet()) {
91     if (cfd->IsDropped()) {
92       continue;
93     }
94     cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files);
95   }
96 
97   ret.clear();
98   ret.reserve(live_table_files.size() + live_blob_files.size() +
99               3);  // for CURRENT + MANIFEST + OPTIONS
100 
101   // create names of the live files. The names are not absolute
102   // paths, instead they are relative to dbname_;
103   for (const auto& table_file_number : live_table_files) {
104     ret.emplace_back(MakeTableFileName("", table_file_number));
105   }
106 
107   for (const auto& blob_file_number : live_blob_files) {
108     ret.emplace_back(BlobFileName("", blob_file_number));
109   }
110 
111   ret.emplace_back(CurrentFileName(""));
112   ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
113   // The OPTIONS file number is zero in read-write mode when OPTIONS file
114   // writing failed and the DB was configured with
115   // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
116   // number is zero when no OPTIONS file exist at all. In those cases we do not
117   // record any OPTIONS file in the live file list.
118   if (versions_->options_file_number() != 0) {
119     ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
120   }
121 
122   // find length of manifest file while holding the mutex lock
123   *manifest_file_size = versions_->manifest_file_size();
124 
125   mutex_.Unlock();
126   return Status::OK();
127 }
128 
GetSortedWalFiles(VectorLogPtr & files)129 Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
130   {
131     // If caller disabled deletions, this function should return files that are
132     // guaranteed not to be deleted until deletions are re-enabled. We need to
133     // wait for pending purges to finish since WalManager doesn't know which
134     // files are going to be purged. Additional purges won't be scheduled as
135     // long as deletions are disabled (so the below loop must terminate).
136     InstrumentedMutexLock l(&mutex_);
137     while (disable_delete_obsolete_files_ > 0 &&
138            (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0)) {
139       bg_cv_.Wait();
140     }
141   }
142 
143   // Disable deletion in order to avoid the case where a file is deleted in
144   // the middle of the process so IO error is returned.
145   Status s = DisableFileDeletions();
146   bool file_deletion_supported = !s.IsNotSupported();
147   if (s.ok() || !file_deletion_supported) {
148     s = wal_manager_.GetSortedWalFiles(files);
149     if (file_deletion_supported) {
150       Status s2 = EnableFileDeletions(false);
151       if (!s2.ok() && s.ok()) {
152         s = s2;
153       }
154     }
155   }
156 
157   return s;
158 }
159 
GetCurrentWalFile(std::unique_ptr<LogFile> * current_log_file)160 Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
161   uint64_t current_logfile_number;
162   {
163     InstrumentedMutexLock l(&mutex_);
164     current_logfile_number = logfile_number_;
165   }
166 
167   return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
168 }
169 
GetLiveFilesStorageInfo(const LiveFilesStorageInfoOptions & opts,std::vector<LiveFileStorageInfo> * files)170 Status DBImpl::GetLiveFilesStorageInfo(
171     const LiveFilesStorageInfoOptions& opts,
172     std::vector<LiveFileStorageInfo>* files) {
173   // To avoid returning partial results, only move to ouput on success
174   assert(files);
175   files->clear();
176   std::vector<LiveFileStorageInfo> results;
177 
178   // NOTE: This implementation was largely migrated from Checkpoint.
179 
180   Status s;
181   VectorLogPtr live_wal_files;
182   bool flush_memtable = true;
183   if (!immutable_db_options_.allow_2pc) {
184     if (opts.wal_size_for_flush == port::kMaxUint64) {
185       flush_memtable = false;
186     } else if (opts.wal_size_for_flush > 0) {
187       // If out standing log files are small, we skip the flush.
188       s = GetSortedWalFiles(live_wal_files);
189 
190       if (!s.ok()) {
191         return s;
192       }
193 
194       // Don't flush column families if total log size is smaller than
195       // log_size_for_flush. We copy the log files instead.
196       // We may be able to cover 2PC case too.
197       uint64_t total_wal_size = 0;
198       for (auto& wal : live_wal_files) {
199         total_wal_size += wal->SizeFileBytes();
200       }
201       if (total_wal_size < opts.wal_size_for_flush) {
202         flush_memtable = false;
203       }
204       live_wal_files.clear();
205     }
206   }
207 
208   // This is a modified version of GetLiveFiles, to get access to more
209   // metadata.
210   mutex_.Lock();
211   if (flush_memtable) {
212     Status status = FlushForGetLiveFiles();
213     if (!status.ok()) {
214       mutex_.Unlock();
215       ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
216                       status.ToString().c_str());
217       return status;
218     }
219   }
220 
221   // Make a set of all of the live table and blob files
222   for (auto cfd : *versions_->GetColumnFamilySet()) {
223     if (cfd->IsDropped()) {
224       continue;
225     }
226     VersionStorageInfo& vsi = *cfd->current()->storage_info();
227     auto& cf_paths = cfd->ioptions()->cf_paths;
228 
229     auto GetDir = [&](size_t path_id) {
230       // Matching TableFileName() behavior
231       if (path_id >= cf_paths.size()) {
232         assert(false);
233         return cf_paths.back().path;
234       } else {
235         return cf_paths[path_id].path;
236       }
237     };
238 
239     for (int level = 0; level < vsi.num_levels(); ++level) {
240       const auto& level_files = vsi.LevelFiles(level);
241       for (const auto& meta : level_files) {
242         assert(meta);
243 
244         results.emplace_back();
245         LiveFileStorageInfo& info = results.back();
246 
247         info.relative_filename = MakeTableFileName(meta->fd.GetNumber());
248         info.directory = GetDir(meta->fd.GetPathId());
249         info.file_number = meta->fd.GetNumber();
250         info.file_type = kTableFile;
251         info.size = meta->fd.GetFileSize();
252         if (opts.include_checksum_info) {
253           info.file_checksum_func_name = meta->file_checksum_func_name;
254           info.file_checksum = meta->file_checksum;
255           if (info.file_checksum_func_name.empty()) {
256             info.file_checksum_func_name = kUnknownFileChecksumFuncName;
257             info.file_checksum = kUnknownFileChecksum;
258           }
259         }
260         info.temperature = meta->temperature;
261       }
262     }
263     const auto& blob_files = vsi.GetBlobFiles();
264     for (const auto& pair : blob_files) {
265       const auto& meta = pair.second;
266       assert(meta);
267 
268       results.emplace_back();
269       LiveFileStorageInfo& info = results.back();
270 
271       info.relative_filename = BlobFileName(meta->GetBlobFileNumber());
272       info.directory = GetName();  // TODO?: support db_paths/cf_paths
273       info.file_number = meta->GetBlobFileNumber();
274       info.file_type = kBlobFile;
275       info.size = meta->GetBlobFileSize();
276       if (opts.include_checksum_info) {
277         info.file_checksum_func_name = meta->GetChecksumMethod();
278         info.file_checksum = meta->GetChecksumValue();
279         if (info.file_checksum_func_name.empty()) {
280           info.file_checksum_func_name = kUnknownFileChecksumFuncName;
281           info.file_checksum = kUnknownFileChecksum;
282         }
283       }
284       // TODO?: info.temperature
285     }
286   }
287 
288   // Capture some final info before releasing mutex
289   const uint64_t manifest_number = versions_->manifest_file_number();
290   const uint64_t manifest_size = versions_->manifest_file_size();
291   const uint64_t options_number = versions_->options_file_number();
292   const uint64_t options_size = versions_->options_file_size_;
293   const uint64_t min_log_num = MinLogNumberToKeep();
294 
295   mutex_.Unlock();
296 
297   std::string manifest_fname = DescriptorFileName(manifest_number);
298   {  // MANIFEST
299     results.emplace_back();
300     LiveFileStorageInfo& info = results.back();
301 
302     info.relative_filename = manifest_fname;
303     info.directory = GetName();
304     info.file_number = manifest_number;
305     info.file_type = kDescriptorFile;
306     info.size = manifest_size;
307     info.trim_to_size = true;
308     if (opts.include_checksum_info) {
309       info.file_checksum_func_name = kUnknownFileChecksumFuncName;
310       info.file_checksum = kUnknownFileChecksum;
311     }
312   }
313 
314   {  // CURRENT
315     results.emplace_back();
316     LiveFileStorageInfo& info = results.back();
317 
318     info.relative_filename = kCurrentFileName;
319     info.directory = GetName();
320     info.file_type = kCurrentFile;
321     // CURRENT could be replaced so we have to record the contents we want
322     // for it
323     info.replacement_contents = manifest_fname + "\n";
324     info.size = manifest_fname.size() + 1;
325     if (opts.include_checksum_info) {
326       info.file_checksum_func_name = kUnknownFileChecksumFuncName;
327       info.file_checksum = kUnknownFileChecksum;
328     }
329   }
330 
331   // The OPTIONS file number is zero in read-write mode when OPTIONS file
332   // writing failed and the DB was configured with
333   // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
334   // number is zero when no OPTIONS file exist at all. In those cases we do not
335   // record any OPTIONS file in the live file list.
336   if (options_number != 0) {
337     results.emplace_back();
338     LiveFileStorageInfo& info = results.back();
339 
340     info.relative_filename = OptionsFileName(options_number);
341     info.directory = GetName();
342     info.file_number = options_number;
343     info.file_type = kOptionsFile;
344     info.size = options_size;
345     if (opts.include_checksum_info) {
346       info.file_checksum_func_name = kUnknownFileChecksumFuncName;
347       info.file_checksum = kUnknownFileChecksum;
348     }
349   }
350 
351   // Some legacy testing stuff  TODO: carefully clean up obsolete parts
352   TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
353 
354   TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
355   TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
356 
357   if (s.ok()) {
358     s = FlushWAL(false /* sync */);
359   }
360 
361   TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
362   TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
363 
364   // if we have more than one column family, we need to also get WAL files
365   if (s.ok()) {
366     s = GetSortedWalFiles(live_wal_files);
367   }
368   if (!s.ok()) {
369     return s;
370   }
371 
372   size_t wal_size = live_wal_files.size();
373 
374   ROCKS_LOG_INFO(immutable_db_options_.info_log,
375                  "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size());
376 
377   // Link WAL files. Copy exact size of last one because it is the only one
378   // that has changes after the last flush.
379   auto wal_dir = immutable_db_options_.GetWalDir();
380   for (size_t i = 0; s.ok() && i < wal_size; ++i) {
381     if ((live_wal_files[i]->Type() == kAliveLogFile) &&
382         (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) {
383       results.emplace_back();
384       LiveFileStorageInfo& info = results.back();
385       auto f = live_wal_files[i]->PathName();
386       assert(!f.empty() && f[0] == '/');
387       info.relative_filename = f.substr(1);
388       info.directory = wal_dir;
389       info.file_number = live_wal_files[i]->LogNumber();
390       info.file_type = kWalFile;
391       info.size = live_wal_files[i]->SizeFileBytes();
392       // Only last should need to be trimmed
393       info.trim_to_size = (i + 1 == wal_size);
394       if (opts.include_checksum_info) {
395         info.file_checksum_func_name = kUnknownFileChecksumFuncName;
396         info.file_checksum = kUnknownFileChecksum;
397       }
398     }
399   }
400 
401   if (s.ok()) {
402     // Only move output on success
403     *files = std::move(results);
404   }
405   return s;
406 }
407 
408 }  // namespace ROCKSDB_NAMESPACE
409 
410 #endif  // ROCKSDB_LITE
411