1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "db/flush_job.h"
11
12 #include <cinttypes>
13
14 #include <algorithm>
15 #include <vector>
16
17 #include "db/builder.h"
18 #include "db/db_iter.h"
19 #include "db/dbformat.h"
20 #include "db/event_helpers.h"
21 #include "db/log_reader.h"
22 #include "db/log_writer.h"
23 #include "db/memtable.h"
24 #include "db/memtable_list.h"
25 #include "db/merge_context.h"
26 #include "db/range_tombstone_fragmenter.h"
27 #include "db/version_set.h"
28 #include "file/file_util.h"
29 #include "file/filename.h"
30 #include "logging/event_logger.h"
31 #include "logging/log_buffer.h"
32 #include "logging/logging.h"
33 #include "monitoring/iostats_context_imp.h"
34 #include "monitoring/perf_context_imp.h"
35 #include "monitoring/thread_status_util.h"
36 #include "port/port.h"
37 #include "rocksdb/db.h"
38 #include "rocksdb/env.h"
39 #include "rocksdb/statistics.h"
40 #include "rocksdb/status.h"
41 #include "rocksdb/table.h"
42 #include "table/merging_iterator.h"
43 #include "table/table_builder.h"
44 #include "table/two_level_iterator.h"
45 #include "test_util/sync_point.h"
46 #include "util/coding.h"
47 #include "util/mutexlock.h"
48 #include "util/stop_watch.h"
49
50 namespace ROCKSDB_NAMESPACE {
51
GetFlushReasonString(FlushReason flush_reason)52 const char* GetFlushReasonString (FlushReason flush_reason) {
53 switch (flush_reason) {
54 case FlushReason::kOthers:
55 return "Other Reasons";
56 case FlushReason::kGetLiveFiles:
57 return "Get Live Files";
58 case FlushReason::kShutDown:
59 return "Shut down";
60 case FlushReason::kExternalFileIngestion:
61 return "External File Ingestion";
62 case FlushReason::kManualCompaction:
63 return "Manual Compaction";
64 case FlushReason::kWriteBufferManager:
65 return "Write Buffer Manager";
66 case FlushReason::kWriteBufferFull:
67 return "Write Buffer Full";
68 case FlushReason::kTest:
69 return "Test";
70 case FlushReason::kDeleteFiles:
71 return "Delete Files";
72 case FlushReason::kAutoCompaction:
73 return "Auto Compaction";
74 case FlushReason::kManualFlush:
75 return "Manual Flush";
76 case FlushReason::kErrorRecovery:
77 return "Error Recovery";
78 case FlushReason::kWalFull:
79 return "WAL Full";
80 default:
81 return "Invalid";
82 }
83 }
84
FlushJob(const std::string & dbname,ColumnFamilyData * cfd,const ImmutableDBOptions & db_options,const MutableCFOptions & mutable_cf_options,uint64_t max_memtable_id,const FileOptions & file_options,VersionSet * versions,InstrumentedMutex * db_mutex,std::atomic<bool> * shutting_down,std::vector<SequenceNumber> existing_snapshots,SequenceNumber earliest_write_conflict_snapshot,SnapshotChecker * snapshot_checker,JobContext * job_context,LogBuffer * log_buffer,FSDirectory * db_directory,FSDirectory * output_file_directory,CompressionType output_compression,Statistics * stats,EventLogger * event_logger,bool measure_io_stats,const bool sync_output_directory,const bool write_manifest,Env::Priority thread_pri,const std::shared_ptr<IOTracer> & io_tracer,const std::string & db_id,const std::string & db_session_id,std::string full_history_ts_low,BlobFileCompletionCallback * blob_callback)85 FlushJob::FlushJob(
86 const std::string& dbname, ColumnFamilyData* cfd,
87 const ImmutableDBOptions& db_options,
88 const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
89 const FileOptions& file_options, VersionSet* versions,
90 InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
91 std::vector<SequenceNumber> existing_snapshots,
92 SequenceNumber earliest_write_conflict_snapshot,
93 SnapshotChecker* snapshot_checker, JobContext* job_context,
94 LogBuffer* log_buffer, FSDirectory* db_directory,
95 FSDirectory* output_file_directory, CompressionType output_compression,
96 Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
97 const bool sync_output_directory, const bool write_manifest,
98 Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
99 const std::string& db_id, const std::string& db_session_id,
100 std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback)
101 : dbname_(dbname),
102 db_id_(db_id),
103 db_session_id_(db_session_id),
104 cfd_(cfd),
105 db_options_(db_options),
106 mutable_cf_options_(mutable_cf_options),
107 max_memtable_id_(max_memtable_id),
108 file_options_(file_options),
109 versions_(versions),
110 db_mutex_(db_mutex),
111 shutting_down_(shutting_down),
112 existing_snapshots_(std::move(existing_snapshots)),
113 earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
114 snapshot_checker_(snapshot_checker),
115 job_context_(job_context),
116 log_buffer_(log_buffer),
117 db_directory_(db_directory),
118 output_file_directory_(output_file_directory),
119 output_compression_(output_compression),
120 stats_(stats),
121 event_logger_(event_logger),
122 measure_io_stats_(measure_io_stats),
123 sync_output_directory_(sync_output_directory),
124 write_manifest_(write_manifest),
125 edit_(nullptr),
126 base_(nullptr),
127 pick_memtable_called(false),
128 thread_pri_(thread_pri),
129 io_tracer_(io_tracer),
130 clock_(db_options_.clock),
131 full_history_ts_low_(std::move(full_history_ts_low)),
132 blob_callback_(blob_callback) {
133 // Update the thread status to indicate flush.
134 ReportStartedFlush();
135 TEST_SYNC_POINT("FlushJob::FlushJob()");
136 }
137
~FlushJob()138 FlushJob::~FlushJob() {
139 io_status_.PermitUncheckedError();
140 ThreadStatusUtil::ResetThreadStatus();
141 }
142
ReportStartedFlush()143 void FlushJob::ReportStartedFlush() {
144 ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env,
145 db_options_.enable_thread_tracking);
146 ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
147 ThreadStatusUtil::SetThreadOperationProperty(
148 ThreadStatus::COMPACTION_JOB_ID,
149 job_context_->job_id);
150 IOSTATS_RESET(bytes_written);
151 }
152
ReportFlushInputSize(const autovector<MemTable * > & mems)153 void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
154 uint64_t input_size = 0;
155 for (auto* mem : mems) {
156 input_size += mem->ApproximateMemoryUsage();
157 }
158 ThreadStatusUtil::IncreaseThreadOperationProperty(
159 ThreadStatus::FLUSH_BYTES_MEMTABLES,
160 input_size);
161 }
162
RecordFlushIOStats()163 void FlushJob::RecordFlushIOStats() {
164 RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
165 ThreadStatusUtil::IncreaseThreadOperationProperty(
166 ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
167 IOSTATS_RESET(bytes_written);
168 }
PickMemTable()169 void FlushJob::PickMemTable() {
170 db_mutex_->AssertHeld();
171 assert(!pick_memtable_called);
172 pick_memtable_called = true;
173 // Save the contents of the earliest memtable as a new Table
174 cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_);
175 if (mems_.empty()) {
176 return;
177 }
178
179 ReportFlushInputSize(mems_);
180
181 // entries mems are (implicitly) sorted in ascending order by their created
182 // time. We will use the first memtable's `edit` to keep the meta info for
183 // this flush.
184 MemTable* m = mems_[0];
185 edit_ = m->GetEdits();
186 edit_->SetPrevLogNumber(0);
187 // SetLogNumber(log_num) indicates logs with number smaller than log_num
188 // will no longer be picked up for recovery.
189 edit_->SetLogNumber(mems_.back()->GetNextLogNumber());
190 edit_->SetColumnFamily(cfd_->GetID());
191
192 // path 0 for level 0 file.
193 meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
194
195 base_ = cfd_->current();
196 base_->Ref(); // it is likely that we do not need this reference
197 }
198
Run(LogsWithPrepTracker * prep_tracker,FileMetaData * file_meta)199 Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
200 FileMetaData* file_meta) {
201 TEST_SYNC_POINT("FlushJob::Start");
202 db_mutex_->AssertHeld();
203 assert(pick_memtable_called);
204 AutoThreadOperationStageUpdater stage_run(
205 ThreadStatus::STAGE_FLUSH_RUN);
206 if (mems_.empty()) {
207 ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
208 cfd_->GetName().c_str());
209 return Status::OK();
210 }
211
212 // I/O measurement variables
213 PerfLevel prev_perf_level = PerfLevel::kEnableTime;
214 uint64_t prev_write_nanos = 0;
215 uint64_t prev_fsync_nanos = 0;
216 uint64_t prev_range_sync_nanos = 0;
217 uint64_t prev_prepare_write_nanos = 0;
218 uint64_t prev_cpu_write_nanos = 0;
219 uint64_t prev_cpu_read_nanos = 0;
220 if (measure_io_stats_) {
221 prev_perf_level = GetPerfLevel();
222 SetPerfLevel(PerfLevel::kEnableTime);
223 prev_write_nanos = IOSTATS(write_nanos);
224 prev_fsync_nanos = IOSTATS(fsync_nanos);
225 prev_range_sync_nanos = IOSTATS(range_sync_nanos);
226 prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
227 prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
228 prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
229 }
230
231 // This will release and re-acquire the mutex.
232 Status s = WriteLevel0Table();
233
234 if (s.ok() && cfd_->IsDropped()) {
235 s = Status::ColumnFamilyDropped("Column family dropped during compaction");
236 }
237 if ((s.ok() || s.IsColumnFamilyDropped()) &&
238 shutting_down_->load(std::memory_order_acquire)) {
239 s = Status::ShutdownInProgress("Database shutdown");
240 }
241
242 if (!s.ok()) {
243 cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
244 } else if (write_manifest_) {
245 TEST_SYNC_POINT("FlushJob::InstallResults");
246 // Replace immutable memtable with the generated Table
247 IOStatus tmp_io_s;
248 s = cfd_->imm()->TryInstallMemtableFlushResults(
249 cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
250 meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
251 log_buffer_, &committed_flush_jobs_info_, &tmp_io_s);
252 if (!tmp_io_s.ok()) {
253 io_status_ = tmp_io_s;
254 }
255 }
256
257 if (s.ok() && file_meta != nullptr) {
258 *file_meta = meta_;
259 }
260 RecordFlushIOStats();
261
262 // When measure_io_stats_ is true, the default 512 bytes is not enough.
263 auto stream = event_logger_->LogToBuffer(log_buffer_, 1024);
264 stream << "job" << job_context_->job_id << "event"
265 << "flush_finished";
266 stream << "output_compression"
267 << CompressionTypeToString(output_compression_);
268 stream << "lsm_state";
269 stream.StartArray();
270 auto vstorage = cfd_->current()->storage_info();
271 for (int level = 0; level < vstorage->num_levels(); ++level) {
272 stream << vstorage->NumLevelFiles(level);
273 }
274 stream.EndArray();
275
276 const auto& blob_files = vstorage->GetBlobFiles();
277 if (!blob_files.empty()) {
278 stream << "blob_file_head" << blob_files.begin()->first;
279 stream << "blob_file_tail" << blob_files.rbegin()->first;
280 }
281
282 stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed();
283
284 if (measure_io_stats_) {
285 if (prev_perf_level != PerfLevel::kEnableTime) {
286 SetPerfLevel(prev_perf_level);
287 }
288 stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos);
289 stream << "file_range_sync_nanos"
290 << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos);
291 stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos);
292 stream << "file_prepare_write_nanos"
293 << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos);
294 stream << "file_cpu_write_nanos"
295 << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos);
296 stream << "file_cpu_read_nanos"
297 << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
298 }
299
300 return s;
301 }
302
Cancel()303 void FlushJob::Cancel() {
304 db_mutex_->AssertHeld();
305 assert(base_ != nullptr);
306 base_->Unref();
307 }
308
WriteLevel0Table()309 Status FlushJob::WriteLevel0Table() {
310 AutoThreadOperationStageUpdater stage_updater(
311 ThreadStatus::STAGE_FLUSH_WRITE_L0);
312 db_mutex_->AssertHeld();
313 const uint64_t start_micros = clock_->NowMicros();
314 const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
315 Status s;
316
317 std::vector<BlobFileAddition> blob_file_additions;
318
319 {
320 auto write_hint = cfd_->CalculateSSTWriteHint(0);
321 db_mutex_->Unlock();
322 if (log_buffer_) {
323 log_buffer_->FlushBufferToLog();
324 }
325 // memtables and range_del_iters store internal iterators over each data
326 // memtable and its associated range deletion memtable, respectively, at
327 // corresponding indexes.
328 std::vector<InternalIterator*> memtables;
329 std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
330 range_del_iters;
331 ReadOptions ro;
332 ro.total_order_seek = true;
333 Arena arena;
334 uint64_t total_num_entries = 0, total_num_deletes = 0;
335 uint64_t total_data_size = 0;
336 size_t total_memory_usage = 0;
337 for (MemTable* m : mems_) {
338 ROCKS_LOG_INFO(
339 db_options_.info_log,
340 "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
341 cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
342 memtables.push_back(m->NewIterator(ro, &arena));
343 auto* range_del_iter =
344 m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
345 if (range_del_iter != nullptr) {
346 range_del_iters.emplace_back(range_del_iter);
347 }
348 total_num_entries += m->num_entries();
349 total_num_deletes += m->num_deletes();
350 total_data_size += m->get_data_size();
351 total_memory_usage += m->ApproximateMemoryUsage();
352 }
353
354 event_logger_->Log() << "job" << job_context_->job_id << "event"
355 << "flush_started"
356 << "num_memtables" << mems_.size() << "num_entries"
357 << total_num_entries << "num_deletes"
358 << total_num_deletes << "total_data_size"
359 << total_data_size << "memory_usage"
360 << total_memory_usage << "flush_reason"
361 << GetFlushReasonString(cfd_->GetFlushReason());
362
363 {
364 ScopedArenaIterator iter(
365 NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
366 static_cast<int>(memtables.size()), &arena));
367 ROCKS_LOG_INFO(db_options_.info_log,
368 "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
369 cfd_->GetName().c_str(), job_context_->job_id,
370 meta_.fd.GetNumber());
371
372 TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
373 &output_compression_);
374 int64_t _current_time = 0;
375 auto status = clock_->GetCurrentTime(&_current_time);
376 // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
377 if (!status.ok()) {
378 ROCKS_LOG_WARN(
379 db_options_.info_log,
380 "Failed to get current time to populate creation_time property. "
381 "Status: %s",
382 status.ToString().c_str());
383 }
384 const uint64_t current_time = static_cast<uint64_t>(_current_time);
385
386 uint64_t oldest_key_time =
387 mems_.front()->ApproximateOldestKeyTime();
388
389 // It's not clear whether oldest_key_time is always available. In case
390 // it is not available, use current_time.
391 uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time);
392
393 TEST_SYNC_POINT_CALLBACK(
394 "FlushJob::WriteLevel0Table:oldest_ancester_time",
395 &oldest_ancester_time);
396 meta_.oldest_ancester_time = oldest_ancester_time;
397
398 meta_.file_creation_time = current_time;
399
400 uint64_t creation_time = (cfd_->ioptions()->compaction_style ==
401 CompactionStyle::kCompactionStyleFIFO)
402 ? current_time
403 : meta_.oldest_ancester_time;
404
405 uint64_t num_input_entries = 0;
406 IOStatus io_s;
407 const std::string* const full_history_ts_low =
408 (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_;
409 TableBuilderOptions tboptions(
410 *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(),
411 cfd_->int_tbl_prop_collector_factories(), output_compression_,
412 mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(),
413 0 /* level */, false /* is_bottommost */,
414 TableFileCreationReason::kFlush, creation_time, oldest_key_time,
415 current_time, db_id_, db_session_id_, 0 /* target_file_size */);
416 s = BuildTable(
417 dbname_, versions_, db_options_, tboptions, file_options_,
418 cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
419 &blob_file_additions, existing_snapshots_,
420 earliest_write_conflict_snapshot_, snapshot_checker_,
421 mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
422 &io_s, io_tracer_, event_logger_, job_context_->job_id, Env::IO_HIGH,
423 &table_properties_, write_hint, full_history_ts_low, blob_callback_,
424 &num_input_entries);
425 if (!io_s.ok()) {
426 io_status_ = io_s;
427 }
428 if (num_input_entries != total_num_entries && s.ok()) {
429 std::string msg = "Expected " + ToString(total_num_entries) +
430 " entries in memtables, but read " +
431 ToString(num_input_entries);
432 ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
433 cfd_->GetName().c_str(), job_context_->job_id,
434 msg.c_str());
435 if (db_options_.flush_verify_memtable_count) {
436 s = Status::Corruption(msg);
437 }
438 }
439 LogFlush(db_options_.info_log);
440 }
441 ROCKS_LOG_INFO(db_options_.info_log,
442 "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
443 " bytes %s"
444 "%s",
445 cfd_->GetName().c_str(), job_context_->job_id,
446 meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
447 s.ToString().c_str(),
448 meta_.marked_for_compaction ? " (needs compaction)" : "");
449
450 if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
451 s = output_file_directory_->Fsync(IOOptions(), nullptr);
452 }
453 TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
454 db_mutex_->Lock();
455 }
456 base_->Unref();
457
458 // Note that if file_size is zero, the file has been deleted and
459 // should not be added to the manifest.
460 const bool has_output = meta_.fd.GetFileSize() > 0;
461
462 if (s.ok() && has_output) {
463 // if we have more than 1 background thread, then we cannot
464 // insert files directly into higher levels because some other
465 // threads could be concurrently producing compacted files for
466 // that key range.
467 // Add file to L0
468 edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
469 meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
470 meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
471 meta_.marked_for_compaction, meta_.oldest_blob_file_number,
472 meta_.oldest_ancester_time, meta_.file_creation_time,
473 meta_.file_checksum, meta_.file_checksum_func_name);
474
475 edit_->SetBlobFileAdditions(std::move(blob_file_additions));
476 }
477 #ifndef ROCKSDB_LITE
478 // Piggyback FlushJobInfo on the first first flushed memtable.
479 mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
480 #endif // !ROCKSDB_LITE
481
482 // Note that here we treat flush as level 0 compaction in internal stats
483 InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
484 stats.micros = clock_->NowMicros() - start_micros;
485 stats.cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
486
487 if (has_output) {
488 stats.bytes_written = meta_.fd.GetFileSize();
489 stats.num_output_files = 1;
490 }
491
492 const auto& blobs = edit_->GetBlobFileAdditions();
493 for (const auto& blob : blobs) {
494 stats.bytes_written_blob += blob.GetTotalBlobBytes();
495 }
496
497 stats.num_output_files_blob = static_cast<int>(blobs.size());
498
499 RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
500 cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
501 cfd_->internal_stats()->AddCFStats(
502 InternalStats::BYTES_FLUSHED,
503 stats.bytes_written + stats.bytes_written_blob);
504 RecordFlushIOStats();
505 return s;
506 }
507
508 #ifndef ROCKSDB_LITE
GetFlushJobInfo() const509 std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
510 db_mutex_->AssertHeld();
511 std::unique_ptr<FlushJobInfo> info(new FlushJobInfo{});
512 info->cf_id = cfd_->GetID();
513 info->cf_name = cfd_->GetName();
514
515 const uint64_t file_number = meta_.fd.GetNumber();
516 info->file_path =
517 MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number);
518 info->file_number = file_number;
519 info->oldest_blob_file_number = meta_.oldest_blob_file_number;
520 info->thread_id = db_options_.env->GetThreadID();
521 info->job_id = job_context_->job_id;
522 info->smallest_seqno = meta_.fd.smallest_seqno;
523 info->largest_seqno = meta_.fd.largest_seqno;
524 info->table_properties = table_properties_;
525 info->flush_reason = cfd_->GetFlushReason();
526 return info;
527 }
528 #endif // !ROCKSDB_LITE
529
530 } // namespace ROCKSDB_NAMESPACE
531