1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include "db/builder.h"
11 
12 #include <algorithm>
13 #include <deque>
14 #include <vector>
15 
16 #include "db/blob/blob_file_builder.h"
17 #include "db/compaction/compaction_iterator.h"
18 #include "db/dbformat.h"
19 #include "db/event_helpers.h"
20 #include "db/internal_stats.h"
21 #include "db/merge_helper.h"
22 #include "db/output_validator.h"
23 #include "db/range_del_aggregator.h"
24 #include "db/table_cache.h"
25 #include "db/version_edit.h"
26 #include "file/file_util.h"
27 #include "file/filename.h"
28 #include "file/read_write_util.h"
29 #include "file/writable_file_writer.h"
30 #include "monitoring/iostats_context_imp.h"
31 #include "monitoring/thread_status_util.h"
32 #include "options/options_helper.h"
33 #include "rocksdb/db.h"
34 #include "rocksdb/env.h"
35 #include "rocksdb/iterator.h"
36 #include "rocksdb/options.h"
37 #include "rocksdb/table.h"
38 #include "table/block_based/block_based_table_builder.h"
39 #include "table/format.h"
40 #include "table/internal_iterator.h"
41 #include "test_util/sync_point.h"
42 #include "util/stop_watch.h"
43 
44 namespace ROCKSDB_NAMESPACE {
45 
46 class TableFactory;
47 
NewTableBuilder(const TableBuilderOptions & tboptions,WritableFileWriter * file)48 TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
49                               WritableFileWriter* file) {
50   assert((tboptions.column_family_id ==
51           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
52          tboptions.column_family_name.empty());
53   return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file);
54 }
55 
BuildTable(const std::string & dbname,VersionSet * versions,const ImmutableDBOptions & db_options,const TableBuilderOptions & tboptions,const FileOptions & file_options,TableCache * table_cache,InternalIterator * iter,std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>> range_del_iters,FileMetaData * meta,std::vector<BlobFileAddition> * blob_file_additions,std::vector<SequenceNumber> snapshots,SequenceNumber earliest_write_conflict_snapshot,SnapshotChecker * snapshot_checker,bool paranoid_file_checks,InternalStats * internal_stats,IOStatus * io_status,const std::shared_ptr<IOTracer> & io_tracer,EventLogger * event_logger,int job_id,const Env::IOPriority io_priority,TableProperties * table_properties,Env::WriteLifeTimeHint write_hint,const std::string * full_history_ts_low,BlobFileCompletionCallback * blob_callback,uint64_t * num_input_entries)56 Status BuildTable(
57     const std::string& dbname, VersionSet* versions,
58     const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
59     const FileOptions& file_options, TableCache* table_cache,
60     InternalIterator* iter,
61     std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
62         range_del_iters,
63     FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
64     std::vector<SequenceNumber> snapshots,
65     SequenceNumber earliest_write_conflict_snapshot,
66     SnapshotChecker* snapshot_checker, bool paranoid_file_checks,
67     InternalStats* internal_stats, IOStatus* io_status,
68     const std::shared_ptr<IOTracer>& io_tracer, EventLogger* event_logger,
69     int job_id, const Env::IOPriority io_priority,
70     TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
71     const std::string* full_history_ts_low,
72     BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries) {
73   assert((tboptions.column_family_id ==
74           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
75          tboptions.column_family_name.empty());
76   auto& mutable_cf_options = tboptions.moptions;
77   auto& ioptions = tboptions.ioptions;
78   // Reports the IOStats for flush for every following bytes.
79   const size_t kReportFlushIOStatsEvery = 1048576;
80   OutputValidator output_validator(
81       tboptions.internal_comparator,
82       /*enable_order_check=*/
83       mutable_cf_options.check_flush_compaction_key_order,
84       /*enable_hash=*/paranoid_file_checks);
85   Status s;
86   meta->fd.file_size = 0;
87   iter->SeekToFirst();
88   std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
89       new CompactionRangeDelAggregator(&tboptions.internal_comparator,
90                                        snapshots));
91   uint64_t num_unfragmented_tombstones = 0;
92   for (auto& range_del_iter : range_del_iters) {
93     num_unfragmented_tombstones +=
94         range_del_iter->num_unfragmented_tombstones();
95     range_del_agg->AddTombstones(std::move(range_del_iter));
96   }
97 
98   std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
99                                     meta->fd.GetPathId());
100   std::vector<std::string> blob_file_paths;
101   std::string file_checksum = kUnknownFileChecksum;
102   std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
103 #ifndef ROCKSDB_LITE
104   EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
105                                                tboptions.column_family_name,
106                                                fname, job_id, tboptions.reason);
107 #endif  // !ROCKSDB_LITE
108   Env* env = db_options.env;
109   assert(env);
110   FileSystem* fs = db_options.fs.get();
111   assert(fs);
112 
113   TableProperties tp;
114   if (iter->Valid() || !range_del_agg->IsEmpty()) {
115     std::unique_ptr<CompactionFilter> compaction_filter;
116     if (ioptions.compaction_filter_factory != nullptr &&
117         ioptions.compaction_filter_factory->ShouldFilterTableFileCreation(
118             tboptions.reason)) {
119       CompactionFilter::Context context;
120       context.is_full_compaction = false;
121       context.is_manual_compaction = false;
122       context.column_family_id = tboptions.column_family_id;
123       context.reason = tboptions.reason;
124       compaction_filter =
125           ioptions.compaction_filter_factory->CreateCompactionFilter(context);
126       if (compaction_filter != nullptr &&
127           !compaction_filter->IgnoreSnapshots()) {
128         s.PermitUncheckedError();
129         return Status::NotSupported(
130             "CompactionFilter::IgnoreSnapshots() = false is not supported "
131             "anymore.");
132       }
133     }
134 
135     TableBuilder* builder;
136     std::unique_ptr<WritableFileWriter> file_writer;
137     {
138       std::unique_ptr<FSWritableFile> file;
139 #ifndef NDEBUG
140       bool use_direct_writes = file_options.use_direct_writes;
141       TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
142 #endif  // !NDEBUG
143       IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
144       assert(s.ok());
145       s = io_s;
146       if (io_status->ok()) {
147         *io_status = io_s;
148       }
149       if (!s.ok()) {
150         EventHelpers::LogAndNotifyTableFileCreationFinished(
151             event_logger, ioptions.listeners, dbname,
152             tboptions.column_family_name, fname, job_id, meta->fd,
153             kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum,
154             file_checksum_func_name);
155         return s;
156       }
157       FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
158       file->SetIOPriority(io_priority);
159       file->SetWriteLifeTimeHint(write_hint);
160       file_writer.reset(new WritableFileWriter(
161           std::move(file), fname, file_options, ioptions.clock, io_tracer,
162           ioptions.stats, ioptions.listeners,
163           ioptions.file_checksum_gen_factory.get(),
164           tmp_set.Contains(FileType::kTableFile)));
165 
166       builder = NewTableBuilder(tboptions, file_writer.get());
167     }
168 
169     MergeHelper merge(
170         env, tboptions.internal_comparator.user_comparator(),
171         ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
172         true /* internal key corruption is not ok */,
173         snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);
174 
175     std::unique_ptr<BlobFileBuilder> blob_file_builder(
176         (mutable_cf_options.enable_blob_files && blob_file_additions)
177             ? new BlobFileBuilder(versions, fs, &ioptions, &mutable_cf_options,
178                                   &file_options, job_id,
179                                   tboptions.column_family_id,
180                                   tboptions.column_family_name, io_priority,
181                                   write_hint, io_tracer, blob_callback,
182                                   &blob_file_paths, blob_file_additions)
183             : nullptr);
184 
185     CompactionIterator c_iter(
186         iter, tboptions.internal_comparator.user_comparator(), &merge,
187         kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
188         snapshot_checker, env, ShouldReportDetailedTime(env, ioptions.stats),
189         true /* internal key corruption is not ok */, range_del_agg.get(),
190         blob_file_builder.get(), ioptions.allow_data_in_errors,
191         /*compaction=*/nullptr, compaction_filter.get(),
192         /*shutting_down=*/nullptr,
193         /*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr,
194         db_options.info_log, full_history_ts_low);
195 
196     c_iter.SeekToFirst();
197     for (; c_iter.Valid(); c_iter.Next()) {
198       const Slice& key = c_iter.key();
199       const Slice& value = c_iter.value();
200       const ParsedInternalKey& ikey = c_iter.ikey();
201       // Generate a rolling 64-bit hash of the key and values
202       s = output_validator.Add(key, value);
203       if (!s.ok()) {
204         break;
205       }
206       builder->Add(key, value);
207       meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
208 
209       // TODO(noetzli): Update stats after flush, too.
210       if (io_priority == Env::IO_HIGH &&
211           IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
212         ThreadStatusUtil::SetThreadOperationProperty(
213             ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
214       }
215     }
216     if (!s.ok()) {
217       c_iter.status().PermitUncheckedError();
218     } else if (!c_iter.status().ok()) {
219       s = c_iter.status();
220     }
221 
222     if (s.ok()) {
223       auto range_del_it = range_del_agg->NewIterator();
224       for (range_del_it->SeekToFirst(); range_del_it->Valid();
225            range_del_it->Next()) {
226         auto tombstone = range_del_it->Tombstone();
227         auto kv = tombstone.Serialize();
228         builder->Add(kv.first.Encode(), kv.second);
229         meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
230                                        tombstone.seq_,
231                                        tboptions.internal_comparator);
232       }
233     }
234 
235     TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
236     const bool empty = builder->IsEmpty();
237     if (num_input_entries != nullptr) {
238       *num_input_entries =
239           c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
240     }
241     if (!s.ok() || empty) {
242       builder->Abandon();
243     } else {
244       s = builder->Finish();
245     }
246     if (io_status->ok()) {
247       *io_status = builder->io_status();
248     }
249 
250     if (s.ok() && !empty) {
251       uint64_t file_size = builder->FileSize();
252       meta->fd.file_size = file_size;
253       meta->marked_for_compaction = builder->NeedCompact();
254       assert(meta->fd.GetFileSize() > 0);
255       tp = builder->GetTableProperties(); // refresh now that builder is finished
256       if (table_properties) {
257         *table_properties = tp;
258       }
259     }
260     delete builder;
261 
262     // Finish and check for file errors
263     TEST_SYNC_POINT("BuildTable:BeforeSyncTable");
264     if (s.ok() && !empty) {
265       StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS);
266       *io_status = file_writer->Sync(ioptions.use_fsync);
267     }
268     TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile");
269     if (s.ok() && io_status->ok() && !empty) {
270       *io_status = file_writer->Close();
271     }
272     if (s.ok() && io_status->ok() && !empty) {
273       // Add the checksum information to file metadata.
274       meta->file_checksum = file_writer->GetFileChecksum();
275       meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName();
276       file_checksum = meta->file_checksum;
277       file_checksum_func_name = meta->file_checksum_func_name;
278     }
279 
280     if (s.ok()) {
281       s = *io_status;
282     }
283 
284     if (blob_file_builder) {
285       if (s.ok()) {
286         s = blob_file_builder->Finish();
287       } else {
288         blob_file_builder->Abandon();
289       }
290       blob_file_builder.reset();
291     }
292 
293     // TODO Also check the IO status when create the Iterator.
294 
295     if (s.ok() && !empty) {
296       // Verify that the table is usable
297       // We set for_compaction to false and don't OptimizeForCompactionTableRead
298       // here because this is a special case after we finish the table building
299       // No matter whether use_direct_io_for_flush_and_compaction is true,
300       // we will regrad this verification as user reads since the goal is
301       // to cache it here for further user reads
302       ReadOptions read_options;
303       std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
304           read_options, file_options, tboptions.internal_comparator, *meta,
305           nullptr /* range_del_agg */,
306           mutable_cf_options.prefix_extractor.get(), nullptr,
307           (internal_stats == nullptr) ? nullptr
308                                       : internal_stats->GetFileReadHist(0),
309           TableReaderCaller::kFlush, /*arena=*/nullptr,
310           /*skip_filter=*/false, tboptions.level_at_creation,
311           MaxFileSizeForL0MetaPin(mutable_cf_options),
312           /*smallest_compaction_key=*/nullptr,
313           /*largest_compaction_key*/ nullptr,
314           /*allow_unprepared_value*/ false));
315       s = it->status();
316       if (s.ok() && paranoid_file_checks) {
317         OutputValidator file_validator(tboptions.internal_comparator,
318                                        /*enable_order_check=*/true,
319                                        /*enable_hash=*/true);
320         for (it->SeekToFirst(); it->Valid(); it->Next()) {
321           // Generate a rolling 64-bit hash of the key and values
322           file_validator.Add(it->key(), it->value()).PermitUncheckedError();
323         }
324         s = it->status();
325         if (s.ok() && !output_validator.CompareValidator(file_validator)) {
326           s = Status::Corruption("Paranoid checksums do not match");
327         }
328       }
329     }
330   }
331 
332   // Check for input iterator errors
333   if (!iter->status().ok()) {
334     s = iter->status();
335   }
336 
337   if (!s.ok() || meta->fd.GetFileSize() == 0) {
338     TEST_SYNC_POINT("BuildTable:BeforeDeleteFile");
339 
340     constexpr IODebugContext* dbg = nullptr;
341 
342     Status ignored = fs->DeleteFile(fname, IOOptions(), dbg);
343     ignored.PermitUncheckedError();
344 
345     assert(blob_file_additions || blob_file_paths.empty());
346 
347     if (blob_file_additions) {
348       for (const std::string& blob_file_path : blob_file_paths) {
349         ignored = DeleteDBFile(&db_options, blob_file_path, dbname,
350                                /*force_bg=*/false, /*force_fg=*/false);
351         ignored.PermitUncheckedError();
352         TEST_SYNC_POINT("BuildTable::AfterDeleteFile");
353       }
354     }
355   }
356 
357   if (meta->fd.GetFileSize() == 0) {
358     fname = "(nil)";
359   }
360   // Output to event logger and fire events.
361   EventHelpers::LogAndNotifyTableFileCreationFinished(
362       event_logger, ioptions.listeners, dbname, tboptions.column_family_name,
363       fname, job_id, meta->fd, meta->oldest_blob_file_number, tp,
364       tboptions.reason, s, file_checksum, file_checksum_func_name);
365 
366   return s;
367 }
368 
369 }  // namespace ROCKSDB_NAMESPACE
370