1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include <atomic>
11 #include <cstdlib>
12 #include <functional>
13 #include <memory>
14 
15 #include "db/db_test_util.h"
16 #include "db/read_callback.h"
17 #include "options/options_helper.h"
18 #include "port/port.h"
19 #include "port/stack_trace.h"
20 #include "rocksdb/iostats_context.h"
21 #include "rocksdb/persistent_cache.h"
22 #include "rocksdb/trace_record.h"
23 #include "rocksdb/trace_record_result.h"
24 #include "rocksdb/utilities/replayer.h"
25 #include "rocksdb/wal_filter.h"
26 #include "test_util/testutil.h"
27 #include "util/random.h"
28 #include "utilities/fault_injection_env.h"
29 
30 namespace ROCKSDB_NAMESPACE {
31 
32 class DBTest2 : public DBTestBase {
33  public:
DBTest2()34   DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
35 
36  protected:
37 #ifndef ROCKSDB_LITE
GetSstSizeHelper(Temperature temperature)38   uint64_t GetSstSizeHelper(Temperature temperature) {
39     std::string prop;
40     EXPECT_TRUE(
41         dbfull()->GetProperty(DB::Properties::kLiveSstFilesSizeAtTemperature +
42                                   ToString(static_cast<uint8_t>(temperature)),
43                               &prop));
44     return static_cast<uint64_t>(std::atoi(prop.c_str()));
45   }
46 #endif  // ROCKSDB_LITE
47 };
48 
49 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,OpenForReadOnly)50 TEST_F(DBTest2, OpenForReadOnly) {
51   DB* db_ptr = nullptr;
52   std::string dbname = test::PerThreadDBPath("db_readonly");
53   Options options = CurrentOptions();
54   options.create_if_missing = true;
55   // OpenForReadOnly should fail but will create <dbname> in the file system
56   ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
57   // Since <dbname> is created, we should be able to delete the dir
58   // We first get the list files under <dbname>
59   // There should not be any subdirectories -- this is not checked here
60   std::vector<std::string> files;
61   ASSERT_OK(env_->GetChildren(dbname, &files));
62   for (auto& f : files) {
63     ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
64   }
65   // <dbname> should be empty now and we should be able to delete it
66   ASSERT_OK(env_->DeleteDir(dbname));
67   options.create_if_missing = false;
68   // OpenForReadOnly should fail since <dbname> was successfully deleted
69   ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
70   // With create_if_missing false, there should not be a dir in the file system
71   ASSERT_NOK(env_->FileExists(dbname));
72 }
73 
TEST_F(DBTest2,OpenForReadOnlyWithColumnFamilies)74 TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
75   DB* db_ptr = nullptr;
76   std::string dbname = test::PerThreadDBPath("db_readonly");
77   Options options = CurrentOptions();
78   options.create_if_missing = true;
79 
80   ColumnFamilyOptions cf_options(options);
81   std::vector<ColumnFamilyDescriptor> column_families;
82   column_families.push_back(
83       ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
84   column_families.push_back(ColumnFamilyDescriptor("goku", cf_options));
85   std::vector<ColumnFamilyHandle*> handles;
86   // OpenForReadOnly should fail but will create <dbname> in the file system
87   ASSERT_NOK(
88       DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
89   // Since <dbname> is created, we should be able to delete the dir
90   // We first get the list files under <dbname>
91   // There should not be any subdirectories -- this is not checked here
92   std::vector<std::string> files;
93   ASSERT_OK(env_->GetChildren(dbname, &files));
94   for (auto& f : files) {
95     ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
96   }
97   // <dbname> should be empty now and we should be able to delete it
98   ASSERT_OK(env_->DeleteDir(dbname));
99   options.create_if_missing = false;
100   // OpenForReadOnly should fail since <dbname> was successfully deleted
101   ASSERT_NOK(
102       DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
103   // With create_if_missing false, there should not be a dir in the file system
104   ASSERT_NOK(env_->FileExists(dbname));
105 }
106 
107 class TestReadOnlyWithCompressedCache
108     : public DBTestBase,
109       public testing::WithParamInterface<std::tuple<int, bool>> {
110  public:
TestReadOnlyWithCompressedCache()111   TestReadOnlyWithCompressedCache()
112       : DBTestBase("test_readonly_with_compressed_cache",
113                    /*env_do_fsync=*/true) {
114     max_open_files_ = std::get<0>(GetParam());
115     use_mmap_ = std::get<1>(GetParam());
116   }
117   int max_open_files_;
118   bool use_mmap_;
119 };
120 
TEST_P(TestReadOnlyWithCompressedCache,ReadOnlyWithCompressedCache)121 TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) {
122   if (use_mmap_ && !IsMemoryMappedAccessSupported()) {
123     ROCKSDB_GTEST_SKIP("Test requires MMAP support");
124     return;
125   }
126   ASSERT_OK(Put("foo", "bar"));
127   ASSERT_OK(Put("foo2", "barbarbarbarbarbarbarbar"));
128   ASSERT_OK(Flush());
129 
130   DB* db_ptr = nullptr;
131   Options options = CurrentOptions();
132   options.allow_mmap_reads = use_mmap_;
133   options.max_open_files = max_open_files_;
134   options.compression = kSnappyCompression;
135   BlockBasedTableOptions table_options;
136   table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
137   table_options.no_block_cache = true;
138   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
139   options.statistics = CreateDBStatistics();
140 
141   ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db_ptr));
142 
143   std::string v;
144   ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
145   ASSERT_EQ("bar", v);
146   ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
147   ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
148   ASSERT_EQ("bar", v);
149   if (Snappy_Supported()) {
150     if (use_mmap_) {
151       ASSERT_EQ(0,
152                 options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
153     } else {
154       ASSERT_EQ(1,
155                 options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
156     }
157   }
158 
159   delete db_ptr;
160 }
161 
162 INSTANTIATE_TEST_CASE_P(TestReadOnlyWithCompressedCache,
163                         TestReadOnlyWithCompressedCache,
164                         ::testing::Combine(::testing::Values(-1, 100),
165                                            ::testing::Bool()));
166 
167 class PartitionedIndexTestListener : public EventListener {
168  public:
OnFlushCompleted(DB *,const FlushJobInfo & info)169   void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
170     ASSERT_GT(info.table_properties.index_partitions, 1);
171     ASSERT_EQ(info.table_properties.index_key_is_user_key, 0);
172   }
173 };
174 
TEST_F(DBTest2,PartitionedIndexUserToInternalKey)175 TEST_F(DBTest2, PartitionedIndexUserToInternalKey) {
176   const int kValueSize = 10500;
177   const int kNumEntriesPerFile = 1000;
178   const int kNumFiles = 3;
179   const int kNumDistinctKeys = 30;
180 
181   BlockBasedTableOptions table_options;
182   Options options = CurrentOptions();
183   options.disable_auto_compactions = true;
184   table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
185   PartitionedIndexTestListener* listener = new PartitionedIndexTestListener();
186   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
187   options.listeners.emplace_back(listener);
188   std::vector<const Snapshot*> snapshots;
189   Reopen(options);
190   Random rnd(301);
191 
192   for (int i = 0; i < kNumFiles; i++) {
193     for (int j = 0; j < kNumEntriesPerFile; j++) {
194       int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys;
195       std::string value = rnd.RandomString(kValueSize);
196       ASSERT_OK(Put("keykey_" + std::to_string(key_id), value));
197       snapshots.push_back(db_->GetSnapshot());
198     }
199     ASSERT_OK(Flush());
200   }
201 
202   for (auto s : snapshots) {
203     db_->ReleaseSnapshot(s);
204   }
205 }
206 
207 #endif  // ROCKSDB_LITE
208 
209 class PrefixFullBloomWithReverseComparator
210     : public DBTestBase,
211       public ::testing::WithParamInterface<bool> {
212  public:
PrefixFullBloomWithReverseComparator()213   PrefixFullBloomWithReverseComparator()
214       : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {}
SetUp()215   void SetUp() override { if_cache_filter_ = GetParam(); }
216   bool if_cache_filter_;
217 };
218 
TEST_P(PrefixFullBloomWithReverseComparator,PrefixFullBloomWithReverseComparator)219 TEST_P(PrefixFullBloomWithReverseComparator,
220        PrefixFullBloomWithReverseComparator) {
221   Options options = last_options_;
222   options.comparator = ReverseBytewiseComparator();
223   options.prefix_extractor.reset(NewCappedPrefixTransform(3));
224   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
225   BlockBasedTableOptions bbto;
226   if (if_cache_filter_) {
227     bbto.no_block_cache = false;
228     bbto.cache_index_and_filter_blocks = true;
229     bbto.block_cache = NewLRUCache(1);
230   }
231   bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
232   bbto.whole_key_filtering = false;
233   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
234   DestroyAndReopen(options);
235 
236   ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo"));
237   ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2"));
238   ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3"));
239 
240   ASSERT_OK(dbfull()->Flush(FlushOptions()));
241 
242   if (bbto.block_cache) {
243     bbto.block_cache->EraseUnRefEntries();
244   }
245 
246   std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
247   iter->Seek("bar345");
248   ASSERT_OK(iter->status());
249   ASSERT_TRUE(iter->Valid());
250   ASSERT_EQ("bar234", iter->key().ToString());
251   ASSERT_EQ("foo2", iter->value().ToString());
252   iter->Next();
253   ASSERT_TRUE(iter->Valid());
254   ASSERT_EQ("bar123", iter->key().ToString());
255   ASSERT_EQ("foo", iter->value().ToString());
256 
257   iter->Seek("foo234");
258   ASSERT_OK(iter->status());
259   ASSERT_TRUE(iter->Valid());
260   ASSERT_EQ("foo123", iter->key().ToString());
261   ASSERT_EQ("foo3", iter->value().ToString());
262 
263   iter->Seek("bar");
264   ASSERT_OK(iter->status());
265   ASSERT_TRUE(!iter->Valid());
266 }
267 
268 INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator,
269                         PrefixFullBloomWithReverseComparator, testing::Bool());
270 
TEST_F(DBTest2,IteratorPropertyVersionNumber)271 TEST_F(DBTest2, IteratorPropertyVersionNumber) {
272   ASSERT_OK(Put("", ""));
273   Iterator* iter1 = db_->NewIterator(ReadOptions());
274   ASSERT_OK(iter1->status());
275   std::string prop_value;
276   ASSERT_OK(
277       iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
278   uint64_t version_number1 =
279       static_cast<uint64_t>(std::atoi(prop_value.c_str()));
280 
281   ASSERT_OK(Put("", ""));
282   ASSERT_OK(Flush());
283 
284   Iterator* iter2 = db_->NewIterator(ReadOptions());
285   ASSERT_OK(iter2->status());
286   ASSERT_OK(
287       iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
288   uint64_t version_number2 =
289       static_cast<uint64_t>(std::atoi(prop_value.c_str()));
290 
291   ASSERT_GT(version_number2, version_number1);
292 
293   ASSERT_OK(Put("", ""));
294 
295   Iterator* iter3 = db_->NewIterator(ReadOptions());
296   ASSERT_OK(iter3->status());
297   ASSERT_OK(
298       iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
299   uint64_t version_number3 =
300       static_cast<uint64_t>(std::atoi(prop_value.c_str()));
301 
302   ASSERT_EQ(version_number2, version_number3);
303 
304   iter1->SeekToFirst();
305   ASSERT_OK(
306       iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
307   uint64_t version_number1_new =
308       static_cast<uint64_t>(std::atoi(prop_value.c_str()));
309   ASSERT_EQ(version_number1, version_number1_new);
310 
311   delete iter1;
312   delete iter2;
313   delete iter3;
314 }
315 
TEST_F(DBTest2,CacheIndexAndFilterWithDBRestart)316 TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
317   Options options = CurrentOptions();
318   options.create_if_missing = true;
319   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
320   BlockBasedTableOptions table_options;
321   table_options.cache_index_and_filter_blocks = true;
322   table_options.filter_policy.reset(NewBloomFilterPolicy(20));
323   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
324   CreateAndReopenWithCF({"pikachu"}, options);
325 
326   ASSERT_OK(Put(1, "a", "begin"));
327   ASSERT_OK(Put(1, "z", "end"));
328   ASSERT_OK(Flush(1));
329   TryReopenWithColumnFamilies({"default", "pikachu"}, options);
330 
331   std::string value;
332   value = Get(1, "a");
333 }
334 
TEST_F(DBTest2,MaxSuccessiveMergesChangeWithDBRecovery)335 TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) {
336   Options options = CurrentOptions();
337   options.create_if_missing = true;
338   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
339   options.max_successive_merges = 3;
340   options.merge_operator = MergeOperators::CreatePutOperator();
341   options.disable_auto_compactions = true;
342   DestroyAndReopen(options);
343   ASSERT_OK(Put("poi", "Finch"));
344   ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese"));
345   ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw"));
346   ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root"));
347   options.max_successive_merges = 2;
348   Reopen(options);
349 }
350 
351 #ifndef ROCKSDB_LITE
352 class DBTestSharedWriteBufferAcrossCFs
353     : public DBTestBase,
354       public testing::WithParamInterface<std::tuple<bool, bool>> {
355  public:
DBTestSharedWriteBufferAcrossCFs()356   DBTestSharedWriteBufferAcrossCFs()
357       : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {}
SetUp()358   void SetUp() override {
359     use_old_interface_ = std::get<0>(GetParam());
360     cost_cache_ = std::get<1>(GetParam());
361   }
362   bool use_old_interface_;
363   bool cost_cache_;
364 };
365 
TEST_P(DBTestSharedWriteBufferAcrossCFs,SharedWriteBufferAcrossCFs)366 TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
367   Options options = CurrentOptions();
368   options.arena_block_size = 4096;
369   auto flush_listener = std::make_shared<FlushCounterListener>();
370   options.listeners.push_back(flush_listener);
371   // Don't trip the listener at shutdown.
372   options.avoid_flush_during_shutdown = true;
373 
374   // Avoid undeterministic value by malloc_usable_size();
375   // Force arena block size to 1
376   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
377       "Arena::Arena:0", [&](void* arg) {
378         size_t* block_size = static_cast<size_t*>(arg);
379         *block_size = 1;
380       });
381 
382   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
383       "Arena::AllocateNewBlock:0", [&](void* arg) {
384         std::pair<size_t*, size_t*>* pair =
385             static_cast<std::pair<size_t*, size_t*>*>(arg);
386         *std::get<0>(*pair) = *std::get<1>(*pair);
387       });
388   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
389 
390   // The total soft write buffer size is about 105000
391   std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
392   ASSERT_LT(cache->GetUsage(), 256 * 1024);
393 
394   if (use_old_interface_) {
395     options.db_write_buffer_size = 120000;  // this is the real limit
396   } else if (!cost_cache_) {
397     options.write_buffer_manager.reset(new WriteBufferManager(114285));
398   } else {
399     options.write_buffer_manager.reset(new WriteBufferManager(114285, cache));
400   }
401   options.write_buffer_size = 500000;  // this is never hit
402   CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
403 
404   WriteOptions wo;
405   wo.disableWAL = true;
406 
407   std::function<void()> wait_flush = [&]() {
408     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
409     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
410     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
411     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
412   };
413 
414   // Create some data and flush "default" and "nikitich" so that they
415   // are newer CFs created.
416   flush_listener->expected_flush_reason = FlushReason::kManualFlush;
417   ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
418   Flush(3);
419   ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
420   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
421   Flush(0);
422   ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
423             static_cast<uint64_t>(1));
424   ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
425             static_cast<uint64_t>(1));
426 
427   flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
428   ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
429   if (cost_cache_) {
430     ASSERT_GE(cache->GetUsage(), 256 * 1024);
431     ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
432   }
433   wait_flush();
434   ASSERT_OK(Put(0, Key(1), DummyString(60000), wo));
435   if (cost_cache_) {
436     ASSERT_GE(cache->GetUsage(), 256 * 1024);
437     ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
438   }
439   wait_flush();
440   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
441   // No flush should trigger
442   wait_flush();
443   {
444     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
445               static_cast<uint64_t>(1));
446     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
447               static_cast<uint64_t>(0));
448     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
449               static_cast<uint64_t>(0));
450     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
451               static_cast<uint64_t>(1));
452   }
453 
454   // Trigger a flush. Flushing "nikitich".
455   ASSERT_OK(Put(3, Key(2), DummyString(30000), wo));
456   wait_flush();
457   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
458   wait_flush();
459   {
460     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
461               static_cast<uint64_t>(1));
462     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
463               static_cast<uint64_t>(0));
464     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
465               static_cast<uint64_t>(0));
466     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
467               static_cast<uint64_t>(2));
468   }
469 
470   // Without hitting the threshold, no flush should trigger.
471   ASSERT_OK(Put(2, Key(1), DummyString(30000), wo));
472   wait_flush();
473   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
474   wait_flush();
475   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
476   wait_flush();
477   {
478     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
479               static_cast<uint64_t>(1));
480     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
481               static_cast<uint64_t>(0));
482     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
483               static_cast<uint64_t>(0));
484     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
485               static_cast<uint64_t>(2));
486   }
487 
488   // Hit the write buffer limit again. "default"
489   // will have been flushed.
490   ASSERT_OK(Put(2, Key(2), DummyString(10000), wo));
491   wait_flush();
492   ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
493   wait_flush();
494   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
495   wait_flush();
496   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
497   wait_flush();
498   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
499   wait_flush();
500   {
501     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
502               static_cast<uint64_t>(2));
503     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
504               static_cast<uint64_t>(0));
505     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
506               static_cast<uint64_t>(0));
507     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
508               static_cast<uint64_t>(2));
509   }
510 
511   // Trigger another flush. This time "dobrynia". "pikachu" should not
512   // be flushed, althrough it was never flushed.
513   ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
514   wait_flush();
515   ASSERT_OK(Put(2, Key(1), DummyString(80000), wo));
516   wait_flush();
517   ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
518   wait_flush();
519   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
520   wait_flush();
521 
522   {
523     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
524               static_cast<uint64_t>(2));
525     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
526               static_cast<uint64_t>(0));
527     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
528               static_cast<uint64_t>(1));
529     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
530               static_cast<uint64_t>(2));
531   }
532   if (cost_cache_) {
533     ASSERT_GE(cache->GetUsage(), 256 * 1024);
534     Close();
535     options.write_buffer_manager.reset();
536     last_options_.write_buffer_manager.reset();
537     ASSERT_LT(cache->GetUsage(), 256 * 1024);
538   }
539   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
540 }
541 
542 INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs,
543                         DBTestSharedWriteBufferAcrossCFs,
544                         ::testing::Values(std::make_tuple(true, false),
545                                           std::make_tuple(false, false),
546                                           std::make_tuple(false, true)));
547 
TEST_F(DBTest2,SharedWriteBufferLimitAcrossDB)548 TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
549   std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
550   Options options = CurrentOptions();
551   options.arena_block_size = 4096;
552   auto flush_listener = std::make_shared<FlushCounterListener>();
553   options.listeners.push_back(flush_listener);
554   // Don't trip the listener at shutdown.
555   options.avoid_flush_during_shutdown = true;
556   // Avoid undeterministic value by malloc_usable_size();
557   // Force arena block size to 1
558   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
559       "Arena::Arena:0", [&](void* arg) {
560         size_t* block_size = static_cast<size_t*>(arg);
561         *block_size = 1;
562       });
563 
564   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
565       "Arena::AllocateNewBlock:0", [&](void* arg) {
566         std::pair<size_t*, size_t*>* pair =
567             static_cast<std::pair<size_t*, size_t*>*>(arg);
568         *std::get<0>(*pair) = *std::get<1>(*pair);
569       });
570   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
571 
572   options.write_buffer_size = 500000;  // this is never hit
573   // Use a write buffer total size so that the soft limit is about
574   // 105000.
575   options.write_buffer_manager.reset(new WriteBufferManager(120000));
576   CreateAndReopenWithCF({"cf1", "cf2"}, options);
577 
578   ASSERT_OK(DestroyDB(dbname2, options));
579   DB* db2 = nullptr;
580   ASSERT_OK(DB::Open(options, dbname2, &db2));
581 
582   WriteOptions wo;
583   wo.disableWAL = true;
584 
585   std::function<void()> wait_flush = [&]() {
586     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
587     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
588     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
589     ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
590   };
591 
592   // Trigger a flush on cf2
593   flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
594   ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
595   wait_flush();
596   ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
597   wait_flush();
598 
599   // Insert to DB2
600   ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
601   wait_flush();
602 
603   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
604   wait_flush();
605   ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
606   {
607     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
608                   GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
609                   GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
610               static_cast<uint64_t>(1));
611     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
612               static_cast<uint64_t>(0));
613   }
614 
615   // Triggering to flush another CF in DB1
616   ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
617   wait_flush();
618   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
619   wait_flush();
620   {
621     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
622               static_cast<uint64_t>(1));
623     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
624               static_cast<uint64_t>(0));
625     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
626               static_cast<uint64_t>(1));
627     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
628               static_cast<uint64_t>(0));
629   }
630 
631   // Triggering flush in DB2.
632   ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000)));
633   wait_flush();
634   ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
635   wait_flush();
636   ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
637   {
638     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
639               static_cast<uint64_t>(1));
640     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
641               static_cast<uint64_t>(0));
642     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
643               static_cast<uint64_t>(1));
644     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
645               static_cast<uint64_t>(1));
646   }
647 
648   delete db2;
649   ASSERT_OK(DestroyDB(dbname2, options));
650 
651   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
652 }
653 
TEST_F(DBTest2,TestWriteBufferNoLimitWithCache)654 TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
655   Options options = CurrentOptions();
656   options.arena_block_size = 4096;
657   std::shared_ptr<Cache> cache =
658       NewLRUCache(LRUCacheOptions(10000000, 1, false, 0.0));
659   options.write_buffer_size = 50000;  // this is never hit
660   // Use a write buffer total size so that the soft limit is about
661   // 105000.
662   options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
663   Reopen(options);
664 
665   ASSERT_OK(Put("foo", "bar"));
666   // One dummy entry is 256KB.
667   ASSERT_GT(cache->GetUsage(), 128000);
668 }
669 
670 namespace {
ValidateKeyExistence(DB * db,const std::vector<Slice> & keys_must_exist,const std::vector<Slice> & keys_must_not_exist)671   void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
672     const std::vector<Slice>& keys_must_not_exist) {
673     // Ensure that expected keys exist
674     std::vector<std::string> values;
675     if (keys_must_exist.size() > 0) {
676       std::vector<Status> status_list =
677         db->MultiGet(ReadOptions(), keys_must_exist, &values);
678       for (size_t i = 0; i < keys_must_exist.size(); i++) {
679         ASSERT_OK(status_list[i]);
680       }
681     }
682 
683     // Ensure that given keys don't exist
684     if (keys_must_not_exist.size() > 0) {
685       std::vector<Status> status_list =
686         db->MultiGet(ReadOptions(), keys_must_not_exist, &values);
687       for (size_t i = 0; i < keys_must_not_exist.size(); i++) {
688         ASSERT_TRUE(status_list[i].IsNotFound());
689       }
690     }
691   }
692 
693 }  // namespace
694 
TEST_F(DBTest2,WalFilterTest)695 TEST_F(DBTest2, WalFilterTest) {
696   class TestWalFilter : public WalFilter {
697   private:
698     // Processing option that is requested to be applied at the given index
699     WalFilter::WalProcessingOption wal_processing_option_;
700     // Index at which to apply wal_processing_option_
701     // At other indexes default wal_processing_option::kContinueProcessing is
702     // returned.
703     size_t apply_option_at_record_index_;
704     // Current record index, incremented with each record encountered.
705     size_t current_record_index_;
706 
707   public:
708     TestWalFilter(WalFilter::WalProcessingOption wal_processing_option,
709       size_t apply_option_for_record_index)
710       : wal_processing_option_(wal_processing_option),
711       apply_option_at_record_index_(apply_option_for_record_index),
712       current_record_index_(0) {}
713 
714     WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
715                                   WriteBatch* /*new_batch*/,
716                                   bool* /*batch_changed*/) const override {
717       WalFilter::WalProcessingOption option_to_return;
718 
719       if (current_record_index_ == apply_option_at_record_index_) {
720         option_to_return = wal_processing_option_;
721       }
722       else {
723         option_to_return = WalProcessingOption::kContinueProcessing;
724       }
725 
726       // Filter is passed as a const object for RocksDB to not modify the
727       // object, however we modify it for our own purpose here and hence
728       // cast the constness away.
729       (const_cast<TestWalFilter*>(this)->current_record_index_)++;
730 
731       return option_to_return;
732     }
733 
734     const char* Name() const override { return "TestWalFilter"; }
735   };
736 
737   // Create 3 batches with two keys each
738   std::vector<std::vector<std::string>> batch_keys(3);
739 
740   batch_keys[0].push_back("key1");
741   batch_keys[0].push_back("key2");
742   batch_keys[1].push_back("key3");
743   batch_keys[1].push_back("key4");
744   batch_keys[2].push_back("key5");
745   batch_keys[2].push_back("key6");
746 
747   // Test with all WAL processing options
748   for (int option = 0;
749     option < static_cast<int>(
750     WalFilter::WalProcessingOption::kWalProcessingOptionMax);
751   option++) {
752     Options options = OptionsForLogIterTest();
753     DestroyAndReopen(options);
754     CreateAndReopenWithCF({ "pikachu" }, options);
755 
756     // Write given keys in given batches
757     for (size_t i = 0; i < batch_keys.size(); i++) {
758       WriteBatch batch;
759       for (size_t j = 0; j < batch_keys[i].size(); j++) {
760         ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
761       }
762       ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
763     }
764 
765     WalFilter::WalProcessingOption wal_processing_option =
766       static_cast<WalFilter::WalProcessingOption>(option);
767 
768     // Create a test filter that would apply wal_processing_option at the first
769     // record
770     size_t apply_option_for_record_index = 1;
771     TestWalFilter test_wal_filter(wal_processing_option,
772       apply_option_for_record_index);
773 
774     // Reopen database with option to use WAL filter
775     options = OptionsForLogIterTest();
776     options.wal_filter = &test_wal_filter;
777     Status status =
778       TryReopenWithColumnFamilies({ "default", "pikachu" }, options);
779     if (wal_processing_option ==
780       WalFilter::WalProcessingOption::kCorruptedRecord) {
781       ASSERT_NOK(status);
782       // In case of corruption we can turn off paranoid_checks to reopen
783       // databse
784       options.paranoid_checks = false;
785       ReopenWithColumnFamilies({ "default", "pikachu" }, options);
786     }
787     else {
788       ASSERT_OK(status);
789     }
790 
791     // Compute which keys we expect to be found
792     // and which we expect not to be found after recovery.
793     std::vector<Slice> keys_must_exist;
794     std::vector<Slice> keys_must_not_exist;
795     switch (wal_processing_option) {
796     case WalFilter::WalProcessingOption::kCorruptedRecord:
797     case WalFilter::WalProcessingOption::kContinueProcessing: {
798       fprintf(stderr, "Testing with complete WAL processing\n");
799       // we expect all records to be processed
800       for (size_t i = 0; i < batch_keys.size(); i++) {
801         for (size_t j = 0; j < batch_keys[i].size(); j++) {
802           keys_must_exist.push_back(Slice(batch_keys[i][j]));
803         }
804       }
805       break;
806     }
807     case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: {
808       fprintf(stderr,
809         "Testing with ignoring record %" ROCKSDB_PRIszt " only\n",
810         apply_option_for_record_index);
811       // We expect the record with apply_option_for_record_index to be not
812       // found.
813       for (size_t i = 0; i < batch_keys.size(); i++) {
814         for (size_t j = 0; j < batch_keys[i].size(); j++) {
815           if (i == apply_option_for_record_index) {
816             keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
817           }
818           else {
819             keys_must_exist.push_back(Slice(batch_keys[i][j]));
820           }
821         }
822       }
823       break;
824     }
825     case WalFilter::WalProcessingOption::kStopReplay: {
826       fprintf(stderr,
827         "Testing with stopping replay from record %" ROCKSDB_PRIszt
828         "\n",
829         apply_option_for_record_index);
830       // We expect records beyond apply_option_for_record_index to be not
831       // found.
832       for (size_t i = 0; i < batch_keys.size(); i++) {
833         for (size_t j = 0; j < batch_keys[i].size(); j++) {
834           if (i >= apply_option_for_record_index) {
835             keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
836           }
837           else {
838             keys_must_exist.push_back(Slice(batch_keys[i][j]));
839           }
840         }
841       }
842       break;
843     }
844     default:
845       FAIL();  // unhandled case
846     }
847 
848     bool checked_after_reopen = false;
849 
850     while (true) {
851       // Ensure that expected keys exists
852       // and not expected keys don't exist after recovery
853       ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
854 
855       if (checked_after_reopen) {
856         break;
857       }
858 
859       // reopen database again to make sure previous log(s) are not used
860       //(even if they were skipped)
861       // reopn database with option to use WAL filter
862       options = OptionsForLogIterTest();
863       ReopenWithColumnFamilies({ "default", "pikachu" }, options);
864 
865       checked_after_reopen = true;
866     }
867   }
868 }
869 
TEST_F(DBTest2,WalFilterTestWithChangeBatch)870 TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
871   class ChangeBatchHandler : public WriteBatch::Handler {
872   private:
873     // Batch to insert keys in
874     WriteBatch* new_write_batch_;
875     // Number of keys to add in the new batch
876     size_t num_keys_to_add_in_new_batch_;
877     // Number of keys added to new batch
878     size_t num_keys_added_;
879 
880   public:
881     ChangeBatchHandler(WriteBatch* new_write_batch,
882       size_t num_keys_to_add_in_new_batch)
883       : new_write_batch_(new_write_batch),
884       num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
885       num_keys_added_(0) {}
886     void Put(const Slice& key, const Slice& value) override {
887       if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
888         ASSERT_OK(new_write_batch_->Put(key, value));
889         ++num_keys_added_;
890       }
891     }
892   };
893 
894   class TestWalFilterWithChangeBatch : public WalFilter {
895   private:
896     // Index at which to start changing records
897     size_t change_records_from_index_;
898     // Number of keys to add in the new batch
899     size_t num_keys_to_add_in_new_batch_;
900     // Current record index, incremented with each record encountered.
901     size_t current_record_index_;
902 
903   public:
904     TestWalFilterWithChangeBatch(size_t change_records_from_index,
905       size_t num_keys_to_add_in_new_batch)
906       : change_records_from_index_(change_records_from_index),
907       num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
908       current_record_index_(0) {}
909 
910     WalProcessingOption LogRecord(const WriteBatch& batch,
911                                   WriteBatch* new_batch,
912                                   bool* batch_changed) const override {
913       if (current_record_index_ >= change_records_from_index_) {
914         ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
915         Status s = batch.Iterate(&handler);
916         if (s.ok()) {
917           *batch_changed = true;
918         } else {
919           assert(false);
920         }
921       }
922 
923       // Filter is passed as a const object for RocksDB to not modify the
924       // object, however we modify it for our own purpose here and hence
925       // cast the constness away.
926       (const_cast<TestWalFilterWithChangeBatch*>(this)
927         ->current_record_index_)++;
928 
929       return WalProcessingOption::kContinueProcessing;
930     }
931 
932     const char* Name() const override { return "TestWalFilterWithChangeBatch"; }
933   };
934 
935   std::vector<std::vector<std::string>> batch_keys(3);
936 
937   batch_keys[0].push_back("key1");
938   batch_keys[0].push_back("key2");
939   batch_keys[1].push_back("key3");
940   batch_keys[1].push_back("key4");
941   batch_keys[2].push_back("key5");
942   batch_keys[2].push_back("key6");
943 
944   Options options = OptionsForLogIterTest();
945   DestroyAndReopen(options);
946   CreateAndReopenWithCF({ "pikachu" }, options);
947 
948   // Write given keys in given batches
949   for (size_t i = 0; i < batch_keys.size(); i++) {
950     WriteBatch batch;
951     for (size_t j = 0; j < batch_keys[i].size(); j++) {
952       ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
953     }
954     ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
955   }
956 
957   // Create a test filter that would apply wal_processing_option at the first
958   // record
959   size_t change_records_from_index = 1;
960   size_t num_keys_to_add_in_new_batch = 1;
961   TestWalFilterWithChangeBatch test_wal_filter_with_change_batch(
962     change_records_from_index, num_keys_to_add_in_new_batch);
963 
964   // Reopen database with option to use WAL filter
965   options = OptionsForLogIterTest();
966   options.wal_filter = &test_wal_filter_with_change_batch;
967   ReopenWithColumnFamilies({ "default", "pikachu" }, options);
968 
969   // Ensure that all keys exist before change_records_from_index_
970   // And after that index only single key exists
971   // as our filter adds only single key for each batch
972   std::vector<Slice> keys_must_exist;
973   std::vector<Slice> keys_must_not_exist;
974 
975   for (size_t i = 0; i < batch_keys.size(); i++) {
976     for (size_t j = 0; j < batch_keys[i].size(); j++) {
977       if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) {
978         keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
979       }
980       else {
981         keys_must_exist.push_back(Slice(batch_keys[i][j]));
982       }
983     }
984   }
985 
986   bool checked_after_reopen = false;
987 
988   while (true) {
989     // Ensure that expected keys exists
990     // and not expected keys don't exist after recovery
991     ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
992 
993     if (checked_after_reopen) {
994       break;
995     }
996 
997     // reopen database again to make sure previous log(s) are not used
998     //(even if they were skipped)
999     // reopn database with option to use WAL filter
1000     options = OptionsForLogIterTest();
1001     ReopenWithColumnFamilies({ "default", "pikachu" }, options);
1002 
1003     checked_after_reopen = true;
1004   }
1005 }
1006 
TEST_F(DBTest2,WalFilterTestWithChangeBatchExtraKeys)1007 TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
1008   class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter {
1009   public:
1010    WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch,
1011                                  bool* batch_changed) const override {
1012      *new_batch = batch;
1013      Status s = new_batch->Put("key_extra", "value_extra");
1014      if (s.ok()) {
1015        *batch_changed = true;
1016      } else {
1017        assert(false);
1018      }
1019      return WalProcessingOption::kContinueProcessing;
1020    }
1021 
1022    const char* Name() const override {
1023      return "WalFilterTestWithChangeBatchExtraKeys";
1024    }
1025   };
1026 
1027   std::vector<std::vector<std::string>> batch_keys(3);
1028 
1029   batch_keys[0].push_back("key1");
1030   batch_keys[0].push_back("key2");
1031   batch_keys[1].push_back("key3");
1032   batch_keys[1].push_back("key4");
1033   batch_keys[2].push_back("key5");
1034   batch_keys[2].push_back("key6");
1035 
1036   Options options = OptionsForLogIterTest();
1037   DestroyAndReopen(options);
1038   CreateAndReopenWithCF({ "pikachu" }, options);
1039 
1040   // Write given keys in given batches
1041   for (size_t i = 0; i < batch_keys.size(); i++) {
1042     WriteBatch batch;
1043     for (size_t j = 0; j < batch_keys[i].size(); j++) {
1044       ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
1045     }
1046     ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
1047   }
1048 
1049   // Create a test filter that would add extra keys
1050   TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys;
1051 
1052   // Reopen database with option to use WAL filter
1053   options = OptionsForLogIterTest();
1054   options.wal_filter = &test_wal_filter_extra_keys;
1055   Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
1056   ASSERT_TRUE(status.IsNotSupported());
1057 
1058   // Reopen without filter, now reopen should succeed - previous
1059   // attempt to open must not have altered the db.
1060   options = OptionsForLogIterTest();
1061   ReopenWithColumnFamilies({ "default", "pikachu" }, options);
1062 
1063   std::vector<Slice> keys_must_exist;
1064   std::vector<Slice> keys_must_not_exist;  // empty vector
1065 
1066   for (size_t i = 0; i < batch_keys.size(); i++) {
1067     for (size_t j = 0; j < batch_keys[i].size(); j++) {
1068       keys_must_exist.push_back(Slice(batch_keys[i][j]));
1069     }
1070   }
1071 
1072   ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
1073 }
1074 
TEST_F(DBTest2,WalFilterTestWithColumnFamilies)1075 TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
1076   class TestWalFilterWithColumnFamilies : public WalFilter {
1077   private:
1078     // column_family_id -> log_number map (provided to WALFilter)
1079     std::map<uint32_t, uint64_t> cf_log_number_map_;
1080     // column_family_name -> column_family_id map (provided to WALFilter)
1081     std::map<std::string, uint32_t> cf_name_id_map_;
1082     // column_family_name -> keys_found_in_wal map
1083     // We store keys that are applicable to the column_family
1084     // during recovery (i.e. aren't already flushed to SST file(s))
1085     // for verification against the keys we expect.
1086     std::map<uint32_t, std::vector<std::string>> cf_wal_keys_;
1087   public:
1088    void ColumnFamilyLogNumberMap(
1089        const std::map<uint32_t, uint64_t>& cf_lognumber_map,
1090        const std::map<std::string, uint32_t>& cf_name_id_map) override {
1091      cf_log_number_map_ = cf_lognumber_map;
1092      cf_name_id_map_ = cf_name_id_map;
1093    }
1094 
1095    WalProcessingOption LogRecordFound(unsigned long long log_number,
1096                                       const std::string& /*log_file_name*/,
1097                                       const WriteBatch& batch,
1098                                       WriteBatch* /*new_batch*/,
1099                                       bool* /*batch_changed*/) override {
1100      class LogRecordBatchHandler : public WriteBatch::Handler {
1101       private:
1102         const std::map<uint32_t, uint64_t> & cf_log_number_map_;
1103         std::map<uint32_t, std::vector<std::string>> & cf_wal_keys_;
1104         unsigned long long log_number_;
1105       public:
1106         LogRecordBatchHandler(unsigned long long current_log_number,
1107           const std::map<uint32_t, uint64_t> & cf_log_number_map,
1108           std::map<uint32_t, std::vector<std::string>> & cf_wal_keys) :
1109           cf_log_number_map_(cf_log_number_map),
1110           cf_wal_keys_(cf_wal_keys),
1111           log_number_(current_log_number){}
1112 
1113         Status PutCF(uint32_t column_family_id, const Slice& key,
1114                      const Slice& /*value*/) override {
1115           auto it = cf_log_number_map_.find(column_family_id);
1116           assert(it != cf_log_number_map_.end());
1117           unsigned long long log_number_for_cf = it->second;
1118           // If the current record is applicable for column_family_id
1119           // (i.e. isn't flushed to SST file(s) for column_family_id)
1120           // add it to the cf_wal_keys_ map for verification.
1121           if (log_number_ >= log_number_for_cf) {
1122             cf_wal_keys_[column_family_id].push_back(std::string(key.data(),
1123               key.size()));
1124           }
1125           return Status::OK();
1126         }
1127       } handler(log_number, cf_log_number_map_, cf_wal_keys_);
1128 
1129       Status s = batch.Iterate(&handler);
1130       if (!s.ok()) {
1131         // TODO(AR) is this ok?
1132         return WalProcessingOption::kCorruptedRecord;
1133       }
1134 
1135       return WalProcessingOption::kContinueProcessing;
1136    }
1137 
1138    const char* Name() const override {
1139      return "WalFilterTestWithColumnFamilies";
1140    }
1141 
1142     const std::map<uint32_t, std::vector<std::string>>& GetColumnFamilyKeys() {
1143       return cf_wal_keys_;
1144     }
1145 
1146     const std::map<std::string, uint32_t> & GetColumnFamilyNameIdMap() {
1147       return cf_name_id_map_;
1148     }
1149   };
1150 
1151   std::vector<std::vector<std::string>> batch_keys_pre_flush(3);
1152 
1153   batch_keys_pre_flush[0].push_back("key1");
1154   batch_keys_pre_flush[0].push_back("key2");
1155   batch_keys_pre_flush[1].push_back("key3");
1156   batch_keys_pre_flush[1].push_back("key4");
1157   batch_keys_pre_flush[2].push_back("key5");
1158   batch_keys_pre_flush[2].push_back("key6");
1159 
1160   Options options = OptionsForLogIterTest();
1161   DestroyAndReopen(options);
1162   CreateAndReopenWithCF({ "pikachu" }, options);
1163 
1164   // Write given keys in given batches
1165   for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
1166     WriteBatch batch;
1167     for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
1168       ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j],
1169                           DummyString(1024)));
1170       ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j],
1171                           DummyString(1024)));
1172     }
1173     ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
1174   }
1175 
1176   //Flush default column-family
1177   ASSERT_OK(db_->Flush(FlushOptions(), handles_[0]));
1178 
1179   // Do some more writes
1180   std::vector<std::vector<std::string>> batch_keys_post_flush(3);
1181 
1182   batch_keys_post_flush[0].push_back("key7");
1183   batch_keys_post_flush[0].push_back("key8");
1184   batch_keys_post_flush[1].push_back("key9");
1185   batch_keys_post_flush[1].push_back("key10");
1186   batch_keys_post_flush[2].push_back("key11");
1187   batch_keys_post_flush[2].push_back("key12");
1188 
1189   // Write given keys in given batches
1190   for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
1191     WriteBatch batch;
1192     for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
1193       ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j],
1194                           DummyString(1024)));
1195       ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j],
1196                           DummyString(1024)));
1197     }
1198     ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
1199   }
1200 
1201   // On Recovery we should only find the second batch applicable to default CF
1202   // But both batches applicable to pikachu CF
1203 
1204   // Create a test filter that would add extra keys
1205   TestWalFilterWithColumnFamilies test_wal_filter_column_families;
1206 
1207   // Reopen database with option to use WAL filter
1208   options = OptionsForLogIterTest();
1209   options.wal_filter = &test_wal_filter_column_families;
1210   Status status =
1211     TryReopenWithColumnFamilies({ "default", "pikachu" }, options);
1212   ASSERT_TRUE(status.ok());
1213 
1214   // verify that handles_[0] only has post_flush keys
1215   // while handles_[1] has pre and post flush keys
1216   auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys();
1217   auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap();
1218   size_t index = 0;
1219   auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]];
1220   //default column-family, only post_flush keys are expected
1221   for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
1222     for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
1223       Slice key_from_the_log(keys_cf[index++]);
1224       Slice batch_key(batch_keys_post_flush[i][j]);
1225       ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
1226     }
1227   }
1228   ASSERT_EQ(index, keys_cf.size());
1229 
1230   index = 0;
1231   keys_cf = cf_wal_keys[name_id_map["pikachu"]];
1232   //pikachu column-family, all keys are expected
1233   for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
1234     for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
1235       Slice key_from_the_log(keys_cf[index++]);
1236       Slice batch_key(batch_keys_pre_flush[i][j]);
1237       ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
1238     }
1239   }
1240 
1241   for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
1242     for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
1243       Slice key_from_the_log(keys_cf[index++]);
1244       Slice batch_key(batch_keys_post_flush[i][j]);
1245       ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
1246     }
1247   }
1248   ASSERT_EQ(index, keys_cf.size());
1249 }
1250 
TEST_F(DBTest2,PresetCompressionDict)1251 TEST_F(DBTest2, PresetCompressionDict) {
1252   // Verifies that compression ratio improves when dictionary is enabled, and
1253   // improves even further when the dictionary is trained by ZSTD.
1254   const size_t kBlockSizeBytes = 4 << 10;
1255   const size_t kL0FileBytes = 128 << 10;
1256   const size_t kApproxPerBlockOverheadBytes = 50;
1257   const int kNumL0Files = 5;
1258 
1259   Options options;
1260   // Make sure to use any custom env that the test is configured with.
1261   options.env = CurrentOptions().env;
1262   options.allow_concurrent_memtable_write = false;
1263   options.arena_block_size = kBlockSizeBytes;
1264   options.create_if_missing = true;
1265   options.disable_auto_compactions = true;
1266   options.level0_file_num_compaction_trigger = kNumL0Files;
1267   options.memtable_factory.reset(
1268       test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
1269   options.num_levels = 2;
1270   options.target_file_size_base = kL0FileBytes;
1271   options.target_file_size_multiplier = 2;
1272   options.write_buffer_size = kL0FileBytes;
1273   BlockBasedTableOptions table_options;
1274   table_options.block_size = kBlockSizeBytes;
1275   std::vector<CompressionType> compression_types;
1276   if (Zlib_Supported()) {
1277     compression_types.push_back(kZlibCompression);
1278   }
1279 #if LZ4_VERSION_NUMBER >= 10400  // r124+
1280   compression_types.push_back(kLZ4Compression);
1281   compression_types.push_back(kLZ4HCCompression);
1282 #endif                          // LZ4_VERSION_NUMBER >= 10400
1283   if (ZSTD_Supported()) {
1284     compression_types.push_back(kZSTD);
1285   }
1286 
1287   enum DictionaryTypes : int {
1288     kWithoutDict,
1289     kWithDict,
1290     kWithZSTDTrainedDict,
1291     kDictEnd,
1292   };
1293 
1294   for (auto compression_type : compression_types) {
1295     options.compression = compression_type;
1296     size_t bytes_without_dict = 0;
1297     size_t bytes_with_dict = 0;
1298     size_t bytes_with_zstd_trained_dict = 0;
1299     for (int i = kWithoutDict; i < kDictEnd; i++) {
1300       // First iteration: compress without preset dictionary
1301       // Second iteration: compress with preset dictionary
1302       // Third iteration (zstd only): compress with zstd-trained dictionary
1303       //
1304       // To make sure the compression dictionary has the intended effect, we
1305       // verify the compressed size is smaller in successive iterations. Also in
1306       // the non-first iterations, verify the data we get out is the same data
1307       // we put in.
1308       switch (i) {
1309         case kWithoutDict:
1310           options.compression_opts.max_dict_bytes = 0;
1311           options.compression_opts.zstd_max_train_bytes = 0;
1312           break;
1313         case kWithDict:
1314           options.compression_opts.max_dict_bytes = kBlockSizeBytes;
1315           options.compression_opts.zstd_max_train_bytes = 0;
1316           break;
1317         case kWithZSTDTrainedDict:
1318           if (compression_type != kZSTD) {
1319             continue;
1320           }
1321           options.compression_opts.max_dict_bytes = kBlockSizeBytes;
1322           options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
1323           break;
1324         default:
1325           assert(false);
1326       }
1327 
1328       options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
1329       options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1330       CreateAndReopenWithCF({"pikachu"}, options);
1331       Random rnd(301);
1332       std::string seq_datas[10];
1333       for (int j = 0; j < 10; ++j) {
1334         seq_datas[j] =
1335             rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
1336       }
1337 
1338       ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
1339       for (int j = 0; j < kNumL0Files; ++j) {
1340         for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
1341           auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
1342           ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
1343                         seq_datas[(key_num / 10) % 10]));
1344         }
1345         ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
1346         ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
1347       }
1348       ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
1349                                             true /* disallow_trivial_move */));
1350       ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
1351       ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
1352 
1353       // Get the live sst files size
1354       size_t total_sst_bytes = TotalSize(1);
1355       if (i == kWithoutDict) {
1356         bytes_without_dict = total_sst_bytes;
1357       } else if (i == kWithDict) {
1358         bytes_with_dict = total_sst_bytes;
1359       } else if (i == kWithZSTDTrainedDict) {
1360         bytes_with_zstd_trained_dict = total_sst_bytes;
1361       }
1362 
1363       for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
1364            j++) {
1365         ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
1366       }
1367       if (i == kWithDict) {
1368         ASSERT_GT(bytes_without_dict, bytes_with_dict);
1369       } else if (i == kWithZSTDTrainedDict) {
1370         // In zstd compression, it is sometimes possible that using a trained
1371         // dictionary does not get as good a compression ratio as without
1372         // training.
1373         // But using a dictionary (with or without training) should always get
1374         // better compression ratio than not using one.
1375         ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
1376                     bytes_without_dict > bytes_with_zstd_trained_dict);
1377       }
1378 
1379       DestroyAndReopen(options);
1380     }
1381   }
1382 }
1383 
TEST_F(DBTest2,PresetCompressionDictLocality)1384 TEST_F(DBTest2, PresetCompressionDictLocality) {
1385   if (!ZSTD_Supported()) {
1386     return;
1387   }
1388   // Verifies that compression dictionary is generated from local data. The
1389   // verification simply checks all output SSTs have different compression
1390   // dictionaries. We do not verify effectiveness as that'd likely be flaky in
1391   // the future.
1392   const int kNumEntriesPerFile = 1 << 10;  // 1KB
1393   const int kNumBytesPerEntry = 1 << 10;   // 1KB
1394   const int kNumFiles = 4;
1395   Options options = CurrentOptions();
1396   options.compression = kZSTD;
1397   options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
1398   options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
1399   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
1400   options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
1401   BlockBasedTableOptions table_options;
1402   table_options.cache_index_and_filter_blocks = true;
1403   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1404   Reopen(options);
1405 
1406   Random rnd(301);
1407   for (int i = 0; i < kNumFiles; ++i) {
1408     for (int j = 0; j < kNumEntriesPerFile; ++j) {
1409       ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
1410                     rnd.RandomString(kNumBytesPerEntry)));
1411     }
1412     ASSERT_OK(Flush());
1413     MoveFilesToLevel(1);
1414     ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
1415   }
1416 
1417   // Store all the dictionaries generated during a full compaction.
1418   std::vector<std::string> compression_dicts;
1419   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
1420       "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
1421       [&](void* arg) {
1422         compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
1423       });
1424   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
1425   CompactRangeOptions compact_range_opts;
1426   compact_range_opts.bottommost_level_compaction =
1427       BottommostLevelCompaction::kForceOptimized;
1428   ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
1429 
1430   // Dictionary compression should not be so good as to compress four totally
1431   // random files into one. If it does then there's probably something wrong
1432   // with the test.
1433   ASSERT_GT(NumTableFilesAtLevel(1), 1);
1434 
1435   // Furthermore, there should be one compression dictionary generated per file.
1436   // And they should all be different from each other.
1437   ASSERT_EQ(NumTableFilesAtLevel(1),
1438             static_cast<int>(compression_dicts.size()));
1439   for (size_t i = 1; i < compression_dicts.size(); ++i) {
1440     std::string& a = compression_dicts[i - 1];
1441     std::string& b = compression_dicts[i];
1442     size_t alen = a.size();
1443     size_t blen = b.size();
1444     ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
1445   }
1446 }
1447 
1448 class PresetCompressionDictTest
1449     : public DBTestBase,
1450       public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
1451  public:
PresetCompressionDictTest()1452   PresetCompressionDictTest()
1453       : DBTestBase("db_test2", false /* env_do_fsync */),
1454         compression_type_(std::get<0>(GetParam())),
1455         bottommost_(std::get<1>(GetParam())) {}
1456 
1457  protected:
1458   const CompressionType compression_type_;
1459   const bool bottommost_;
1460 };
1461 
1462 INSTANTIATE_TEST_CASE_P(
1463     DBTest2, PresetCompressionDictTest,
1464     ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
1465                        ::testing::Bool()));
1466 
TEST_P(PresetCompressionDictTest,Flush)1467 TEST_P(PresetCompressionDictTest, Flush) {
1468   // Verifies that dictionary is generated and written during flush only when
1469   // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
1470   // size of the dictionary is within expectations according to the limit on
1471   // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
1472   const size_t kValueLen = 256;
1473   const size_t kKeysPerFile = 1 << 10;
1474   const size_t kDictLen = 16 << 10;
1475   const size_t kBlockLen = 4 << 10;
1476 
1477   Options options = CurrentOptions();
1478   if (bottommost_) {
1479     options.bottommost_compression = compression_type_;
1480     options.bottommost_compression_opts.enabled = true;
1481     options.bottommost_compression_opts.max_dict_bytes = kDictLen;
1482     options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
1483   } else {
1484     options.compression = compression_type_;
1485     options.compression_opts.max_dict_bytes = kDictLen;
1486     options.compression_opts.max_dict_buffer_bytes = kBlockLen;
1487   }
1488   options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
1489   options.statistics = CreateDBStatistics();
1490   BlockBasedTableOptions bbto;
1491   bbto.block_size = kBlockLen;
1492   bbto.cache_index_and_filter_blocks = true;
1493   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
1494   Reopen(options);
1495 
1496   Random rnd(301);
1497   for (size_t i = 0; i <= kKeysPerFile; ++i) {
1498     ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
1499   }
1500   ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
1501 
1502   // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
1503   // compression dictionary exists since dictionaries would be preloaded when
1504   // the flush finishes.
1505   if (bottommost_) {
1506     // Flush is never considered bottommost. This should change in the future
1507     // since flushed files may have nothing underneath them, like the one in
1508     // this test case.
1509     ASSERT_EQ(
1510         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1511         0);
1512   } else {
1513     ASSERT_GT(
1514         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1515         0);
1516     // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
1517     // number of bytes needs to be adjusted in case the cached block is in
1518     // ZSTD's digested dictionary format.
1519     if (compression_type_ != kZSTD &&
1520         compression_type_ != kZSTDNotFinalCompression) {
1521       // Although we limited buffering to `kBlockLen`, there may be up to two
1522       // blocks of data included in the dictionary since we only check limit
1523       // after each block is built.
1524       ASSERT_LE(TestGetTickerCount(options,
1525                                    BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1526                 2 * kBlockLen);
1527     }
1528   }
1529 }
1530 
TEST_P(PresetCompressionDictTest,CompactNonBottommost)1531 TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
1532   // Verifies that dictionary is generated and written during compaction to
1533   // non-bottommost level only when `ColumnFamilyOptions::compression` enables
1534   // dictionary. Also verifies the size of the dictionary is within expectations
1535   // according to the limit on buffering set by
1536   // `CompressionOptions::max_dict_buffer_bytes`.
1537   const size_t kValueLen = 256;
1538   const size_t kKeysPerFile = 1 << 10;
1539   const size_t kDictLen = 16 << 10;
1540   const size_t kBlockLen = 4 << 10;
1541 
1542   Options options = CurrentOptions();
1543   if (bottommost_) {
1544     options.bottommost_compression = compression_type_;
1545     options.bottommost_compression_opts.enabled = true;
1546     options.bottommost_compression_opts.max_dict_bytes = kDictLen;
1547     options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
1548   } else {
1549     options.compression = compression_type_;
1550     options.compression_opts.max_dict_bytes = kDictLen;
1551     options.compression_opts.max_dict_buffer_bytes = kBlockLen;
1552   }
1553   options.disable_auto_compactions = true;
1554   options.statistics = CreateDBStatistics();
1555   BlockBasedTableOptions bbto;
1556   bbto.block_size = kBlockLen;
1557   bbto.cache_index_and_filter_blocks = true;
1558   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
1559   Reopen(options);
1560 
1561   Random rnd(301);
1562   for (size_t j = 0; j <= kKeysPerFile; ++j) {
1563     ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
1564   }
1565   ASSERT_OK(Flush());
1566   MoveFilesToLevel(2);
1567 
1568   for (int i = 0; i < 2; ++i) {
1569     for (size_t j = 0; j <= kKeysPerFile; ++j) {
1570       ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
1571     }
1572     ASSERT_OK(Flush());
1573   }
1574 #ifndef ROCKSDB_LITE
1575   ASSERT_EQ("2,0,1", FilesPerLevel(0));
1576 #endif  // ROCKSDB_LITE
1577 
1578   uint64_t prev_compression_dict_bytes_inserted =
1579       TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
1580   // This L0->L1 compaction merges the two L0 files into L1. The produced L1
1581   // file is not bottommost due to the existing L2 file covering the same key-
1582   // range.
1583   ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
1584 #ifndef ROCKSDB_LITE
1585   ASSERT_EQ("0,1,1", FilesPerLevel(0));
1586 #endif  // ROCKSDB_LITE
1587   // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
1588   // compression dictionary exists since dictionaries would be preloaded when
1589   // the compaction finishes.
1590   if (bottommost_) {
1591     ASSERT_EQ(
1592         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1593         prev_compression_dict_bytes_inserted);
1594   } else {
1595     ASSERT_GT(
1596         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1597         prev_compression_dict_bytes_inserted);
1598     // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
1599     // number of bytes needs to be adjusted in case the cached block is in
1600     // ZSTD's digested dictionary format.
1601     if (compression_type_ != kZSTD &&
1602         compression_type_ != kZSTDNotFinalCompression) {
1603       // Although we limited buffering to `kBlockLen`, there may be up to two
1604       // blocks of data included in the dictionary since we only check limit
1605       // after each block is built.
1606       ASSERT_LE(TestGetTickerCount(options,
1607                                    BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1608                 prev_compression_dict_bytes_inserted + 2 * kBlockLen);
1609     }
1610   }
1611 }
1612 
TEST_P(PresetCompressionDictTest,CompactBottommost)1613 TEST_P(PresetCompressionDictTest, CompactBottommost) {
1614   // Verifies that dictionary is generated and written during compaction to
1615   // non-bottommost level only when either `ColumnFamilyOptions::compression` or
1616   // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
1617   // verifies the size of the dictionary is within expectations according to the
1618   // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
1619   const size_t kValueLen = 256;
1620   const size_t kKeysPerFile = 1 << 10;
1621   const size_t kDictLen = 16 << 10;
1622   const size_t kBlockLen = 4 << 10;
1623 
1624   Options options = CurrentOptions();
1625   if (bottommost_) {
1626     options.bottommost_compression = compression_type_;
1627     options.bottommost_compression_opts.enabled = true;
1628     options.bottommost_compression_opts.max_dict_bytes = kDictLen;
1629     options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
1630   } else {
1631     options.compression = compression_type_;
1632     options.compression_opts.max_dict_bytes = kDictLen;
1633     options.compression_opts.max_dict_buffer_bytes = kBlockLen;
1634   }
1635   options.disable_auto_compactions = true;
1636   options.statistics = CreateDBStatistics();
1637   BlockBasedTableOptions bbto;
1638   bbto.block_size = kBlockLen;
1639   bbto.cache_index_and_filter_blocks = true;
1640   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
1641   Reopen(options);
1642 
1643   Random rnd(301);
1644   for (int i = 0; i < 2; ++i) {
1645     for (size_t j = 0; j <= kKeysPerFile; ++j) {
1646       ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
1647     }
1648     ASSERT_OK(Flush());
1649   }
1650 #ifndef ROCKSDB_LITE
1651   ASSERT_EQ("2", FilesPerLevel(0));
1652 #endif  // ROCKSDB_LITE
1653 
1654   uint64_t prev_compression_dict_bytes_inserted =
1655       TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
1656   CompactRangeOptions cro;
1657   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
1658 #ifndef ROCKSDB_LITE
1659   ASSERT_EQ("0,1", FilesPerLevel(0));
1660 #endif  // ROCKSDB_LITE
1661   ASSERT_GT(
1662       TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1663       prev_compression_dict_bytes_inserted);
1664   // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
1665   // number of bytes needs to be adjusted in case the cached block is in ZSTD's
1666   // digested dictionary format.
1667   if (compression_type_ != kZSTD &&
1668       compression_type_ != kZSTDNotFinalCompression) {
1669     // Although we limited buffering to `kBlockLen`, there may be up to two
1670     // blocks of data included in the dictionary since we only check limit after
1671     // each block is built.
1672     ASSERT_LE(
1673         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1674         prev_compression_dict_bytes_inserted + 2 * kBlockLen);
1675   }
1676 }
1677 
1678 class CompactionCompressionListener : public EventListener {
1679  public:
CompactionCompressionListener(Options * db_options)1680   explicit CompactionCompressionListener(Options* db_options)
1681       : db_options_(db_options) {}
1682 
OnCompactionCompleted(DB * db,const CompactionJobInfo & ci)1683   void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
1684     // Figure out last level with files
1685     int bottommost_level = 0;
1686     for (int level = 0; level < db->NumberLevels(); level++) {
1687       std::string files_at_level;
1688       ASSERT_TRUE(db->GetProperty(
1689           "rocksdb.num-files-at-level" + ROCKSDB_NAMESPACE::ToString(level),
1690           &files_at_level));
1691       if (files_at_level != "0") {
1692         bottommost_level = level;
1693       }
1694     }
1695 
1696     if (db_options_->bottommost_compression != kDisableCompressionOption &&
1697         ci.output_level == bottommost_level) {
1698       ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
1699     } else if (db_options_->compression_per_level.size() != 0) {
1700       ASSERT_EQ(ci.compression,
1701                 db_options_->compression_per_level[ci.output_level]);
1702     } else {
1703       ASSERT_EQ(ci.compression, db_options_->compression);
1704     }
1705     max_level_checked = std::max(max_level_checked, ci.output_level);
1706   }
1707 
1708   int max_level_checked = 0;
1709   const Options* db_options_;
1710 };
1711 
1712 enum CompressionFailureType {
1713   kTestCompressionFail,
1714   kTestDecompressionFail,
1715   kTestDecompressionCorruption
1716 };
1717 
1718 class CompressionFailuresTest
1719     : public DBTest2,
1720       public testing::WithParamInterface<std::tuple<
1721           CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
1722  public:
CompressionFailuresTest()1723   CompressionFailuresTest() {
1724     std::tie(compression_failure_type_, compression_type_,
1725              compression_max_dict_bytes_, compression_parallel_threads_) =
1726         GetParam();
1727   }
1728 
1729   CompressionFailureType compression_failure_type_ = kTestCompressionFail;
1730   CompressionType compression_type_ = kNoCompression;
1731   uint32_t compression_max_dict_bytes_ = 0;
1732   uint32_t compression_parallel_threads_ = 0;
1733 };
1734 
1735 INSTANTIATE_TEST_CASE_P(
1736     DBTest2, CompressionFailuresTest,
1737     ::testing::Combine(::testing::Values(kTestCompressionFail,
1738                                          kTestDecompressionFail,
1739                                          kTestDecompressionCorruption),
1740                        ::testing::ValuesIn(GetSupportedCompressions()),
1741                        ::testing::Values(0, 10), ::testing::Values(1, 4)));
1742 
TEST_P(CompressionFailuresTest,CompressionFailures)1743 TEST_P(CompressionFailuresTest, CompressionFailures) {
1744   if (compression_type_ == kNoCompression) {
1745     return;
1746   }
1747 
1748   Options options = CurrentOptions();
1749   options.level0_file_num_compaction_trigger = 2;
1750   options.max_bytes_for_level_base = 1024;
1751   options.max_bytes_for_level_multiplier = 2;
1752   options.num_levels = 7;
1753   options.max_background_compactions = 1;
1754   options.target_file_size_base = 512;
1755 
1756   BlockBasedTableOptions table_options;
1757   table_options.block_size = 512;
1758   table_options.verify_compression = true;
1759   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1760 
1761   options.compression = compression_type_;
1762   options.compression_opts.parallel_threads = compression_parallel_threads_;
1763   options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
1764   options.bottommost_compression_opts.parallel_threads =
1765       compression_parallel_threads_;
1766   options.bottommost_compression_opts.max_dict_bytes =
1767       compression_max_dict_bytes_;
1768 
1769   if (compression_failure_type_ == kTestCompressionFail) {
1770     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
1771         "CompressData:TamperWithReturnValue", [](void* arg) {
1772           bool* ret = static_cast<bool*>(arg);
1773           *ret = false;
1774         });
1775   } else if (compression_failure_type_ == kTestDecompressionFail) {
1776     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
1777         "UncompressBlockContentsForCompressionType:TamperWithReturnValue",
1778         [](void* arg) {
1779           Status* ret = static_cast<Status*>(arg);
1780           ASSERT_OK(*ret);
1781           *ret = Status::Corruption("kTestDecompressionFail");
1782         });
1783   } else if (compression_failure_type_ == kTestDecompressionCorruption) {
1784     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
1785         "UncompressBlockContentsForCompressionType:"
1786         "TamperWithDecompressionOutput",
1787         [](void* arg) {
1788           BlockContents* contents = static_cast<BlockContents*>(arg);
1789           // Ensure uncompressed data != original data
1790           const size_t len = contents->data.size() + 1;
1791           std::unique_ptr<char[]> fake_data(new char[len]());
1792           *contents = BlockContents(std::move(fake_data), len);
1793         });
1794   }
1795 
1796   std::map<std::string, std::string> key_value_written;
1797 
1798   const int kKeySize = 5;
1799   const int kValUnitSize = 16;
1800   const int kValSize = 256;
1801   Random rnd(405);
1802 
1803   Status s = Status::OK();
1804 
1805   DestroyAndReopen(options);
1806   // Write 10 random files
1807   for (int i = 0; i < 10; i++) {
1808     for (int j = 0; j < 5; j++) {
1809       std::string key = rnd.RandomString(kKeySize);
1810       // Ensure good compression ratio
1811       std::string valueUnit = rnd.RandomString(kValUnitSize);
1812       std::string value;
1813       for (int k = 0; k < kValSize; k += kValUnitSize) {
1814         value += valueUnit;
1815       }
1816       s = Put(key, value);
1817       if (compression_failure_type_ == kTestCompressionFail) {
1818         key_value_written[key] = value;
1819         ASSERT_OK(s);
1820       }
1821     }
1822     s = Flush();
1823     if (compression_failure_type_ == kTestCompressionFail) {
1824       ASSERT_OK(s);
1825     }
1826     s = dbfull()->TEST_WaitForCompact();
1827     if (compression_failure_type_ == kTestCompressionFail) {
1828       ASSERT_OK(s);
1829     }
1830     if (i == 4) {
1831       // Make compression fail at the mid of table building
1832       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
1833     }
1834   }
1835   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
1836 
1837   if (compression_failure_type_ == kTestCompressionFail) {
1838     // Should be kNoCompression, check content consistency
1839     std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
1840     for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
1841       std::string key = db_iter->key().ToString();
1842       std::string value = db_iter->value().ToString();
1843       ASSERT_NE(key_value_written.find(key), key_value_written.end());
1844       ASSERT_EQ(key_value_written[key], value);
1845       key_value_written.erase(key);
1846     }
1847     ASSERT_EQ(0, key_value_written.size());
1848   } else if (compression_failure_type_ == kTestDecompressionFail) {
1849     ASSERT_EQ(std::string(s.getState()),
1850               "Could not decompress: kTestDecompressionFail");
1851   } else if (compression_failure_type_ == kTestDecompressionCorruption) {
1852     ASSERT_EQ(std::string(s.getState()),
1853               "Decompressed block did not match raw block");
1854   }
1855 }
1856 
TEST_F(DBTest2,CompressionOptions)1857 TEST_F(DBTest2, CompressionOptions) {
1858   if (!Zlib_Supported() || !Snappy_Supported()) {
1859     return;
1860   }
1861 
1862   Options options = CurrentOptions();
1863   options.level0_file_num_compaction_trigger = 2;
1864   options.max_bytes_for_level_base = 100;
1865   options.max_bytes_for_level_multiplier = 2;
1866   options.num_levels = 7;
1867   options.max_background_compactions = 1;
1868 
1869   CompactionCompressionListener* listener =
1870       new CompactionCompressionListener(&options);
1871   options.listeners.emplace_back(listener);
1872 
1873   const int kKeySize = 5;
1874   const int kValSize = 20;
1875   Random rnd(301);
1876 
1877   std::vector<uint32_t> compression_parallel_threads = {1, 4};
1878 
1879   std::map<std::string, std::string> key_value_written;
1880 
1881   for (int iter = 0; iter <= 2; iter++) {
1882     listener->max_level_checked = 0;
1883 
1884     if (iter == 0) {
1885       // Use different compression algorithms for different levels but
1886       // always use Zlib for bottommost level
1887       options.compression_per_level = {kNoCompression,     kNoCompression,
1888                                        kNoCompression,     kSnappyCompression,
1889                                        kSnappyCompression, kSnappyCompression,
1890                                        kZlibCompression};
1891       options.compression = kNoCompression;
1892       options.bottommost_compression = kZlibCompression;
1893     } else if (iter == 1) {
1894       // Use Snappy except for bottommost level use ZLib
1895       options.compression_per_level = {};
1896       options.compression = kSnappyCompression;
1897       options.bottommost_compression = kZlibCompression;
1898     } else if (iter == 2) {
1899       // Use Snappy everywhere
1900       options.compression_per_level = {};
1901       options.compression = kSnappyCompression;
1902       options.bottommost_compression = kDisableCompressionOption;
1903     }
1904 
1905     for (auto num_threads : compression_parallel_threads) {
1906       options.compression_opts.parallel_threads = num_threads;
1907       options.bottommost_compression_opts.parallel_threads = num_threads;
1908 
1909       DestroyAndReopen(options);
1910       // Write 10 random files
1911       for (int i = 0; i < 10; i++) {
1912         for (int j = 0; j < 5; j++) {
1913           std::string key = rnd.RandomString(kKeySize);
1914           std::string value = rnd.RandomString(kValSize);
1915           key_value_written[key] = value;
1916           ASSERT_OK(Put(key, value));
1917         }
1918         ASSERT_OK(Flush());
1919         ASSERT_OK(dbfull()->TEST_WaitForCompact());
1920       }
1921 
1922       // Make sure that we wrote enough to check all 7 levels
1923       ASSERT_EQ(listener->max_level_checked, 6);
1924 
1925       // Make sure database content is the same as key_value_written
1926       std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
1927       for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
1928         std::string key = db_iter->key().ToString();
1929         std::string value = db_iter->value().ToString();
1930         ASSERT_NE(key_value_written.find(key), key_value_written.end());
1931         ASSERT_EQ(key_value_written[key], value);
1932         key_value_written.erase(key);
1933       }
1934       ASSERT_OK(db_iter->status());
1935       ASSERT_EQ(0, key_value_written.size());
1936     }
1937   }
1938 }
1939 
1940 class CompactionStallTestListener : public EventListener {
1941  public:
CompactionStallTestListener()1942   CompactionStallTestListener() : compacting_files_cnt_(0), compacted_files_cnt_(0) {}
1943 
OnCompactionBegin(DB *,const CompactionJobInfo & ci)1944   void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
1945     ASSERT_EQ(ci.cf_name, "default");
1946     ASSERT_EQ(ci.base_input_level, 0);
1947     ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
1948     compacting_files_cnt_ += ci.input_files.size();
1949   }
1950 
OnCompactionCompleted(DB *,const CompactionJobInfo & ci)1951   void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
1952     ASSERT_EQ(ci.cf_name, "default");
1953     ASSERT_EQ(ci.base_input_level, 0);
1954     ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
1955     compacted_files_cnt_ += ci.input_files.size();
1956   }
1957 
1958   std::atomic<size_t> compacting_files_cnt_;
1959   std::atomic<size_t> compacted_files_cnt_;
1960 };
1961 
TEST_F(DBTest2,CompactionStall)1962 TEST_F(DBTest2, CompactionStall) {
1963   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
1964       {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
1965        {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
1966        {"DBTest2::CompactionStall:2",
1967         "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
1968        {"DBTest2::CompactionStall:3",
1969         "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
1970   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
1971 
1972   Options options = CurrentOptions();
1973   options.level0_file_num_compaction_trigger = 4;
1974   options.max_background_compactions = 40;
1975   CompactionStallTestListener* listener = new CompactionStallTestListener();
1976   options.listeners.emplace_back(listener);
1977   DestroyAndReopen(options);
1978   // make sure all background compaction jobs can be scheduled
1979   auto stop_token =
1980       dbfull()->TEST_write_controler().GetCompactionPressureToken();
1981 
1982   Random rnd(301);
1983 
1984   // 4 Files in L0
1985   for (int i = 0; i < 4; i++) {
1986     for (int j = 0; j < 10; j++) {
1987       ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
1988     }
1989     ASSERT_OK(Flush());
1990   }
1991 
1992   // Wait for compaction to be triggered
1993   TEST_SYNC_POINT("DBTest2::CompactionStall:0");
1994 
1995   // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again
1996   // at DBTest2::CompactionStall::1
1997   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
1998 
1999   // Another 6 L0 files to trigger compaction again
2000   for (int i = 0; i < 6; i++) {
2001     for (int j = 0; j < 10; j++) {
2002       ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
2003     }
2004     ASSERT_OK(Flush());
2005   }
2006 
2007   // Wait for another compaction to be triggered
2008   TEST_SYNC_POINT("DBTest2::CompactionStall:1");
2009 
2010   // Hold NotifyOnCompactionBegin in the unlock mutex section
2011   TEST_SYNC_POINT("DBTest2::CompactionStall:2");
2012 
2013   // Hold NotifyOnCompactionCompleted in the unlock mutex section
2014   TEST_SYNC_POINT("DBTest2::CompactionStall:3");
2015 
2016   ASSERT_OK(dbfull()->TEST_WaitForCompact());
2017   ASSERT_LT(NumTableFilesAtLevel(0),
2018             options.level0_file_num_compaction_trigger);
2019   ASSERT_GT(listener->compacted_files_cnt_.load(),
2020             10 - options.level0_file_num_compaction_trigger);
2021   ASSERT_EQ(listener->compacting_files_cnt_.load(), listener->compacted_files_cnt_.load());
2022 
2023   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2024 }
2025 
2026 #endif  // ROCKSDB_LITE
2027 
TEST_F(DBTest2,FirstSnapshotTest)2028 TEST_F(DBTest2, FirstSnapshotTest) {
2029   Options options;
2030   options.write_buffer_size = 100000;  // Small write buffer
2031   options = CurrentOptions(options);
2032   CreateAndReopenWithCF({"pikachu"}, options);
2033 
2034   // This snapshot will have sequence number 0 what is expected behaviour.
2035   const Snapshot* s1 = db_->GetSnapshot();
2036 
2037   ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
2038   ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
2039 
2040   db_->ReleaseSnapshot(s1);
2041 }
2042 
2043 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,DuplicateSnapshot)2044 TEST_F(DBTest2, DuplicateSnapshot) {
2045   Options options;
2046   options = CurrentOptions(options);
2047   std::vector<const Snapshot*> snapshots;
2048   DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
2049   SequenceNumber oldest_ww_snap, first_ww_snap;
2050 
2051   ASSERT_OK(Put("k", "v"));  // inc seq
2052   snapshots.push_back(db_->GetSnapshot());
2053   snapshots.push_back(db_->GetSnapshot());
2054   ASSERT_OK(Put("k", "v"));  // inc seq
2055   snapshots.push_back(db_->GetSnapshot());
2056   snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
2057   first_ww_snap = snapshots.back()->GetSequenceNumber();
2058   ASSERT_OK(Put("k", "v"));  // inc seq
2059   snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
2060   snapshots.push_back(db_->GetSnapshot());
2061   ASSERT_OK(Put("k", "v"));  // inc seq
2062   snapshots.push_back(db_->GetSnapshot());
2063 
2064   {
2065     InstrumentedMutexLock l(dbi->mutex());
2066     auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap);
2067     ASSERT_EQ(seqs.size(), 4);  // duplicates are not counted
2068     ASSERT_EQ(oldest_ww_snap, first_ww_snap);
2069   }
2070 
2071   for (auto s : snapshots) {
2072     db_->ReleaseSnapshot(s);
2073   }
2074 }
2075 #endif  // ROCKSDB_LITE
2076 
2077 class PinL0IndexAndFilterBlocksTest
2078     : public DBTestBase,
2079       public testing::WithParamInterface<std::tuple<bool, bool>> {
2080  public:
PinL0IndexAndFilterBlocksTest()2081   PinL0IndexAndFilterBlocksTest()
2082       : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {}
SetUp()2083   void SetUp() override {
2084     infinite_max_files_ = std::get<0>(GetParam());
2085     disallow_preload_ = std::get<1>(GetParam());
2086   }
2087 
CreateTwoLevels(Options * options,bool close_afterwards)2088   void CreateTwoLevels(Options* options, bool close_afterwards) {
2089     if (infinite_max_files_) {
2090       options->max_open_files = -1;
2091     }
2092     options->create_if_missing = true;
2093     options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
2094     BlockBasedTableOptions table_options;
2095     table_options.cache_index_and_filter_blocks = true;
2096     table_options.pin_l0_filter_and_index_blocks_in_cache = true;
2097     table_options.filter_policy.reset(NewBloomFilterPolicy(20));
2098     options->table_factory.reset(NewBlockBasedTableFactory(table_options));
2099     CreateAndReopenWithCF({"pikachu"}, *options);
2100 
2101     ASSERT_OK(Put(1, "a", "begin"));
2102     ASSERT_OK(Put(1, "z", "end"));
2103     ASSERT_OK(Flush(1));
2104     // move this table to L1
2105     ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
2106 
2107     // reset block cache
2108     table_options.block_cache = NewLRUCache(64 * 1024);
2109     options->table_factory.reset(NewBlockBasedTableFactory(table_options));
2110     TryReopenWithColumnFamilies({"default", "pikachu"}, *options);
2111     // create new table at L0
2112     ASSERT_OK(Put(1, "a2", "begin2"));
2113     ASSERT_OK(Put(1, "z2", "end2"));
2114     ASSERT_OK(Flush(1));
2115 
2116     if (close_afterwards) {
2117       Close();  // This ensures that there is no ref to block cache entries
2118     }
2119     table_options.block_cache->EraseUnRefEntries();
2120   }
2121 
2122   bool infinite_max_files_;
2123   bool disallow_preload_;
2124 };
2125 
TEST_P(PinL0IndexAndFilterBlocksTest,IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning)2126 TEST_P(PinL0IndexAndFilterBlocksTest,
2127        IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) {
2128   Options options = CurrentOptions();
2129   if (infinite_max_files_) {
2130     options.max_open_files = -1;
2131   }
2132   options.create_if_missing = true;
2133   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
2134   BlockBasedTableOptions table_options;
2135   table_options.cache_index_and_filter_blocks = true;
2136   table_options.pin_l0_filter_and_index_blocks_in_cache = true;
2137   table_options.filter_policy.reset(NewBloomFilterPolicy(20));
2138   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
2139   CreateAndReopenWithCF({"pikachu"}, options);
2140 
2141   ASSERT_OK(Put(1, "key", "val"));
2142   // Create a new table.
2143   ASSERT_OK(Flush(1));
2144 
2145   // index/filter blocks added to block cache right after table creation.
2146   ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2147   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2148   ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2149   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2150 
2151   // only index/filter were added
2152   ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD));
2153   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
2154 
2155   std::string value;
2156   // Miss and hit count should remain the same, they're all pinned.
2157   ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value));
2158   ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2159   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2160   ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2161   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2162 
2163   // Miss and hit count should remain the same, they're all pinned.
2164   value = Get(1, "key");
2165   ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2166   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2167   ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2168   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2169 }
2170 
TEST_P(PinL0IndexAndFilterBlocksTest,MultiLevelIndexAndFilterBlocksCachedWithPinning)2171 TEST_P(PinL0IndexAndFilterBlocksTest,
2172        MultiLevelIndexAndFilterBlocksCachedWithPinning) {
2173   Options options = CurrentOptions();
2174   PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false);
2175   // get base cache values
2176   uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
2177   uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
2178   uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
2179   uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
2180 
2181   std::string value;
2182   // this should be read from L0
2183   // so cache values don't change
2184   value = Get(1, "a2");
2185   ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2186   ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2187   ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2188   ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2189 
2190   // this should be read from L1
2191   // the file is opened, prefetching results in a cache filter miss
2192   // the block is loaded and added to the cache,
2193   // then the get results in a cache hit for L1
2194   // When we have inifinite max_files, there is still cache miss because we have
2195   // reset the block cache
2196   value = Get(1, "a");
2197   ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2198   ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2199 }
2200 
TEST_P(PinL0IndexAndFilterBlocksTest,DisablePrefetchingNonL0IndexAndFilter)2201 TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
2202   Options options = CurrentOptions();
2203   // This ensures that db does not ref anything in the block cache, so
2204   // EraseUnRefEntries could clear them up.
2205   bool close_afterwards = true;
2206   PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards);
2207 
2208   // Get base cache values
2209   uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
2210   uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
2211   uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
2212   uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
2213 
2214   if (disallow_preload_) {
2215     // Now we have two files. We narrow the max open files to allow 3 entries
2216     // so that preloading SST files won't happen.
2217     options.max_open_files = 13;
2218     // RocksDB sanitize max open files to at least 20. Modify it back.
2219     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2220         "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
2221           int* max_open_files = static_cast<int*>(arg);
2222           *max_open_files = 13;
2223         });
2224   }
2225   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2226 
2227   // Reopen database. If max_open_files is set as -1, table readers will be
2228   // preloaded. This will trigger a BlockBasedTable::Open() and prefetch
2229   // L0 index and filter. Level 1's prefetching is disabled in DB::Open()
2230   TryReopenWithColumnFamilies({"default", "pikachu"}, options);
2231 
2232   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2233 
2234   if (!disallow_preload_) {
2235     // After reopen, cache miss are increased by one because we read (and only
2236     // read) filter and index on L0
2237     ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2238     ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2239     ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2240     ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2241   } else {
2242     // If max_open_files is not -1, we do not preload table readers, so there is
2243     // no change.
2244     ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2245     ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2246     ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2247     ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2248   }
2249   std::string value;
2250   // this should be read from L0
2251   value = Get(1, "a2");
2252   // If max_open_files is -1, we have pinned index and filter in Rep, so there
2253   // will not be changes in index and filter misses or hits. If max_open_files
2254   // is not -1, Get() will open a TableReader and prefetch index and filter.
2255   ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2256   ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2257   ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2258   ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2259 
2260   // this should be read from L1
2261   value = Get(1, "a");
2262   if (!disallow_preload_) {
2263     // In inifinite max files case, there's a cache miss in executing Get()
2264     // because index and filter are not prefetched before.
2265     ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2266     ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2267     ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2268     ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2269   } else {
2270     // In this case, cache miss will be increased by one in
2271     // BlockBasedTable::Open() because this is not in DB::Open() code path so we
2272     // will prefetch L1's index and filter. Cache hit will also be increased by
2273     // one because Get() will read index and filter from the block cache
2274     // prefetched in previous Open() call.
2275     ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2276     ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2277     ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2278     ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2279   }
2280 
2281   // Force a full compaction to one single file. There will be a block
2282   // cache read for both of index and filter. If prefetch doesn't explicitly
2283   // happen, it will happen when verifying the file.
2284   Compact(1, "a", "zzzzz");
2285   ASSERT_OK(dbfull()->TEST_WaitForCompact());
2286 
2287   if (!disallow_preload_) {
2288     ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2289     ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2290     ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2291     ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2292   } else {
2293     ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2294     ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2295     ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2296     ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2297   }
2298 
2299   // Bloom and index hit will happen when a Get() happens.
2300   value = Get(1, "a");
2301   if (!disallow_preload_) {
2302     ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2303     ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2304     ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2305     ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2306   } else {
2307     ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2308     ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2309     ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2310     ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2311   }
2312 }
2313 
2314 INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest,
2315                         PinL0IndexAndFilterBlocksTest,
2316                         ::testing::Values(std::make_tuple(true, false),
2317                                           std::make_tuple(false, false),
2318                                           std::make_tuple(false, true)));
2319 
2320 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,MaxCompactionBytesTest)2321 TEST_F(DBTest2, MaxCompactionBytesTest) {
2322   Options options = CurrentOptions();
2323   options.memtable_factory.reset(test::NewSpecialSkipListFactory(
2324       DBTestBase::kNumKeysByGenerateNewRandomFile));
2325   options.compaction_style = kCompactionStyleLevel;
2326   options.write_buffer_size = 200 << 10;
2327   options.arena_block_size = 4 << 10;
2328   options.level0_file_num_compaction_trigger = 4;
2329   options.num_levels = 4;
2330   options.compression = kNoCompression;
2331   options.max_bytes_for_level_base = 450 << 10;
2332   options.target_file_size_base = 100 << 10;
2333   // Infinite for full compaction.
2334   options.max_compaction_bytes = options.target_file_size_base * 100;
2335 
2336   Reopen(options);
2337 
2338   Random rnd(301);
2339 
2340   for (int num = 0; num < 8; num++) {
2341     GenerateNewRandomFile(&rnd);
2342   }
2343   CompactRangeOptions cro;
2344   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
2345   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
2346   ASSERT_EQ("0,0,8", FilesPerLevel(0));
2347 
2348   // When compact from Ln -> Ln+1, cut a file if the file overlaps with
2349   // more than three files in Ln+1.
2350   options.max_compaction_bytes = options.target_file_size_base * 3;
2351   Reopen(options);
2352 
2353   GenerateNewRandomFile(&rnd);
2354   // Add three more small files that overlap with the previous file
2355   for (int i = 0; i < 3; i++) {
2356     ASSERT_OK(Put("a", "z"));
2357     ASSERT_OK(Flush());
2358   }
2359   ASSERT_OK(dbfull()->TEST_WaitForCompact());
2360 
2361   // Output files to L1 are cut to three pieces, according to
2362   // options.max_compaction_bytes
2363   ASSERT_EQ("0,3,8", FilesPerLevel(0));
2364 }
2365 
UniqueIdCallback(void * arg)2366 static void UniqueIdCallback(void* arg) {
2367   int* result = reinterpret_cast<int*>(arg);
2368   if (*result == -1) {
2369     *result = 0;
2370   }
2371 
2372   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
2373   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2374       "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
2375 }
2376 
2377 class MockPersistentCache : public PersistentCache {
2378  public:
MockPersistentCache(const bool is_compressed,const size_t max_size)2379   explicit MockPersistentCache(const bool is_compressed, const size_t max_size)
2380       : is_compressed_(is_compressed), max_size_(max_size) {
2381     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2382     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2383         "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
2384   }
2385 
~MockPersistentCache()2386   ~MockPersistentCache() override {}
2387 
Stats()2388   PersistentCache::StatsType Stats() override {
2389     return PersistentCache::StatsType();
2390   }
2391 
NewId()2392   uint64_t NewId() override {
2393     return last_id_.fetch_add(1, std::memory_order_relaxed);
2394   }
2395 
Insert(const Slice & page_key,const char * data,const size_t size)2396   Status Insert(const Slice& page_key, const char* data,
2397                 const size_t size) override {
2398     MutexLock _(&lock_);
2399 
2400     if (size_ > max_size_) {
2401       size_ -= data_.begin()->second.size();
2402       data_.erase(data_.begin());
2403     }
2404 
2405     data_.insert(std::make_pair(page_key.ToString(), std::string(data, size)));
2406     size_ += size;
2407     return Status::OK();
2408   }
2409 
Lookup(const Slice & page_key,std::unique_ptr<char[]> * data,size_t * size)2410   Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
2411                 size_t* size) override {
2412     MutexLock _(&lock_);
2413     auto it = data_.find(page_key.ToString());
2414     if (it == data_.end()) {
2415       return Status::NotFound();
2416     }
2417 
2418     assert(page_key.ToString() == it->first);
2419     data->reset(new char[it->second.size()]);
2420     memcpy(data->get(), it->second.c_str(), it->second.size());
2421     *size = it->second.size();
2422     return Status::OK();
2423   }
2424 
IsCompressed()2425   bool IsCompressed() override { return is_compressed_; }
2426 
GetPrintableOptions() const2427   std::string GetPrintableOptions() const override {
2428     return "MockPersistentCache";
2429   }
2430 
2431   port::Mutex lock_;
2432   std::map<std::string, std::string> data_;
2433   const bool is_compressed_ = true;
2434   size_t size_ = 0;
2435   const size_t max_size_ = 10 * 1024;  // 10KiB
2436   std::atomic<uint64_t> last_id_{1};
2437 };
2438 
2439 #ifdef OS_LINUX
2440 // Make sure that in CPU time perf context counters, Env::NowCPUNanos()
2441 // is used, rather than Env::CPUNanos();
TEST_F(DBTest2,TestPerfContextGetCpuTime)2442 TEST_F(DBTest2, TestPerfContextGetCpuTime) {
2443   // force resizing table cache so table handle is not preloaded so that
2444   // we can measure find_table_nanos during Get().
2445   dbfull()->TEST_table_cache()->SetCapacity(0);
2446   ASSERT_OK(Put("foo", "bar"));
2447   ASSERT_OK(Flush());
2448   env_->now_cpu_count_.store(0);
2449   env_->SetMockSleep();
2450 
2451   // NOTE: Presumed unnecessary and removed: resetting mock time in env
2452 
2453   // CPU timing is not enabled with kEnableTimeExceptForMutex
2454   SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
2455   ASSERT_EQ("bar", Get("foo"));
2456   ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
2457   ASSERT_EQ(0, env_->now_cpu_count_.load());
2458 
2459   constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
2460   constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
2461 
2462   // Add time to NowNanos() reading.
2463   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2464       "TableCache::FindTable:0",
2465       [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
2466   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2467 
2468   SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
2469   ASSERT_EQ("bar", Get("foo"));
2470   ASSERT_GT(env_->now_cpu_count_.load(), 2);
2471   ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos);
2472   ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
2473 
2474   SetPerfLevel(PerfLevel::kDisable);
2475   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2476 }
2477 
TEST_F(DBTest2,TestPerfContextIterCpuTime)2478 TEST_F(DBTest2, TestPerfContextIterCpuTime) {
2479   DestroyAndReopen(CurrentOptions());
2480   // force resizing table cache so table handle is not preloaded so that
2481   // we can measure find_table_nanos during iteration
2482   dbfull()->TEST_table_cache()->SetCapacity(0);
2483 
2484   const size_t kNumEntries = 10;
2485   for (size_t i = 0; i < kNumEntries; ++i) {
2486     ASSERT_OK(Put("k" + ToString(i), "v" + ToString(i)));
2487   }
2488   ASSERT_OK(Flush());
2489   for (size_t i = 0; i < kNumEntries; ++i) {
2490     ASSERT_EQ("v" + ToString(i), Get("k" + ToString(i)));
2491   }
2492   std::string last_key = "k" + ToString(kNumEntries - 1);
2493   std::string last_value = "v" + ToString(kNumEntries - 1);
2494   env_->now_cpu_count_.store(0);
2495   env_->SetMockSleep();
2496 
2497   // NOTE: Presumed unnecessary and removed: resetting mock time in env
2498 
2499   // CPU timing is not enabled with kEnableTimeExceptForMutex
2500   SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
2501   Iterator* iter = db_->NewIterator(ReadOptions());
2502   iter->Seek("k0");
2503   ASSERT_TRUE(iter->Valid());
2504   ASSERT_EQ("v0", iter->value().ToString());
2505   iter->SeekForPrev(last_key);
2506   ASSERT_TRUE(iter->Valid());
2507   iter->SeekToLast();
2508   ASSERT_TRUE(iter->Valid());
2509   ASSERT_EQ(last_value, iter->value().ToString());
2510   iter->SeekToFirst();
2511   ASSERT_TRUE(iter->Valid());
2512   ASSERT_EQ("v0", iter->value().ToString());
2513   ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos);
2514   iter->Next();
2515   ASSERT_TRUE(iter->Valid());
2516   ASSERT_EQ("v1", iter->value().ToString());
2517   ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
2518   iter->Prev();
2519   ASSERT_TRUE(iter->Valid());
2520   ASSERT_OK(iter->status());
2521   ASSERT_EQ("v0", iter->value().ToString());
2522   ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
2523   ASSERT_EQ(0, env_->now_cpu_count_.load());
2524   delete iter;
2525 
2526   constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
2527   constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
2528 
2529   // Add time to NowNanos() reading.
2530   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2531       "TableCache::FindTable:0",
2532       [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
2533   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2534 
2535   SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
2536   iter = db_->NewIterator(ReadOptions());
2537   iter->Seek("k0");
2538   ASSERT_TRUE(iter->Valid());
2539   ASSERT_EQ("v0", iter->value().ToString());
2540   iter->SeekForPrev(last_key);
2541   ASSERT_TRUE(iter->Valid());
2542   iter->SeekToLast();
2543   ASSERT_TRUE(iter->Valid());
2544   ASSERT_EQ(last_value, iter->value().ToString());
2545   iter->SeekToFirst();
2546   ASSERT_TRUE(iter->Valid());
2547   ASSERT_EQ("v0", iter->value().ToString());
2548   ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
2549   ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos);
2550   iter->Next();
2551   ASSERT_TRUE(iter->Valid());
2552   ASSERT_EQ("v1", iter->value().ToString());
2553   ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
2554   ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos);
2555   iter->Prev();
2556   ASSERT_TRUE(iter->Valid());
2557   ASSERT_OK(iter->status());
2558   ASSERT_EQ("v0", iter->value().ToString());
2559   ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
2560   ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos);
2561   ASSERT_GE(env_->now_cpu_count_.load(), 12);
2562   ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
2563 
2564   SetPerfLevel(PerfLevel::kDisable);
2565   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2566   delete iter;
2567 }
2568 #endif  // OS_LINUX
2569 
2570 #if !defined OS_SOLARIS
TEST_F(DBTest2,PersistentCache)2571 TEST_F(DBTest2, PersistentCache) {
2572   int num_iter = 80;
2573 
2574   Options options;
2575   options.write_buffer_size = 64 * 1024;  // small write buffer
2576   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
2577   options = CurrentOptions(options);
2578 
2579   auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024};
2580   auto types = {/*compressed*/ 1, /*uncompressed*/ 0};
2581   for (auto bsize : bsizes) {
2582     for (auto type : types) {
2583       BlockBasedTableOptions table_options;
2584       table_options.persistent_cache.reset(
2585           new MockPersistentCache(type, 10 * 1024));
2586       table_options.no_block_cache = true;
2587       table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr;
2588       table_options.block_cache_compressed = nullptr;
2589       options.table_factory.reset(NewBlockBasedTableFactory(table_options));
2590 
2591       DestroyAndReopen(options);
2592       CreateAndReopenWithCF({"pikachu"}, options);
2593       // default column family doesn't have block cache
2594       Options no_block_cache_opts;
2595       no_block_cache_opts.statistics = options.statistics;
2596       no_block_cache_opts = CurrentOptions(no_block_cache_opts);
2597       BlockBasedTableOptions table_options_no_bc;
2598       table_options_no_bc.no_block_cache = true;
2599       no_block_cache_opts.table_factory.reset(
2600           NewBlockBasedTableFactory(table_options_no_bc));
2601       ReopenWithColumnFamilies(
2602           {"default", "pikachu"},
2603           std::vector<Options>({no_block_cache_opts, options}));
2604 
2605       Random rnd(301);
2606 
2607       // Write 8MB (80 values, each 100K)
2608       ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
2609       std::vector<std::string> values;
2610       std::string str;
2611       for (int i = 0; i < num_iter; i++) {
2612         if (i % 4 == 0) {  // high compression ratio
2613           str = rnd.RandomString(1000);
2614         }
2615         values.push_back(str);
2616         ASSERT_OK(Put(1, Key(i), values[i]));
2617       }
2618 
2619       // flush all data from memtable so that reads are from block cache
2620       ASSERT_OK(Flush(1));
2621 
2622       for (int i = 0; i < num_iter; i++) {
2623         ASSERT_EQ(Get(1, Key(i)), values[i]);
2624       }
2625 
2626       auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT);
2627       auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS);
2628 
2629       ASSERT_GT(hit, 0);
2630       ASSERT_GT(miss, 0);
2631     }
2632   }
2633 }
2634 #endif  // !defined OS_SOLARIS
2635 
2636 namespace {
CountSyncPoint()2637 void CountSyncPoint() {
2638   TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */);
2639 }
2640 }  // namespace
2641 
TEST_F(DBTest2,SyncPointMarker)2642 TEST_F(DBTest2, SyncPointMarker) {
2643   std::atomic<int> sync_point_called(0);
2644   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2645       "DBTest2::MarkedPoint",
2646       [&](void* /*arg*/) { sync_point_called.fetch_add(1); });
2647 
2648   // The first dependency enforces Marker can be loaded before MarkedPoint.
2649   // The second checks that thread 1's MarkedPoint should be disabled here.
2650   // Execution order:
2651   // |   Thread 1    |  Thread 2   |
2652   // |               |   Marker    |
2653   // |  MarkedPoint  |             |
2654   // | Thread1First  |             |
2655   // |               | MarkedPoint |
2656   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
2657       {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}},
2658       {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}});
2659 
2660   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2661 
2662   std::function<void()> func1 = [&]() {
2663     CountSyncPoint();
2664     TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First");
2665   };
2666 
2667   std::function<void()> func2 = [&]() {
2668     TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker");
2669     CountSyncPoint();
2670   };
2671 
2672   auto thread1 = port::Thread(func1);
2673   auto thread2 = port::Thread(func2);
2674   thread1.join();
2675   thread2.join();
2676 
2677   // Callback is only executed once
2678   ASSERT_EQ(sync_point_called.load(), 1);
2679   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2680 }
2681 #endif
2682 
GetEncodedEntrySize(size_t key_size,size_t value_size)2683 size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
2684   std::string buffer;
2685 
2686   PutVarint32(&buffer, static_cast<uint32_t>(0));
2687   PutVarint32(&buffer, static_cast<uint32_t>(key_size));
2688   PutVarint32(&buffer, static_cast<uint32_t>(value_size));
2689 
2690   return buffer.size() + key_size + value_size;
2691 }
2692 
TEST_F(DBTest2,ReadAmpBitmap)2693 TEST_F(DBTest2, ReadAmpBitmap) {
2694   Options options = CurrentOptions();
2695   BlockBasedTableOptions bbto;
2696   uint32_t bytes_per_bit[2] = {1, 16};
2697   for (size_t k = 0; k < 2; k++) {
2698     // Disable delta encoding to make it easier to calculate read amplification
2699     bbto.use_delta_encoding = false;
2700     // Huge block cache to make it easier to calculate read amplification
2701     bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
2702     bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
2703     options.table_factory.reset(NewBlockBasedTableFactory(bbto));
2704     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
2705     DestroyAndReopen(options);
2706 
2707     const size_t kNumEntries = 10000;
2708 
2709     Random rnd(301);
2710     for (size_t i = 0; i < kNumEntries; i++) {
2711       ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(100)));
2712     }
2713     ASSERT_OK(Flush());
2714 
2715     Close();
2716     Reopen(options);
2717 
2718     // Read keys/values randomly and verify that reported read amp error
2719     // is less than 2%
2720     uint64_t total_useful_bytes = 0;
2721     std::set<int> read_keys;
2722     std::string value;
2723     for (size_t i = 0; i < kNumEntries * 5; i++) {
2724       int key_idx = rnd.Next() % kNumEntries;
2725       std::string key = Key(key_idx);
2726       ASSERT_OK(db_->Get(ReadOptions(), key, &value));
2727 
2728       if (read_keys.find(key_idx) == read_keys.end()) {
2729         auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
2730         total_useful_bytes +=
2731             GetEncodedEntrySize(internal_key.size(), value.size());
2732         read_keys.insert(key_idx);
2733       }
2734 
2735       double expected_read_amp =
2736           static_cast<double>(total_useful_bytes) /
2737           options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
2738 
2739       double read_amp =
2740           static_cast<double>(options.statistics->getTickerCount(
2741               READ_AMP_ESTIMATE_USEFUL_BYTES)) /
2742           options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
2743 
2744       double error_pct = fabs(expected_read_amp - read_amp) * 100;
2745       // Error between reported read amp and real read amp should be less than
2746       // 2%
2747       EXPECT_LE(error_pct, 2);
2748     }
2749 
2750     // Make sure we read every thing in the DB (which is smaller than our cache)
2751     Iterator* iter = db_->NewIterator(ReadOptions());
2752     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
2753       ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
2754     }
2755     ASSERT_OK(iter->status());
2756     delete iter;
2757 
2758     // Read amp is on average 100% since we read all what we loaded in memory
2759     if (k == 0) {
2760       ASSERT_EQ(
2761           options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
2762           options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
2763     } else {
2764       ASSERT_NEAR(
2765           options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) *
2766               1.0f /
2767               options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
2768           1, .01);
2769     }
2770   }
2771 }
2772 
2773 #ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
TEST_F(DBTest2,ReadAmpBitmapLiveInCacheAfterDBClose)2774 TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
2775   {
2776     const int kIdBufLen = 100;
2777     char id_buf[kIdBufLen];
2778     Status s = Status::NotSupported();
2779 #ifndef OS_WIN
2780     // You can't open a directory on windows using random access file
2781     std::unique_ptr<RandomAccessFile> file;
2782     s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions());
2783     if (s.ok()) {
2784       if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
2785         // fs holding db directory doesn't support getting a unique file id,
2786         // this means that running this test will fail because lru_cache will
2787         // load the blocks again regardless of them being already in the cache
2788         return;
2789       }
2790     }
2791 #endif
2792     if (!s.ok()) {
2793       std::unique_ptr<Directory> dir;
2794       ASSERT_OK(env_->NewDirectory(dbname_, &dir));
2795       if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
2796         // fs holding db directory doesn't support getting a unique file id,
2797         // this means that running this test will fail because lru_cache will
2798         // load the blocks again regardless of them being already in the cache
2799         return;
2800       }
2801     }
2802   }
2803   uint32_t bytes_per_bit[2] = {1, 16};
2804   for (size_t k = 0; k < 2; k++) {
2805     std::shared_ptr<Cache> lru_cache = NewLRUCache(1024 * 1024 * 1024);
2806     std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
2807 
2808     Options options = CurrentOptions();
2809     BlockBasedTableOptions bbto;
2810     // Disable delta encoding to make it easier to calculate read amplification
2811     bbto.use_delta_encoding = false;
2812     // Huge block cache to make it easier to calculate read amplification
2813     bbto.block_cache = lru_cache;
2814     bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
2815     options.table_factory.reset(NewBlockBasedTableFactory(bbto));
2816     options.statistics = stats;
2817     DestroyAndReopen(options);
2818 
2819     const int kNumEntries = 10000;
2820 
2821     Random rnd(301);
2822     for (int i = 0; i < kNumEntries; i++) {
2823       ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
2824     }
2825     ASSERT_OK(Flush());
2826 
2827     Close();
2828     Reopen(options);
2829 
2830     uint64_t total_useful_bytes = 0;
2831     std::set<int> read_keys;
2832     std::string value;
2833     // Iter1: Read half the DB, Read even keys
2834     // Key(0), Key(2), Key(4), Key(6), Key(8), ...
2835     for (int i = 0; i < kNumEntries; i += 2) {
2836       std::string key = Key(i);
2837       ASSERT_OK(db_->Get(ReadOptions(), key, &value));
2838 
2839       if (read_keys.find(i) == read_keys.end()) {
2840         auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
2841         total_useful_bytes +=
2842             GetEncodedEntrySize(internal_key.size(), value.size());
2843         read_keys.insert(i);
2844       }
2845     }
2846 
2847     size_t total_useful_bytes_iter1 =
2848         options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
2849     size_t total_loaded_bytes_iter1 =
2850         options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
2851 
2852     Close();
2853     std::shared_ptr<Statistics> new_statistics =
2854         ROCKSDB_NAMESPACE::CreateDBStatistics();
2855     // Destroy old statistics obj that the blocks in lru_cache are pointing to
2856     options.statistics.reset();
2857     // Use the statistics object that we just created
2858     options.statistics = new_statistics;
2859     Reopen(options);
2860 
2861     // Iter2: Read half the DB, Read odd keys
2862     // Key(1), Key(3), Key(5), Key(7), Key(9), ...
2863     for (int i = 1; i < kNumEntries; i += 2) {
2864       std::string key = Key(i);
2865       ASSERT_OK(db_->Get(ReadOptions(), key, &value));
2866 
2867       if (read_keys.find(i) == read_keys.end()) {
2868         auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
2869         total_useful_bytes +=
2870             GetEncodedEntrySize(internal_key.size(), value.size());
2871         read_keys.insert(i);
2872       }
2873     }
2874 
2875     size_t total_useful_bytes_iter2 =
2876         options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
2877     size_t total_loaded_bytes_iter2 =
2878         options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
2879 
2880 
2881     // Read amp is on average 100% since we read all what we loaded in memory
2882     if (k == 0) {
2883       ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2,
2884                 total_loaded_bytes_iter1 + total_loaded_bytes_iter2);
2885     } else {
2886       ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f /
2887                       (total_loaded_bytes_iter1 + total_loaded_bytes_iter2),
2888                   1, .01);
2889     }
2890   }
2891 }
2892 #endif // !OS_SOLARIS
2893 
2894 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,AutomaticCompactionOverlapManualCompaction)2895 TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
2896   Options options = CurrentOptions();
2897   options.num_levels = 3;
2898   options.IncreaseParallelism(20);
2899   DestroyAndReopen(options);
2900 
2901   ASSERT_OK(Put(Key(0), "a"));
2902   ASSERT_OK(Put(Key(5), "a"));
2903   ASSERT_OK(Flush());
2904 
2905   ASSERT_OK(Put(Key(10), "a"));
2906   ASSERT_OK(Put(Key(15), "a"));
2907   ASSERT_OK(Flush());
2908 
2909   CompactRangeOptions cro;
2910   cro.change_level = true;
2911   cro.target_level = 2;
2912   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
2913 
2914   auto get_stat = [](std::string level_str, LevelStatType type,
2915                      std::map<std::string, std::string> props) {
2916     auto prop_str =
2917         "compaction." + level_str + "." +
2918         InternalStats::compaction_level_stats.at(type).property_name.c_str();
2919     auto prop_item = props.find(prop_str);
2920     return prop_item == props.end() ? 0 : std::stod(prop_item->second);
2921   };
2922 
2923   // Trivial move 2 files to L2
2924   ASSERT_EQ("0,0,2", FilesPerLevel());
2925   // Also test that the stats GetMapProperty API reporting the same result
2926   {
2927     std::map<std::string, std::string> prop;
2928     ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
2929     ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop));
2930     ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop));
2931     ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop));
2932     ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop));
2933   }
2934 
2935   // While the compaction is running, we will create 2 new files that
2936   // can fit in L2, these 2 files will be moved to L2 and overlap with
2937   // the running compaction and break the LSM consistency.
2938   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2939       "CompactionJob::Run():Start", [&](void* /*arg*/) {
2940         ASSERT_OK(
2941             dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
2942                                   {"max_bytes_for_level_base", "1"}}));
2943         ASSERT_OK(Put(Key(6), "a"));
2944         ASSERT_OK(Put(Key(7), "a"));
2945         ASSERT_OK(Flush());
2946 
2947         ASSERT_OK(Put(Key(8), "a"));
2948         ASSERT_OK(Put(Key(9), "a"));
2949         ASSERT_OK(Flush());
2950       });
2951   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2952 
2953   // Run a manual compaction that will compact the 2 files in L2
2954   // into 1 file in L2
2955   cro.exclusive_manual_compaction = false;
2956   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
2957   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
2958 
2959   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2960 
2961   // Test that the stats GetMapProperty API reporting 1 file in L2
2962   {
2963     std::map<std::string, std::string> prop;
2964     ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
2965     ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop));
2966   }
2967 }
2968 
TEST_F(DBTest2,ManualCompactionOverlapManualCompaction)2969 TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
2970   Options options = CurrentOptions();
2971   options.num_levels = 2;
2972   options.IncreaseParallelism(20);
2973   options.disable_auto_compactions = true;
2974   DestroyAndReopen(options);
2975 
2976   ASSERT_OK(Put(Key(0), "a"));
2977   ASSERT_OK(Put(Key(5), "a"));
2978   ASSERT_OK(Flush());
2979 
2980   ASSERT_OK(Put(Key(10), "a"));
2981   ASSERT_OK(Put(Key(15), "a"));
2982   ASSERT_OK(Flush());
2983 
2984   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2985 
2986   // Trivial move 2 files to L1
2987   ASSERT_EQ("0,2", FilesPerLevel());
2988 
2989   std::function<void()> bg_manual_compact = [&]() {
2990     std::string k1 = Key(6);
2991     std::string k2 = Key(9);
2992     Slice k1s(k1);
2993     Slice k2s(k2);
2994     CompactRangeOptions cro;
2995     cro.exclusive_manual_compaction = false;
2996     ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s));
2997   };
2998   ROCKSDB_NAMESPACE::port::Thread bg_thread;
2999 
3000   // While the compaction is running, we will create 2 new files that
3001   // can fit in L1, these 2 files will be moved to L1 and overlap with
3002   // the running compaction and break the LSM consistency.
3003   std::atomic<bool> flag(false);
3004   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3005       "CompactionJob::Run():Start", [&](void* /*arg*/) {
3006         if (flag.exchange(true)) {
3007           // We want to make sure to call this callback only once
3008           return;
3009         }
3010         ASSERT_OK(Put(Key(6), "a"));
3011         ASSERT_OK(Put(Key(7), "a"));
3012         ASSERT_OK(Flush());
3013 
3014         ASSERT_OK(Put(Key(8), "a"));
3015         ASSERT_OK(Put(Key(9), "a"));
3016         ASSERT_OK(Flush());
3017 
3018         // Start a non-exclusive manual compaction in a bg thread
3019         bg_thread = port::Thread(bg_manual_compact);
3020         // This manual compaction conflict with the other manual compaction
3021         // so it should wait until the first compaction finish
3022         env_->SleepForMicroseconds(1000000);
3023       });
3024   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3025 
3026   // Run a manual compaction that will compact the 2 files in L1
3027   // into 1 file in L1
3028   CompactRangeOptions cro;
3029   cro.exclusive_manual_compaction = false;
3030   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
3031   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
3032   bg_thread.join();
3033 
3034   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3035 }
3036 
TEST_F(DBTest2,PausingManualCompaction1)3037 TEST_F(DBTest2, PausingManualCompaction1) {
3038   Options options = CurrentOptions();
3039   options.disable_auto_compactions = true;
3040   options.num_levels = 7;
3041 
3042   DestroyAndReopen(options);
3043   Random rnd(301);
3044   // Generate a file containing 10 keys.
3045   for (int i = 0; i < 10; i++) {
3046     ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
3047   }
3048   ASSERT_OK(Flush());
3049 
3050   // Generate another file containing same keys
3051   for (int i = 0; i < 10; i++) {
3052     ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
3053   }
3054   ASSERT_OK(Flush());
3055 
3056   int manual_compactions_paused = 0;
3057   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3058       "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
3059         auto paused = static_cast<std::atomic<int>*>(arg);
3060         ASSERT_EQ(0, paused->load(std::memory_order_acquire));
3061         paused->fetch_add(1, std::memory_order_release);
3062         manual_compactions_paused += 1;
3063       });
3064   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3065 
3066   std::vector<std::string> files_before_compact, files_after_compact;
3067   // Remember file name before compaction is triggered
3068   std::vector<LiveFileMetaData> files_meta;
3069   dbfull()->GetLiveFilesMetaData(&files_meta);
3070   for (auto file : files_meta) {
3071     files_before_compact.push_back(file.name);
3072   }
3073 
3074   // OK, now trigger a manual compaction
3075   ASSERT_TRUE(dbfull()
3076                   ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
3077                   .IsManualCompactionPaused());
3078 
3079   // Wait for compactions to get scheduled and stopped
3080   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3081 
3082   // Get file names after compaction is stopped
3083   files_meta.clear();
3084   dbfull()->GetLiveFilesMetaData(&files_meta);
3085   for (auto file : files_meta) {
3086     files_after_compact.push_back(file.name);
3087   }
3088 
3089   // Like nothing happened
3090   ASSERT_EQ(files_before_compact, files_after_compact);
3091   ASSERT_EQ(manual_compactions_paused, 1);
3092 
3093   manual_compactions_paused = 0;
3094   // Now make sure CompactFiles also not run
3095   ASSERT_TRUE(dbfull()
3096                   ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
3097                                  files_before_compact, 0)
3098                   .IsManualCompactionPaused());
3099   // Wait for manual compaction to get scheduled and finish
3100   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3101 
3102   files_meta.clear();
3103   files_after_compact.clear();
3104   dbfull()->GetLiveFilesMetaData(&files_meta);
3105   for (auto file : files_meta) {
3106     files_after_compact.push_back(file.name);
3107   }
3108 
3109   ASSERT_EQ(files_before_compact, files_after_compact);
3110   // CompactFiles returns at entry point
3111   ASSERT_EQ(manual_compactions_paused, 0);
3112 
3113   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3114 }
3115 
3116 // PausingManualCompaction does not affect auto compaction
TEST_F(DBTest2,PausingManualCompaction2)3117 TEST_F(DBTest2, PausingManualCompaction2) {
3118   Options options = CurrentOptions();
3119   options.level0_file_num_compaction_trigger = 2;
3120   options.disable_auto_compactions = false;
3121 
3122   DestroyAndReopen(options);
3123   dbfull()->DisableManualCompaction();
3124 
3125   Random rnd(301);
3126   for (int i = 0; i < 2; i++) {
3127     // Generate a file containing 10 keys.
3128     for (int j = 0; j < 100; j++) {
3129       ASSERT_OK(Put(Key(j), rnd.RandomString(50)));
3130     }
3131     ASSERT_OK(Flush());
3132   }
3133   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3134 
3135   std::vector<LiveFileMetaData> files_meta;
3136   dbfull()->GetLiveFilesMetaData(&files_meta);
3137   ASSERT_EQ(files_meta.size(), 1);
3138 }
3139 
TEST_F(DBTest2,PausingManualCompaction3)3140 TEST_F(DBTest2, PausingManualCompaction3) {
3141   CompactRangeOptions compact_options;
3142   Options options = CurrentOptions();
3143   options.disable_auto_compactions = true;
3144   options.num_levels = 7;
3145 
3146   Random rnd(301);
3147   auto generate_files = [&]() {
3148     for (int i = 0; i < options.num_levels; i++) {
3149       for (int j = 0; j < options.num_levels - i + 1; j++) {
3150         for (int k = 0; k < 1000; k++) {
3151           ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
3152         }
3153         ASSERT_OK(Flush());
3154       }
3155 
3156       for (int l = 1; l < options.num_levels - i; l++) {
3157         MoveFilesToLevel(l);
3158       }
3159     }
3160   };
3161 
3162   DestroyAndReopen(options);
3163   generate_files();
3164 #ifndef ROCKSDB_LITE
3165   ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3166 #endif  // !ROCKSDB_LITE
3167   int run_manual_compactions = 0;
3168   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3169       "CompactionJob::Run():PausingManualCompaction:1",
3170       [&](void* /*arg*/) { run_manual_compactions++; });
3171   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3172 
3173   dbfull()->DisableManualCompaction();
3174   ASSERT_TRUE(dbfull()
3175                   ->CompactRange(compact_options, nullptr, nullptr)
3176                   .IsManualCompactionPaused());
3177   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3178   // As manual compaction disabled, not even reach sync point
3179   ASSERT_EQ(run_manual_compactions, 0);
3180 #ifndef ROCKSDB_LITE
3181   ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3182 #endif  // !ROCKSDB_LITE
3183 
3184   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3185       "CompactionJob::Run():PausingManualCompaction:1");
3186   dbfull()->EnableManualCompaction();
3187   ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3188   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3189 #ifndef ROCKSDB_LITE
3190   ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
3191 #endif  // !ROCKSDB_LITE
3192 
3193   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3194 }
3195 
TEST_F(DBTest2,PausingManualCompaction4)3196 TEST_F(DBTest2, PausingManualCompaction4) {
3197   CompactRangeOptions compact_options;
3198   Options options = CurrentOptions();
3199   options.disable_auto_compactions = true;
3200   options.num_levels = 7;
3201 
3202   Random rnd(301);
3203   auto generate_files = [&]() {
3204     for (int i = 0; i < options.num_levels; i++) {
3205       for (int j = 0; j < options.num_levels - i + 1; j++) {
3206         for (int k = 0; k < 1000; k++) {
3207           ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
3208         }
3209         ASSERT_OK(Flush());
3210       }
3211 
3212       for (int l = 1; l < options.num_levels - i; l++) {
3213         MoveFilesToLevel(l);
3214       }
3215     }
3216   };
3217 
3218   DestroyAndReopen(options);
3219   generate_files();
3220 #ifndef ROCKSDB_LITE
3221   ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3222 #endif  // !ROCKSDB_LITE
3223   int run_manual_compactions = 0;
3224   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3225       "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
3226         auto paused = static_cast<std::atomic<int>*>(arg);
3227         ASSERT_EQ(0, paused->load(std::memory_order_acquire));
3228         paused->fetch_add(1, std::memory_order_release);
3229         run_manual_compactions++;
3230       });
3231   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3232 
3233   ASSERT_TRUE(dbfull()
3234                   ->CompactRange(compact_options, nullptr, nullptr)
3235                   .IsManualCompactionPaused());
3236   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3237   ASSERT_EQ(run_manual_compactions, 1);
3238 #ifndef ROCKSDB_LITE
3239   ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3240 #endif  // !ROCKSDB_LITE
3241 
3242   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3243       "CompactionJob::Run():PausingManualCompaction:2");
3244   dbfull()->EnableManualCompaction();
3245   ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3246   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3247 #ifndef ROCKSDB_LITE
3248   ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
3249 #endif  // !ROCKSDB_LITE
3250 
3251   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3252 }
3253 
TEST_F(DBTest2,CancelManualCompaction1)3254 TEST_F(DBTest2, CancelManualCompaction1) {
3255   CompactRangeOptions compact_options;
3256   auto canceledPtr =
3257       std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
3258   compact_options.canceled = canceledPtr.get();
3259 
3260   Options options = CurrentOptions();
3261   options.disable_auto_compactions = true;
3262   options.num_levels = 7;
3263 
3264   Random rnd(301);
3265   auto generate_files = [&]() {
3266     for (int i = 0; i < options.num_levels; i++) {
3267       for (int j = 0; j < options.num_levels - i + 1; j++) {
3268         for (int k = 0; k < 1000; k++) {
3269           ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
3270         }
3271         ASSERT_OK(Flush());
3272       }
3273 
3274       for (int l = 1; l < options.num_levels - i; l++) {
3275         MoveFilesToLevel(l);
3276       }
3277     }
3278   };
3279 
3280   DestroyAndReopen(options);
3281   generate_files();
3282 #ifndef ROCKSDB_LITE
3283   ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3284 #endif  // !ROCKSDB_LITE
3285 
3286   int run_manual_compactions = 0;
3287   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3288       "CompactionJob::Run():PausingManualCompaction:1",
3289       [&](void* /*arg*/) { run_manual_compactions++; });
3290   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3291 
3292   // Setup a callback to disable compactions after a couple of levels are
3293   // compacted
3294   int compactions_run = 0;
3295   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3296       "DBImpl::RunManualCompaction()::1",
3297       [&](void* /*arg*/) { ++compactions_run; });
3298 
3299   ASSERT_TRUE(dbfull()
3300                   ->CompactRange(compact_options, nullptr, nullptr)
3301                   .IsManualCompactionPaused());
3302   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3303 
3304   // Since compactions are disabled, we shouldn't start compacting.
3305   // E.g. we should call the compaction function exactly one time.
3306   ASSERT_EQ(compactions_run, 0);
3307   ASSERT_EQ(run_manual_compactions, 0);
3308 #ifndef ROCKSDB_LITE
3309   ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3310 #endif  // !ROCKSDB_LITE
3311 
3312   compactions_run = 0;
3313   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3314       "DBImpl::RunManualCompaction()::1");
3315   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3316       "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
3317         ++compactions_run;
3318         // After 3 compactions disable
3319         if (compactions_run == 3) {
3320           compact_options.canceled->store(true, std::memory_order_release);
3321         }
3322       });
3323 
3324   compact_options.canceled->store(false, std::memory_order_release);
3325   ASSERT_TRUE(dbfull()
3326                   ->CompactRange(compact_options, nullptr, nullptr)
3327                   .IsManualCompactionPaused());
3328   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3329 
3330   ASSERT_EQ(compactions_run, 3);
3331 
3332   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3333       "DBImpl::RunManualCompaction()::1");
3334   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3335       "CompactionJob::Run():PausingManualCompaction:1");
3336 
3337   // Compactions should work again if we re-enable them..
3338   compact_options.canceled->store(false, std::memory_order_relaxed);
3339   ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3340   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3341 #ifndef ROCKSDB_LITE
3342   ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
3343 #endif  // !ROCKSDB_LITE
3344 
3345   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3346 }
3347 
TEST_F(DBTest2,CancelManualCompaction2)3348 TEST_F(DBTest2, CancelManualCompaction2) {
3349   CompactRangeOptions compact_options;
3350   auto canceledPtr =
3351       std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
3352   compact_options.canceled = canceledPtr.get();
3353   compact_options.max_subcompactions = 1;
3354 
3355   Options options = CurrentOptions();
3356   options.disable_auto_compactions = true;
3357   options.num_levels = 7;
3358 
3359   Random rnd(301);
3360   auto generate_files = [&]() {
3361     for (int i = 0; i < options.num_levels; i++) {
3362       for (int j = 0; j < options.num_levels - i + 1; j++) {
3363         for (int k = 0; k < 1000; k++) {
3364           ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
3365         }
3366         ASSERT_OK(Flush());
3367       }
3368 
3369       for (int l = 1; l < options.num_levels - i; l++) {
3370         MoveFilesToLevel(l);
3371       }
3372     }
3373   };
3374 
3375   DestroyAndReopen(options);
3376   generate_files();
3377 #ifndef ROCKSDB_LITE
3378   ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3379 #endif  // !ROCKSDB_LITE
3380 
3381   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3382 
3383   int compactions_run = 0;
3384   std::atomic<int> kv_compactions{0};
3385   int compactions_stopped_at = 0;
3386   int kv_compactions_stopped_at = 0;
3387   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3388       "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
3389         ++compactions_run;
3390         // After 3 compactions disable
3391       });
3392 
3393   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3394       "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
3395         int kv_compactions_run =
3396             kv_compactions.fetch_add(1, std::memory_order_release);
3397         if (kv_compactions_run == 5) {
3398           compact_options.canceled->store(true, std::memory_order_release);
3399           kv_compactions_stopped_at = kv_compactions_run;
3400           compactions_stopped_at = compactions_run;
3401         }
3402       });
3403 
3404   compact_options.canceled->store(false, std::memory_order_release);
3405   ASSERT_TRUE(dbfull()
3406                   ->CompactRange(compact_options, nullptr, nullptr)
3407                   .IsManualCompactionPaused());
3408   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3409 
3410   // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to
3411   // the canceled variable from the single compacting thread (via callback),
3412   // this value is deterministically kv_compactions_stopped_at + 1.
3413   ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1);
3414   ASSERT_EQ(compactions_run, compactions_stopped_at);
3415 
3416   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3417       "CompactionIterator::ProcessKV");
3418   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3419       "DBImpl::RunManualCompaction()::1");
3420   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3421       "CompactionJob::Run():PausingManualCompaction:1");
3422 
3423   // Compactions should work again if we re-enable them..
3424   compact_options.canceled->store(false, std::memory_order_relaxed);
3425   ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3426   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3427 #ifndef ROCKSDB_LITE
3428   ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
3429 #endif  // !ROCKSDB_LITE
3430 
3431   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3432 }
3433 
3434 class CancelCompactionListener : public EventListener {
3435  public:
CancelCompactionListener()3436   CancelCompactionListener()
3437       : num_compaction_started_(0), num_compaction_ended_(0) {}
3438 
OnCompactionBegin(DB *,const CompactionJobInfo & ci)3439   void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
3440     ASSERT_EQ(ci.cf_name, "default");
3441     ASSERT_EQ(ci.base_input_level, 0);
3442     num_compaction_started_++;
3443   }
3444 
OnCompactionCompleted(DB *,const CompactionJobInfo & ci)3445   void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
3446     ASSERT_EQ(ci.cf_name, "default");
3447     ASSERT_EQ(ci.base_input_level, 0);
3448     ASSERT_EQ(ci.status.code(), code_);
3449     ASSERT_EQ(ci.status.subcode(), subcode_);
3450     num_compaction_ended_++;
3451   }
3452 
3453   std::atomic<size_t> num_compaction_started_;
3454   std::atomic<size_t> num_compaction_ended_;
3455   Status::Code code_;
3456   Status::SubCode subcode_;
3457 };
3458 
TEST_F(DBTest2,CancelManualCompactionWithListener)3459 TEST_F(DBTest2, CancelManualCompactionWithListener) {
3460   CompactRangeOptions compact_options;
3461   auto canceledPtr =
3462       std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
3463   compact_options.canceled = canceledPtr.get();
3464   compact_options.max_subcompactions = 1;
3465 
3466   Options options = CurrentOptions();
3467   options.disable_auto_compactions = true;
3468   CancelCompactionListener* listener = new CancelCompactionListener();
3469   options.listeners.emplace_back(listener);
3470 
3471   DestroyAndReopen(options);
3472 
3473   Random rnd(301);
3474   for (int i = 0; i < 10; i++) {
3475     for (int j = 0; j < 10; j++) {
3476       ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50)));
3477     }
3478     ASSERT_OK(Flush());
3479   }
3480 
3481   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3482 
3483   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3484       "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
3485         compact_options.canceled->store(true, std::memory_order_release);
3486       });
3487 
3488   int running_compaction = 0;
3489   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3490       "CompactionJob::FinishCompactionOutputFile1",
3491       [&](void* /*arg*/) { running_compaction++; });
3492 
3493   // Case I: 1 Notify begin compaction, 2 DisableManualCompaction, 3 Compaction
3494   // not run, 4 Notify compaction end.
3495   listener->code_ = Status::kIncomplete;
3496   listener->subcode_ = Status::SubCode::kManualCompactionPaused;
3497 
3498   compact_options.canceled->store(false, std::memory_order_release);
3499   ASSERT_TRUE(dbfull()
3500                   ->CompactRange(compact_options, nullptr, nullptr)
3501                   .IsManualCompactionPaused());
3502   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3503 
3504   ASSERT_GT(listener->num_compaction_started_, 0);
3505   ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
3506   ASSERT_EQ(running_compaction, 0);
3507 
3508   listener->num_compaction_started_ = 0;
3509   listener->num_compaction_ended_ = 0;
3510 
3511   // Case II: 1 DisableManualCompaction, 2 Notify begin compaction (return
3512   // without notifying), 3 Notify compaction end (return without notifying).
3513   ASSERT_TRUE(dbfull()
3514                   ->CompactRange(compact_options, nullptr, nullptr)
3515                   .IsManualCompactionPaused());
3516   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3517 
3518   ASSERT_EQ(listener->num_compaction_started_, 0);
3519   ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
3520   ASSERT_EQ(running_compaction, 0);
3521 
3522   // Case III: 1 Notify begin compaction, 2 Compaction in between
3523   // 3. DisableManualCompaction, , 4 Notify compaction end.
3524   // compact_options.canceled->store(false, std::memory_order_release);
3525   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3526       "CompactionIterator:ProcessKV");
3527 
3528   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3529       "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) {
3530         compact_options.canceled->store(true, std::memory_order_release);
3531       });
3532 
3533   listener->code_ = Status::kOk;
3534   listener->subcode_ = Status::SubCode::kNone;
3535 
3536   compact_options.canceled->store(false, std::memory_order_release);
3537   ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3538   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3539 
3540   ASSERT_GT(listener->num_compaction_started_, 0);
3541   ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
3542 
3543   // Compaction job will succeed.
3544   ASSERT_GT(running_compaction, 0);
3545 
3546   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
3547   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3548 }
3549 
TEST_F(DBTest2,CompactionOnBottomPriorityWithListener)3550 TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) {
3551   int num_levels = 3;
3552   const int kNumFilesTrigger = 4;
3553 
3554   Options options = CurrentOptions();
3555   env_->SetBackgroundThreads(0, Env::Priority::HIGH);
3556   env_->SetBackgroundThreads(0, Env::Priority::LOW);
3557   env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
3558   options.env = env_;
3559   options.compaction_style = kCompactionStyleUniversal;
3560   options.num_levels = num_levels;
3561   options.write_buffer_size = 100 << 10;     // 100KB
3562   options.target_file_size_base = 32 << 10;  // 32KB
3563   options.level0_file_num_compaction_trigger = kNumFilesTrigger;
3564   // Trigger compaction if size amplification exceeds 110%
3565   options.compaction_options_universal.max_size_amplification_percent = 110;
3566 
3567   CancelCompactionListener* listener = new CancelCompactionListener();
3568   options.listeners.emplace_back(listener);
3569 
3570   DestroyAndReopen(options);
3571 
3572   int num_bottom_thread_compaction_scheduled = 0;
3573   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3574 
3575   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3576       "DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
3577       [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; });
3578 
3579   int num_compaction_jobs = 0;
3580   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3581       "CompactionJob::Run():End",
3582       [&](void* /*arg*/) { num_compaction_jobs++; });
3583 
3584   listener->code_ = Status::kOk;
3585   listener->subcode_ = Status::SubCode::kNone;
3586 
3587   Random rnd(301);
3588   for (int i = 0; i < 1; ++i) {
3589     for (int num = 0; num < kNumFilesTrigger; num++) {
3590       int key_idx = 0;
3591       GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
3592       // use no_wait above because that one waits for flush and compaction. We
3593       // don't want to wait for compaction because the full compaction is
3594       // intentionally blocked while more files are flushed.
3595       ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
3596     }
3597   }
3598   ASSERT_OK(dbfull()->TEST_WaitForCompact());
3599   ASSERT_GT(num_bottom_thread_compaction_scheduled, 0);
3600   ASSERT_EQ(num_compaction_jobs, 1);
3601   ASSERT_GT(listener->num_compaction_started_, 0);
3602   ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
3603 
3604   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
3605   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3606 }
3607 
TEST_F(DBTest2,OptimizeForPointLookup)3608 TEST_F(DBTest2, OptimizeForPointLookup) {
3609   Options options = CurrentOptions();
3610   Close();
3611   options.OptimizeForPointLookup(2);
3612   ASSERT_OK(DB::Open(options, dbname_, &db_));
3613 
3614   ASSERT_OK(Put("foo", "v1"));
3615   ASSERT_EQ("v1", Get("foo"));
3616   ASSERT_OK(Flush());
3617   ASSERT_EQ("v1", Get("foo"));
3618 }
3619 
TEST_F(DBTest2,OptimizeForSmallDB)3620 TEST_F(DBTest2, OptimizeForSmallDB) {
3621   Options options = CurrentOptions();
3622   Close();
3623   options.OptimizeForSmallDb();
3624 
3625   // Find the cache object
3626   ASSERT_TRUE(options.table_factory->IsInstanceOf(
3627       TableFactory::kBlockBasedTableName()));
3628   auto table_options =
3629       options.table_factory->GetOptions<BlockBasedTableOptions>();
3630 
3631   ASSERT_TRUE(table_options != nullptr);
3632   std::shared_ptr<Cache> cache = table_options->block_cache;
3633 
3634   ASSERT_EQ(0, cache->GetUsage());
3635   ASSERT_OK(DB::Open(options, dbname_, &db_));
3636   ASSERT_OK(Put("foo", "v1"));
3637 
3638   // memtable size is costed to the block cache
3639   ASSERT_NE(0, cache->GetUsage());
3640 
3641   ASSERT_EQ("v1", Get("foo"));
3642   ASSERT_OK(Flush());
3643 
3644   size_t prev_size = cache->GetUsage();
3645   // Remember block cache size, so that we can find that
3646   // it is filled after Get().
3647   // Use pinnable slice so that it can ping the block so that
3648   // when we check the size it is not evicted.
3649   PinnableSlice value;
3650   ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
3651   ASSERT_GT(cache->GetUsage(), prev_size);
3652   value.Reset();
3653 }
3654 
3655 #endif  // ROCKSDB_LITE
3656 
TEST_F(DBTest2,IterRaceFlush1)3657 TEST_F(DBTest2, IterRaceFlush1) {
3658   ASSERT_OK(Put("foo", "v1"));
3659 
3660   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3661       {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"},
3662        {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}});
3663 
3664   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3665 
3666   ROCKSDB_NAMESPACE::port::Thread t1([&] {
3667     TEST_SYNC_POINT("DBTest2::IterRaceFlush:1");
3668     ASSERT_OK(Put("foo", "v2"));
3669     ASSERT_OK(Flush());
3670     TEST_SYNC_POINT("DBTest2::IterRaceFlush:2");
3671   });
3672 
3673   // iterator is created after the first Put(), and its snapshot sequence is
3674   // assigned after second Put(), so it must see v2.
3675   {
3676     std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
3677     it->Seek("foo");
3678     ASSERT_TRUE(it->Valid());
3679     ASSERT_OK(it->status());
3680     ASSERT_EQ("foo", it->key().ToString());
3681     ASSERT_EQ("v2", it->value().ToString());
3682   }
3683 
3684   t1.join();
3685   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3686 }
3687 
TEST_F(DBTest2,IterRaceFlush2)3688 TEST_F(DBTest2, IterRaceFlush2) {
3689   ASSERT_OK(Put("foo", "v1"));
3690 
3691   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3692       {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"},
3693        {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}});
3694 
3695   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3696 
3697   ROCKSDB_NAMESPACE::port::Thread t1([&] {
3698     TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1");
3699     ASSERT_OK(Put("foo", "v2"));
3700     ASSERT_OK(Flush());
3701     TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2");
3702   });
3703 
3704   // iterator is created after the first Put(), and its snapshot sequence is
3705   // assigned before second Put(), thus it must see v1.
3706   {
3707     std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
3708     it->Seek("foo");
3709     ASSERT_TRUE(it->Valid());
3710     ASSERT_OK(it->status());
3711     ASSERT_EQ("foo", it->key().ToString());
3712     ASSERT_EQ("v1", it->value().ToString());
3713   }
3714 
3715   t1.join();
3716   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3717 }
3718 
TEST_F(DBTest2,IterRefreshRaceFlush)3719 TEST_F(DBTest2, IterRefreshRaceFlush) {
3720   ASSERT_OK(Put("foo", "v1"));
3721 
3722   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3723       {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"},
3724        {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}});
3725 
3726   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3727 
3728   ROCKSDB_NAMESPACE::port::Thread t1([&] {
3729     TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1");
3730     ASSERT_OK(Put("foo", "v2"));
3731     ASSERT_OK(Flush());
3732     TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2");
3733   });
3734 
3735   // iterator is refreshed after the first Put(), and its sequence number is
3736   // assigned after second Put(), thus it must see v2.
3737   {
3738     std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
3739     ASSERT_OK(it->status());
3740     ASSERT_OK(it->Refresh());
3741     it->Seek("foo");
3742     ASSERT_TRUE(it->Valid());
3743     ASSERT_OK(it->status());
3744     ASSERT_EQ("foo", it->key().ToString());
3745     ASSERT_EQ("v2", it->value().ToString());
3746   }
3747 
3748   t1.join();
3749   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3750 }
3751 
TEST_F(DBTest2,GetRaceFlush1)3752 TEST_F(DBTest2, GetRaceFlush1) {
3753   ASSERT_OK(Put("foo", "v1"));
3754 
3755   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3756       {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"},
3757        {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}});
3758 
3759   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3760 
3761   ROCKSDB_NAMESPACE::port::Thread t1([&] {
3762     TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
3763     ASSERT_OK(Put("foo", "v2"));
3764     ASSERT_OK(Flush());
3765     TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
3766   });
3767 
3768   // Get() is issued after the first Put(), so it should see either
3769   // "v1" or "v2".
3770   ASSERT_NE("NOT_FOUND", Get("foo"));
3771   t1.join();
3772   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3773 }
3774 
TEST_F(DBTest2,GetRaceFlush2)3775 TEST_F(DBTest2, GetRaceFlush2) {
3776   ASSERT_OK(Put("foo", "v1"));
3777 
3778   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3779       {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"},
3780        {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}});
3781 
3782   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3783 
3784   port::Thread t1([&] {
3785     TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
3786     ASSERT_OK(Put("foo", "v2"));
3787     ASSERT_OK(Flush());
3788     TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
3789   });
3790 
3791   // Get() is issued after the first Put(), so it should see either
3792   // "v1" or "v2".
3793   ASSERT_NE("NOT_FOUND", Get("foo"));
3794   t1.join();
3795   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3796 }
3797 
TEST_F(DBTest2,DirectIO)3798 TEST_F(DBTest2, DirectIO) {
3799   if (!IsDirectIOSupported()) {
3800     return;
3801   }
3802   Options options = CurrentOptions();
3803   options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
3804       true;
3805   options.allow_mmap_reads = options.allow_mmap_writes = false;
3806   DestroyAndReopen(options);
3807 
3808   ASSERT_OK(Put(Key(0), "a"));
3809   ASSERT_OK(Put(Key(5), "a"));
3810   ASSERT_OK(Flush());
3811 
3812   ASSERT_OK(Put(Key(10), "a"));
3813   ASSERT_OK(Put(Key(15), "a"));
3814   ASSERT_OK(Flush());
3815 
3816   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
3817   Reopen(options);
3818 }
3819 
TEST_F(DBTest2,MemtableOnlyIterator)3820 TEST_F(DBTest2, MemtableOnlyIterator) {
3821   Options options = CurrentOptions();
3822   CreateAndReopenWithCF({"pikachu"}, options);
3823 
3824   ASSERT_OK(Put(1, "foo", "first"));
3825   ASSERT_OK(Put(1, "bar", "second"));
3826 
3827   ReadOptions ropt;
3828   ropt.read_tier = kMemtableTier;
3829   std::string value;
3830   Iterator* it = nullptr;
3831 
3832   // Before flushing
3833   // point lookups
3834   ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
3835   ASSERT_EQ("first", value);
3836   ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
3837   ASSERT_EQ("second", value);
3838 
3839   // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet.
3840   it = db_->NewIterator(ropt, handles_[1]);
3841   int count = 0;
3842   for (it->SeekToFirst(); it->Valid(); it->Next()) {
3843     ASSERT_TRUE(it->Valid());
3844     count++;
3845   }
3846   ASSERT_TRUE(!it->Valid());
3847   ASSERT_EQ(2, count);
3848   delete it;
3849 
3850   Flush(1);
3851 
3852   // After flushing
3853   // point lookups
3854   ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
3855   ASSERT_EQ("first", value);
3856   ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
3857   ASSERT_EQ("second", value);
3858   // nothing should be returned using memtable-only iterator after flushing.
3859   it = db_->NewIterator(ropt, handles_[1]);
3860   ASSERT_OK(it->status());
3861   count = 0;
3862   for (it->SeekToFirst(); it->Valid(); it->Next()) {
3863     ASSERT_TRUE(it->Valid());
3864     count++;
3865   }
3866   ASSERT_TRUE(!it->Valid());
3867   ASSERT_EQ(0, count);
3868   ASSERT_OK(it->status());
3869   delete it;
3870 
3871   // Add a key to memtable
3872   ASSERT_OK(Put(1, "foobar", "third"));
3873   it = db_->NewIterator(ropt, handles_[1]);
3874   ASSERT_OK(it->status());
3875   count = 0;
3876   for (it->SeekToFirst(); it->Valid(); it->Next()) {
3877     ASSERT_TRUE(it->Valid());
3878     ASSERT_EQ("foobar", it->key().ToString());
3879     ASSERT_EQ("third", it->value().ToString());
3880     count++;
3881   }
3882   ASSERT_TRUE(!it->Valid());
3883   ASSERT_EQ(1, count);
3884   ASSERT_OK(it->status());
3885   delete it;
3886 }
3887 
TEST_F(DBTest2,LowPriWrite)3888 TEST_F(DBTest2, LowPriWrite) {
3889   Options options = CurrentOptions();
3890   // Compaction pressure should trigger since 6 files
3891   options.level0_file_num_compaction_trigger = 4;
3892   options.level0_slowdown_writes_trigger = 12;
3893   options.level0_stop_writes_trigger = 30;
3894   options.delayed_write_rate = 8 * 1024 * 1024;
3895   Reopen(options);
3896 
3897   std::atomic<int> rate_limit_count(0);
3898 
3899   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3900       "GenericRateLimiter::Request:1", [&](void* arg) {
3901         rate_limit_count.fetch_add(1);
3902         int64_t* rate_bytes_per_sec = static_cast<int64_t*>(arg);
3903         ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec);
3904       });
3905   // Block compaction
3906   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
3907       {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"},
3908   });
3909   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3910   WriteOptions wo;
3911   for (int i = 0; i < 6; i++) {
3912     wo.low_pri = false;
3913     ASSERT_OK(Put("", "", wo));
3914     wo.low_pri = true;
3915     ASSERT_OK(Put("", "", wo));
3916     ASSERT_OK(Flush());
3917   }
3918   ASSERT_EQ(0, rate_limit_count.load());
3919   wo.low_pri = true;
3920   ASSERT_OK(Put("", "", wo));
3921   ASSERT_EQ(1, rate_limit_count.load());
3922   wo.low_pri = false;
3923   ASSERT_OK(Put("", "", wo));
3924   ASSERT_EQ(1, rate_limit_count.load());
3925 
3926   TEST_SYNC_POINT("DBTest.LowPriWrite:0");
3927   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3928 
3929   ASSERT_OK(dbfull()->TEST_WaitForCompact());
3930   wo.low_pri = true;
3931   ASSERT_OK(Put("", "", wo));
3932   ASSERT_EQ(1, rate_limit_count.load());
3933   wo.low_pri = false;
3934   ASSERT_OK(Put("", "", wo));
3935   ASSERT_EQ(1, rate_limit_count.load());
3936 }
3937 
3938 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,RateLimitedCompactionReads)3939 TEST_F(DBTest2, RateLimitedCompactionReads) {
3940   // compaction input has 512KB data
3941   const int kNumKeysPerFile = 128;
3942   const int kBytesPerKey = 1024;
3943   const int kNumL0Files = 4;
3944 
3945   for (auto use_direct_io : {false, true}) {
3946     if (use_direct_io && !IsDirectIOSupported()) {
3947       continue;
3948     }
3949     Options options = CurrentOptions();
3950     options.compression = kNoCompression;
3951     options.level0_file_num_compaction_trigger = kNumL0Files;
3952     options.memtable_factory.reset(
3953         test::NewSpecialSkipListFactory(kNumKeysPerFile));
3954     options.new_table_reader_for_compaction_inputs = true;
3955     // takes roughly one second, split into 100 x 10ms intervals. Each interval
3956     // permits 5.12KB, which is smaller than the block size, so this test
3957     // exercises the code for chunking reads.
3958     options.rate_limiter.reset(NewGenericRateLimiter(
3959         static_cast<int64_t>(kNumL0Files * kNumKeysPerFile *
3960                              kBytesPerKey) /* rate_bytes_per_sec */,
3961         10 * 1000 /* refill_period_us */, 10 /* fairness */,
3962         RateLimiter::Mode::kReadsOnly));
3963     options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
3964         use_direct_io;
3965     BlockBasedTableOptions bbto;
3966     bbto.block_size = 16384;
3967     bbto.no_block_cache = true;
3968     options.table_factory.reset(NewBlockBasedTableFactory(bbto));
3969     DestroyAndReopen(options);
3970 
3971     for (int i = 0; i < kNumL0Files; ++i) {
3972       for (int j = 0; j <= kNumKeysPerFile; ++j) {
3973         ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey)));
3974       }
3975       ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
3976       ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
3977     }
3978     ASSERT_OK(dbfull()->TEST_WaitForCompact());
3979     ASSERT_EQ(0, NumTableFilesAtLevel(0));
3980 
3981     ASSERT_EQ(0, options.rate_limiter->GetTotalBytesThrough(Env::IO_HIGH));
3982     // should be slightly above 512KB due to non-data blocks read. Arbitrarily
3983     // chose 1MB as the upper bound on the total bytes read.
3984     size_t rate_limited_bytes =
3985         options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW);
3986     // Include the explicit prefetch of the footer in direct I/O case.
3987     size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
3988     ASSERT_GE(
3989         rate_limited_bytes,
3990         static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
3991     ASSERT_LT(
3992         rate_limited_bytes,
3993         static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
3994                             direct_io_extra));
3995 
3996     Iterator* iter = db_->NewIterator(ReadOptions());
3997     ASSERT_OK(iter->status());
3998     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
3999       ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey));
4000     }
4001     delete iter;
4002     // bytes read for user iterator shouldn't count against the rate limit.
4003     ASSERT_EQ(rate_limited_bytes,
4004               static_cast<size_t>(
4005                   options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW)));
4006   }
4007 }
4008 #endif  // ROCKSDB_LITE
4009 
4010 // Make sure DB can be reopen with reduced number of levels, given no file
4011 // is on levels higher than the new num_levels.
TEST_F(DBTest2,ReduceLevel)4012 TEST_F(DBTest2, ReduceLevel) {
4013   Options options;
4014   options.env = env_;
4015   options.disable_auto_compactions = true;
4016   options.num_levels = 7;
4017   Reopen(options);
4018   ASSERT_OK(Put("foo", "bar"));
4019   ASSERT_OK(Flush());
4020   MoveFilesToLevel(6);
4021 #ifndef ROCKSDB_LITE
4022   ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
4023 #endif  // !ROCKSDB_LITE
4024   CompactRangeOptions compact_options;
4025   compact_options.change_level = true;
4026   compact_options.target_level = 1;
4027   ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
4028 #ifndef ROCKSDB_LITE
4029   ASSERT_EQ("0,1", FilesPerLevel());
4030 #endif  // !ROCKSDB_LITE
4031   options.num_levels = 3;
4032   Reopen(options);
4033 #ifndef ROCKSDB_LITE
4034   ASSERT_EQ("0,1", FilesPerLevel());
4035 #endif  // !ROCKSDB_LITE
4036 }
4037 
4038 // Test that ReadCallback is actually used in both memtbale and sst tables
TEST_F(DBTest2,ReadCallbackTest)4039 TEST_F(DBTest2, ReadCallbackTest) {
4040   Options options;
4041   options.disable_auto_compactions = true;
4042   options.num_levels = 7;
4043   options.env = env_;
4044   Reopen(options);
4045   std::vector<const Snapshot*> snapshots;
4046   // Try to create a db with multiple layers and a memtable
4047   const std::string key = "foo";
4048   const std::string value = "bar";
4049   // This test assumes that the seq start with 1 and increased by 1 after each
4050   // write batch of size 1. If that behavior changes, the test needs to be
4051   // updated as well.
4052   // TODO(myabandeh): update this test to use the seq number that is returned by
4053   // the DB instead of assuming what seq the DB used.
4054   int i = 1;
4055   for (; i < 10; i++) {
4056     ASSERT_OK(Put(key, value + std::to_string(i)));
4057     // Take a snapshot to avoid the value being removed during compaction
4058     auto snapshot = dbfull()->GetSnapshot();
4059     snapshots.push_back(snapshot);
4060   }
4061   ASSERT_OK(Flush());
4062   for (; i < 20; i++) {
4063     ASSERT_OK(Put(key, value + std::to_string(i)));
4064     // Take a snapshot to avoid the value being removed during compaction
4065     auto snapshot = dbfull()->GetSnapshot();
4066     snapshots.push_back(snapshot);
4067   }
4068   ASSERT_OK(Flush());
4069   MoveFilesToLevel(6);
4070 #ifndef ROCKSDB_LITE
4071   ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
4072 #endif  // !ROCKSDB_LITE
4073   for (; i < 30; i++) {
4074     ASSERT_OK(Put(key, value + std::to_string(i)));
4075     auto snapshot = dbfull()->GetSnapshot();
4076     snapshots.push_back(snapshot);
4077   }
4078   ASSERT_OK(Flush());
4079 #ifndef ROCKSDB_LITE
4080   ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
4081 #endif  // !ROCKSDB_LITE
4082   // And also add some values to the memtable
4083   for (; i < 40; i++) {
4084     ASSERT_OK(Put(key, value + std::to_string(i)));
4085     auto snapshot = dbfull()->GetSnapshot();
4086     snapshots.push_back(snapshot);
4087   }
4088 
4089   class TestReadCallback : public ReadCallback {
4090    public:
4091     explicit TestReadCallback(SequenceNumber snapshot)
4092         : ReadCallback(snapshot), snapshot_(snapshot) {}
4093     bool IsVisibleFullCheck(SequenceNumber seq) override {
4094       return seq <= snapshot_;
4095     }
4096 
4097    private:
4098     SequenceNumber snapshot_;
4099   };
4100 
4101   for (int seq = 1; seq < i; seq++) {
4102     PinnableSlice pinnable_val;
4103     ReadOptions roptions;
4104     TestReadCallback callback(seq);
4105     bool dont_care = true;
4106     DBImpl::GetImplOptions get_impl_options;
4107     get_impl_options.column_family = dbfull()->DefaultColumnFamily();
4108     get_impl_options.value = &pinnable_val;
4109     get_impl_options.value_found = &dont_care;
4110     get_impl_options.callback = &callback;
4111     Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
4112     ASSERT_TRUE(s.ok());
4113     // Assuming that after each Put the DB increased seq by one, the value and
4114     // seq number must be equal since we also inc value by 1 after each Put.
4115     ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString());
4116   }
4117 
4118   for (auto snapshot : snapshots) {
4119     dbfull()->ReleaseSnapshot(snapshot);
4120   }
4121 }
4122 
4123 #ifndef ROCKSDB_LITE
4124 
TEST_F(DBTest2,LiveFilesOmitObsoleteFiles)4125 TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
4126   // Regression test for race condition where an obsolete file is returned to
4127   // user as a "live file" but then deleted, all while file deletions are
4128   // disabled.
4129   //
4130   // It happened like this:
4131   //
4132   // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles
4133   // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the
4134   //    latter returned "x.log"
4135   // 3. [flush thread] PurgeObsoleteFiles deleted "x.log"
4136   // 4. [user thread] Reading "x.log" failed
4137   //
4138   // Unfortunately the only regression test I can come up with involves sleep.
4139   // We cannot set SyncPoints to repro since, once the fix is applied, the
4140   // SyncPoints would cause a deadlock as the repro's sequence of events is now
4141   // prohibited.
4142   //
4143   // Instead, if we sleep for a second between Find and Purge, and ensure the
4144   // read attempt happens after purge, then the sequence of events will almost
4145   // certainly happen on the old code.
4146   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
4147       {"DBImpl::BackgroundCallFlush:FilesFound",
4148        "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"},
4149       {"DBImpl::PurgeObsoleteFiles:End",
4150        "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"},
4151   });
4152   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
4153       "DBImpl::PurgeObsoleteFiles:Begin",
4154       [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
4155   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
4156 
4157   ASSERT_OK(Put("key", "val"));
4158   FlushOptions flush_opts;
4159   flush_opts.wait = false;
4160   db_->Flush(flush_opts);
4161   TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");
4162 
4163   ASSERT_OK(db_->DisableFileDeletions());
4164   VectorLogPtr log_files;
4165   ASSERT_OK(db_->GetSortedWalFiles(log_files));
4166   TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
4167   for (const auto& log_file : log_files) {
4168     ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
4169   }
4170 
4171   ASSERT_OK(db_->EnableFileDeletions());
4172   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
4173 }
4174 
TEST_F(DBTest2,TestNumPread)4175 TEST_F(DBTest2, TestNumPread) {
4176   Options options = CurrentOptions();
4177   bool prefetch_supported =
4178       test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
4179   // disable block cache
4180   BlockBasedTableOptions table_options;
4181   table_options.no_block_cache = true;
4182   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
4183   Reopen(options);
4184   env_->count_random_reads_ = true;
4185   env_->random_file_open_counter_.store(0);
4186   ASSERT_OK(Put("bar", "foo"));
4187   ASSERT_OK(Put("foo", "bar"));
4188   ASSERT_OK(Flush());
4189   if (prefetch_supported) {
4190     // After flush, we'll open the file and read footer, meta block,
4191     // property block and index block.
4192     ASSERT_EQ(4, env_->random_read_counter_.Read());
4193   } else {
4194     // With prefetch not supported, we will do a single read into a buffer
4195     ASSERT_EQ(1, env_->random_read_counter_.Read());
4196   }
4197   ASSERT_EQ(1, env_->random_file_open_counter_.load());
4198 
4199   // One pread per a normal data block read
4200   env_->random_file_open_counter_.store(0);
4201   env_->random_read_counter_.Reset();
4202   ASSERT_EQ("bar", Get("foo"));
4203   ASSERT_EQ(1, env_->random_read_counter_.Read());
4204   // All files are already opened.
4205   ASSERT_EQ(0, env_->random_file_open_counter_.load());
4206 
4207   env_->random_file_open_counter_.store(0);
4208   env_->random_read_counter_.Reset();
4209   ASSERT_OK(Put("bar2", "foo2"));
4210   ASSERT_OK(Put("foo2", "bar2"));
4211   ASSERT_OK(Flush());
4212   if (prefetch_supported) {
4213     // After flush, we'll open the file and read footer, meta block,
4214     // property block and index block.
4215     ASSERT_EQ(4, env_->random_read_counter_.Read());
4216   } else {
4217     // With prefetch not supported, we will do a single read into a buffer
4218     ASSERT_EQ(1, env_->random_read_counter_.Read());
4219   }
4220   ASSERT_EQ(1, env_->random_file_open_counter_.load());
4221 
4222   env_->random_file_open_counter_.store(0);
4223   env_->random_read_counter_.Reset();
4224   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
4225   if (prefetch_supported) {
4226     // Compaction needs two input blocks, which requires 2 preads, and
4227     // generate a new SST file which needs 4 preads (footer, meta block,
4228     // property block and index block). In total 6.
4229     ASSERT_EQ(6, env_->random_read_counter_.Read());
4230   } else {
4231     // With prefetch off, compaction needs two input blocks,
4232     // followed by a single buffered read.  In total 3.
4233     ASSERT_EQ(3, env_->random_read_counter_.Read());
4234   }
4235   // All compaction input files should have already been opened.
4236   ASSERT_EQ(1, env_->random_file_open_counter_.load());
4237 
4238   // One pread per a normal data block read
4239   env_->random_file_open_counter_.store(0);
4240   env_->random_read_counter_.Reset();
4241   ASSERT_EQ("foo2", Get("bar2"));
4242   ASSERT_EQ(1, env_->random_read_counter_.Read());
4243   // SST files are already opened.
4244   ASSERT_EQ(0, env_->random_file_open_counter_.load());
4245 }
4246 
4247 class TraceExecutionResultHandler : public TraceRecordResult::Handler {
4248  public:
TraceExecutionResultHandler()4249   TraceExecutionResultHandler() {}
~TraceExecutionResultHandler()4250   ~TraceExecutionResultHandler() override {}
4251 
Handle(const StatusOnlyTraceExecutionResult & result)4252   virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override {
4253     if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
4254       return Status::InvalidArgument("Invalid timestamps.");
4255     }
4256     result.GetStatus().PermitUncheckedError();
4257     switch (result.GetTraceType()) {
4258       case kTraceWrite: {
4259         total_latency_ += result.GetLatency();
4260         cnt_++;
4261         writes_++;
4262         break;
4263       }
4264       default:
4265         return Status::Corruption("Type mismatch.");
4266     }
4267     return Status::OK();
4268   }
4269 
Handle(const SingleValueTraceExecutionResult & result)4270   virtual Status Handle(
4271       const SingleValueTraceExecutionResult& result) override {
4272     if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
4273       return Status::InvalidArgument("Invalid timestamps.");
4274     }
4275     result.GetStatus().PermitUncheckedError();
4276     switch (result.GetTraceType()) {
4277       case kTraceGet: {
4278         total_latency_ += result.GetLatency();
4279         cnt_++;
4280         gets_++;
4281         break;
4282       }
4283       default:
4284         return Status::Corruption("Type mismatch.");
4285     }
4286     return Status::OK();
4287   }
4288 
Handle(const MultiValuesTraceExecutionResult & result)4289   virtual Status Handle(
4290       const MultiValuesTraceExecutionResult& result) override {
4291     if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
4292       return Status::InvalidArgument("Invalid timestamps.");
4293     }
4294     for (const Status& s : result.GetMultiStatus()) {
4295       s.PermitUncheckedError();
4296     }
4297     switch (result.GetTraceType()) {
4298       case kTraceMultiGet: {
4299         total_latency_ += result.GetLatency();
4300         cnt_++;
4301         multigets_++;
4302         break;
4303       }
4304       default:
4305         return Status::Corruption("Type mismatch.");
4306     }
4307     return Status::OK();
4308   }
4309 
Handle(const IteratorTraceExecutionResult & result)4310   virtual Status Handle(const IteratorTraceExecutionResult& result) override {
4311     if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
4312       return Status::InvalidArgument("Invalid timestamps.");
4313     }
4314     result.GetStatus().PermitUncheckedError();
4315     switch (result.GetTraceType()) {
4316       case kTraceIteratorSeek:
4317       case kTraceIteratorSeekForPrev: {
4318         total_latency_ += result.GetLatency();
4319         cnt_++;
4320         seeks_++;
4321         break;
4322       }
4323       default:
4324         return Status::Corruption("Type mismatch.");
4325     }
4326     return Status::OK();
4327   }
4328 
Reset()4329   void Reset() {
4330     total_latency_ = 0;
4331     cnt_ = 0;
4332     writes_ = 0;
4333     gets_ = 0;
4334     seeks_ = 0;
4335     multigets_ = 0;
4336   }
4337 
GetAvgLatency() const4338   double GetAvgLatency() const {
4339     return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_;
4340   }
4341 
GetNumWrites() const4342   int GetNumWrites() const { return writes_; }
4343 
GetNumGets() const4344   int GetNumGets() const { return gets_; }
4345 
GetNumIterSeeks() const4346   int GetNumIterSeeks() const { return seeks_; }
4347 
GetNumMultiGets() const4348   int GetNumMultiGets() const { return multigets_; }
4349 
4350  private:
4351   std::atomic<uint64_t> total_latency_{0};
4352   std::atomic<uint32_t> cnt_{0};
4353   std::atomic<int> writes_{0};
4354   std::atomic<int> gets_{0};
4355   std::atomic<int> seeks_{0};
4356   std::atomic<int> multigets_{0};
4357 };
4358 
TEST_F(DBTest2,TraceAndReplay)4359 TEST_F(DBTest2, TraceAndReplay) {
4360   Options options = CurrentOptions();
4361   options.merge_operator = MergeOperators::CreatePutOperator();
4362   ReadOptions ro;
4363   WriteOptions wo;
4364   TraceOptions trace_opts;
4365   EnvOptions env_opts;
4366   CreateAndReopenWithCF({"pikachu"}, options);
4367   Random rnd(301);
4368   Iterator* single_iter = nullptr;
4369 
4370   ASSERT_TRUE(db_->EndTrace().IsIOError());
4371 
4372   std::string trace_filename = dbname_ + "/rocksdb.trace";
4373   std::unique_ptr<TraceWriter> trace_writer;
4374   ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
4375   ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
4376 
4377   // 5 Writes
4378   ASSERT_OK(Put(0, "a", "1"));
4379   ASSERT_OK(Merge(0, "b", "2"));
4380   ASSERT_OK(Delete(0, "c"));
4381   ASSERT_OK(SingleDelete(0, "d"));
4382   ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
4383 
4384   // 6th Write
4385   WriteBatch batch;
4386   ASSERT_OK(batch.Put("f", "11"));
4387   ASSERT_OK(batch.Merge("g", "12"));
4388   ASSERT_OK(batch.Delete("h"));
4389   ASSERT_OK(batch.SingleDelete("i"));
4390   ASSERT_OK(batch.DeleteRange("j", "k"));
4391   ASSERT_OK(db_->Write(wo, &batch));
4392 
4393   // 2 Seek(ForPrev)s
4394   single_iter = db_->NewIterator(ro);
4395   single_iter->Seek("f");  // Seek 1
4396   single_iter->SeekForPrev("g");
4397   ASSERT_OK(single_iter->status());
4398   delete single_iter;
4399 
4400   // 2 Gets
4401   ASSERT_EQ("1", Get(0, "a"));
4402   ASSERT_EQ("12", Get(0, "g"));
4403 
4404   // 7th and 8th Write, 3rd Get
4405   ASSERT_OK(Put(1, "foo", "bar"));
4406   ASSERT_OK(Put(1, "rocksdb", "rocks"));
4407   ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
4408 
4409   // Total Write x 8, Get x 3, Seek x 2.
4410   ASSERT_OK(db_->EndTrace());
4411   // These should not get into the trace file as it is after EndTrace.
4412   ASSERT_OK(Put("hello", "world"));
4413   ASSERT_OK(Merge("foo", "bar"));
4414 
4415   // Open another db, replay, and verify the data
4416   std::string value;
4417   std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
4418   ASSERT_OK(DestroyDB(dbname2, options));
4419 
4420   // Using a different name than db2, to pacify infer's use-after-lifetime
4421   // warnings (http://fbinfer.com).
4422   DB* db2_init = nullptr;
4423   options.create_if_missing = true;
4424   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
4425   ColumnFamilyHandle* cf;
4426   ASSERT_OK(
4427       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
4428   delete cf;
4429   delete db2_init;
4430 
4431   DB* db2 = nullptr;
4432   std::vector<ColumnFamilyDescriptor> column_families;
4433   ColumnFamilyOptions cf_options;
4434   cf_options.merge_operator = MergeOperators::CreatePutOperator();
4435   column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
4436   column_families.push_back(
4437       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
4438   std::vector<ColumnFamilyHandle*> handles;
4439   DBOptions db_opts;
4440   db_opts.env = env_;
4441   ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
4442 
4443   env_->SleepForMicroseconds(100);
4444   // Verify that the keys don't already exist
4445   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4446   ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
4447 
4448   std::unique_ptr<TraceReader> trace_reader;
4449   ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
4450   std::unique_ptr<Replayer> replayer;
4451   ASSERT_OK(
4452       db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
4453 
4454   TraceExecutionResultHandler res_handler;
4455   std::function<void(Status, std::unique_ptr<TraceRecordResult> &&)> res_cb =
4456       [&res_handler](Status exec_s, std::unique_ptr<TraceRecordResult>&& res) {
4457         ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported());
4458         if (res != nullptr) {
4459           ASSERT_OK(res->Accept(&res_handler));
4460           res.reset();
4461         }
4462       };
4463 
4464   // Unprepared replay should fail with Status::Incomplete()
4465   ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
4466   ASSERT_OK(replayer->Prepare());
4467   // Ok to repeatedly Prepare().
4468   ASSERT_OK(replayer->Prepare());
4469   // Replay using 1 thread, 1x speed.
4470   ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb));
4471   ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4472   ASSERT_EQ(res_handler.GetNumWrites(), 8);
4473   ASSERT_EQ(res_handler.GetNumGets(), 3);
4474   ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
4475   ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4476   res_handler.Reset();
4477 
4478   ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
4479   ASSERT_EQ("1", value);
4480   ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
4481   ASSERT_EQ("12", value);
4482   ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
4483   ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
4484 
4485   ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
4486   ASSERT_EQ("bar", value);
4487   ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
4488   ASSERT_EQ("rocks", value);
4489 
4490   // Re-replay should fail with Status::Incomplete() if Prepare() was not
4491   // called. Currently we don't distinguish between unprepared and trace end.
4492   ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
4493 
4494   // Re-replay using 2 threads, 2x speed.
4495   ASSERT_OK(replayer->Prepare());
4496   ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb));
4497   ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4498   ASSERT_EQ(res_handler.GetNumWrites(), 8);
4499   ASSERT_EQ(res_handler.GetNumGets(), 3);
4500   ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
4501   ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4502   res_handler.Reset();
4503 
4504   // Re-replay using 2 threads, 1/2 speed.
4505   ASSERT_OK(replayer->Prepare());
4506   ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb));
4507   ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4508   ASSERT_EQ(res_handler.GetNumWrites(), 8);
4509   ASSERT_EQ(res_handler.GetNumGets(), 3);
4510   ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
4511   ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4512   res_handler.Reset();
4513 
4514   replayer.reset();
4515 
4516   for (auto handle : handles) {
4517     delete handle;
4518   }
4519   delete db2;
4520   ASSERT_OK(DestroyDB(dbname2, options));
4521 }
4522 
TEST_F(DBTest2,TraceAndManualReplay)4523 TEST_F(DBTest2, TraceAndManualReplay) {
4524   Options options = CurrentOptions();
4525   options.merge_operator = MergeOperators::CreatePutOperator();
4526   ReadOptions ro;
4527   WriteOptions wo;
4528   TraceOptions trace_opts;
4529   EnvOptions env_opts;
4530   CreateAndReopenWithCF({"pikachu"}, options);
4531   Random rnd(301);
4532   Iterator* single_iter = nullptr;
4533 
4534   ASSERT_TRUE(db_->EndTrace().IsIOError());
4535 
4536   std::string trace_filename = dbname_ + "/rocksdb.trace";
4537   std::unique_ptr<TraceWriter> trace_writer;
4538   ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
4539   ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
4540 
4541   ASSERT_OK(Put(0, "a", "1"));
4542   ASSERT_OK(Merge(0, "b", "2"));
4543   ASSERT_OK(Delete(0, "c"));
4544   ASSERT_OK(SingleDelete(0, "d"));
4545   ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
4546 
4547   WriteBatch batch;
4548   ASSERT_OK(batch.Put("f", "11"));
4549   ASSERT_OK(batch.Merge("g", "12"));
4550   ASSERT_OK(batch.Delete("h"));
4551   ASSERT_OK(batch.SingleDelete("i"));
4552   ASSERT_OK(batch.DeleteRange("j", "k"));
4553   ASSERT_OK(db_->Write(wo, &batch));
4554 
4555   single_iter = db_->NewIterator(ro);
4556   single_iter->Seek("f");
4557   single_iter->SeekForPrev("g");
4558   ASSERT_OK(single_iter->status());
4559   delete single_iter;
4560 
4561   // Write some sequenced keys for testing lower/upper bounds of iterator.
4562   batch.Clear();
4563   ASSERT_OK(batch.Put("iter-0", "iter-0"));
4564   ASSERT_OK(batch.Put("iter-1", "iter-1"));
4565   ASSERT_OK(batch.Put("iter-2", "iter-2"));
4566   ASSERT_OK(batch.Put("iter-3", "iter-3"));
4567   ASSERT_OK(batch.Put("iter-4", "iter-4"));
4568   ASSERT_OK(db_->Write(wo, &batch));
4569 
4570   ReadOptions bounded_ro = ro;
4571   Slice lower_bound("iter-1");
4572   Slice upper_bound("iter-3");
4573   bounded_ro.iterate_lower_bound = &lower_bound;
4574   bounded_ro.iterate_upper_bound = &upper_bound;
4575   single_iter = db_->NewIterator(bounded_ro);
4576   single_iter->Seek("iter-0");
4577   ASSERT_EQ(single_iter->key().ToString(), "iter-1");
4578   single_iter->Seek("iter-2");
4579   ASSERT_EQ(single_iter->key().ToString(), "iter-2");
4580   single_iter->Seek("iter-4");
4581   ASSERT_FALSE(single_iter->Valid());
4582   single_iter->SeekForPrev("iter-0");
4583   ASSERT_FALSE(single_iter->Valid());
4584   single_iter->SeekForPrev("iter-2");
4585   ASSERT_EQ(single_iter->key().ToString(), "iter-2");
4586   single_iter->SeekForPrev("iter-4");
4587   ASSERT_EQ(single_iter->key().ToString(), "iter-2");
4588   ASSERT_OK(single_iter->status());
4589   delete single_iter;
4590 
4591   ASSERT_EQ("1", Get(0, "a"));
4592   ASSERT_EQ("12", Get(0, "g"));
4593 
4594   ASSERT_OK(Put(1, "foo", "bar"));
4595   ASSERT_OK(Put(1, "rocksdb", "rocks"));
4596   ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
4597 
4598   // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2.
4599   // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6
4600   // Seek(ForPrev)s.
4601   // Total Write x 9, Get x 3, Seek x 8
4602   ASSERT_OK(db_->EndTrace());
4603   // These should not get into the trace file as it is after EndTrace.
4604   ASSERT_OK(Put("hello", "world"));
4605   ASSERT_OK(Merge("foo", "bar"));
4606 
4607   // Open another db, replay, and verify the data
4608   std::string value;
4609   std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
4610   ASSERT_OK(DestroyDB(dbname2, options));
4611 
4612   // Using a different name than db2, to pacify infer's use-after-lifetime
4613   // warnings (http://fbinfer.com).
4614   DB* db2_init = nullptr;
4615   options.create_if_missing = true;
4616   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
4617   ColumnFamilyHandle* cf;
4618   ASSERT_OK(
4619       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
4620   delete cf;
4621   delete db2_init;
4622 
4623   DB* db2 = nullptr;
4624   std::vector<ColumnFamilyDescriptor> column_families;
4625   ColumnFamilyOptions cf_options;
4626   cf_options.merge_operator = MergeOperators::CreatePutOperator();
4627   column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
4628   column_families.push_back(
4629       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
4630   std::vector<ColumnFamilyHandle*> handles;
4631   DBOptions db_opts;
4632   db_opts.env = env_;
4633   ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
4634 
4635   env_->SleepForMicroseconds(100);
4636   // Verify that the keys don't already exist
4637   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4638   ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
4639 
4640   std::unique_ptr<TraceReader> trace_reader;
4641   ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
4642   std::unique_ptr<Replayer> replayer;
4643   ASSERT_OK(
4644       db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
4645 
4646   TraceExecutionResultHandler res_handler;
4647 
4648   // Manual replay for 2 times. The 2nd checks if the replay can restart.
4649   std::unique_ptr<TraceRecord> record;
4650   std::unique_ptr<TraceRecordResult> result;
4651   for (int i = 0; i < 2; i++) {
4652     // Next should fail if unprepared.
4653     ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
4654     ASSERT_OK(replayer->Prepare());
4655     Status s = Status::OK();
4656     // Looping until trace end.
4657     while (s.ok()) {
4658       s = replayer->Next(&record);
4659       // Skip unsupported operations.
4660       if (s.IsNotSupported()) {
4661         continue;
4662       }
4663       if (s.ok()) {
4664         ASSERT_OK(replayer->Execute(record, &result));
4665         if (result != nullptr) {
4666           ASSERT_OK(result->Accept(&res_handler));
4667           if (record->GetTraceType() == kTraceIteratorSeek ||
4668               record->GetTraceType() == kTraceIteratorSeekForPrev) {
4669             IteratorSeekQueryTraceRecord* iter_rec =
4670                 dynamic_cast<IteratorSeekQueryTraceRecord*>(record.get());
4671             IteratorTraceExecutionResult* iter_res =
4672                 dynamic_cast<IteratorTraceExecutionResult*>(result.get());
4673             // Check if lower/upper bounds are correctly saved and decoded.
4674             std::string lower_str = iter_rec->GetLowerBound().ToString();
4675             std::string upper_str = iter_rec->GetUpperBound().ToString();
4676             std::string iter_key = iter_res->GetKey().ToString();
4677             std::string iter_value = iter_res->GetValue().ToString();
4678             if (!lower_str.empty() && !upper_str.empty()) {
4679               ASSERT_EQ(lower_str, "iter-1");
4680               ASSERT_EQ(upper_str, "iter-3");
4681               if (iter_res->GetValid()) {
4682                 // If iterator is valid, then lower_bound <= key < upper_bound.
4683                 ASSERT_GE(iter_key, lower_str);
4684                 ASSERT_LT(iter_key, upper_str);
4685               } else {
4686                 // If iterator is invalid, then
4687                 //   key < lower_bound or key >= upper_bound.
4688                 ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str);
4689               }
4690             }
4691             // If iterator is invalid, the key and value should be empty.
4692             if (!iter_res->GetValid()) {
4693               ASSERT_TRUE(iter_key.empty());
4694               ASSERT_TRUE(iter_value.empty());
4695             }
4696           }
4697           result.reset();
4698         }
4699       }
4700     }
4701     // Status::Incomplete() will be returned when manually reading the trace
4702     // end, or Prepare() was not called.
4703     ASSERT_TRUE(s.IsIncomplete());
4704     ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
4705     ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4706     ASSERT_EQ(res_handler.GetNumWrites(), 9);
4707     ASSERT_EQ(res_handler.GetNumGets(), 3);
4708     ASSERT_EQ(res_handler.GetNumIterSeeks(), 8);
4709     ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4710     res_handler.Reset();
4711   }
4712 
4713   ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
4714   ASSERT_EQ("1", value);
4715   ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
4716   ASSERT_EQ("12", value);
4717   ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
4718   ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
4719 
4720   ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
4721   ASSERT_EQ("bar", value);
4722   ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
4723   ASSERT_EQ("rocks", value);
4724 
4725   // Test execution of artificially created TraceRecords.
4726   uint64_t fake_ts = 1U;
4727   // Write
4728   batch.Clear();
4729   ASSERT_OK(batch.Put("trace-record-write1", "write1"));
4730   ASSERT_OK(batch.Put("trace-record-write2", "write2"));
4731   record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++));
4732   ASSERT_OK(replayer->Execute(record, &result));
4733   ASSERT_TRUE(result != nullptr);
4734   ASSERT_OK(result->Accept(&res_handler));  // Write x 1
4735   ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value));
4736   ASSERT_EQ("write1", value);
4737   ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value));
4738   ASSERT_EQ("write2", value);
4739   ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4740   ASSERT_EQ(res_handler.GetNumWrites(), 1);
4741   ASSERT_EQ(res_handler.GetNumGets(), 0);
4742   ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
4743   ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4744   res_handler.Reset();
4745 
4746   // Get related
4747   // Get an existing key.
4748   record.reset(new GetQueryTraceRecord(handles[0]->GetID(),
4749                                        "trace-record-write1", fake_ts++));
4750   ASSERT_OK(replayer->Execute(record, &result));
4751   ASSERT_TRUE(result != nullptr);
4752   ASSERT_OK(result->Accept(&res_handler));  // Get x 1
4753   // Get an non-existing key, should still return Status::OK().
4754   record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get",
4755                                        fake_ts++));
4756   ASSERT_OK(replayer->Execute(record, &result));
4757   ASSERT_TRUE(result != nullptr);
4758   ASSERT_OK(result->Accept(&res_handler));  // Get x 2
4759   // Get from an invalid (non-existing) cf_id.
4760   uint32_t invalid_cf_id = handles[1]->GetID() + 1;
4761   record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++));
4762   ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
4763   ASSERT_TRUE(result == nullptr);
4764   ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4765   ASSERT_EQ(res_handler.GetNumWrites(), 0);
4766   ASSERT_EQ(res_handler.GetNumGets(), 2);
4767   ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
4768   ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4769   res_handler.Reset();
4770 
4771   // Iteration related
4772   for (IteratorSeekQueryTraceRecord::SeekType seekType :
4773        {IteratorSeekQueryTraceRecord::kSeek,
4774         IteratorSeekQueryTraceRecord::kSeekForPrev}) {
4775     // Seek to an existing key.
4776     record.reset(new IteratorSeekQueryTraceRecord(
4777         seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++));
4778     ASSERT_OK(replayer->Execute(record, &result));
4779     ASSERT_TRUE(result != nullptr);
4780     ASSERT_OK(result->Accept(&res_handler));  // Seek x 1 in one iteration
4781     // Seek to an non-existing key, should still return Status::OK().
4782     record.reset(new IteratorSeekQueryTraceRecord(
4783         seekType, handles[0]->GetID(), "trace-record-get", fake_ts++));
4784     ASSERT_OK(replayer->Execute(record, &result));
4785     ASSERT_TRUE(result != nullptr);
4786     ASSERT_OK(result->Accept(&res_handler));  // Seek x 2 in one iteration
4787     // Seek from an invalid cf_id.
4788     record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id,
4789                                                   "whatever", fake_ts++));
4790     ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
4791     ASSERT_TRUE(result == nullptr);
4792   }
4793   ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4794   ASSERT_EQ(res_handler.GetNumWrites(), 0);
4795   ASSERT_EQ(res_handler.GetNumGets(), 0);
4796   ASSERT_EQ(res_handler.GetNumIterSeeks(), 4);  // Seek x 2 in two iterations
4797   ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4798   res_handler.Reset();
4799 
4800   // MultiGet related
4801   // Get existing keys.
4802   record.reset(new MultiGetQueryTraceRecord(
4803       std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
4804       std::vector<std::string>({"a", "foo"}), fake_ts++));
4805   ASSERT_OK(replayer->Execute(record, &result));
4806   ASSERT_TRUE(result != nullptr);
4807   ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 1
4808   // Get all non-existing keys, should still return Status::OK().
4809   record.reset(new MultiGetQueryTraceRecord(
4810       std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
4811       std::vector<std::string>({"no1", "no2"}), fake_ts++));
4812   ASSERT_OK(replayer->Execute(record, &result));
4813   ASSERT_TRUE(result != nullptr);
4814   ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 2
4815   // Get mixed of existing and non-existing keys, should still return
4816   // Status::OK().
4817   record.reset(new MultiGetQueryTraceRecord(
4818       std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
4819       std::vector<std::string>({"a", "no2"}), fake_ts++));
4820   ASSERT_OK(replayer->Execute(record, &result));
4821   ASSERT_TRUE(result != nullptr);
4822   MultiValuesTraceExecutionResult* mvr =
4823       dynamic_cast<MultiValuesTraceExecutionResult*>(result.get());
4824   ASSERT_TRUE(mvr != nullptr);
4825   ASSERT_OK(mvr->GetMultiStatus()[0]);
4826   ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound());
4827   ASSERT_EQ(mvr->GetValues()[0], "1");
4828   ASSERT_EQ(mvr->GetValues()[1], "");
4829   ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 3
4830   // Get from an invalid (non-existing) cf_id.
4831   record.reset(new MultiGetQueryTraceRecord(
4832       std::vector<uint32_t>(
4833           {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}),
4834       std::vector<std::string>({"a", "foo", "whatever"}), fake_ts++));
4835   ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
4836   ASSERT_TRUE(result == nullptr);
4837   // Empty MultiGet
4838   record.reset(new MultiGetQueryTraceRecord(
4839       std::vector<uint32_t>(), std::vector<std::string>(), fake_ts++));
4840   ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
4841   ASSERT_TRUE(result == nullptr);
4842   // MultiGet size mismatch
4843   record.reset(new MultiGetQueryTraceRecord(
4844       std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
4845       std::vector<std::string>({"a"}), fake_ts++));
4846   ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
4847   ASSERT_TRUE(result == nullptr);
4848   ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4849   ASSERT_EQ(res_handler.GetNumWrites(), 0);
4850   ASSERT_EQ(res_handler.GetNumGets(), 0);
4851   ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
4852   ASSERT_EQ(res_handler.GetNumMultiGets(), 3);
4853   res_handler.Reset();
4854 
4855   replayer.reset();
4856 
4857   for (auto handle : handles) {
4858     delete handle;
4859   }
4860   delete db2;
4861   ASSERT_OK(DestroyDB(dbname2, options));
4862 }
4863 
TEST_F(DBTest2,TraceWithLimit)4864 TEST_F(DBTest2, TraceWithLimit) {
4865   Options options = CurrentOptions();
4866   options.merge_operator = MergeOperators::CreatePutOperator();
4867   ReadOptions ro;
4868   WriteOptions wo;
4869   TraceOptions trace_opts;
4870   EnvOptions env_opts;
4871   CreateAndReopenWithCF({"pikachu"}, options);
4872   Random rnd(301);
4873 
4874   // test the max trace file size options
4875   trace_opts.max_trace_file_size = 5;
4876   std::string trace_filename = dbname_ + "/rocksdb.trace1";
4877   std::unique_ptr<TraceWriter> trace_writer;
4878   ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
4879   ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
4880   ASSERT_OK(Put(0, "a", "1"));
4881   ASSERT_OK(Put(0, "b", "1"));
4882   ASSERT_OK(Put(0, "c", "1"));
4883   ASSERT_OK(db_->EndTrace());
4884 
4885   std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2");
4886   std::string value;
4887   ASSERT_OK(DestroyDB(dbname2, options));
4888 
4889   // Using a different name than db2, to pacify infer's use-after-lifetime
4890   // warnings (http://fbinfer.com).
4891   DB* db2_init = nullptr;
4892   options.create_if_missing = true;
4893   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
4894   ColumnFamilyHandle* cf;
4895   ASSERT_OK(
4896       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
4897   delete cf;
4898   delete db2_init;
4899 
4900   DB* db2 = nullptr;
4901   std::vector<ColumnFamilyDescriptor> column_families;
4902   ColumnFamilyOptions cf_options;
4903   cf_options.merge_operator = MergeOperators::CreatePutOperator();
4904   column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
4905   column_families.push_back(
4906       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
4907   std::vector<ColumnFamilyHandle*> handles;
4908   DBOptions db_opts;
4909   db_opts.env = env_;
4910   ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
4911 
4912   env_->SleepForMicroseconds(100);
4913   // Verify that the keys don't already exist
4914   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4915   ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
4916   ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
4917 
4918   std::unique_ptr<TraceReader> trace_reader;
4919   ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
4920   std::unique_ptr<Replayer> replayer;
4921   ASSERT_OK(
4922       db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
4923   ASSERT_OK(replayer->Prepare());
4924   ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
4925   replayer.reset();
4926 
4927   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4928   ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
4929   ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
4930 
4931   for (auto handle : handles) {
4932     delete handle;
4933   }
4934   delete db2;
4935   ASSERT_OK(DestroyDB(dbname2, options));
4936 }
4937 
TEST_F(DBTest2,TraceWithSampling)4938 TEST_F(DBTest2, TraceWithSampling) {
4939   Options options = CurrentOptions();
4940   ReadOptions ro;
4941   WriteOptions wo;
4942   TraceOptions trace_opts;
4943   EnvOptions env_opts;
4944   CreateAndReopenWithCF({"pikachu"}, options);
4945   Random rnd(301);
4946 
4947   // test the trace file sampling options
4948   trace_opts.sampling_frequency = 2;
4949   std::string trace_filename = dbname_ + "/rocksdb.trace_sampling";
4950   std::unique_ptr<TraceWriter> trace_writer;
4951   ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
4952   ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
4953   ASSERT_OK(Put(0, "a", "1"));
4954   ASSERT_OK(Put(0, "b", "2"));
4955   ASSERT_OK(Put(0, "c", "3"));
4956   ASSERT_OK(Put(0, "d", "4"));
4957   ASSERT_OK(Put(0, "e", "5"));
4958   ASSERT_OK(db_->EndTrace());
4959 
4960   std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling");
4961   std::string value;
4962   ASSERT_OK(DestroyDB(dbname2, options));
4963 
4964   // Using a different name than db2, to pacify infer's use-after-lifetime
4965   // warnings (http://fbinfer.com).
4966   DB* db2_init = nullptr;
4967   options.create_if_missing = true;
4968   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
4969   ColumnFamilyHandle* cf;
4970   ASSERT_OK(
4971       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
4972   delete cf;
4973   delete db2_init;
4974 
4975   DB* db2 = nullptr;
4976   std::vector<ColumnFamilyDescriptor> column_families;
4977   ColumnFamilyOptions cf_options;
4978   column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
4979   column_families.push_back(
4980       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
4981   std::vector<ColumnFamilyHandle*> handles;
4982   DBOptions db_opts;
4983   db_opts.env = env_;
4984   ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
4985 
4986   env_->SleepForMicroseconds(100);
4987   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4988   ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
4989   ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
4990   ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
4991   ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
4992 
4993   std::unique_ptr<TraceReader> trace_reader;
4994   ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
4995   std::unique_ptr<Replayer> replayer;
4996   ASSERT_OK(
4997       db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
4998   ASSERT_OK(replayer->Prepare());
4999   ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
5000   replayer.reset();
5001 
5002   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
5003   ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
5004   ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
5005   ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
5006   ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
5007 
5008   for (auto handle : handles) {
5009     delete handle;
5010   }
5011   delete db2;
5012   ASSERT_OK(DestroyDB(dbname2, options));
5013 }
5014 
TEST_F(DBTest2,TraceWithFilter)5015 TEST_F(DBTest2, TraceWithFilter) {
5016   Options options = CurrentOptions();
5017   options.merge_operator = MergeOperators::CreatePutOperator();
5018   ReadOptions ro;
5019   WriteOptions wo;
5020   TraceOptions trace_opts;
5021   EnvOptions env_opts;
5022   CreateAndReopenWithCF({"pikachu"}, options);
5023   Random rnd(301);
5024   Iterator* single_iter = nullptr;
5025 
5026   trace_opts.filter = TraceFilterType::kTraceFilterWrite;
5027 
5028   std::string trace_filename = dbname_ + "/rocksdb.trace";
5029   std::unique_ptr<TraceWriter> trace_writer;
5030   ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
5031   ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
5032 
5033   ASSERT_OK(Put(0, "a", "1"));
5034   ASSERT_OK(Merge(0, "b", "2"));
5035   ASSERT_OK(Delete(0, "c"));
5036   ASSERT_OK(SingleDelete(0, "d"));
5037   ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
5038 
5039   WriteBatch batch;
5040   ASSERT_OK(batch.Put("f", "11"));
5041   ASSERT_OK(batch.Merge("g", "12"));
5042   ASSERT_OK(batch.Delete("h"));
5043   ASSERT_OK(batch.SingleDelete("i"));
5044   ASSERT_OK(batch.DeleteRange("j", "k"));
5045   ASSERT_OK(db_->Write(wo, &batch));
5046 
5047   single_iter = db_->NewIterator(ro);
5048   single_iter->Seek("f");
5049   single_iter->SeekForPrev("g");
5050   delete single_iter;
5051 
5052   ASSERT_EQ("1", Get(0, "a"));
5053   ASSERT_EQ("12", Get(0, "g"));
5054 
5055   ASSERT_OK(Put(1, "foo", "bar"));
5056   ASSERT_OK(Put(1, "rocksdb", "rocks"));
5057   ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
5058 
5059   ASSERT_OK(db_->EndTrace());
5060   // These should not get into the trace file as it is after EndTrace.
5061   ASSERT_OK(Put("hello", "world"));
5062   ASSERT_OK(Merge("foo", "bar"));
5063 
5064   // Open another db, replay, and verify the data
5065   std::string value;
5066   std::string dbname2 = test::PerThreadDBPath(env_, "db_replay");
5067   ASSERT_OK(DestroyDB(dbname2, options));
5068 
5069   // Using a different name than db2, to pacify infer's use-after-lifetime
5070   // warnings (http://fbinfer.com).
5071   DB* db2_init = nullptr;
5072   options.create_if_missing = true;
5073   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
5074   ColumnFamilyHandle* cf;
5075   ASSERT_OK(
5076       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
5077   delete cf;
5078   delete db2_init;
5079 
5080   DB* db2 = nullptr;
5081   std::vector<ColumnFamilyDescriptor> column_families;
5082   ColumnFamilyOptions cf_options;
5083   cf_options.merge_operator = MergeOperators::CreatePutOperator();
5084   column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
5085   column_families.push_back(
5086       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
5087   std::vector<ColumnFamilyHandle*> handles;
5088   DBOptions db_opts;
5089   db_opts.env = env_;
5090   ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
5091 
5092   env_->SleepForMicroseconds(100);
5093   // Verify that the keys don't already exist
5094   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
5095   ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
5096 
5097   std::unique_ptr<TraceReader> trace_reader;
5098   ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
5099   std::unique_ptr<Replayer> replayer;
5100   ASSERT_OK(
5101       db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
5102   ASSERT_OK(replayer->Prepare());
5103   ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
5104   replayer.reset();
5105 
5106   // All the key-values should not present since we filter out the WRITE ops.
5107   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
5108   ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
5109   ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
5110   ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
5111   ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound());
5112   ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound());
5113 
5114   for (auto handle : handles) {
5115     delete handle;
5116   }
5117   delete db2;
5118   ASSERT_OK(DestroyDB(dbname2, options));
5119 
5120   // Set up a new db.
5121   std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
5122   ASSERT_OK(DestroyDB(dbname3, options));
5123 
5124   DB* db3_init = nullptr;
5125   options.create_if_missing = true;
5126   ColumnFamilyHandle* cf3;
5127   ASSERT_OK(DB::Open(options, dbname3, &db3_init));
5128   ASSERT_OK(
5129       db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
5130   delete cf3;
5131   delete db3_init;
5132 
5133   column_families.clear();
5134   column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
5135   column_families.push_back(
5136       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
5137   handles.clear();
5138 
5139   DB* db3 =  nullptr;
5140   ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));
5141 
5142   env_->SleepForMicroseconds(100);
5143   // Verify that the keys don't already exist
5144   ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound());
5145   ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound());
5146 
5147   //The tracer will not record the READ ops.
5148   trace_opts.filter = TraceFilterType::kTraceFilterGet;
5149   std::string trace_filename3 = dbname_ + "/rocksdb.trace_3";
5150   std::unique_ptr<TraceWriter> trace_writer3;
5151   ASSERT_OK(
5152     NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3));
5153   ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3)));
5154 
5155   ASSERT_OK(db3->Put(wo, handles[0], "a", "1"));
5156   ASSERT_OK(db3->Merge(wo, handles[0], "b", "2"));
5157   ASSERT_OK(db3->Delete(wo, handles[0], "c"));
5158   ASSERT_OK(db3->SingleDelete(wo, handles[0], "d"));
5159 
5160   ASSERT_OK(db3->Get(ro, handles[0], "a", &value));
5161   ASSERT_EQ(value, "1");
5162   ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound());
5163 
5164   ASSERT_OK(db3->EndTrace());
5165 
5166   for (auto handle : handles) {
5167     delete handle;
5168   }
5169   delete db3;
5170   ASSERT_OK(DestroyDB(dbname3, options));
5171 
5172   std::unique_ptr<TraceReader> trace_reader3;
5173   ASSERT_OK(
5174     NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3));
5175 
5176   // Count the number of records in the trace file;
5177   int count = 0;
5178   std::string data;
5179   Status s;
5180   while (true) {
5181     s = trace_reader3->Read(&data);
5182     if (!s.ok()) {
5183       break;
5184     }
5185     count += 1;
5186   }
5187   // We also need to count the header and footer
5188   // 4 WRITE + HEADER + FOOTER = 6
5189   ASSERT_EQ(count, 6);
5190 }
5191 
5192 #endif  // ROCKSDB_LITE
5193 
TEST_F(DBTest2,PinnableSliceAndMmapReads)5194 TEST_F(DBTest2, PinnableSliceAndMmapReads) {
5195   Options options = CurrentOptions();
5196   options.env = env_;
5197   if (!IsMemoryMappedAccessSupported()) {
5198     ROCKSDB_GTEST_SKIP("Test requires default environment");
5199     return;
5200   }
5201   options.allow_mmap_reads = true;
5202   options.max_open_files = 100;
5203   options.compression = kNoCompression;
5204   Reopen(options);
5205 
5206   ASSERT_OK(Put("foo", "bar"));
5207   ASSERT_OK(Flush());
5208 
5209   PinnableSlice pinned_value;
5210   ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
5211   // It is not safe to pin mmap files as they might disappear by compaction
5212   ASSERT_FALSE(pinned_value.IsPinned());
5213   ASSERT_EQ(pinned_value.ToString(), "bar");
5214 
5215   ASSERT_OK(dbfull()->TEST_CompactRange(
5216       0 /* level */, nullptr /* begin */, nullptr /* end */,
5217       nullptr /* column_family */, true /* disallow_trivial_move */));
5218 
5219   // Ensure pinned_value doesn't rely on memory munmap'd by the above
5220   // compaction. It crashes if it does.
5221   ASSERT_EQ(pinned_value.ToString(), "bar");
5222 
5223 #ifndef ROCKSDB_LITE
5224   pinned_value.Reset();
5225   // Unsafe to pin mmap files when they could be kicked out of table cache
5226   Close();
5227   ASSERT_OK(ReadOnlyReopen(options));
5228   ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
5229   ASSERT_FALSE(pinned_value.IsPinned());
5230   ASSERT_EQ(pinned_value.ToString(), "bar");
5231 
5232   pinned_value.Reset();
5233   // In read-only mode with infinite capacity on table cache it should pin the
5234   // value and avoid the memcpy
5235   Close();
5236   options.max_open_files = -1;
5237   ASSERT_OK(ReadOnlyReopen(options));
5238   ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
5239   ASSERT_TRUE(pinned_value.IsPinned());
5240   ASSERT_EQ(pinned_value.ToString(), "bar");
5241 #endif
5242 }
5243 
TEST_F(DBTest2,DISABLED_IteratorPinnedMemory)5244 TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
5245   Options options = CurrentOptions();
5246   options.create_if_missing = true;
5247   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
5248   BlockBasedTableOptions bbto;
5249   bbto.no_block_cache = false;
5250   bbto.cache_index_and_filter_blocks = false;
5251   bbto.block_cache = NewLRUCache(100000);
5252   bbto.block_size = 400;  // small block size
5253   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
5254   Reopen(options);
5255 
5256   Random rnd(301);
5257   std::string v = rnd.RandomString(400);
5258 
5259   // Since v is the size of a block, each key should take a block
5260   // of 400+ bytes.
5261   ASSERT_OK(Put("1", v));
5262   ASSERT_OK(Put("3", v));
5263   ASSERT_OK(Put("5", v));
5264   ASSERT_OK(Put("7", v));
5265   ASSERT_OK(Flush());
5266 
5267   ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
5268 
5269   // Verify that iterators don't pin more than one data block in block cache
5270   // at each time.
5271   {
5272     std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
5273     iter->SeekToFirst();
5274 
5275     for (int i = 0; i < 4; i++) {
5276       ASSERT_TRUE(iter->Valid());
5277       // Block cache should contain exactly one block.
5278       ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
5279       ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
5280       iter->Next();
5281     }
5282     ASSERT_FALSE(iter->Valid());
5283 
5284     iter->Seek("4");
5285     ASSERT_TRUE(iter->Valid());
5286 
5287     ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
5288     ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
5289 
5290     iter->Seek("3");
5291     ASSERT_TRUE(iter->Valid());
5292 
5293     ASSERT_OK(iter->status());
5294 
5295     ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
5296     ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
5297   }
5298   ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
5299 
5300   // Test compaction case
5301   ASSERT_OK(Put("2", v));
5302   ASSERT_OK(Put("5", v));
5303   ASSERT_OK(Put("6", v));
5304   ASSERT_OK(Put("8", v));
5305   ASSERT_OK(Flush());
5306 
5307   // Clear existing data in block cache
5308   bbto.block_cache->SetCapacity(0);
5309   bbto.block_cache->SetCapacity(100000);
5310 
5311   // Verify compaction input iterators don't hold more than one data blocks at
5312   // one time.
5313   std::atomic<bool> finished(false);
5314   std::atomic<int> block_newed(0);
5315   std::atomic<int> block_destroyed(0);
5316   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5317       "Block::Block:0", [&](void* /*arg*/) {
5318         if (finished) {
5319           return;
5320         }
5321         // Two iterators. At most 2 outstanding blocks.
5322         EXPECT_GE(block_newed.load(), block_destroyed.load());
5323         EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
5324         block_newed.fetch_add(1);
5325       });
5326   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5327       "Block::~Block", [&](void* /*arg*/) {
5328         if (finished) {
5329           return;
5330         }
5331         // Two iterators. At most 2 outstanding blocks.
5332         EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
5333         EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
5334         block_destroyed.fetch_add(1);
5335       });
5336   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5337       "CompactionJob::Run:BeforeVerify",
5338       [&](void* /*arg*/) { finished = true; });
5339   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5340 
5341   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5342 
5343   // Two input files. Each of them has 4 data blocks.
5344   ASSERT_EQ(8, block_newed.load());
5345   ASSERT_EQ(8, block_destroyed.load());
5346 
5347   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5348 }
5349 
TEST_F(DBTest2,TestBBTTailPrefetch)5350 TEST_F(DBTest2, TestBBTTailPrefetch) {
5351   std::atomic<bool> called(false);
5352   size_t expected_lower_bound = 512 * 1024;
5353   size_t expected_higher_bound = 512 * 1024;
5354   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5355       "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
5356         size_t* prefetch_size = static_cast<size_t*>(arg);
5357         EXPECT_LE(expected_lower_bound, *prefetch_size);
5358         EXPECT_GE(expected_higher_bound, *prefetch_size);
5359         called = true;
5360       });
5361   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5362 
5363   ASSERT_OK(Put("1", "1"));
5364   ASSERT_OK(Put("9", "1"));
5365   ASSERT_OK(Flush());
5366 
5367   expected_lower_bound = 0;
5368   expected_higher_bound = 8 * 1024;
5369 
5370   ASSERT_OK(Put("1", "1"));
5371   ASSERT_OK(Put("9", "1"));
5372   ASSERT_OK(Flush());
5373 
5374   ASSERT_OK(Put("1", "1"));
5375   ASSERT_OK(Put("9", "1"));
5376   ASSERT_OK(Flush());
5377 
5378   // Full compaction to make sure there is no L0 file after the open.
5379   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5380 
5381   ASSERT_TRUE(called.load());
5382   called = false;
5383 
5384   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5385   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5386 
5387   std::atomic<bool> first_call(true);
5388   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5389       "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
5390         size_t* prefetch_size = static_cast<size_t*>(arg);
5391         if (first_call) {
5392           EXPECT_EQ(4 * 1024, *prefetch_size);
5393           first_call = false;
5394         } else {
5395           EXPECT_GE(4 * 1024, *prefetch_size);
5396         }
5397         called = true;
5398       });
5399   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5400 
5401   Options options = CurrentOptions();
5402   options.max_file_opening_threads = 1;  // one thread
5403   BlockBasedTableOptions table_options;
5404   table_options.cache_index_and_filter_blocks = true;
5405   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
5406   options.max_open_files = -1;
5407   Reopen(options);
5408 
5409   ASSERT_OK(Put("1", "1"));
5410   ASSERT_OK(Put("9", "1"));
5411   ASSERT_OK(Flush());
5412 
5413   ASSERT_OK(Put("1", "1"));
5414   ASSERT_OK(Put("9", "1"));
5415   ASSERT_OK(Flush());
5416 
5417   ASSERT_TRUE(called.load());
5418   called = false;
5419 
5420   // Parallel loading SST files
5421   options.max_file_opening_threads = 16;
5422   Reopen(options);
5423 
5424   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5425 
5426   ASSERT_TRUE(called.load());
5427 
5428   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5429   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5430 }
5431 
TEST_F(DBTest2,TestGetColumnFamilyHandleUnlocked)5432 TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
5433   // Setup sync point dependency to reproduce the race condition of
5434   // DBImpl::GetColumnFamilyHandleUnlocked
5435   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
5436       {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
5437        "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
5438       {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
5439        "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
5440   });
5441   SyncPoint::GetInstance()->EnableProcessing();
5442 
5443   CreateColumnFamilies({"test1", "test2"}, Options());
5444   ASSERT_EQ(handles_.size(), 2);
5445 
5446   DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
5447   port::Thread user_thread1([&]() {
5448     auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
5449     ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
5450     TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
5451     TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
5452     ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
5453   });
5454 
5455   port::Thread user_thread2([&]() {
5456     TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
5457     auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
5458     ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
5459     TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
5460     ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
5461   });
5462 
5463   user_thread1.join();
5464   user_thread2.join();
5465 
5466   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5467   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5468 }
5469 
5470 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,TestCompactFiles)5471 TEST_F(DBTest2, TestCompactFiles) {
5472   // Setup sync point dependency to reproduce the race condition of
5473   // DBImpl::GetColumnFamilyHandleUnlocked
5474   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
5475       {"TestCompactFiles::IngestExternalFile1",
5476        "TestCompactFiles::IngestExternalFile2"},
5477   });
5478   SyncPoint::GetInstance()->EnableProcessing();
5479 
5480   Options options;
5481   options.env = env_;
5482   options.num_levels = 2;
5483   options.disable_auto_compactions = true;
5484   Reopen(options);
5485   auto* handle = db_->DefaultColumnFamily();
5486   ASSERT_EQ(db_->NumberLevels(handle), 2);
5487 
5488   ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
5489       ROCKSDB_NAMESPACE::EnvOptions(), options};
5490   std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
5491   std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
5492   std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";
5493 
5494   ASSERT_OK(sst_file_writer.Open(external_file1));
5495   ASSERT_OK(sst_file_writer.Put("1", "1"));
5496   ASSERT_OK(sst_file_writer.Put("2", "2"));
5497   ASSERT_OK(sst_file_writer.Finish());
5498 
5499   ASSERT_OK(sst_file_writer.Open(external_file2));
5500   ASSERT_OK(sst_file_writer.Put("3", "3"));
5501   ASSERT_OK(sst_file_writer.Put("4", "4"));
5502   ASSERT_OK(sst_file_writer.Finish());
5503 
5504   ASSERT_OK(sst_file_writer.Open(external_file3));
5505   ASSERT_OK(sst_file_writer.Put("5", "5"));
5506   ASSERT_OK(sst_file_writer.Put("6", "6"));
5507   ASSERT_OK(sst_file_writer.Finish());
5508 
5509   ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
5510                                     IngestExternalFileOptions()));
5511   ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
5512   std::vector<std::string> files;
5513   GetSstFiles(env_, dbname_, &files);
5514   ASSERT_EQ(files.size(), 2);
5515 
5516   Status user_thread1_status;
5517   port::Thread user_thread1([&]() {
5518     user_thread1_status =
5519         db_->CompactFiles(CompactionOptions(), handle, files, 1);
5520   });
5521 
5522   Status user_thread2_status;
5523   port::Thread user_thread2([&]() {
5524     user_thread2_status = db_->IngestExternalFile(handle, {external_file2},
5525                                                   IngestExternalFileOptions());
5526     TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
5527   });
5528 
5529   user_thread1.join();
5530   user_thread2.join();
5531 
5532   ASSERT_OK(user_thread1_status);
5533   ASSERT_OK(user_thread2_status);
5534 
5535   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5536   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5537 }
5538 #endif  // ROCKSDB_LITE
5539 
TEST_F(DBTest2,MultiDBParallelOpenTest)5540 TEST_F(DBTest2, MultiDBParallelOpenTest) {
5541   const int kNumDbs = 2;
5542   Options options = CurrentOptions();
5543   std::vector<std::string> dbnames;
5544   for (int i = 0; i < kNumDbs; ++i) {
5545     dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + ToString(i)));
5546     ASSERT_OK(DestroyDB(dbnames.back(), options));
5547   }
5548 
5549   // Verify empty DBs can be created in parallel
5550   std::vector<std::thread> open_threads;
5551   std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
5552   options.create_if_missing = true;
5553   for (int i = 0; i < kNumDbs; ++i) {
5554     open_threads.emplace_back(
5555         [&](int dbnum) {
5556           ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
5557         },
5558         i);
5559   }
5560 
5561   // Now add some data and close, so next we can verify non-empty DBs can be
5562   // recovered in parallel
5563   for (int i = 0; i < kNumDbs; ++i) {
5564     open_threads[i].join();
5565     ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
5566     delete dbs[i];
5567   }
5568 
5569   // Verify non-empty DBs can be recovered in parallel
5570   open_threads.clear();
5571   for (int i = 0; i < kNumDbs; ++i) {
5572     open_threads.emplace_back(
5573         [&](int dbnum) {
5574           ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
5575         },
5576         i);
5577   }
5578 
5579   // Wait and cleanup
5580   for (int i = 0; i < kNumDbs; ++i) {
5581     open_threads[i].join();
5582     delete dbs[i];
5583     ASSERT_OK(DestroyDB(dbnames[i], options));
5584   }
5585 }
5586 
5587 namespace {
5588 class DummyOldStats : public Statistics {
5589  public:
Name() const5590   const char* Name() const override { return "DummyOldStats"; }
getTickerCount(uint32_t) const5591   uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
recordTick(uint32_t,uint64_t)5592   void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
5593     num_rt++;
5594   }
setTickerCount(uint32_t,uint64_t)5595   void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {}
getAndResetTickerCount(uint32_t)5596   uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override {
5597     return 0;
5598   }
measureTime(uint32_t,uint64_t)5599   void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override {
5600     num_mt++;
5601   }
histogramData(uint32_t,ROCKSDB_NAMESPACE::HistogramData * const) const5602   void histogramData(
5603       uint32_t /*histogram_type*/,
5604       ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {}
getHistogramString(uint32_t) const5605   std::string getHistogramString(uint32_t /*type*/) const override {
5606     return "";
5607   }
HistEnabledForType(uint32_t) const5608   bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
ToString() const5609   std::string ToString() const override { return ""; }
5610   std::atomic<int> num_rt{0};
5611   std::atomic<int> num_mt{0};
5612 };
5613 }  // namespace
5614 
TEST_F(DBTest2,OldStatsInterface)5615 TEST_F(DBTest2, OldStatsInterface) {
5616   DummyOldStats* dos = new DummyOldStats();
5617   std::shared_ptr<Statistics> stats(dos);
5618   Options options = CurrentOptions();
5619   options.create_if_missing = true;
5620   options.statistics = stats;
5621   Reopen(options);
5622 
5623   ASSERT_OK(Put("foo", "bar"));
5624   ASSERT_EQ("bar", Get("foo"));
5625   ASSERT_OK(Flush());
5626   ASSERT_EQ("bar", Get("foo"));
5627 
5628   ASSERT_GT(dos->num_rt, 0);
5629   ASSERT_GT(dos->num_mt, 0);
5630 }
5631 
TEST_F(DBTest2,CloseWithUnreleasedSnapshot)5632 TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
5633   const Snapshot* ss = db_->GetSnapshot();
5634 
5635   for (auto h : handles_) {
5636     db_->DestroyColumnFamilyHandle(h);
5637   }
5638   handles_.clear();
5639 
5640   ASSERT_NOK(db_->Close());
5641   db_->ReleaseSnapshot(ss);
5642   ASSERT_OK(db_->Close());
5643   delete db_;
5644   db_ = nullptr;
5645 }
5646 
TEST_F(DBTest2,PrefixBloomReseek)5647 TEST_F(DBTest2, PrefixBloomReseek) {
5648   Options options = CurrentOptions();
5649   options.create_if_missing = true;
5650   options.prefix_extractor.reset(NewCappedPrefixTransform(3));
5651   BlockBasedTableOptions bbto;
5652   bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
5653   bbto.whole_key_filtering = false;
5654   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
5655   DestroyAndReopen(options);
5656 
5657   // Construct two L1 files with keys:
5658   // f1:[aaa1 ccc1] f2:[ddd0]
5659   ASSERT_OK(Put("aaa1", ""));
5660   ASSERT_OK(Put("ccc1", ""));
5661   ASSERT_OK(Flush());
5662   ASSERT_OK(Put("ddd0", ""));
5663   ASSERT_OK(Flush());
5664   CompactRangeOptions cro;
5665   cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
5666   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
5667 
5668   ASSERT_OK(Put("bbb1", ""));
5669 
5670   Iterator* iter = db_->NewIterator(ReadOptions());
5671   ASSERT_OK(iter->status());
5672 
5673   // Seeking into f1, the iterator will check bloom filter which returns the
5674   // file iterator ot be invalidate, and the cursor will put into f2, with
5675   // the next key to be "ddd0".
5676   iter->Seek("bbb1");
5677   ASSERT_TRUE(iter->Valid());
5678   ASSERT_EQ("bbb1", iter->key().ToString());
5679 
5680   // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
5681   iter->Seek("ccc1");
5682   ASSERT_TRUE(iter->Valid());
5683   ASSERT_EQ("ccc1", iter->key().ToString());
5684 
5685   delete iter;
5686 }
5687 
TEST_F(DBTest2,PrefixBloomFilteredOut)5688 TEST_F(DBTest2, PrefixBloomFilteredOut) {
5689   Options options = CurrentOptions();
5690   options.create_if_missing = true;
5691   options.prefix_extractor.reset(NewCappedPrefixTransform(3));
5692   BlockBasedTableOptions bbto;
5693   bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
5694   bbto.whole_key_filtering = false;
5695   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
5696   DestroyAndReopen(options);
5697 
5698   // Construct two L1 files with keys:
5699   // f1:[aaa1 ccc1] f2:[ddd0]
5700   ASSERT_OK(Put("aaa1", ""));
5701   ASSERT_OK(Put("ccc1", ""));
5702   ASSERT_OK(Flush());
5703   ASSERT_OK(Put("ddd0", ""));
5704   ASSERT_OK(Flush());
5705   CompactRangeOptions cro;
5706   cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
5707   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
5708 
5709   Iterator* iter = db_->NewIterator(ReadOptions());
5710   ASSERT_OK(iter->status());
5711 
5712   // Bloom filter is filterd out by f1.
5713   // This is just one of several valid position following the contract.
5714   // Postioning to ccc1 or ddd0 is also valid. This is just to validate
5715   // the behavior of the current implementation. If underlying implementation
5716   // changes, the test might fail here.
5717   iter->Seek("bbb1");
5718   ASSERT_OK(iter->status());
5719   ASSERT_FALSE(iter->Valid());
5720 
5721   delete iter;
5722 }
5723 
5724 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,RowCacheSnapshot)5725 TEST_F(DBTest2, RowCacheSnapshot) {
5726   Options options = CurrentOptions();
5727   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
5728   options.row_cache = NewLRUCache(8 * 8192);
5729   DestroyAndReopen(options);
5730 
5731   ASSERT_OK(Put("foo", "bar1"));
5732 
5733   const Snapshot* s1 = db_->GetSnapshot();
5734 
5735   ASSERT_OK(Put("foo", "bar2"));
5736   ASSERT_OK(Flush());
5737 
5738   ASSERT_OK(Put("foo2", "bar"));
5739   const Snapshot* s2 = db_->GetSnapshot();
5740   ASSERT_OK(Put("foo3", "bar"));
5741   const Snapshot* s3 = db_->GetSnapshot();
5742 
5743   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
5744   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
5745   ASSERT_EQ(Get("foo"), "bar2");
5746   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
5747   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
5748   ASSERT_EQ(Get("foo"), "bar2");
5749   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
5750   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
5751   ASSERT_EQ(Get("foo", s1), "bar1");
5752   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
5753   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
5754   ASSERT_EQ(Get("foo", s2), "bar2");
5755   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
5756   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
5757   ASSERT_EQ(Get("foo", s1), "bar1");
5758   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
5759   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
5760   ASSERT_EQ(Get("foo", s3), "bar2");
5761   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
5762   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
5763 
5764   db_->ReleaseSnapshot(s1);
5765   db_->ReleaseSnapshot(s2);
5766   db_->ReleaseSnapshot(s3);
5767 }
5768 #endif  // ROCKSDB_LITE
5769 
5770 // When DB is reopened with multiple column families, the manifest file
5771 // is written after the first CF is flushed, and it is written again
5772 // after each flush. If DB crashes between the flushes, the flushed CF
5773 // flushed will pass the latest log file, and now we require it not
5774 // to be corrupted, and triggering a corruption report.
5775 // We need to fix the bug and enable the test.
TEST_F(DBTest2,CrashInRecoveryMultipleCF)5776 TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
5777   const std::vector<std::string> sync_points = {
5778       "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
5779       "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
5780   for (const auto& test_sync_point : sync_points) {
5781     Options options = CurrentOptions();
5782     // First destroy original db to ensure a clean start.
5783     DestroyAndReopen(options);
5784     options.create_if_missing = true;
5785     options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
5786     CreateAndReopenWithCF({"pikachu"}, options);
5787     ASSERT_OK(Put("foo", "bar"));
5788     ASSERT_OK(Flush());
5789     ASSERT_OK(Put(1, "foo", "bar"));
5790     ASSERT_OK(Flush(1));
5791     ASSERT_OK(Put("foo", "bar"));
5792     ASSERT_OK(Put(1, "foo", "bar"));
5793     // The value is large enough to be divided to two blocks.
5794     std::string large_value(400, ' ');
5795     ASSERT_OK(Put("foo1", large_value));
5796     ASSERT_OK(Put("foo2", large_value));
5797     Close();
5798 
5799     // Corrupt the log file in the middle, so that it is not corrupted
5800     // in the tail.
5801     std::vector<std::string> filenames;
5802     ASSERT_OK(env_->GetChildren(dbname_, &filenames));
5803     for (const auto& f : filenames) {
5804       uint64_t number;
5805       FileType type;
5806       if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) {
5807         std::string fname = dbname_ + "/" + f;
5808         std::string file_content;
5809         ASSERT_OK(ReadFileToString(env_, fname, &file_content));
5810         file_content[400] = 'h';
5811         file_content[401] = 'a';
5812         ASSERT_OK(WriteStringToFile(env_, file_content, fname));
5813         break;
5814       }
5815     }
5816 
5817     // Reopen and freeze the file system after the first manifest write.
5818     FaultInjectionTestEnv fit_env(options.env);
5819     options.env = &fit_env;
5820     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5821     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5822         test_sync_point,
5823         [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
5824     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5825     ASSERT_NOK(TryReopenWithColumnFamilies(
5826         {kDefaultColumnFamilyName, "pikachu"}, options));
5827     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5828 
5829     fit_env.SetFilesystemActive(true);
5830     // If we continue using failure ingestion Env, it will conplain something
5831     // when renaming current file, which is not expected. Need to investigate
5832     // why.
5833     options.env = env_;
5834     ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
5835                                           options));
5836   }
5837 }
5838 
TEST_F(DBTest2,SeekFileRangeDeleteTail)5839 TEST_F(DBTest2, SeekFileRangeDeleteTail) {
5840   Options options = CurrentOptions();
5841   options.prefix_extractor.reset(NewCappedPrefixTransform(1));
5842   options.num_levels = 3;
5843   DestroyAndReopen(options);
5844 
5845   ASSERT_OK(Put("a", "a"));
5846   const Snapshot* s1 = db_->GetSnapshot();
5847   ASSERT_OK(
5848       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
5849   ASSERT_OK(Put("b", "a"));
5850   ASSERT_OK(Flush());
5851 
5852   ASSERT_OK(Put("x", "a"));
5853   ASSERT_OK(Put("z", "a"));
5854   ASSERT_OK(Flush());
5855 
5856   CompactRangeOptions cro;
5857   cro.change_level = true;
5858   cro.target_level = 2;
5859   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
5860 
5861   {
5862     ReadOptions ro;
5863     ro.total_order_seek = true;
5864     std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
5865     ASSERT_OK(iter->status());
5866     iter->Seek("e");
5867     ASSERT_TRUE(iter->Valid());
5868     ASSERT_EQ("x", iter->key().ToString());
5869   }
5870   db_->ReleaseSnapshot(s1);
5871 }
5872 
TEST_F(DBTest2,BackgroundPurgeTest)5873 TEST_F(DBTest2, BackgroundPurgeTest) {
5874   Options options = CurrentOptions();
5875   options.write_buffer_manager =
5876       std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(1 << 20);
5877   options.avoid_unnecessary_blocking_io = true;
5878   DestroyAndReopen(options);
5879   size_t base_value = options.write_buffer_manager->memory_usage();
5880 
5881   ASSERT_OK(Put("a", "a"));
5882   Iterator* iter = db_->NewIterator(ReadOptions());
5883   ASSERT_OK(iter->status());
5884   ASSERT_OK(Flush());
5885   size_t value = options.write_buffer_manager->memory_usage();
5886   ASSERT_GT(value, base_value);
5887 
5888   db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH);
5889   test::SleepingBackgroundTask sleeping_task_after;
5890   db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
5891                           &sleeping_task_after, Env::Priority::HIGH);
5892   delete iter;
5893 
5894   Env::Default()->SleepForMicroseconds(100000);
5895   value = options.write_buffer_manager->memory_usage();
5896   ASSERT_GT(value, base_value);
5897 
5898   sleeping_task_after.WakeUp();
5899   sleeping_task_after.WaitUntilDone();
5900 
5901   test::SleepingBackgroundTask sleeping_task_after2;
5902   db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
5903                           &sleeping_task_after2, Env::Priority::HIGH);
5904   sleeping_task_after2.WakeUp();
5905   sleeping_task_after2.WaitUntilDone();
5906 
5907   value = options.write_buffer_manager->memory_usage();
5908   ASSERT_EQ(base_value, value);
5909 }
5910 
TEST_F(DBTest2,SwitchMemtableRaceWithNewManifest)5911 TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
5912   Options options = CurrentOptions();
5913   DestroyAndReopen(options);
5914   options.max_manifest_file_size = 10;
5915   options.create_if_missing = true;
5916   CreateAndReopenWithCF({"pikachu"}, options);
5917   ASSERT_EQ(2, handles_.size());
5918 
5919   ASSERT_OK(Put("foo", "value"));
5920   const int kL0Files = options.level0_file_num_compaction_trigger;
5921   for (int i = 0; i < kL0Files; ++i) {
5922     ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i)));
5923     ASSERT_OK(Flush(/*cf=*/1));
5924   }
5925 
5926   port::Thread thread([&]() { ASSERT_OK(Flush()); });
5927   ASSERT_OK(dbfull()->TEST_WaitForCompact());
5928   thread.join();
5929 }
5930 
TEST_F(DBTest2,SameSmallestInSameLevel)5931 TEST_F(DBTest2, SameSmallestInSameLevel) {
5932   // This test validates fractional casacading logic when several files at one
5933   // one level only contains the same user key.
5934   Options options = CurrentOptions();
5935   options.merge_operator = MergeOperators::CreateStringAppendOperator();
5936   DestroyAndReopen(options);
5937 
5938   ASSERT_OK(Put("key", "1"));
5939   ASSERT_OK(Put("key", "2"));
5940   ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
5941   ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
5942   ASSERT_OK(Flush());
5943   CompactRangeOptions cro;
5944   cro.change_level = true;
5945   cro.target_level = 2;
5946   ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr,
5947                                    nullptr));
5948 
5949   ASSERT_OK(db_->Merge(WriteOptions(), "key", "5"));
5950   ASSERT_OK(Flush());
5951   ASSERT_OK(db_->Merge(WriteOptions(), "key", "6"));
5952   ASSERT_OK(Flush());
5953   ASSERT_OK(db_->Merge(WriteOptions(), "key", "7"));
5954   ASSERT_OK(Flush());
5955   ASSERT_OK(db_->Merge(WriteOptions(), "key", "8"));
5956   ASSERT_OK(Flush());
5957   ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
5958 #ifndef ROCKSDB_LITE
5959   ASSERT_EQ("0,4,1", FilesPerLevel());
5960 #endif  // ROCKSDB_LITE
5961 
5962   ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
5963 }
5964 
TEST_F(DBTest2,FileConsistencyCheckInOpen)5965 TEST_F(DBTest2, FileConsistencyCheckInOpen) {
5966   ASSERT_OK(Put("foo", "bar"));
5967   ASSERT_OK(Flush());
5968 
5969   SyncPoint::GetInstance()->SetCallBack(
5970       "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
5971         Status* ret_s = static_cast<Status*>(arg);
5972         *ret_s = Status::Corruption("fcc");
5973       });
5974   SyncPoint::GetInstance()->EnableProcessing();
5975 
5976   Options options = CurrentOptions();
5977   options.force_consistency_checks = true;
5978   ASSERT_NOK(TryReopen(options));
5979 
5980   SyncPoint::GetInstance()->DisableProcessing();
5981 }
5982 
TEST_F(DBTest2,BlockBasedTablePrefixIndexSeekForPrev)5983 TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
5984   // create a DB with block prefix index
5985   BlockBasedTableOptions table_options;
5986   Options options = CurrentOptions();
5987   table_options.block_size = 300;
5988   table_options.index_type = BlockBasedTableOptions::kHashSearch;
5989   table_options.index_shortening =
5990       BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
5991   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
5992   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
5993 
5994   Reopen(options);
5995 
5996   Random rnd(301);
5997   std::string large_value = rnd.RandomString(500);
5998 
5999   ASSERT_OK(Put("a1", large_value));
6000   ASSERT_OK(Put("x1", large_value));
6001   ASSERT_OK(Put("y1", large_value));
6002   ASSERT_OK(Flush());
6003 
6004   {
6005     std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
6006     ASSERT_OK(iterator->status());
6007     iterator->SeekForPrev("x3");
6008     ASSERT_TRUE(iterator->Valid());
6009     ASSERT_EQ("x1", iterator->key().ToString());
6010 
6011     iterator->SeekForPrev("a3");
6012     ASSERT_TRUE(iterator->Valid());
6013     ASSERT_EQ("a1", iterator->key().ToString());
6014 
6015     iterator->SeekForPrev("y3");
6016     ASSERT_TRUE(iterator->Valid());
6017     ASSERT_EQ("y1", iterator->key().ToString());
6018 
6019     // Query more than one non-existing prefix to cover the case both
6020     // of empty hash bucket and hash bucket conflict.
6021     iterator->SeekForPrev("b1");
6022     // Result should be not valid or "a1".
6023     if (iterator->Valid()) {
6024       ASSERT_EQ("a1", iterator->key().ToString());
6025     }
6026 
6027     iterator->SeekForPrev("c1");
6028     // Result should be not valid or "a1".
6029     if (iterator->Valid()) {
6030       ASSERT_EQ("a1", iterator->key().ToString());
6031     }
6032 
6033     iterator->SeekForPrev("d1");
6034     // Result should be not valid or "a1".
6035     if (iterator->Valid()) {
6036       ASSERT_EQ("a1", iterator->key().ToString());
6037     }
6038 
6039     iterator->SeekForPrev("y3");
6040     ASSERT_TRUE(iterator->Valid());
6041     ASSERT_EQ("y1", iterator->key().ToString());
6042   }
6043 }
6044 
TEST_F(DBTest2,PartitionedIndexPrefetchFailure)6045 TEST_F(DBTest2, PartitionedIndexPrefetchFailure) {
6046   Options options = last_options_;
6047   options.env = env_;
6048   options.max_open_files = 20;
6049   BlockBasedTableOptions bbto;
6050   bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
6051   bbto.metadata_block_size = 128;
6052   bbto.block_size = 128;
6053   bbto.block_cache = NewLRUCache(16777216);
6054   bbto.cache_index_and_filter_blocks = true;
6055   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
6056   DestroyAndReopen(options);
6057 
6058   // Force no table cache so every read will preload the SST file.
6059   dbfull()->TEST_table_cache()->SetCapacity(0);
6060   bbto.block_cache->SetCapacity(0);
6061 
6062   Random rnd(301);
6063   for (int i = 0; i < 4096; i++) {
6064     ASSERT_OK(Put(Key(i), rnd.RandomString(32)));
6065   }
6066   ASSERT_OK(Flush());
6067 
6068   // Try different random failures in table open for 300 times.
6069   for (int i = 0; i < 300; i++) {
6070     env_->num_reads_fails_ = 0;
6071     env_->rand_reads_fail_odd_ = 8;
6072 
6073     std::string value;
6074     Status s = dbfull()->Get(ReadOptions(), Key(1), &value);
6075     if (env_->num_reads_fails_ > 0) {
6076       ASSERT_NOK(s);
6077     } else {
6078       ASSERT_OK(s);
6079     }
6080   }
6081 
6082   env_->rand_reads_fail_odd_ = 0;
6083 }
6084 
TEST_F(DBTest2,ChangePrefixExtractor)6085 TEST_F(DBTest2, ChangePrefixExtractor) {
6086   for (bool use_partitioned_filter : {true, false}) {
6087     // create a DB with block prefix index
6088     BlockBasedTableOptions table_options;
6089     Options options = CurrentOptions();
6090 
6091     // Sometimes filter is checked based on upper bound. Assert counters
6092     // for that case. Otherwise, only check data correctness.
6093 #ifndef ROCKSDB_LITE
6094     bool expect_filter_check = !use_partitioned_filter;
6095 #else
6096     bool expect_filter_check = false;
6097 #endif
6098     table_options.partition_filters = use_partitioned_filter;
6099     if (use_partitioned_filter) {
6100       table_options.index_type =
6101           BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
6102     }
6103     table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
6104 
6105     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
6106     options.statistics = CreateDBStatistics();
6107 
6108     options.prefix_extractor.reset(NewFixedPrefixTransform(2));
6109     DestroyAndReopen(options);
6110 
6111     Random rnd(301);
6112 
6113     ASSERT_OK(Put("aa", ""));
6114     ASSERT_OK(Put("xb", ""));
6115     ASSERT_OK(Put("xx1", ""));
6116     ASSERT_OK(Put("xz1", ""));
6117     ASSERT_OK(Put("zz", ""));
6118     ASSERT_OK(Flush());
6119 
6120     // After reopening DB with prefix size 2 => 1, prefix extractor
6121     // won't take effective unless it won't change results based
6122     // on upper bound and seek key.
6123     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
6124     Reopen(options);
6125 
6126     {
6127       std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
6128       ASSERT_OK(iterator->status());
6129       iterator->Seek("xa");
6130       ASSERT_TRUE(iterator->Valid());
6131       ASSERT_EQ("xb", iterator->key().ToString());
6132       // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not
6133       // correct in this case. So don't check counters in this case.
6134       if (expect_filter_check) {
6135         ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6136       }
6137 
6138       iterator->Seek("xz");
6139       ASSERT_TRUE(iterator->Valid());
6140       ASSERT_EQ("xz1", iterator->key().ToString());
6141       if (expect_filter_check) {
6142         ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6143       }
6144     }
6145 
6146     std::string ub_str = "xg9";
6147     Slice ub(ub_str);
6148     ReadOptions ro;
6149     ro.iterate_upper_bound = &ub;
6150 
6151     {
6152       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6153       ASSERT_OK(iterator->status());
6154 
6155       // SeekForPrev() never uses prefix bloom if it is changed.
6156       iterator->SeekForPrev("xg0");
6157       ASSERT_TRUE(iterator->Valid());
6158       ASSERT_EQ("xb", iterator->key().ToString());
6159       if (expect_filter_check) {
6160         ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6161       }
6162     }
6163 
6164     ub_str = "xx9";
6165     ub = Slice(ub_str);
6166     {
6167       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6168       ASSERT_OK(iterator->status());
6169 
6170       iterator->Seek("x");
6171       ASSERT_TRUE(iterator->Valid());
6172       ASSERT_EQ("xb", iterator->key().ToString());
6173       if (expect_filter_check) {
6174         ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6175       }
6176 
6177       iterator->Seek("xx0");
6178       ASSERT_TRUE(iterator->Valid());
6179       ASSERT_EQ("xx1", iterator->key().ToString());
6180       if (expect_filter_check) {
6181         ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6182       }
6183     }
6184 
6185     CompactRangeOptions compact_range_opts;
6186     compact_range_opts.bottommost_level_compaction =
6187         BottommostLevelCompaction::kForce;
6188     ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
6189     ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
6190 
6191     // Re-execute similar queries after a full compaction
6192     {
6193       std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
6194 
6195       iterator->Seek("x");
6196       ASSERT_TRUE(iterator->Valid());
6197       ASSERT_EQ("xb", iterator->key().ToString());
6198       if (expect_filter_check) {
6199         ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6200       }
6201 
6202       iterator->Seek("xg");
6203       ASSERT_TRUE(iterator->Valid());
6204       ASSERT_EQ("xx1", iterator->key().ToString());
6205       if (expect_filter_check) {
6206         ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6207       }
6208 
6209       iterator->Seek("xz");
6210       ASSERT_TRUE(iterator->Valid());
6211       ASSERT_EQ("xz1", iterator->key().ToString());
6212       if (expect_filter_check) {
6213         ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6214       }
6215 
6216       ASSERT_OK(iterator->status());
6217     }
6218     {
6219       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6220 
6221       iterator->SeekForPrev("xx0");
6222       ASSERT_TRUE(iterator->Valid());
6223       ASSERT_EQ("xb", iterator->key().ToString());
6224       if (expect_filter_check) {
6225         ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6226       }
6227 
6228       iterator->Seek("xx0");
6229       ASSERT_TRUE(iterator->Valid());
6230       ASSERT_EQ("xx1", iterator->key().ToString());
6231       if (expect_filter_check) {
6232         ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6233       }
6234 
6235       ASSERT_OK(iterator->status());
6236     }
6237 
6238     ub_str = "xg9";
6239     ub = Slice(ub_str);
6240     {
6241       std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6242       iterator->SeekForPrev("xg0");
6243       ASSERT_TRUE(iterator->Valid());
6244       ASSERT_EQ("xb", iterator->key().ToString());
6245       if (expect_filter_check) {
6246         ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6247       }
6248       ASSERT_OK(iterator->status());
6249     }
6250   }
6251 }
6252 
TEST_F(DBTest2,BlockBasedTablePrefixGetIndexNotFound)6253 TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
6254   // create a DB with block prefix index
6255   BlockBasedTableOptions table_options;
6256   Options options = CurrentOptions();
6257   table_options.block_size = 300;
6258   table_options.index_type = BlockBasedTableOptions::kHashSearch;
6259   table_options.index_shortening =
6260       BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
6261   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
6262   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
6263   options.level0_file_num_compaction_trigger = 8;
6264 
6265   Reopen(options);
6266 
6267   ASSERT_OK(Put("b1", "ok"));
6268   ASSERT_OK(Flush());
6269 
6270   // Flushing several files so that the chance that hash bucket
6271   // is empty fo "b" in at least one of the files is high.
6272   ASSERT_OK(Put("a1", ""));
6273   ASSERT_OK(Put("c1", ""));
6274   ASSERT_OK(Flush());
6275 
6276   ASSERT_OK(Put("a2", ""));
6277   ASSERT_OK(Put("c2", ""));
6278   ASSERT_OK(Flush());
6279 
6280   ASSERT_OK(Put("a3", ""));
6281   ASSERT_OK(Put("c3", ""));
6282   ASSERT_OK(Flush());
6283 
6284   ASSERT_OK(Put("a4", ""));
6285   ASSERT_OK(Put("c4", ""));
6286   ASSERT_OK(Flush());
6287 
6288   ASSERT_OK(Put("a5", ""));
6289   ASSERT_OK(Put("c5", ""));
6290   ASSERT_OK(Flush());
6291 
6292   ASSERT_EQ("ok", Get("b1"));
6293 }
6294 
6295 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,AutoPrefixMode1)6296 TEST_F(DBTest2, AutoPrefixMode1) {
6297   // create a DB with block prefix index
6298   BlockBasedTableOptions table_options;
6299   Options options = CurrentOptions();
6300   table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
6301   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
6302   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
6303   options.statistics = CreateDBStatistics();
6304 
6305   Reopen(options);
6306 
6307   Random rnd(301);
6308   std::string large_value = rnd.RandomString(500);
6309 
6310   ASSERT_OK(Put("a1", large_value));
6311   ASSERT_OK(Put("x1", large_value));
6312   ASSERT_OK(Put("y1", large_value));
6313   ASSERT_OK(Flush());
6314 
6315   ReadOptions ro;
6316   ro.total_order_seek = false;
6317   ro.auto_prefix_mode = true;
6318   {
6319     std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6320     iterator->Seek("b1");
6321     ASSERT_TRUE(iterator->Valid());
6322     ASSERT_EQ("x1", iterator->key().ToString());
6323     ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6324     ASSERT_OK(iterator->status());
6325   }
6326 
6327   std::string ub_str = "b9";
6328   Slice ub(ub_str);
6329   ro.iterate_upper_bound = &ub;
6330 
6331   {
6332     std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6333     iterator->Seek("b1");
6334     ASSERT_FALSE(iterator->Valid());
6335     ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6336     ASSERT_OK(iterator->status());
6337   }
6338 
6339   ub_str = "z";
6340   ub = Slice(ub_str);
6341   {
6342     std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6343     iterator->Seek("b1");
6344     ASSERT_TRUE(iterator->Valid());
6345     ASSERT_EQ("x1", iterator->key().ToString());
6346     ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6347     ASSERT_OK(iterator->status());
6348   }
6349 
6350   ub_str = "c";
6351   ub = Slice(ub_str);
6352   {
6353     std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6354     iterator->Seek("b1");
6355     ASSERT_FALSE(iterator->Valid());
6356     ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6357     ASSERT_OK(iterator->status());
6358   }
6359 
6360   // The same queries without recreating iterator
6361   {
6362     ub_str = "b9";
6363     ub = Slice(ub_str);
6364     ro.iterate_upper_bound = &ub;
6365 
6366     std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6367     iterator->Seek("b1");
6368     ASSERT_FALSE(iterator->Valid());
6369     ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6370     ASSERT_OK(iterator->status());
6371 
6372     ub_str = "z";
6373     ub = Slice(ub_str);
6374 
6375     iterator->Seek("b1");
6376     ASSERT_TRUE(iterator->Valid());
6377     ASSERT_EQ("x1", iterator->key().ToString());
6378     ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6379 
6380     ub_str = "c";
6381     ub = Slice(ub_str);
6382 
6383     iterator->Seek("b1");
6384     ASSERT_FALSE(iterator->Valid());
6385     ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6386 
6387     ub_str = "b9";
6388     ub = Slice(ub_str);
6389     ro.iterate_upper_bound = &ub;
6390     iterator->SeekForPrev("b1");
6391     ASSERT_TRUE(iterator->Valid());
6392     ASSERT_EQ("a1", iterator->key().ToString());
6393     ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6394 
6395     ub_str = "zz";
6396     ub = Slice(ub_str);
6397     ro.iterate_upper_bound = &ub;
6398     iterator->SeekToLast();
6399     ASSERT_TRUE(iterator->Valid());
6400     ASSERT_EQ("y1", iterator->key().ToString());
6401 
6402     iterator->SeekToFirst();
6403     ASSERT_TRUE(iterator->Valid());
6404     ASSERT_EQ("a1", iterator->key().ToString());
6405   }
6406 }
6407 
6408 class RenameCurrentTest : public DBTestBase,
6409                           public testing::WithParamInterface<std::string> {
6410  public:
RenameCurrentTest()6411   RenameCurrentTest()
6412       : DBTestBase("rename_current_test", /*env_do_fsync=*/true),
6413         sync_point_(GetParam()) {}
6414 
~RenameCurrentTest()6415   ~RenameCurrentTest() override {}
6416 
SetUp()6417   void SetUp() override {
6418     env_->no_file_overwrite_.store(true, std::memory_order_release);
6419   }
6420 
TearDown()6421   void TearDown() override {
6422     env_->no_file_overwrite_.store(false, std::memory_order_release);
6423   }
6424 
SetupSyncPoints()6425   void SetupSyncPoints() {
6426     SyncPoint::GetInstance()->DisableProcessing();
6427     SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) {
6428       Status* s = reinterpret_cast<Status*>(arg);
6429       assert(s);
6430       *s = Status::IOError("Injected IO error.");
6431     });
6432   }
6433 
6434   const std::string sync_point_;
6435 };
6436 
6437 INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest,
6438                         ::testing::Values("SetCurrentFile:BeforeRename",
6439                                           "SetCurrentFile:AfterRename"));
6440 
TEST_P(RenameCurrentTest,Open)6441 TEST_P(RenameCurrentTest, Open) {
6442   Destroy(last_options_);
6443   Options options = GetDefaultOptions();
6444   options.create_if_missing = true;
6445   SetupSyncPoints();
6446   SyncPoint::GetInstance()->EnableProcessing();
6447   Status s = TryReopen(options);
6448   ASSERT_NOK(s);
6449 
6450   SyncPoint::GetInstance()->DisableProcessing();
6451   Reopen(options);
6452 }
6453 
TEST_P(RenameCurrentTest,Flush)6454 TEST_P(RenameCurrentTest, Flush) {
6455   Destroy(last_options_);
6456   Options options = GetDefaultOptions();
6457   options.max_manifest_file_size = 1;
6458   options.create_if_missing = true;
6459   Reopen(options);
6460   ASSERT_OK(Put("key", "value"));
6461   SetupSyncPoints();
6462   SyncPoint::GetInstance()->EnableProcessing();
6463   ASSERT_NOK(Flush());
6464 
6465   ASSERT_NOK(Put("foo", "value"));
6466 
6467   SyncPoint::GetInstance()->DisableProcessing();
6468   Reopen(options);
6469   ASSERT_EQ("value", Get("key"));
6470   ASSERT_EQ("NOT_FOUND", Get("foo"));
6471 }
6472 
TEST_P(RenameCurrentTest,Compaction)6473 TEST_P(RenameCurrentTest, Compaction) {
6474   Destroy(last_options_);
6475   Options options = GetDefaultOptions();
6476   options.max_manifest_file_size = 1;
6477   options.create_if_missing = true;
6478   Reopen(options);
6479   ASSERT_OK(Put("a", "a_value"));
6480   ASSERT_OK(Put("c", "c_value"));
6481   ASSERT_OK(Flush());
6482 
6483   ASSERT_OK(Put("b", "b_value"));
6484   ASSERT_OK(Put("d", "d_value"));
6485   ASSERT_OK(Flush());
6486 
6487   SetupSyncPoints();
6488   SyncPoint::GetInstance()->EnableProcessing();
6489   ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
6490                                /*end=*/nullptr));
6491 
6492   ASSERT_NOK(Put("foo", "value"));
6493 
6494   SyncPoint::GetInstance()->DisableProcessing();
6495   Reopen(options);
6496   ASSERT_EQ("NOT_FOUND", Get("foo"));
6497   ASSERT_EQ("d_value", Get("d"));
6498 }
6499 
TEST_F(DBTest2,BottommostTemperature)6500 TEST_F(DBTest2, BottommostTemperature) {
6501   Options options = CurrentOptions();
6502   options.bottommost_temperature = Temperature::kWarm;
6503   options.level0_file_num_compaction_trigger = 2;
6504   Reopen(options);
6505 
6506   auto size = GetSstSizeHelper(Temperature::kUnknown);
6507   ASSERT_EQ(size, 0);
6508   size = GetSstSizeHelper(Temperature::kWarm);
6509   ASSERT_EQ(size, 0);
6510   size = GetSstSizeHelper(Temperature::kHot);
6511   ASSERT_EQ(size, 0);
6512 
6513   ASSERT_OK(Put("foo", "bar"));
6514   ASSERT_OK(Put("bar", "bar"));
6515   ASSERT_OK(Flush());
6516   ASSERT_OK(Put("foo", "bar"));
6517   ASSERT_OK(Put("bar", "bar"));
6518   ASSERT_OK(Flush());
6519   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6520 
6521   get_iostats_context()->Reset();
6522   IOStatsContext* iostats = get_iostats_context();
6523 
6524   ColumnFamilyMetaData metadata;
6525   db_->GetColumnFamilyMetaData(&metadata);
6526   ASSERT_EQ(1, metadata.file_count);
6527   ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature);
6528   size = GetSstSizeHelper(Temperature::kUnknown);
6529   ASSERT_EQ(size, 0);
6530   size = GetSstSizeHelper(Temperature::kWarm);
6531   ASSERT_GT(size, 0);
6532   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6533   ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
6534   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6535 
6536   ASSERT_EQ("bar", Get("foo"));
6537 
6538   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6539   ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
6540   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6541   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
6542   ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
6543   ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
6544 
6545   // non-bottommost file still has unknown temperature
6546   ASSERT_OK(Put("foo", "bar"));
6547   ASSERT_OK(Put("bar", "bar"));
6548   ASSERT_OK(Flush());
6549   ASSERT_EQ("bar", Get("bar"));
6550   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6551   ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
6552   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6553   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
6554   ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
6555   ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
6556 
6557   db_->GetColumnFamilyMetaData(&metadata);
6558   ASSERT_EQ(2, metadata.file_count);
6559   ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
6560   size = GetSstSizeHelper(Temperature::kUnknown);
6561   ASSERT_GT(size, 0);
6562   size = GetSstSizeHelper(Temperature::kWarm);
6563   ASSERT_GT(size, 0);
6564 
6565   // reopen and check the information is persisted
6566   Reopen(options);
6567   db_->GetColumnFamilyMetaData(&metadata);
6568   ASSERT_EQ(2, metadata.file_count);
6569   ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
6570   ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature);
6571   size = GetSstSizeHelper(Temperature::kUnknown);
6572   ASSERT_GT(size, 0);
6573   size = GetSstSizeHelper(Temperature::kWarm);
6574   ASSERT_GT(size, 0);
6575 
6576   // check other non-exist temperatures
6577   size = GetSstSizeHelper(Temperature::kHot);
6578   ASSERT_EQ(size, 0);
6579   size = GetSstSizeHelper(Temperature::kCold);
6580   ASSERT_EQ(size, 0);
6581   std::string prop;
6582   ASSERT_TRUE(dbfull()->GetProperty(
6583       DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
6584       &prop));
6585   ASSERT_EQ(std::atoi(prop.c_str()), 0);
6586 }
6587 
TEST_F(DBTest2,BottommostTemperatureUniversal)6588 TEST_F(DBTest2, BottommostTemperatureUniversal) {
6589   const int kTriggerNum = 3;
6590   const int kNumLevels = 5;
6591   const int kBottommostLevel = kNumLevels - 1;
6592   Options options = CurrentOptions();
6593   options.compaction_style = kCompactionStyleUniversal;
6594   options.level0_file_num_compaction_trigger = kTriggerNum;
6595   options.num_levels = kNumLevels;
6596 
6597   DestroyAndReopen(options);
6598 
6599   auto size = GetSstSizeHelper(Temperature::kUnknown);
6600   ASSERT_EQ(size, 0);
6601   size = GetSstSizeHelper(Temperature::kWarm);
6602   ASSERT_EQ(size, 0);
6603   size = GetSstSizeHelper(Temperature::kHot);
6604   ASSERT_EQ(size, 0);
6605   get_iostats_context()->Reset();
6606   IOStatsContext* iostats = get_iostats_context();
6607 
6608   for (int i = 0; i < kTriggerNum; i++) {
6609     ASSERT_OK(Put("foo", "bar"));
6610     ASSERT_OK(Put("bar", "bar"));
6611     ASSERT_OK(Flush());
6612   }
6613   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6614 
6615   ColumnFamilyMetaData metadata;
6616   db_->GetColumnFamilyMetaData(&metadata);
6617   ASSERT_EQ(1, metadata.file_count);
6618   ASSERT_EQ(Temperature::kUnknown,
6619             metadata.levels[kBottommostLevel].files[0].temperature);
6620   size = GetSstSizeHelper(Temperature::kUnknown);
6621   ASSERT_GT(size, 0);
6622   size = GetSstSizeHelper(Temperature::kWarm);
6623   ASSERT_EQ(size, 0);
6624   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6625   ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
6626   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6627   ASSERT_EQ("bar", Get("foo"));
6628 
6629   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6630   ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
6631   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6632   ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
6633   ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
6634   ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
6635 
6636   ASSERT_OK(Put("foo", "bar"));
6637   ASSERT_OK(Put("bar", "bar"));
6638   ASSERT_OK(Flush());
6639   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6640   db_->GetColumnFamilyMetaData(&metadata);
6641   ASSERT_EQ(2, metadata.file_count);
6642   ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
6643   size = GetSstSizeHelper(Temperature::kUnknown);
6644   ASSERT_GT(size, 0);
6645   size = GetSstSizeHelper(Temperature::kWarm);
6646   ASSERT_EQ(size, 0);
6647 
6648   // Update bottommost temperature
6649   options.bottommost_temperature = Temperature::kWarm;
6650   Reopen(options);
6651   db_->GetColumnFamilyMetaData(&metadata);
6652   // Should not impact existing ones
6653   ASSERT_EQ(Temperature::kUnknown,
6654             metadata.levels[kBottommostLevel].files[0].temperature);
6655   size = GetSstSizeHelper(Temperature::kUnknown);
6656   ASSERT_GT(size, 0);
6657   size = GetSstSizeHelper(Temperature::kWarm);
6658   ASSERT_EQ(size, 0);
6659 
6660   // new generated file should have the new settings
6661   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
6662   db_->GetColumnFamilyMetaData(&metadata);
6663   ASSERT_EQ(1, metadata.file_count);
6664   ASSERT_EQ(Temperature::kWarm,
6665             metadata.levels[kBottommostLevel].files[0].temperature);
6666   size = GetSstSizeHelper(Temperature::kUnknown);
6667   ASSERT_EQ(size, 0);
6668   size = GetSstSizeHelper(Temperature::kWarm);
6669   ASSERT_GT(size, 0);
6670 
6671   // non-bottommost file still has unknown temperature
6672   ASSERT_OK(Put("foo", "bar"));
6673   ASSERT_OK(Put("bar", "bar"));
6674   ASSERT_OK(Flush());
6675   ASSERT_OK(dbfull()->TEST_WaitForCompact());
6676   db_->GetColumnFamilyMetaData(&metadata);
6677   ASSERT_EQ(2, metadata.file_count);
6678   ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
6679   size = GetSstSizeHelper(Temperature::kUnknown);
6680   ASSERT_GT(size, 0);
6681   size = GetSstSizeHelper(Temperature::kWarm);
6682   ASSERT_GT(size, 0);
6683 
6684   // check other non-exist temperatures
6685   size = GetSstSizeHelper(Temperature::kHot);
6686   ASSERT_EQ(size, 0);
6687   size = GetSstSizeHelper(Temperature::kCold);
6688   ASSERT_EQ(size, 0);
6689   std::string prop;
6690   ASSERT_TRUE(dbfull()->GetProperty(
6691       DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
6692       &prop));
6693   ASSERT_EQ(std::atoi(prop.c_str()), 0);
6694 }
6695 #endif  // ROCKSDB_LITE
6696 
6697 // WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
TEST_F(DBTest2,PointInTimeRecoveryWithIOErrorWhileReadingWal)6698 TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
6699   Options options = CurrentOptions();
6700   DestroyAndReopen(options);
6701   ASSERT_OK(Put("foo", "value0"));
6702   Close();
6703   SyncPoint::GetInstance()->DisableProcessing();
6704   SyncPoint::GetInstance()->ClearAllCallBacks();
6705   bool should_inject_error = false;
6706   SyncPoint::GetInstance()->SetCallBack(
6707       "DBImpl::RecoverLogFiles:BeforeReadWal",
6708       [&](void* /*arg*/) { should_inject_error = true; });
6709   SyncPoint::GetInstance()->SetCallBack(
6710       "LogReader::ReadMore:AfterReadFile", [&](void* arg) {
6711         if (should_inject_error) {
6712           ASSERT_NE(nullptr, arg);
6713           *reinterpret_cast<Status*>(arg) = Status::IOError("Injected IOError");
6714         }
6715       });
6716   SyncPoint::GetInstance()->EnableProcessing();
6717   options.avoid_flush_during_recovery = true;
6718   options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
6719   Status s = TryReopen(options);
6720   ASSERT_TRUE(s.IsIOError());
6721 }
6722 
TEST_F(DBTest2,PointInTimeRecoveryWithSyncFailureInCFCreation)6723 TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
6724   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
6725       {{"DBImpl::BackgroundCallFlush:Start:1",
6726         "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
6727        {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
6728         "DBImpl::BackgroundCallFlush:Start:2"}});
6729   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
6730 
6731   CreateColumnFamilies({"test1"}, Options());
6732   ASSERT_OK(Put("foo", "bar"));
6733 
6734   // Creating a CF when a flush is going on, log is synced but the
6735   // closed log file is not synced and corrupted.
6736   port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
6737   TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
6738   CreateColumnFamilies({"test2"}, Options());
6739   env_->corrupt_in_sync_ = true;
6740   TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
6741   flush_thread.join();
6742   env_->corrupt_in_sync_ = false;
6743   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
6744 
6745   // Reopening the DB should not corrupt anything
6746   Options options = CurrentOptions();
6747   options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
6748   ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
6749 }
6750 
TEST_F(DBTest2,RenameDirectory)6751 TEST_F(DBTest2, RenameDirectory) {
6752   Options options = CurrentOptions();
6753   DestroyAndReopen(options);
6754   ASSERT_OK(Put("foo", "value0"));
6755   Close();
6756   auto old_dbname = dbname_;
6757   auto new_dbname = dbname_ + "_2";
6758   EXPECT_OK(env_->RenameFile(dbname_, new_dbname));
6759   options.create_if_missing = false;
6760   dbname_ = new_dbname;
6761   ASSERT_OK(TryReopen(options));
6762   ASSERT_EQ("value0", Get("foo"));
6763   Destroy(options);
6764   dbname_ = old_dbname;
6765 }
6766 }  // namespace ROCKSDB_NAMESPACE
6767 
6768 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
6769 extern "C" {
6770 void RegisterCustomObjects(int argc, char** argv);
6771 }
6772 #else
RegisterCustomObjects(int,char **)6773 void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
6774 #endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
6775 
main(int argc,char ** argv)6776 int main(int argc, char** argv) {
6777   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
6778   ::testing::InitGoogleTest(&argc, argv);
6779   RegisterCustomObjects(argc, argv);
6780   return RUN_ALL_TESTS();
6781 }
6782