1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include <atomic>
11 #include <cstdlib>
12 #include <functional>
13 #include <memory>
14
15 #include "db/db_test_util.h"
16 #include "db/read_callback.h"
17 #include "options/options_helper.h"
18 #include "port/port.h"
19 #include "port/stack_trace.h"
20 #include "rocksdb/iostats_context.h"
21 #include "rocksdb/persistent_cache.h"
22 #include "rocksdb/trace_record.h"
23 #include "rocksdb/trace_record_result.h"
24 #include "rocksdb/utilities/replayer.h"
25 #include "rocksdb/wal_filter.h"
26 #include "test_util/testutil.h"
27 #include "util/random.h"
28 #include "utilities/fault_injection_env.h"
29
30 namespace ROCKSDB_NAMESPACE {
31
32 class DBTest2 : public DBTestBase {
33 public:
DBTest2()34 DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
35
36 protected:
37 #ifndef ROCKSDB_LITE
GetSstSizeHelper(Temperature temperature)38 uint64_t GetSstSizeHelper(Temperature temperature) {
39 std::string prop;
40 EXPECT_TRUE(
41 dbfull()->GetProperty(DB::Properties::kLiveSstFilesSizeAtTemperature +
42 ToString(static_cast<uint8_t>(temperature)),
43 &prop));
44 return static_cast<uint64_t>(std::atoi(prop.c_str()));
45 }
46 #endif // ROCKSDB_LITE
47 };
48
49 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,OpenForReadOnly)50 TEST_F(DBTest2, OpenForReadOnly) {
51 DB* db_ptr = nullptr;
52 std::string dbname = test::PerThreadDBPath("db_readonly");
53 Options options = CurrentOptions();
54 options.create_if_missing = true;
55 // OpenForReadOnly should fail but will create <dbname> in the file system
56 ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
57 // Since <dbname> is created, we should be able to delete the dir
58 // We first get the list files under <dbname>
59 // There should not be any subdirectories -- this is not checked here
60 std::vector<std::string> files;
61 ASSERT_OK(env_->GetChildren(dbname, &files));
62 for (auto& f : files) {
63 ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
64 }
65 // <dbname> should be empty now and we should be able to delete it
66 ASSERT_OK(env_->DeleteDir(dbname));
67 options.create_if_missing = false;
68 // OpenForReadOnly should fail since <dbname> was successfully deleted
69 ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
70 // With create_if_missing false, there should not be a dir in the file system
71 ASSERT_NOK(env_->FileExists(dbname));
72 }
73
TEST_F(DBTest2,OpenForReadOnlyWithColumnFamilies)74 TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
75 DB* db_ptr = nullptr;
76 std::string dbname = test::PerThreadDBPath("db_readonly");
77 Options options = CurrentOptions();
78 options.create_if_missing = true;
79
80 ColumnFamilyOptions cf_options(options);
81 std::vector<ColumnFamilyDescriptor> column_families;
82 column_families.push_back(
83 ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
84 column_families.push_back(ColumnFamilyDescriptor("goku", cf_options));
85 std::vector<ColumnFamilyHandle*> handles;
86 // OpenForReadOnly should fail but will create <dbname> in the file system
87 ASSERT_NOK(
88 DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
89 // Since <dbname> is created, we should be able to delete the dir
90 // We first get the list files under <dbname>
91 // There should not be any subdirectories -- this is not checked here
92 std::vector<std::string> files;
93 ASSERT_OK(env_->GetChildren(dbname, &files));
94 for (auto& f : files) {
95 ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
96 }
97 // <dbname> should be empty now and we should be able to delete it
98 ASSERT_OK(env_->DeleteDir(dbname));
99 options.create_if_missing = false;
100 // OpenForReadOnly should fail since <dbname> was successfully deleted
101 ASSERT_NOK(
102 DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
103 // With create_if_missing false, there should not be a dir in the file system
104 ASSERT_NOK(env_->FileExists(dbname));
105 }
106
107 class TestReadOnlyWithCompressedCache
108 : public DBTestBase,
109 public testing::WithParamInterface<std::tuple<int, bool>> {
110 public:
TestReadOnlyWithCompressedCache()111 TestReadOnlyWithCompressedCache()
112 : DBTestBase("test_readonly_with_compressed_cache",
113 /*env_do_fsync=*/true) {
114 max_open_files_ = std::get<0>(GetParam());
115 use_mmap_ = std::get<1>(GetParam());
116 }
117 int max_open_files_;
118 bool use_mmap_;
119 };
120
TEST_P(TestReadOnlyWithCompressedCache,ReadOnlyWithCompressedCache)121 TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) {
122 if (use_mmap_ && !IsMemoryMappedAccessSupported()) {
123 ROCKSDB_GTEST_SKIP("Test requires MMAP support");
124 return;
125 }
126 ASSERT_OK(Put("foo", "bar"));
127 ASSERT_OK(Put("foo2", "barbarbarbarbarbarbarbar"));
128 ASSERT_OK(Flush());
129
130 DB* db_ptr = nullptr;
131 Options options = CurrentOptions();
132 options.allow_mmap_reads = use_mmap_;
133 options.max_open_files = max_open_files_;
134 options.compression = kSnappyCompression;
135 BlockBasedTableOptions table_options;
136 table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
137 table_options.no_block_cache = true;
138 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
139 options.statistics = CreateDBStatistics();
140
141 ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db_ptr));
142
143 std::string v;
144 ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
145 ASSERT_EQ("bar", v);
146 ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
147 ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
148 ASSERT_EQ("bar", v);
149 if (Snappy_Supported()) {
150 if (use_mmap_) {
151 ASSERT_EQ(0,
152 options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
153 } else {
154 ASSERT_EQ(1,
155 options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
156 }
157 }
158
159 delete db_ptr;
160 }
161
162 INSTANTIATE_TEST_CASE_P(TestReadOnlyWithCompressedCache,
163 TestReadOnlyWithCompressedCache,
164 ::testing::Combine(::testing::Values(-1, 100),
165 ::testing::Bool()));
166
167 class PartitionedIndexTestListener : public EventListener {
168 public:
OnFlushCompleted(DB *,const FlushJobInfo & info)169 void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
170 ASSERT_GT(info.table_properties.index_partitions, 1);
171 ASSERT_EQ(info.table_properties.index_key_is_user_key, 0);
172 }
173 };
174
TEST_F(DBTest2,PartitionedIndexUserToInternalKey)175 TEST_F(DBTest2, PartitionedIndexUserToInternalKey) {
176 const int kValueSize = 10500;
177 const int kNumEntriesPerFile = 1000;
178 const int kNumFiles = 3;
179 const int kNumDistinctKeys = 30;
180
181 BlockBasedTableOptions table_options;
182 Options options = CurrentOptions();
183 options.disable_auto_compactions = true;
184 table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
185 PartitionedIndexTestListener* listener = new PartitionedIndexTestListener();
186 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
187 options.listeners.emplace_back(listener);
188 std::vector<const Snapshot*> snapshots;
189 Reopen(options);
190 Random rnd(301);
191
192 for (int i = 0; i < kNumFiles; i++) {
193 for (int j = 0; j < kNumEntriesPerFile; j++) {
194 int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys;
195 std::string value = rnd.RandomString(kValueSize);
196 ASSERT_OK(Put("keykey_" + std::to_string(key_id), value));
197 snapshots.push_back(db_->GetSnapshot());
198 }
199 ASSERT_OK(Flush());
200 }
201
202 for (auto s : snapshots) {
203 db_->ReleaseSnapshot(s);
204 }
205 }
206
207 #endif // ROCKSDB_LITE
208
209 class PrefixFullBloomWithReverseComparator
210 : public DBTestBase,
211 public ::testing::WithParamInterface<bool> {
212 public:
PrefixFullBloomWithReverseComparator()213 PrefixFullBloomWithReverseComparator()
214 : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {}
SetUp()215 void SetUp() override { if_cache_filter_ = GetParam(); }
216 bool if_cache_filter_;
217 };
218
TEST_P(PrefixFullBloomWithReverseComparator,PrefixFullBloomWithReverseComparator)219 TEST_P(PrefixFullBloomWithReverseComparator,
220 PrefixFullBloomWithReverseComparator) {
221 Options options = last_options_;
222 options.comparator = ReverseBytewiseComparator();
223 options.prefix_extractor.reset(NewCappedPrefixTransform(3));
224 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
225 BlockBasedTableOptions bbto;
226 if (if_cache_filter_) {
227 bbto.no_block_cache = false;
228 bbto.cache_index_and_filter_blocks = true;
229 bbto.block_cache = NewLRUCache(1);
230 }
231 bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
232 bbto.whole_key_filtering = false;
233 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
234 DestroyAndReopen(options);
235
236 ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo"));
237 ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2"));
238 ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3"));
239
240 ASSERT_OK(dbfull()->Flush(FlushOptions()));
241
242 if (bbto.block_cache) {
243 bbto.block_cache->EraseUnRefEntries();
244 }
245
246 std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
247 iter->Seek("bar345");
248 ASSERT_OK(iter->status());
249 ASSERT_TRUE(iter->Valid());
250 ASSERT_EQ("bar234", iter->key().ToString());
251 ASSERT_EQ("foo2", iter->value().ToString());
252 iter->Next();
253 ASSERT_TRUE(iter->Valid());
254 ASSERT_EQ("bar123", iter->key().ToString());
255 ASSERT_EQ("foo", iter->value().ToString());
256
257 iter->Seek("foo234");
258 ASSERT_OK(iter->status());
259 ASSERT_TRUE(iter->Valid());
260 ASSERT_EQ("foo123", iter->key().ToString());
261 ASSERT_EQ("foo3", iter->value().ToString());
262
263 iter->Seek("bar");
264 ASSERT_OK(iter->status());
265 ASSERT_TRUE(!iter->Valid());
266 }
267
268 INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator,
269 PrefixFullBloomWithReverseComparator, testing::Bool());
270
TEST_F(DBTest2,IteratorPropertyVersionNumber)271 TEST_F(DBTest2, IteratorPropertyVersionNumber) {
272 ASSERT_OK(Put("", ""));
273 Iterator* iter1 = db_->NewIterator(ReadOptions());
274 ASSERT_OK(iter1->status());
275 std::string prop_value;
276 ASSERT_OK(
277 iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
278 uint64_t version_number1 =
279 static_cast<uint64_t>(std::atoi(prop_value.c_str()));
280
281 ASSERT_OK(Put("", ""));
282 ASSERT_OK(Flush());
283
284 Iterator* iter2 = db_->NewIterator(ReadOptions());
285 ASSERT_OK(iter2->status());
286 ASSERT_OK(
287 iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
288 uint64_t version_number2 =
289 static_cast<uint64_t>(std::atoi(prop_value.c_str()));
290
291 ASSERT_GT(version_number2, version_number1);
292
293 ASSERT_OK(Put("", ""));
294
295 Iterator* iter3 = db_->NewIterator(ReadOptions());
296 ASSERT_OK(iter3->status());
297 ASSERT_OK(
298 iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
299 uint64_t version_number3 =
300 static_cast<uint64_t>(std::atoi(prop_value.c_str()));
301
302 ASSERT_EQ(version_number2, version_number3);
303
304 iter1->SeekToFirst();
305 ASSERT_OK(
306 iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
307 uint64_t version_number1_new =
308 static_cast<uint64_t>(std::atoi(prop_value.c_str()));
309 ASSERT_EQ(version_number1, version_number1_new);
310
311 delete iter1;
312 delete iter2;
313 delete iter3;
314 }
315
TEST_F(DBTest2,CacheIndexAndFilterWithDBRestart)316 TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
317 Options options = CurrentOptions();
318 options.create_if_missing = true;
319 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
320 BlockBasedTableOptions table_options;
321 table_options.cache_index_and_filter_blocks = true;
322 table_options.filter_policy.reset(NewBloomFilterPolicy(20));
323 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
324 CreateAndReopenWithCF({"pikachu"}, options);
325
326 ASSERT_OK(Put(1, "a", "begin"));
327 ASSERT_OK(Put(1, "z", "end"));
328 ASSERT_OK(Flush(1));
329 TryReopenWithColumnFamilies({"default", "pikachu"}, options);
330
331 std::string value;
332 value = Get(1, "a");
333 }
334
TEST_F(DBTest2,MaxSuccessiveMergesChangeWithDBRecovery)335 TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) {
336 Options options = CurrentOptions();
337 options.create_if_missing = true;
338 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
339 options.max_successive_merges = 3;
340 options.merge_operator = MergeOperators::CreatePutOperator();
341 options.disable_auto_compactions = true;
342 DestroyAndReopen(options);
343 ASSERT_OK(Put("poi", "Finch"));
344 ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese"));
345 ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw"));
346 ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root"));
347 options.max_successive_merges = 2;
348 Reopen(options);
349 }
350
351 #ifndef ROCKSDB_LITE
352 class DBTestSharedWriteBufferAcrossCFs
353 : public DBTestBase,
354 public testing::WithParamInterface<std::tuple<bool, bool>> {
355 public:
DBTestSharedWriteBufferAcrossCFs()356 DBTestSharedWriteBufferAcrossCFs()
357 : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {}
SetUp()358 void SetUp() override {
359 use_old_interface_ = std::get<0>(GetParam());
360 cost_cache_ = std::get<1>(GetParam());
361 }
362 bool use_old_interface_;
363 bool cost_cache_;
364 };
365
TEST_P(DBTestSharedWriteBufferAcrossCFs,SharedWriteBufferAcrossCFs)366 TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
367 Options options = CurrentOptions();
368 options.arena_block_size = 4096;
369 auto flush_listener = std::make_shared<FlushCounterListener>();
370 options.listeners.push_back(flush_listener);
371 // Don't trip the listener at shutdown.
372 options.avoid_flush_during_shutdown = true;
373
374 // Avoid undeterministic value by malloc_usable_size();
375 // Force arena block size to 1
376 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
377 "Arena::Arena:0", [&](void* arg) {
378 size_t* block_size = static_cast<size_t*>(arg);
379 *block_size = 1;
380 });
381
382 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
383 "Arena::AllocateNewBlock:0", [&](void* arg) {
384 std::pair<size_t*, size_t*>* pair =
385 static_cast<std::pair<size_t*, size_t*>*>(arg);
386 *std::get<0>(*pair) = *std::get<1>(*pair);
387 });
388 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
389
390 // The total soft write buffer size is about 105000
391 std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
392 ASSERT_LT(cache->GetUsage(), 256 * 1024);
393
394 if (use_old_interface_) {
395 options.db_write_buffer_size = 120000; // this is the real limit
396 } else if (!cost_cache_) {
397 options.write_buffer_manager.reset(new WriteBufferManager(114285));
398 } else {
399 options.write_buffer_manager.reset(new WriteBufferManager(114285, cache));
400 }
401 options.write_buffer_size = 500000; // this is never hit
402 CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
403
404 WriteOptions wo;
405 wo.disableWAL = true;
406
407 std::function<void()> wait_flush = [&]() {
408 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
409 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
410 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
411 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
412 };
413
414 // Create some data and flush "default" and "nikitich" so that they
415 // are newer CFs created.
416 flush_listener->expected_flush_reason = FlushReason::kManualFlush;
417 ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
418 Flush(3);
419 ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
420 ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
421 Flush(0);
422 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
423 static_cast<uint64_t>(1));
424 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
425 static_cast<uint64_t>(1));
426
427 flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
428 ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
429 if (cost_cache_) {
430 ASSERT_GE(cache->GetUsage(), 256 * 1024);
431 ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
432 }
433 wait_flush();
434 ASSERT_OK(Put(0, Key(1), DummyString(60000), wo));
435 if (cost_cache_) {
436 ASSERT_GE(cache->GetUsage(), 256 * 1024);
437 ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
438 }
439 wait_flush();
440 ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
441 // No flush should trigger
442 wait_flush();
443 {
444 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
445 static_cast<uint64_t>(1));
446 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
447 static_cast<uint64_t>(0));
448 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
449 static_cast<uint64_t>(0));
450 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
451 static_cast<uint64_t>(1));
452 }
453
454 // Trigger a flush. Flushing "nikitich".
455 ASSERT_OK(Put(3, Key(2), DummyString(30000), wo));
456 wait_flush();
457 ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
458 wait_flush();
459 {
460 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
461 static_cast<uint64_t>(1));
462 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
463 static_cast<uint64_t>(0));
464 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
465 static_cast<uint64_t>(0));
466 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
467 static_cast<uint64_t>(2));
468 }
469
470 // Without hitting the threshold, no flush should trigger.
471 ASSERT_OK(Put(2, Key(1), DummyString(30000), wo));
472 wait_flush();
473 ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
474 wait_flush();
475 ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
476 wait_flush();
477 {
478 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
479 static_cast<uint64_t>(1));
480 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
481 static_cast<uint64_t>(0));
482 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
483 static_cast<uint64_t>(0));
484 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
485 static_cast<uint64_t>(2));
486 }
487
488 // Hit the write buffer limit again. "default"
489 // will have been flushed.
490 ASSERT_OK(Put(2, Key(2), DummyString(10000), wo));
491 wait_flush();
492 ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
493 wait_flush();
494 ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
495 wait_flush();
496 ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
497 wait_flush();
498 ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
499 wait_flush();
500 {
501 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
502 static_cast<uint64_t>(2));
503 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
504 static_cast<uint64_t>(0));
505 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
506 static_cast<uint64_t>(0));
507 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
508 static_cast<uint64_t>(2));
509 }
510
511 // Trigger another flush. This time "dobrynia". "pikachu" should not
512 // be flushed, althrough it was never flushed.
513 ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
514 wait_flush();
515 ASSERT_OK(Put(2, Key(1), DummyString(80000), wo));
516 wait_flush();
517 ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
518 wait_flush();
519 ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
520 wait_flush();
521
522 {
523 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
524 static_cast<uint64_t>(2));
525 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
526 static_cast<uint64_t>(0));
527 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
528 static_cast<uint64_t>(1));
529 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
530 static_cast<uint64_t>(2));
531 }
532 if (cost_cache_) {
533 ASSERT_GE(cache->GetUsage(), 256 * 1024);
534 Close();
535 options.write_buffer_manager.reset();
536 last_options_.write_buffer_manager.reset();
537 ASSERT_LT(cache->GetUsage(), 256 * 1024);
538 }
539 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
540 }
541
542 INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs,
543 DBTestSharedWriteBufferAcrossCFs,
544 ::testing::Values(std::make_tuple(true, false),
545 std::make_tuple(false, false),
546 std::make_tuple(false, true)));
547
TEST_F(DBTest2,SharedWriteBufferLimitAcrossDB)548 TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
549 std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
550 Options options = CurrentOptions();
551 options.arena_block_size = 4096;
552 auto flush_listener = std::make_shared<FlushCounterListener>();
553 options.listeners.push_back(flush_listener);
554 // Don't trip the listener at shutdown.
555 options.avoid_flush_during_shutdown = true;
556 // Avoid undeterministic value by malloc_usable_size();
557 // Force arena block size to 1
558 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
559 "Arena::Arena:0", [&](void* arg) {
560 size_t* block_size = static_cast<size_t*>(arg);
561 *block_size = 1;
562 });
563
564 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
565 "Arena::AllocateNewBlock:0", [&](void* arg) {
566 std::pair<size_t*, size_t*>* pair =
567 static_cast<std::pair<size_t*, size_t*>*>(arg);
568 *std::get<0>(*pair) = *std::get<1>(*pair);
569 });
570 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
571
572 options.write_buffer_size = 500000; // this is never hit
573 // Use a write buffer total size so that the soft limit is about
574 // 105000.
575 options.write_buffer_manager.reset(new WriteBufferManager(120000));
576 CreateAndReopenWithCF({"cf1", "cf2"}, options);
577
578 ASSERT_OK(DestroyDB(dbname2, options));
579 DB* db2 = nullptr;
580 ASSERT_OK(DB::Open(options, dbname2, &db2));
581
582 WriteOptions wo;
583 wo.disableWAL = true;
584
585 std::function<void()> wait_flush = [&]() {
586 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
587 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
588 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
589 ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
590 };
591
592 // Trigger a flush on cf2
593 flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
594 ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
595 wait_flush();
596 ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
597 wait_flush();
598
599 // Insert to DB2
600 ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
601 wait_flush();
602
603 ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
604 wait_flush();
605 ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
606 {
607 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
608 GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
609 GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
610 static_cast<uint64_t>(1));
611 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
612 static_cast<uint64_t>(0));
613 }
614
615 // Triggering to flush another CF in DB1
616 ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
617 wait_flush();
618 ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
619 wait_flush();
620 {
621 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
622 static_cast<uint64_t>(1));
623 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
624 static_cast<uint64_t>(0));
625 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
626 static_cast<uint64_t>(1));
627 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
628 static_cast<uint64_t>(0));
629 }
630
631 // Triggering flush in DB2.
632 ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000)));
633 wait_flush();
634 ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
635 wait_flush();
636 ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
637 {
638 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
639 static_cast<uint64_t>(1));
640 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
641 static_cast<uint64_t>(0));
642 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
643 static_cast<uint64_t>(1));
644 ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
645 static_cast<uint64_t>(1));
646 }
647
648 delete db2;
649 ASSERT_OK(DestroyDB(dbname2, options));
650
651 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
652 }
653
TEST_F(DBTest2,TestWriteBufferNoLimitWithCache)654 TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
655 Options options = CurrentOptions();
656 options.arena_block_size = 4096;
657 std::shared_ptr<Cache> cache =
658 NewLRUCache(LRUCacheOptions(10000000, 1, false, 0.0));
659 options.write_buffer_size = 50000; // this is never hit
660 // Use a write buffer total size so that the soft limit is about
661 // 105000.
662 options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
663 Reopen(options);
664
665 ASSERT_OK(Put("foo", "bar"));
666 // One dummy entry is 256KB.
667 ASSERT_GT(cache->GetUsage(), 128000);
668 }
669
670 namespace {
ValidateKeyExistence(DB * db,const std::vector<Slice> & keys_must_exist,const std::vector<Slice> & keys_must_not_exist)671 void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
672 const std::vector<Slice>& keys_must_not_exist) {
673 // Ensure that expected keys exist
674 std::vector<std::string> values;
675 if (keys_must_exist.size() > 0) {
676 std::vector<Status> status_list =
677 db->MultiGet(ReadOptions(), keys_must_exist, &values);
678 for (size_t i = 0; i < keys_must_exist.size(); i++) {
679 ASSERT_OK(status_list[i]);
680 }
681 }
682
683 // Ensure that given keys don't exist
684 if (keys_must_not_exist.size() > 0) {
685 std::vector<Status> status_list =
686 db->MultiGet(ReadOptions(), keys_must_not_exist, &values);
687 for (size_t i = 0; i < keys_must_not_exist.size(); i++) {
688 ASSERT_TRUE(status_list[i].IsNotFound());
689 }
690 }
691 }
692
693 } // namespace
694
TEST_F(DBTest2,WalFilterTest)695 TEST_F(DBTest2, WalFilterTest) {
696 class TestWalFilter : public WalFilter {
697 private:
698 // Processing option that is requested to be applied at the given index
699 WalFilter::WalProcessingOption wal_processing_option_;
700 // Index at which to apply wal_processing_option_
701 // At other indexes default wal_processing_option::kContinueProcessing is
702 // returned.
703 size_t apply_option_at_record_index_;
704 // Current record index, incremented with each record encountered.
705 size_t current_record_index_;
706
707 public:
708 TestWalFilter(WalFilter::WalProcessingOption wal_processing_option,
709 size_t apply_option_for_record_index)
710 : wal_processing_option_(wal_processing_option),
711 apply_option_at_record_index_(apply_option_for_record_index),
712 current_record_index_(0) {}
713
714 WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
715 WriteBatch* /*new_batch*/,
716 bool* /*batch_changed*/) const override {
717 WalFilter::WalProcessingOption option_to_return;
718
719 if (current_record_index_ == apply_option_at_record_index_) {
720 option_to_return = wal_processing_option_;
721 }
722 else {
723 option_to_return = WalProcessingOption::kContinueProcessing;
724 }
725
726 // Filter is passed as a const object for RocksDB to not modify the
727 // object, however we modify it for our own purpose here and hence
728 // cast the constness away.
729 (const_cast<TestWalFilter*>(this)->current_record_index_)++;
730
731 return option_to_return;
732 }
733
734 const char* Name() const override { return "TestWalFilter"; }
735 };
736
737 // Create 3 batches with two keys each
738 std::vector<std::vector<std::string>> batch_keys(3);
739
740 batch_keys[0].push_back("key1");
741 batch_keys[0].push_back("key2");
742 batch_keys[1].push_back("key3");
743 batch_keys[1].push_back("key4");
744 batch_keys[2].push_back("key5");
745 batch_keys[2].push_back("key6");
746
747 // Test with all WAL processing options
748 for (int option = 0;
749 option < static_cast<int>(
750 WalFilter::WalProcessingOption::kWalProcessingOptionMax);
751 option++) {
752 Options options = OptionsForLogIterTest();
753 DestroyAndReopen(options);
754 CreateAndReopenWithCF({ "pikachu" }, options);
755
756 // Write given keys in given batches
757 for (size_t i = 0; i < batch_keys.size(); i++) {
758 WriteBatch batch;
759 for (size_t j = 0; j < batch_keys[i].size(); j++) {
760 ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
761 }
762 ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
763 }
764
765 WalFilter::WalProcessingOption wal_processing_option =
766 static_cast<WalFilter::WalProcessingOption>(option);
767
768 // Create a test filter that would apply wal_processing_option at the first
769 // record
770 size_t apply_option_for_record_index = 1;
771 TestWalFilter test_wal_filter(wal_processing_option,
772 apply_option_for_record_index);
773
774 // Reopen database with option to use WAL filter
775 options = OptionsForLogIterTest();
776 options.wal_filter = &test_wal_filter;
777 Status status =
778 TryReopenWithColumnFamilies({ "default", "pikachu" }, options);
779 if (wal_processing_option ==
780 WalFilter::WalProcessingOption::kCorruptedRecord) {
781 ASSERT_NOK(status);
782 // In case of corruption we can turn off paranoid_checks to reopen
783 // databse
784 options.paranoid_checks = false;
785 ReopenWithColumnFamilies({ "default", "pikachu" }, options);
786 }
787 else {
788 ASSERT_OK(status);
789 }
790
791 // Compute which keys we expect to be found
792 // and which we expect not to be found after recovery.
793 std::vector<Slice> keys_must_exist;
794 std::vector<Slice> keys_must_not_exist;
795 switch (wal_processing_option) {
796 case WalFilter::WalProcessingOption::kCorruptedRecord:
797 case WalFilter::WalProcessingOption::kContinueProcessing: {
798 fprintf(stderr, "Testing with complete WAL processing\n");
799 // we expect all records to be processed
800 for (size_t i = 0; i < batch_keys.size(); i++) {
801 for (size_t j = 0; j < batch_keys[i].size(); j++) {
802 keys_must_exist.push_back(Slice(batch_keys[i][j]));
803 }
804 }
805 break;
806 }
807 case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: {
808 fprintf(stderr,
809 "Testing with ignoring record %" ROCKSDB_PRIszt " only\n",
810 apply_option_for_record_index);
811 // We expect the record with apply_option_for_record_index to be not
812 // found.
813 for (size_t i = 0; i < batch_keys.size(); i++) {
814 for (size_t j = 0; j < batch_keys[i].size(); j++) {
815 if (i == apply_option_for_record_index) {
816 keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
817 }
818 else {
819 keys_must_exist.push_back(Slice(batch_keys[i][j]));
820 }
821 }
822 }
823 break;
824 }
825 case WalFilter::WalProcessingOption::kStopReplay: {
826 fprintf(stderr,
827 "Testing with stopping replay from record %" ROCKSDB_PRIszt
828 "\n",
829 apply_option_for_record_index);
830 // We expect records beyond apply_option_for_record_index to be not
831 // found.
832 for (size_t i = 0; i < batch_keys.size(); i++) {
833 for (size_t j = 0; j < batch_keys[i].size(); j++) {
834 if (i >= apply_option_for_record_index) {
835 keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
836 }
837 else {
838 keys_must_exist.push_back(Slice(batch_keys[i][j]));
839 }
840 }
841 }
842 break;
843 }
844 default:
845 FAIL(); // unhandled case
846 }
847
848 bool checked_after_reopen = false;
849
850 while (true) {
851 // Ensure that expected keys exists
852 // and not expected keys don't exist after recovery
853 ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
854
855 if (checked_after_reopen) {
856 break;
857 }
858
859 // reopen database again to make sure previous log(s) are not used
860 //(even if they were skipped)
861 // reopn database with option to use WAL filter
862 options = OptionsForLogIterTest();
863 ReopenWithColumnFamilies({ "default", "pikachu" }, options);
864
865 checked_after_reopen = true;
866 }
867 }
868 }
869
TEST_F(DBTest2,WalFilterTestWithChangeBatch)870 TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
871 class ChangeBatchHandler : public WriteBatch::Handler {
872 private:
873 // Batch to insert keys in
874 WriteBatch* new_write_batch_;
875 // Number of keys to add in the new batch
876 size_t num_keys_to_add_in_new_batch_;
877 // Number of keys added to new batch
878 size_t num_keys_added_;
879
880 public:
881 ChangeBatchHandler(WriteBatch* new_write_batch,
882 size_t num_keys_to_add_in_new_batch)
883 : new_write_batch_(new_write_batch),
884 num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
885 num_keys_added_(0) {}
886 void Put(const Slice& key, const Slice& value) override {
887 if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
888 ASSERT_OK(new_write_batch_->Put(key, value));
889 ++num_keys_added_;
890 }
891 }
892 };
893
894 class TestWalFilterWithChangeBatch : public WalFilter {
895 private:
896 // Index at which to start changing records
897 size_t change_records_from_index_;
898 // Number of keys to add in the new batch
899 size_t num_keys_to_add_in_new_batch_;
900 // Current record index, incremented with each record encountered.
901 size_t current_record_index_;
902
903 public:
904 TestWalFilterWithChangeBatch(size_t change_records_from_index,
905 size_t num_keys_to_add_in_new_batch)
906 : change_records_from_index_(change_records_from_index),
907 num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
908 current_record_index_(0) {}
909
910 WalProcessingOption LogRecord(const WriteBatch& batch,
911 WriteBatch* new_batch,
912 bool* batch_changed) const override {
913 if (current_record_index_ >= change_records_from_index_) {
914 ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
915 Status s = batch.Iterate(&handler);
916 if (s.ok()) {
917 *batch_changed = true;
918 } else {
919 assert(false);
920 }
921 }
922
923 // Filter is passed as a const object for RocksDB to not modify the
924 // object, however we modify it for our own purpose here and hence
925 // cast the constness away.
926 (const_cast<TestWalFilterWithChangeBatch*>(this)
927 ->current_record_index_)++;
928
929 return WalProcessingOption::kContinueProcessing;
930 }
931
932 const char* Name() const override { return "TestWalFilterWithChangeBatch"; }
933 };
934
935 std::vector<std::vector<std::string>> batch_keys(3);
936
937 batch_keys[0].push_back("key1");
938 batch_keys[0].push_back("key2");
939 batch_keys[1].push_back("key3");
940 batch_keys[1].push_back("key4");
941 batch_keys[2].push_back("key5");
942 batch_keys[2].push_back("key6");
943
944 Options options = OptionsForLogIterTest();
945 DestroyAndReopen(options);
946 CreateAndReopenWithCF({ "pikachu" }, options);
947
948 // Write given keys in given batches
949 for (size_t i = 0; i < batch_keys.size(); i++) {
950 WriteBatch batch;
951 for (size_t j = 0; j < batch_keys[i].size(); j++) {
952 ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
953 }
954 ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
955 }
956
957 // Create a test filter that would apply wal_processing_option at the first
958 // record
959 size_t change_records_from_index = 1;
960 size_t num_keys_to_add_in_new_batch = 1;
961 TestWalFilterWithChangeBatch test_wal_filter_with_change_batch(
962 change_records_from_index, num_keys_to_add_in_new_batch);
963
964 // Reopen database with option to use WAL filter
965 options = OptionsForLogIterTest();
966 options.wal_filter = &test_wal_filter_with_change_batch;
967 ReopenWithColumnFamilies({ "default", "pikachu" }, options);
968
969 // Ensure that all keys exist before change_records_from_index_
970 // And after that index only single key exists
971 // as our filter adds only single key for each batch
972 std::vector<Slice> keys_must_exist;
973 std::vector<Slice> keys_must_not_exist;
974
975 for (size_t i = 0; i < batch_keys.size(); i++) {
976 for (size_t j = 0; j < batch_keys[i].size(); j++) {
977 if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) {
978 keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
979 }
980 else {
981 keys_must_exist.push_back(Slice(batch_keys[i][j]));
982 }
983 }
984 }
985
986 bool checked_after_reopen = false;
987
988 while (true) {
989 // Ensure that expected keys exists
990 // and not expected keys don't exist after recovery
991 ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
992
993 if (checked_after_reopen) {
994 break;
995 }
996
997 // reopen database again to make sure previous log(s) are not used
998 //(even if they were skipped)
999 // reopn database with option to use WAL filter
1000 options = OptionsForLogIterTest();
1001 ReopenWithColumnFamilies({ "default", "pikachu" }, options);
1002
1003 checked_after_reopen = true;
1004 }
1005 }
1006
TEST_F(DBTest2,WalFilterTestWithChangeBatchExtraKeys)1007 TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
1008 class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter {
1009 public:
1010 WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch,
1011 bool* batch_changed) const override {
1012 *new_batch = batch;
1013 Status s = new_batch->Put("key_extra", "value_extra");
1014 if (s.ok()) {
1015 *batch_changed = true;
1016 } else {
1017 assert(false);
1018 }
1019 return WalProcessingOption::kContinueProcessing;
1020 }
1021
1022 const char* Name() const override {
1023 return "WalFilterTestWithChangeBatchExtraKeys";
1024 }
1025 };
1026
1027 std::vector<std::vector<std::string>> batch_keys(3);
1028
1029 batch_keys[0].push_back("key1");
1030 batch_keys[0].push_back("key2");
1031 batch_keys[1].push_back("key3");
1032 batch_keys[1].push_back("key4");
1033 batch_keys[2].push_back("key5");
1034 batch_keys[2].push_back("key6");
1035
1036 Options options = OptionsForLogIterTest();
1037 DestroyAndReopen(options);
1038 CreateAndReopenWithCF({ "pikachu" }, options);
1039
1040 // Write given keys in given batches
1041 for (size_t i = 0; i < batch_keys.size(); i++) {
1042 WriteBatch batch;
1043 for (size_t j = 0; j < batch_keys[i].size(); j++) {
1044 ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
1045 }
1046 ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
1047 }
1048
1049 // Create a test filter that would add extra keys
1050 TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys;
1051
1052 // Reopen database with option to use WAL filter
1053 options = OptionsForLogIterTest();
1054 options.wal_filter = &test_wal_filter_extra_keys;
1055 Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
1056 ASSERT_TRUE(status.IsNotSupported());
1057
1058 // Reopen without filter, now reopen should succeed - previous
1059 // attempt to open must not have altered the db.
1060 options = OptionsForLogIterTest();
1061 ReopenWithColumnFamilies({ "default", "pikachu" }, options);
1062
1063 std::vector<Slice> keys_must_exist;
1064 std::vector<Slice> keys_must_not_exist; // empty vector
1065
1066 for (size_t i = 0; i < batch_keys.size(); i++) {
1067 for (size_t j = 0; j < batch_keys[i].size(); j++) {
1068 keys_must_exist.push_back(Slice(batch_keys[i][j]));
1069 }
1070 }
1071
1072 ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
1073 }
1074
TEST_F(DBTest2,WalFilterTestWithColumnFamilies)1075 TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
1076 class TestWalFilterWithColumnFamilies : public WalFilter {
1077 private:
1078 // column_family_id -> log_number map (provided to WALFilter)
1079 std::map<uint32_t, uint64_t> cf_log_number_map_;
1080 // column_family_name -> column_family_id map (provided to WALFilter)
1081 std::map<std::string, uint32_t> cf_name_id_map_;
1082 // column_family_name -> keys_found_in_wal map
1083 // We store keys that are applicable to the column_family
1084 // during recovery (i.e. aren't already flushed to SST file(s))
1085 // for verification against the keys we expect.
1086 std::map<uint32_t, std::vector<std::string>> cf_wal_keys_;
1087 public:
1088 void ColumnFamilyLogNumberMap(
1089 const std::map<uint32_t, uint64_t>& cf_lognumber_map,
1090 const std::map<std::string, uint32_t>& cf_name_id_map) override {
1091 cf_log_number_map_ = cf_lognumber_map;
1092 cf_name_id_map_ = cf_name_id_map;
1093 }
1094
1095 WalProcessingOption LogRecordFound(unsigned long long log_number,
1096 const std::string& /*log_file_name*/,
1097 const WriteBatch& batch,
1098 WriteBatch* /*new_batch*/,
1099 bool* /*batch_changed*/) override {
1100 class LogRecordBatchHandler : public WriteBatch::Handler {
1101 private:
1102 const std::map<uint32_t, uint64_t> & cf_log_number_map_;
1103 std::map<uint32_t, std::vector<std::string>> & cf_wal_keys_;
1104 unsigned long long log_number_;
1105 public:
1106 LogRecordBatchHandler(unsigned long long current_log_number,
1107 const std::map<uint32_t, uint64_t> & cf_log_number_map,
1108 std::map<uint32_t, std::vector<std::string>> & cf_wal_keys) :
1109 cf_log_number_map_(cf_log_number_map),
1110 cf_wal_keys_(cf_wal_keys),
1111 log_number_(current_log_number){}
1112
1113 Status PutCF(uint32_t column_family_id, const Slice& key,
1114 const Slice& /*value*/) override {
1115 auto it = cf_log_number_map_.find(column_family_id);
1116 assert(it != cf_log_number_map_.end());
1117 unsigned long long log_number_for_cf = it->second;
1118 // If the current record is applicable for column_family_id
1119 // (i.e. isn't flushed to SST file(s) for column_family_id)
1120 // add it to the cf_wal_keys_ map for verification.
1121 if (log_number_ >= log_number_for_cf) {
1122 cf_wal_keys_[column_family_id].push_back(std::string(key.data(),
1123 key.size()));
1124 }
1125 return Status::OK();
1126 }
1127 } handler(log_number, cf_log_number_map_, cf_wal_keys_);
1128
1129 Status s = batch.Iterate(&handler);
1130 if (!s.ok()) {
1131 // TODO(AR) is this ok?
1132 return WalProcessingOption::kCorruptedRecord;
1133 }
1134
1135 return WalProcessingOption::kContinueProcessing;
1136 }
1137
1138 const char* Name() const override {
1139 return "WalFilterTestWithColumnFamilies";
1140 }
1141
1142 const std::map<uint32_t, std::vector<std::string>>& GetColumnFamilyKeys() {
1143 return cf_wal_keys_;
1144 }
1145
1146 const std::map<std::string, uint32_t> & GetColumnFamilyNameIdMap() {
1147 return cf_name_id_map_;
1148 }
1149 };
1150
1151 std::vector<std::vector<std::string>> batch_keys_pre_flush(3);
1152
1153 batch_keys_pre_flush[0].push_back("key1");
1154 batch_keys_pre_flush[0].push_back("key2");
1155 batch_keys_pre_flush[1].push_back("key3");
1156 batch_keys_pre_flush[1].push_back("key4");
1157 batch_keys_pre_flush[2].push_back("key5");
1158 batch_keys_pre_flush[2].push_back("key6");
1159
1160 Options options = OptionsForLogIterTest();
1161 DestroyAndReopen(options);
1162 CreateAndReopenWithCF({ "pikachu" }, options);
1163
1164 // Write given keys in given batches
1165 for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
1166 WriteBatch batch;
1167 for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
1168 ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j],
1169 DummyString(1024)));
1170 ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j],
1171 DummyString(1024)));
1172 }
1173 ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
1174 }
1175
1176 //Flush default column-family
1177 ASSERT_OK(db_->Flush(FlushOptions(), handles_[0]));
1178
1179 // Do some more writes
1180 std::vector<std::vector<std::string>> batch_keys_post_flush(3);
1181
1182 batch_keys_post_flush[0].push_back("key7");
1183 batch_keys_post_flush[0].push_back("key8");
1184 batch_keys_post_flush[1].push_back("key9");
1185 batch_keys_post_flush[1].push_back("key10");
1186 batch_keys_post_flush[2].push_back("key11");
1187 batch_keys_post_flush[2].push_back("key12");
1188
1189 // Write given keys in given batches
1190 for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
1191 WriteBatch batch;
1192 for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
1193 ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j],
1194 DummyString(1024)));
1195 ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j],
1196 DummyString(1024)));
1197 }
1198 ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
1199 }
1200
1201 // On Recovery we should only find the second batch applicable to default CF
1202 // But both batches applicable to pikachu CF
1203
1204 // Create a test filter that would add extra keys
1205 TestWalFilterWithColumnFamilies test_wal_filter_column_families;
1206
1207 // Reopen database with option to use WAL filter
1208 options = OptionsForLogIterTest();
1209 options.wal_filter = &test_wal_filter_column_families;
1210 Status status =
1211 TryReopenWithColumnFamilies({ "default", "pikachu" }, options);
1212 ASSERT_TRUE(status.ok());
1213
1214 // verify that handles_[0] only has post_flush keys
1215 // while handles_[1] has pre and post flush keys
1216 auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys();
1217 auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap();
1218 size_t index = 0;
1219 auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]];
1220 //default column-family, only post_flush keys are expected
1221 for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
1222 for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
1223 Slice key_from_the_log(keys_cf[index++]);
1224 Slice batch_key(batch_keys_post_flush[i][j]);
1225 ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
1226 }
1227 }
1228 ASSERT_EQ(index, keys_cf.size());
1229
1230 index = 0;
1231 keys_cf = cf_wal_keys[name_id_map["pikachu"]];
1232 //pikachu column-family, all keys are expected
1233 for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
1234 for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
1235 Slice key_from_the_log(keys_cf[index++]);
1236 Slice batch_key(batch_keys_pre_flush[i][j]);
1237 ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
1238 }
1239 }
1240
1241 for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
1242 for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
1243 Slice key_from_the_log(keys_cf[index++]);
1244 Slice batch_key(batch_keys_post_flush[i][j]);
1245 ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
1246 }
1247 }
1248 ASSERT_EQ(index, keys_cf.size());
1249 }
1250
TEST_F(DBTest2,PresetCompressionDict)1251 TEST_F(DBTest2, PresetCompressionDict) {
1252 // Verifies that compression ratio improves when dictionary is enabled, and
1253 // improves even further when the dictionary is trained by ZSTD.
1254 const size_t kBlockSizeBytes = 4 << 10;
1255 const size_t kL0FileBytes = 128 << 10;
1256 const size_t kApproxPerBlockOverheadBytes = 50;
1257 const int kNumL0Files = 5;
1258
1259 Options options;
1260 // Make sure to use any custom env that the test is configured with.
1261 options.env = CurrentOptions().env;
1262 options.allow_concurrent_memtable_write = false;
1263 options.arena_block_size = kBlockSizeBytes;
1264 options.create_if_missing = true;
1265 options.disable_auto_compactions = true;
1266 options.level0_file_num_compaction_trigger = kNumL0Files;
1267 options.memtable_factory.reset(
1268 test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
1269 options.num_levels = 2;
1270 options.target_file_size_base = kL0FileBytes;
1271 options.target_file_size_multiplier = 2;
1272 options.write_buffer_size = kL0FileBytes;
1273 BlockBasedTableOptions table_options;
1274 table_options.block_size = kBlockSizeBytes;
1275 std::vector<CompressionType> compression_types;
1276 if (Zlib_Supported()) {
1277 compression_types.push_back(kZlibCompression);
1278 }
1279 #if LZ4_VERSION_NUMBER >= 10400 // r124+
1280 compression_types.push_back(kLZ4Compression);
1281 compression_types.push_back(kLZ4HCCompression);
1282 #endif // LZ4_VERSION_NUMBER >= 10400
1283 if (ZSTD_Supported()) {
1284 compression_types.push_back(kZSTD);
1285 }
1286
1287 enum DictionaryTypes : int {
1288 kWithoutDict,
1289 kWithDict,
1290 kWithZSTDTrainedDict,
1291 kDictEnd,
1292 };
1293
1294 for (auto compression_type : compression_types) {
1295 options.compression = compression_type;
1296 size_t bytes_without_dict = 0;
1297 size_t bytes_with_dict = 0;
1298 size_t bytes_with_zstd_trained_dict = 0;
1299 for (int i = kWithoutDict; i < kDictEnd; i++) {
1300 // First iteration: compress without preset dictionary
1301 // Second iteration: compress with preset dictionary
1302 // Third iteration (zstd only): compress with zstd-trained dictionary
1303 //
1304 // To make sure the compression dictionary has the intended effect, we
1305 // verify the compressed size is smaller in successive iterations. Also in
1306 // the non-first iterations, verify the data we get out is the same data
1307 // we put in.
1308 switch (i) {
1309 case kWithoutDict:
1310 options.compression_opts.max_dict_bytes = 0;
1311 options.compression_opts.zstd_max_train_bytes = 0;
1312 break;
1313 case kWithDict:
1314 options.compression_opts.max_dict_bytes = kBlockSizeBytes;
1315 options.compression_opts.zstd_max_train_bytes = 0;
1316 break;
1317 case kWithZSTDTrainedDict:
1318 if (compression_type != kZSTD) {
1319 continue;
1320 }
1321 options.compression_opts.max_dict_bytes = kBlockSizeBytes;
1322 options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
1323 break;
1324 default:
1325 assert(false);
1326 }
1327
1328 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
1329 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1330 CreateAndReopenWithCF({"pikachu"}, options);
1331 Random rnd(301);
1332 std::string seq_datas[10];
1333 for (int j = 0; j < 10; ++j) {
1334 seq_datas[j] =
1335 rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
1336 }
1337
1338 ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
1339 for (int j = 0; j < kNumL0Files; ++j) {
1340 for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
1341 auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
1342 ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
1343 seq_datas[(key_num / 10) % 10]));
1344 }
1345 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
1346 ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
1347 }
1348 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
1349 true /* disallow_trivial_move */));
1350 ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
1351 ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
1352
1353 // Get the live sst files size
1354 size_t total_sst_bytes = TotalSize(1);
1355 if (i == kWithoutDict) {
1356 bytes_without_dict = total_sst_bytes;
1357 } else if (i == kWithDict) {
1358 bytes_with_dict = total_sst_bytes;
1359 } else if (i == kWithZSTDTrainedDict) {
1360 bytes_with_zstd_trained_dict = total_sst_bytes;
1361 }
1362
1363 for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
1364 j++) {
1365 ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
1366 }
1367 if (i == kWithDict) {
1368 ASSERT_GT(bytes_without_dict, bytes_with_dict);
1369 } else if (i == kWithZSTDTrainedDict) {
1370 // In zstd compression, it is sometimes possible that using a trained
1371 // dictionary does not get as good a compression ratio as without
1372 // training.
1373 // But using a dictionary (with or without training) should always get
1374 // better compression ratio than not using one.
1375 ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
1376 bytes_without_dict > bytes_with_zstd_trained_dict);
1377 }
1378
1379 DestroyAndReopen(options);
1380 }
1381 }
1382 }
1383
TEST_F(DBTest2,PresetCompressionDictLocality)1384 TEST_F(DBTest2, PresetCompressionDictLocality) {
1385 if (!ZSTD_Supported()) {
1386 return;
1387 }
1388 // Verifies that compression dictionary is generated from local data. The
1389 // verification simply checks all output SSTs have different compression
1390 // dictionaries. We do not verify effectiveness as that'd likely be flaky in
1391 // the future.
1392 const int kNumEntriesPerFile = 1 << 10; // 1KB
1393 const int kNumBytesPerEntry = 1 << 10; // 1KB
1394 const int kNumFiles = 4;
1395 Options options = CurrentOptions();
1396 options.compression = kZSTD;
1397 options.compression_opts.max_dict_bytes = 1 << 14; // 16KB
1398 options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB
1399 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
1400 options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
1401 BlockBasedTableOptions table_options;
1402 table_options.cache_index_and_filter_blocks = true;
1403 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1404 Reopen(options);
1405
1406 Random rnd(301);
1407 for (int i = 0; i < kNumFiles; ++i) {
1408 for (int j = 0; j < kNumEntriesPerFile; ++j) {
1409 ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
1410 rnd.RandomString(kNumBytesPerEntry)));
1411 }
1412 ASSERT_OK(Flush());
1413 MoveFilesToLevel(1);
1414 ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
1415 }
1416
1417 // Store all the dictionaries generated during a full compaction.
1418 std::vector<std::string> compression_dicts;
1419 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
1420 "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
1421 [&](void* arg) {
1422 compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
1423 });
1424 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
1425 CompactRangeOptions compact_range_opts;
1426 compact_range_opts.bottommost_level_compaction =
1427 BottommostLevelCompaction::kForceOptimized;
1428 ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
1429
1430 // Dictionary compression should not be so good as to compress four totally
1431 // random files into one. If it does then there's probably something wrong
1432 // with the test.
1433 ASSERT_GT(NumTableFilesAtLevel(1), 1);
1434
1435 // Furthermore, there should be one compression dictionary generated per file.
1436 // And they should all be different from each other.
1437 ASSERT_EQ(NumTableFilesAtLevel(1),
1438 static_cast<int>(compression_dicts.size()));
1439 for (size_t i = 1; i < compression_dicts.size(); ++i) {
1440 std::string& a = compression_dicts[i - 1];
1441 std::string& b = compression_dicts[i];
1442 size_t alen = a.size();
1443 size_t blen = b.size();
1444 ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
1445 }
1446 }
1447
1448 class PresetCompressionDictTest
1449 : public DBTestBase,
1450 public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
1451 public:
PresetCompressionDictTest()1452 PresetCompressionDictTest()
1453 : DBTestBase("db_test2", false /* env_do_fsync */),
1454 compression_type_(std::get<0>(GetParam())),
1455 bottommost_(std::get<1>(GetParam())) {}
1456
1457 protected:
1458 const CompressionType compression_type_;
1459 const bool bottommost_;
1460 };
1461
1462 INSTANTIATE_TEST_CASE_P(
1463 DBTest2, PresetCompressionDictTest,
1464 ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
1465 ::testing::Bool()));
1466
TEST_P(PresetCompressionDictTest,Flush)1467 TEST_P(PresetCompressionDictTest, Flush) {
1468 // Verifies that dictionary is generated and written during flush only when
1469 // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
1470 // size of the dictionary is within expectations according to the limit on
1471 // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
1472 const size_t kValueLen = 256;
1473 const size_t kKeysPerFile = 1 << 10;
1474 const size_t kDictLen = 16 << 10;
1475 const size_t kBlockLen = 4 << 10;
1476
1477 Options options = CurrentOptions();
1478 if (bottommost_) {
1479 options.bottommost_compression = compression_type_;
1480 options.bottommost_compression_opts.enabled = true;
1481 options.bottommost_compression_opts.max_dict_bytes = kDictLen;
1482 options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
1483 } else {
1484 options.compression = compression_type_;
1485 options.compression_opts.max_dict_bytes = kDictLen;
1486 options.compression_opts.max_dict_buffer_bytes = kBlockLen;
1487 }
1488 options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
1489 options.statistics = CreateDBStatistics();
1490 BlockBasedTableOptions bbto;
1491 bbto.block_size = kBlockLen;
1492 bbto.cache_index_and_filter_blocks = true;
1493 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
1494 Reopen(options);
1495
1496 Random rnd(301);
1497 for (size_t i = 0; i <= kKeysPerFile; ++i) {
1498 ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
1499 }
1500 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
1501
1502 // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
1503 // compression dictionary exists since dictionaries would be preloaded when
1504 // the flush finishes.
1505 if (bottommost_) {
1506 // Flush is never considered bottommost. This should change in the future
1507 // since flushed files may have nothing underneath them, like the one in
1508 // this test case.
1509 ASSERT_EQ(
1510 TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1511 0);
1512 } else {
1513 ASSERT_GT(
1514 TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1515 0);
1516 // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
1517 // number of bytes needs to be adjusted in case the cached block is in
1518 // ZSTD's digested dictionary format.
1519 if (compression_type_ != kZSTD &&
1520 compression_type_ != kZSTDNotFinalCompression) {
1521 // Although we limited buffering to `kBlockLen`, there may be up to two
1522 // blocks of data included in the dictionary since we only check limit
1523 // after each block is built.
1524 ASSERT_LE(TestGetTickerCount(options,
1525 BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1526 2 * kBlockLen);
1527 }
1528 }
1529 }
1530
TEST_P(PresetCompressionDictTest,CompactNonBottommost)1531 TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
1532 // Verifies that dictionary is generated and written during compaction to
1533 // non-bottommost level only when `ColumnFamilyOptions::compression` enables
1534 // dictionary. Also verifies the size of the dictionary is within expectations
1535 // according to the limit on buffering set by
1536 // `CompressionOptions::max_dict_buffer_bytes`.
1537 const size_t kValueLen = 256;
1538 const size_t kKeysPerFile = 1 << 10;
1539 const size_t kDictLen = 16 << 10;
1540 const size_t kBlockLen = 4 << 10;
1541
1542 Options options = CurrentOptions();
1543 if (bottommost_) {
1544 options.bottommost_compression = compression_type_;
1545 options.bottommost_compression_opts.enabled = true;
1546 options.bottommost_compression_opts.max_dict_bytes = kDictLen;
1547 options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
1548 } else {
1549 options.compression = compression_type_;
1550 options.compression_opts.max_dict_bytes = kDictLen;
1551 options.compression_opts.max_dict_buffer_bytes = kBlockLen;
1552 }
1553 options.disable_auto_compactions = true;
1554 options.statistics = CreateDBStatistics();
1555 BlockBasedTableOptions bbto;
1556 bbto.block_size = kBlockLen;
1557 bbto.cache_index_and_filter_blocks = true;
1558 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
1559 Reopen(options);
1560
1561 Random rnd(301);
1562 for (size_t j = 0; j <= kKeysPerFile; ++j) {
1563 ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
1564 }
1565 ASSERT_OK(Flush());
1566 MoveFilesToLevel(2);
1567
1568 for (int i = 0; i < 2; ++i) {
1569 for (size_t j = 0; j <= kKeysPerFile; ++j) {
1570 ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
1571 }
1572 ASSERT_OK(Flush());
1573 }
1574 #ifndef ROCKSDB_LITE
1575 ASSERT_EQ("2,0,1", FilesPerLevel(0));
1576 #endif // ROCKSDB_LITE
1577
1578 uint64_t prev_compression_dict_bytes_inserted =
1579 TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
1580 // This L0->L1 compaction merges the two L0 files into L1. The produced L1
1581 // file is not bottommost due to the existing L2 file covering the same key-
1582 // range.
1583 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
1584 #ifndef ROCKSDB_LITE
1585 ASSERT_EQ("0,1,1", FilesPerLevel(0));
1586 #endif // ROCKSDB_LITE
1587 // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
1588 // compression dictionary exists since dictionaries would be preloaded when
1589 // the compaction finishes.
1590 if (bottommost_) {
1591 ASSERT_EQ(
1592 TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1593 prev_compression_dict_bytes_inserted);
1594 } else {
1595 ASSERT_GT(
1596 TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1597 prev_compression_dict_bytes_inserted);
1598 // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
1599 // number of bytes needs to be adjusted in case the cached block is in
1600 // ZSTD's digested dictionary format.
1601 if (compression_type_ != kZSTD &&
1602 compression_type_ != kZSTDNotFinalCompression) {
1603 // Although we limited buffering to `kBlockLen`, there may be up to two
1604 // blocks of data included in the dictionary since we only check limit
1605 // after each block is built.
1606 ASSERT_LE(TestGetTickerCount(options,
1607 BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1608 prev_compression_dict_bytes_inserted + 2 * kBlockLen);
1609 }
1610 }
1611 }
1612
TEST_P(PresetCompressionDictTest,CompactBottommost)1613 TEST_P(PresetCompressionDictTest, CompactBottommost) {
1614 // Verifies that dictionary is generated and written during compaction to
1615 // non-bottommost level only when either `ColumnFamilyOptions::compression` or
1616 // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
1617 // verifies the size of the dictionary is within expectations according to the
1618 // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
1619 const size_t kValueLen = 256;
1620 const size_t kKeysPerFile = 1 << 10;
1621 const size_t kDictLen = 16 << 10;
1622 const size_t kBlockLen = 4 << 10;
1623
1624 Options options = CurrentOptions();
1625 if (bottommost_) {
1626 options.bottommost_compression = compression_type_;
1627 options.bottommost_compression_opts.enabled = true;
1628 options.bottommost_compression_opts.max_dict_bytes = kDictLen;
1629 options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
1630 } else {
1631 options.compression = compression_type_;
1632 options.compression_opts.max_dict_bytes = kDictLen;
1633 options.compression_opts.max_dict_buffer_bytes = kBlockLen;
1634 }
1635 options.disable_auto_compactions = true;
1636 options.statistics = CreateDBStatistics();
1637 BlockBasedTableOptions bbto;
1638 bbto.block_size = kBlockLen;
1639 bbto.cache_index_and_filter_blocks = true;
1640 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
1641 Reopen(options);
1642
1643 Random rnd(301);
1644 for (int i = 0; i < 2; ++i) {
1645 for (size_t j = 0; j <= kKeysPerFile; ++j) {
1646 ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
1647 }
1648 ASSERT_OK(Flush());
1649 }
1650 #ifndef ROCKSDB_LITE
1651 ASSERT_EQ("2", FilesPerLevel(0));
1652 #endif // ROCKSDB_LITE
1653
1654 uint64_t prev_compression_dict_bytes_inserted =
1655 TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
1656 CompactRangeOptions cro;
1657 ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
1658 #ifndef ROCKSDB_LITE
1659 ASSERT_EQ("0,1", FilesPerLevel(0));
1660 #endif // ROCKSDB_LITE
1661 ASSERT_GT(
1662 TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1663 prev_compression_dict_bytes_inserted);
1664 // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
1665 // number of bytes needs to be adjusted in case the cached block is in ZSTD's
1666 // digested dictionary format.
1667 if (compression_type_ != kZSTD &&
1668 compression_type_ != kZSTDNotFinalCompression) {
1669 // Although we limited buffering to `kBlockLen`, there may be up to two
1670 // blocks of data included in the dictionary since we only check limit after
1671 // each block is built.
1672 ASSERT_LE(
1673 TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
1674 prev_compression_dict_bytes_inserted + 2 * kBlockLen);
1675 }
1676 }
1677
1678 class CompactionCompressionListener : public EventListener {
1679 public:
CompactionCompressionListener(Options * db_options)1680 explicit CompactionCompressionListener(Options* db_options)
1681 : db_options_(db_options) {}
1682
OnCompactionCompleted(DB * db,const CompactionJobInfo & ci)1683 void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
1684 // Figure out last level with files
1685 int bottommost_level = 0;
1686 for (int level = 0; level < db->NumberLevels(); level++) {
1687 std::string files_at_level;
1688 ASSERT_TRUE(db->GetProperty(
1689 "rocksdb.num-files-at-level" + ROCKSDB_NAMESPACE::ToString(level),
1690 &files_at_level));
1691 if (files_at_level != "0") {
1692 bottommost_level = level;
1693 }
1694 }
1695
1696 if (db_options_->bottommost_compression != kDisableCompressionOption &&
1697 ci.output_level == bottommost_level) {
1698 ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
1699 } else if (db_options_->compression_per_level.size() != 0) {
1700 ASSERT_EQ(ci.compression,
1701 db_options_->compression_per_level[ci.output_level]);
1702 } else {
1703 ASSERT_EQ(ci.compression, db_options_->compression);
1704 }
1705 max_level_checked = std::max(max_level_checked, ci.output_level);
1706 }
1707
1708 int max_level_checked = 0;
1709 const Options* db_options_;
1710 };
1711
1712 enum CompressionFailureType {
1713 kTestCompressionFail,
1714 kTestDecompressionFail,
1715 kTestDecompressionCorruption
1716 };
1717
1718 class CompressionFailuresTest
1719 : public DBTest2,
1720 public testing::WithParamInterface<std::tuple<
1721 CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
1722 public:
CompressionFailuresTest()1723 CompressionFailuresTest() {
1724 std::tie(compression_failure_type_, compression_type_,
1725 compression_max_dict_bytes_, compression_parallel_threads_) =
1726 GetParam();
1727 }
1728
1729 CompressionFailureType compression_failure_type_ = kTestCompressionFail;
1730 CompressionType compression_type_ = kNoCompression;
1731 uint32_t compression_max_dict_bytes_ = 0;
1732 uint32_t compression_parallel_threads_ = 0;
1733 };
1734
1735 INSTANTIATE_TEST_CASE_P(
1736 DBTest2, CompressionFailuresTest,
1737 ::testing::Combine(::testing::Values(kTestCompressionFail,
1738 kTestDecompressionFail,
1739 kTestDecompressionCorruption),
1740 ::testing::ValuesIn(GetSupportedCompressions()),
1741 ::testing::Values(0, 10), ::testing::Values(1, 4)));
1742
TEST_P(CompressionFailuresTest,CompressionFailures)1743 TEST_P(CompressionFailuresTest, CompressionFailures) {
1744 if (compression_type_ == kNoCompression) {
1745 return;
1746 }
1747
1748 Options options = CurrentOptions();
1749 options.level0_file_num_compaction_trigger = 2;
1750 options.max_bytes_for_level_base = 1024;
1751 options.max_bytes_for_level_multiplier = 2;
1752 options.num_levels = 7;
1753 options.max_background_compactions = 1;
1754 options.target_file_size_base = 512;
1755
1756 BlockBasedTableOptions table_options;
1757 table_options.block_size = 512;
1758 table_options.verify_compression = true;
1759 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
1760
1761 options.compression = compression_type_;
1762 options.compression_opts.parallel_threads = compression_parallel_threads_;
1763 options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
1764 options.bottommost_compression_opts.parallel_threads =
1765 compression_parallel_threads_;
1766 options.bottommost_compression_opts.max_dict_bytes =
1767 compression_max_dict_bytes_;
1768
1769 if (compression_failure_type_ == kTestCompressionFail) {
1770 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
1771 "CompressData:TamperWithReturnValue", [](void* arg) {
1772 bool* ret = static_cast<bool*>(arg);
1773 *ret = false;
1774 });
1775 } else if (compression_failure_type_ == kTestDecompressionFail) {
1776 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
1777 "UncompressBlockContentsForCompressionType:TamperWithReturnValue",
1778 [](void* arg) {
1779 Status* ret = static_cast<Status*>(arg);
1780 ASSERT_OK(*ret);
1781 *ret = Status::Corruption("kTestDecompressionFail");
1782 });
1783 } else if (compression_failure_type_ == kTestDecompressionCorruption) {
1784 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
1785 "UncompressBlockContentsForCompressionType:"
1786 "TamperWithDecompressionOutput",
1787 [](void* arg) {
1788 BlockContents* contents = static_cast<BlockContents*>(arg);
1789 // Ensure uncompressed data != original data
1790 const size_t len = contents->data.size() + 1;
1791 std::unique_ptr<char[]> fake_data(new char[len]());
1792 *contents = BlockContents(std::move(fake_data), len);
1793 });
1794 }
1795
1796 std::map<std::string, std::string> key_value_written;
1797
1798 const int kKeySize = 5;
1799 const int kValUnitSize = 16;
1800 const int kValSize = 256;
1801 Random rnd(405);
1802
1803 Status s = Status::OK();
1804
1805 DestroyAndReopen(options);
1806 // Write 10 random files
1807 for (int i = 0; i < 10; i++) {
1808 for (int j = 0; j < 5; j++) {
1809 std::string key = rnd.RandomString(kKeySize);
1810 // Ensure good compression ratio
1811 std::string valueUnit = rnd.RandomString(kValUnitSize);
1812 std::string value;
1813 for (int k = 0; k < kValSize; k += kValUnitSize) {
1814 value += valueUnit;
1815 }
1816 s = Put(key, value);
1817 if (compression_failure_type_ == kTestCompressionFail) {
1818 key_value_written[key] = value;
1819 ASSERT_OK(s);
1820 }
1821 }
1822 s = Flush();
1823 if (compression_failure_type_ == kTestCompressionFail) {
1824 ASSERT_OK(s);
1825 }
1826 s = dbfull()->TEST_WaitForCompact();
1827 if (compression_failure_type_ == kTestCompressionFail) {
1828 ASSERT_OK(s);
1829 }
1830 if (i == 4) {
1831 // Make compression fail at the mid of table building
1832 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
1833 }
1834 }
1835 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
1836
1837 if (compression_failure_type_ == kTestCompressionFail) {
1838 // Should be kNoCompression, check content consistency
1839 std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
1840 for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
1841 std::string key = db_iter->key().ToString();
1842 std::string value = db_iter->value().ToString();
1843 ASSERT_NE(key_value_written.find(key), key_value_written.end());
1844 ASSERT_EQ(key_value_written[key], value);
1845 key_value_written.erase(key);
1846 }
1847 ASSERT_EQ(0, key_value_written.size());
1848 } else if (compression_failure_type_ == kTestDecompressionFail) {
1849 ASSERT_EQ(std::string(s.getState()),
1850 "Could not decompress: kTestDecompressionFail");
1851 } else if (compression_failure_type_ == kTestDecompressionCorruption) {
1852 ASSERT_EQ(std::string(s.getState()),
1853 "Decompressed block did not match raw block");
1854 }
1855 }
1856
TEST_F(DBTest2,CompressionOptions)1857 TEST_F(DBTest2, CompressionOptions) {
1858 if (!Zlib_Supported() || !Snappy_Supported()) {
1859 return;
1860 }
1861
1862 Options options = CurrentOptions();
1863 options.level0_file_num_compaction_trigger = 2;
1864 options.max_bytes_for_level_base = 100;
1865 options.max_bytes_for_level_multiplier = 2;
1866 options.num_levels = 7;
1867 options.max_background_compactions = 1;
1868
1869 CompactionCompressionListener* listener =
1870 new CompactionCompressionListener(&options);
1871 options.listeners.emplace_back(listener);
1872
1873 const int kKeySize = 5;
1874 const int kValSize = 20;
1875 Random rnd(301);
1876
1877 std::vector<uint32_t> compression_parallel_threads = {1, 4};
1878
1879 std::map<std::string, std::string> key_value_written;
1880
1881 for (int iter = 0; iter <= 2; iter++) {
1882 listener->max_level_checked = 0;
1883
1884 if (iter == 0) {
1885 // Use different compression algorithms for different levels but
1886 // always use Zlib for bottommost level
1887 options.compression_per_level = {kNoCompression, kNoCompression,
1888 kNoCompression, kSnappyCompression,
1889 kSnappyCompression, kSnappyCompression,
1890 kZlibCompression};
1891 options.compression = kNoCompression;
1892 options.bottommost_compression = kZlibCompression;
1893 } else if (iter == 1) {
1894 // Use Snappy except for bottommost level use ZLib
1895 options.compression_per_level = {};
1896 options.compression = kSnappyCompression;
1897 options.bottommost_compression = kZlibCompression;
1898 } else if (iter == 2) {
1899 // Use Snappy everywhere
1900 options.compression_per_level = {};
1901 options.compression = kSnappyCompression;
1902 options.bottommost_compression = kDisableCompressionOption;
1903 }
1904
1905 for (auto num_threads : compression_parallel_threads) {
1906 options.compression_opts.parallel_threads = num_threads;
1907 options.bottommost_compression_opts.parallel_threads = num_threads;
1908
1909 DestroyAndReopen(options);
1910 // Write 10 random files
1911 for (int i = 0; i < 10; i++) {
1912 for (int j = 0; j < 5; j++) {
1913 std::string key = rnd.RandomString(kKeySize);
1914 std::string value = rnd.RandomString(kValSize);
1915 key_value_written[key] = value;
1916 ASSERT_OK(Put(key, value));
1917 }
1918 ASSERT_OK(Flush());
1919 ASSERT_OK(dbfull()->TEST_WaitForCompact());
1920 }
1921
1922 // Make sure that we wrote enough to check all 7 levels
1923 ASSERT_EQ(listener->max_level_checked, 6);
1924
1925 // Make sure database content is the same as key_value_written
1926 std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
1927 for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
1928 std::string key = db_iter->key().ToString();
1929 std::string value = db_iter->value().ToString();
1930 ASSERT_NE(key_value_written.find(key), key_value_written.end());
1931 ASSERT_EQ(key_value_written[key], value);
1932 key_value_written.erase(key);
1933 }
1934 ASSERT_OK(db_iter->status());
1935 ASSERT_EQ(0, key_value_written.size());
1936 }
1937 }
1938 }
1939
1940 class CompactionStallTestListener : public EventListener {
1941 public:
CompactionStallTestListener()1942 CompactionStallTestListener() : compacting_files_cnt_(0), compacted_files_cnt_(0) {}
1943
OnCompactionBegin(DB *,const CompactionJobInfo & ci)1944 void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
1945 ASSERT_EQ(ci.cf_name, "default");
1946 ASSERT_EQ(ci.base_input_level, 0);
1947 ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
1948 compacting_files_cnt_ += ci.input_files.size();
1949 }
1950
OnCompactionCompleted(DB *,const CompactionJobInfo & ci)1951 void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
1952 ASSERT_EQ(ci.cf_name, "default");
1953 ASSERT_EQ(ci.base_input_level, 0);
1954 ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
1955 compacted_files_cnt_ += ci.input_files.size();
1956 }
1957
1958 std::atomic<size_t> compacting_files_cnt_;
1959 std::atomic<size_t> compacted_files_cnt_;
1960 };
1961
TEST_F(DBTest2,CompactionStall)1962 TEST_F(DBTest2, CompactionStall) {
1963 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
1964 {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
1965 {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
1966 {"DBTest2::CompactionStall:2",
1967 "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
1968 {"DBTest2::CompactionStall:3",
1969 "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
1970 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
1971
1972 Options options = CurrentOptions();
1973 options.level0_file_num_compaction_trigger = 4;
1974 options.max_background_compactions = 40;
1975 CompactionStallTestListener* listener = new CompactionStallTestListener();
1976 options.listeners.emplace_back(listener);
1977 DestroyAndReopen(options);
1978 // make sure all background compaction jobs can be scheduled
1979 auto stop_token =
1980 dbfull()->TEST_write_controler().GetCompactionPressureToken();
1981
1982 Random rnd(301);
1983
1984 // 4 Files in L0
1985 for (int i = 0; i < 4; i++) {
1986 for (int j = 0; j < 10; j++) {
1987 ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
1988 }
1989 ASSERT_OK(Flush());
1990 }
1991
1992 // Wait for compaction to be triggered
1993 TEST_SYNC_POINT("DBTest2::CompactionStall:0");
1994
1995 // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again
1996 // at DBTest2::CompactionStall::1
1997 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
1998
1999 // Another 6 L0 files to trigger compaction again
2000 for (int i = 0; i < 6; i++) {
2001 for (int j = 0; j < 10; j++) {
2002 ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
2003 }
2004 ASSERT_OK(Flush());
2005 }
2006
2007 // Wait for another compaction to be triggered
2008 TEST_SYNC_POINT("DBTest2::CompactionStall:1");
2009
2010 // Hold NotifyOnCompactionBegin in the unlock mutex section
2011 TEST_SYNC_POINT("DBTest2::CompactionStall:2");
2012
2013 // Hold NotifyOnCompactionCompleted in the unlock mutex section
2014 TEST_SYNC_POINT("DBTest2::CompactionStall:3");
2015
2016 ASSERT_OK(dbfull()->TEST_WaitForCompact());
2017 ASSERT_LT(NumTableFilesAtLevel(0),
2018 options.level0_file_num_compaction_trigger);
2019 ASSERT_GT(listener->compacted_files_cnt_.load(),
2020 10 - options.level0_file_num_compaction_trigger);
2021 ASSERT_EQ(listener->compacting_files_cnt_.load(), listener->compacted_files_cnt_.load());
2022
2023 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2024 }
2025
2026 #endif // ROCKSDB_LITE
2027
TEST_F(DBTest2,FirstSnapshotTest)2028 TEST_F(DBTest2, FirstSnapshotTest) {
2029 Options options;
2030 options.write_buffer_size = 100000; // Small write buffer
2031 options = CurrentOptions(options);
2032 CreateAndReopenWithCF({"pikachu"}, options);
2033
2034 // This snapshot will have sequence number 0 what is expected behaviour.
2035 const Snapshot* s1 = db_->GetSnapshot();
2036
2037 ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable
2038 ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush
2039
2040 db_->ReleaseSnapshot(s1);
2041 }
2042
2043 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,DuplicateSnapshot)2044 TEST_F(DBTest2, DuplicateSnapshot) {
2045 Options options;
2046 options = CurrentOptions(options);
2047 std::vector<const Snapshot*> snapshots;
2048 DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
2049 SequenceNumber oldest_ww_snap, first_ww_snap;
2050
2051 ASSERT_OK(Put("k", "v")); // inc seq
2052 snapshots.push_back(db_->GetSnapshot());
2053 snapshots.push_back(db_->GetSnapshot());
2054 ASSERT_OK(Put("k", "v")); // inc seq
2055 snapshots.push_back(db_->GetSnapshot());
2056 snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
2057 first_ww_snap = snapshots.back()->GetSequenceNumber();
2058 ASSERT_OK(Put("k", "v")); // inc seq
2059 snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
2060 snapshots.push_back(db_->GetSnapshot());
2061 ASSERT_OK(Put("k", "v")); // inc seq
2062 snapshots.push_back(db_->GetSnapshot());
2063
2064 {
2065 InstrumentedMutexLock l(dbi->mutex());
2066 auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap);
2067 ASSERT_EQ(seqs.size(), 4); // duplicates are not counted
2068 ASSERT_EQ(oldest_ww_snap, first_ww_snap);
2069 }
2070
2071 for (auto s : snapshots) {
2072 db_->ReleaseSnapshot(s);
2073 }
2074 }
2075 #endif // ROCKSDB_LITE
2076
2077 class PinL0IndexAndFilterBlocksTest
2078 : public DBTestBase,
2079 public testing::WithParamInterface<std::tuple<bool, bool>> {
2080 public:
PinL0IndexAndFilterBlocksTest()2081 PinL0IndexAndFilterBlocksTest()
2082 : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {}
SetUp()2083 void SetUp() override {
2084 infinite_max_files_ = std::get<0>(GetParam());
2085 disallow_preload_ = std::get<1>(GetParam());
2086 }
2087
CreateTwoLevels(Options * options,bool close_afterwards)2088 void CreateTwoLevels(Options* options, bool close_afterwards) {
2089 if (infinite_max_files_) {
2090 options->max_open_files = -1;
2091 }
2092 options->create_if_missing = true;
2093 options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
2094 BlockBasedTableOptions table_options;
2095 table_options.cache_index_and_filter_blocks = true;
2096 table_options.pin_l0_filter_and_index_blocks_in_cache = true;
2097 table_options.filter_policy.reset(NewBloomFilterPolicy(20));
2098 options->table_factory.reset(NewBlockBasedTableFactory(table_options));
2099 CreateAndReopenWithCF({"pikachu"}, *options);
2100
2101 ASSERT_OK(Put(1, "a", "begin"));
2102 ASSERT_OK(Put(1, "z", "end"));
2103 ASSERT_OK(Flush(1));
2104 // move this table to L1
2105 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
2106
2107 // reset block cache
2108 table_options.block_cache = NewLRUCache(64 * 1024);
2109 options->table_factory.reset(NewBlockBasedTableFactory(table_options));
2110 TryReopenWithColumnFamilies({"default", "pikachu"}, *options);
2111 // create new table at L0
2112 ASSERT_OK(Put(1, "a2", "begin2"));
2113 ASSERT_OK(Put(1, "z2", "end2"));
2114 ASSERT_OK(Flush(1));
2115
2116 if (close_afterwards) {
2117 Close(); // This ensures that there is no ref to block cache entries
2118 }
2119 table_options.block_cache->EraseUnRefEntries();
2120 }
2121
2122 bool infinite_max_files_;
2123 bool disallow_preload_;
2124 };
2125
TEST_P(PinL0IndexAndFilterBlocksTest,IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning)2126 TEST_P(PinL0IndexAndFilterBlocksTest,
2127 IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) {
2128 Options options = CurrentOptions();
2129 if (infinite_max_files_) {
2130 options.max_open_files = -1;
2131 }
2132 options.create_if_missing = true;
2133 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
2134 BlockBasedTableOptions table_options;
2135 table_options.cache_index_and_filter_blocks = true;
2136 table_options.pin_l0_filter_and_index_blocks_in_cache = true;
2137 table_options.filter_policy.reset(NewBloomFilterPolicy(20));
2138 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
2139 CreateAndReopenWithCF({"pikachu"}, options);
2140
2141 ASSERT_OK(Put(1, "key", "val"));
2142 // Create a new table.
2143 ASSERT_OK(Flush(1));
2144
2145 // index/filter blocks added to block cache right after table creation.
2146 ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2147 ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2148 ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2149 ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2150
2151 // only index/filter were added
2152 ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD));
2153 ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
2154
2155 std::string value;
2156 // Miss and hit count should remain the same, they're all pinned.
2157 ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value));
2158 ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2159 ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2160 ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2161 ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2162
2163 // Miss and hit count should remain the same, they're all pinned.
2164 value = Get(1, "key");
2165 ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2166 ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2167 ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2168 ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2169 }
2170
TEST_P(PinL0IndexAndFilterBlocksTest,MultiLevelIndexAndFilterBlocksCachedWithPinning)2171 TEST_P(PinL0IndexAndFilterBlocksTest,
2172 MultiLevelIndexAndFilterBlocksCachedWithPinning) {
2173 Options options = CurrentOptions();
2174 PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false);
2175 // get base cache values
2176 uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
2177 uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
2178 uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
2179 uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
2180
2181 std::string value;
2182 // this should be read from L0
2183 // so cache values don't change
2184 value = Get(1, "a2");
2185 ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2186 ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2187 ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2188 ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2189
2190 // this should be read from L1
2191 // the file is opened, prefetching results in a cache filter miss
2192 // the block is loaded and added to the cache,
2193 // then the get results in a cache hit for L1
2194 // When we have inifinite max_files, there is still cache miss because we have
2195 // reset the block cache
2196 value = Get(1, "a");
2197 ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2198 ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2199 }
2200
TEST_P(PinL0IndexAndFilterBlocksTest,DisablePrefetchingNonL0IndexAndFilter)2201 TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
2202 Options options = CurrentOptions();
2203 // This ensures that db does not ref anything in the block cache, so
2204 // EraseUnRefEntries could clear them up.
2205 bool close_afterwards = true;
2206 PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards);
2207
2208 // Get base cache values
2209 uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
2210 uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
2211 uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
2212 uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
2213
2214 if (disallow_preload_) {
2215 // Now we have two files. We narrow the max open files to allow 3 entries
2216 // so that preloading SST files won't happen.
2217 options.max_open_files = 13;
2218 // RocksDB sanitize max open files to at least 20. Modify it back.
2219 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2220 "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
2221 int* max_open_files = static_cast<int*>(arg);
2222 *max_open_files = 13;
2223 });
2224 }
2225 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2226
2227 // Reopen database. If max_open_files is set as -1, table readers will be
2228 // preloaded. This will trigger a BlockBasedTable::Open() and prefetch
2229 // L0 index and filter. Level 1's prefetching is disabled in DB::Open()
2230 TryReopenWithColumnFamilies({"default", "pikachu"}, options);
2231
2232 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2233
2234 if (!disallow_preload_) {
2235 // After reopen, cache miss are increased by one because we read (and only
2236 // read) filter and index on L0
2237 ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2238 ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2239 ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2240 ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2241 } else {
2242 // If max_open_files is not -1, we do not preload table readers, so there is
2243 // no change.
2244 ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2245 ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2246 ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2247 ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2248 }
2249 std::string value;
2250 // this should be read from L0
2251 value = Get(1, "a2");
2252 // If max_open_files is -1, we have pinned index and filter in Rep, so there
2253 // will not be changes in index and filter misses or hits. If max_open_files
2254 // is not -1, Get() will open a TableReader and prefetch index and filter.
2255 ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2256 ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2257 ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2258 ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2259
2260 // this should be read from L1
2261 value = Get(1, "a");
2262 if (!disallow_preload_) {
2263 // In inifinite max files case, there's a cache miss in executing Get()
2264 // because index and filter are not prefetched before.
2265 ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2266 ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2267 ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2268 ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2269 } else {
2270 // In this case, cache miss will be increased by one in
2271 // BlockBasedTable::Open() because this is not in DB::Open() code path so we
2272 // will prefetch L1's index and filter. Cache hit will also be increased by
2273 // one because Get() will read index and filter from the block cache
2274 // prefetched in previous Open() call.
2275 ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2276 ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2277 ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2278 ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2279 }
2280
2281 // Force a full compaction to one single file. There will be a block
2282 // cache read for both of index and filter. If prefetch doesn't explicitly
2283 // happen, it will happen when verifying the file.
2284 Compact(1, "a", "zzzzz");
2285 ASSERT_OK(dbfull()->TEST_WaitForCompact());
2286
2287 if (!disallow_preload_) {
2288 ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2289 ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2290 ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2291 ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2292 } else {
2293 ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2294 ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2295 ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2296 ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2297 }
2298
2299 // Bloom and index hit will happen when a Get() happens.
2300 value = Get(1, "a");
2301 if (!disallow_preload_) {
2302 ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2303 ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2304 ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2305 ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2306 } else {
2307 ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
2308 ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
2309 ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
2310 ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
2311 }
2312 }
2313
2314 INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest,
2315 PinL0IndexAndFilterBlocksTest,
2316 ::testing::Values(std::make_tuple(true, false),
2317 std::make_tuple(false, false),
2318 std::make_tuple(false, true)));
2319
2320 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,MaxCompactionBytesTest)2321 TEST_F(DBTest2, MaxCompactionBytesTest) {
2322 Options options = CurrentOptions();
2323 options.memtable_factory.reset(test::NewSpecialSkipListFactory(
2324 DBTestBase::kNumKeysByGenerateNewRandomFile));
2325 options.compaction_style = kCompactionStyleLevel;
2326 options.write_buffer_size = 200 << 10;
2327 options.arena_block_size = 4 << 10;
2328 options.level0_file_num_compaction_trigger = 4;
2329 options.num_levels = 4;
2330 options.compression = kNoCompression;
2331 options.max_bytes_for_level_base = 450 << 10;
2332 options.target_file_size_base = 100 << 10;
2333 // Infinite for full compaction.
2334 options.max_compaction_bytes = options.target_file_size_base * 100;
2335
2336 Reopen(options);
2337
2338 Random rnd(301);
2339
2340 for (int num = 0; num < 8; num++) {
2341 GenerateNewRandomFile(&rnd);
2342 }
2343 CompactRangeOptions cro;
2344 cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
2345 ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
2346 ASSERT_EQ("0,0,8", FilesPerLevel(0));
2347
2348 // When compact from Ln -> Ln+1, cut a file if the file overlaps with
2349 // more than three files in Ln+1.
2350 options.max_compaction_bytes = options.target_file_size_base * 3;
2351 Reopen(options);
2352
2353 GenerateNewRandomFile(&rnd);
2354 // Add three more small files that overlap with the previous file
2355 for (int i = 0; i < 3; i++) {
2356 ASSERT_OK(Put("a", "z"));
2357 ASSERT_OK(Flush());
2358 }
2359 ASSERT_OK(dbfull()->TEST_WaitForCompact());
2360
2361 // Output files to L1 are cut to three pieces, according to
2362 // options.max_compaction_bytes
2363 ASSERT_EQ("0,3,8", FilesPerLevel(0));
2364 }
2365
UniqueIdCallback(void * arg)2366 static void UniqueIdCallback(void* arg) {
2367 int* result = reinterpret_cast<int*>(arg);
2368 if (*result == -1) {
2369 *result = 0;
2370 }
2371
2372 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
2373 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2374 "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
2375 }
2376
2377 class MockPersistentCache : public PersistentCache {
2378 public:
MockPersistentCache(const bool is_compressed,const size_t max_size)2379 explicit MockPersistentCache(const bool is_compressed, const size_t max_size)
2380 : is_compressed_(is_compressed), max_size_(max_size) {
2381 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2382 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2383 "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
2384 }
2385
~MockPersistentCache()2386 ~MockPersistentCache() override {}
2387
Stats()2388 PersistentCache::StatsType Stats() override {
2389 return PersistentCache::StatsType();
2390 }
2391
NewId()2392 uint64_t NewId() override {
2393 return last_id_.fetch_add(1, std::memory_order_relaxed);
2394 }
2395
Insert(const Slice & page_key,const char * data,const size_t size)2396 Status Insert(const Slice& page_key, const char* data,
2397 const size_t size) override {
2398 MutexLock _(&lock_);
2399
2400 if (size_ > max_size_) {
2401 size_ -= data_.begin()->second.size();
2402 data_.erase(data_.begin());
2403 }
2404
2405 data_.insert(std::make_pair(page_key.ToString(), std::string(data, size)));
2406 size_ += size;
2407 return Status::OK();
2408 }
2409
Lookup(const Slice & page_key,std::unique_ptr<char[]> * data,size_t * size)2410 Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
2411 size_t* size) override {
2412 MutexLock _(&lock_);
2413 auto it = data_.find(page_key.ToString());
2414 if (it == data_.end()) {
2415 return Status::NotFound();
2416 }
2417
2418 assert(page_key.ToString() == it->first);
2419 data->reset(new char[it->second.size()]);
2420 memcpy(data->get(), it->second.c_str(), it->second.size());
2421 *size = it->second.size();
2422 return Status::OK();
2423 }
2424
IsCompressed()2425 bool IsCompressed() override { return is_compressed_; }
2426
GetPrintableOptions() const2427 std::string GetPrintableOptions() const override {
2428 return "MockPersistentCache";
2429 }
2430
2431 port::Mutex lock_;
2432 std::map<std::string, std::string> data_;
2433 const bool is_compressed_ = true;
2434 size_t size_ = 0;
2435 const size_t max_size_ = 10 * 1024; // 10KiB
2436 std::atomic<uint64_t> last_id_{1};
2437 };
2438
2439 #ifdef OS_LINUX
2440 // Make sure that in CPU time perf context counters, Env::NowCPUNanos()
2441 // is used, rather than Env::CPUNanos();
TEST_F(DBTest2,TestPerfContextGetCpuTime)2442 TEST_F(DBTest2, TestPerfContextGetCpuTime) {
2443 // force resizing table cache so table handle is not preloaded so that
2444 // we can measure find_table_nanos during Get().
2445 dbfull()->TEST_table_cache()->SetCapacity(0);
2446 ASSERT_OK(Put("foo", "bar"));
2447 ASSERT_OK(Flush());
2448 env_->now_cpu_count_.store(0);
2449 env_->SetMockSleep();
2450
2451 // NOTE: Presumed unnecessary and removed: resetting mock time in env
2452
2453 // CPU timing is not enabled with kEnableTimeExceptForMutex
2454 SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
2455 ASSERT_EQ("bar", Get("foo"));
2456 ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
2457 ASSERT_EQ(0, env_->now_cpu_count_.load());
2458
2459 constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
2460 constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
2461
2462 // Add time to NowNanos() reading.
2463 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2464 "TableCache::FindTable:0",
2465 [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
2466 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2467
2468 SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
2469 ASSERT_EQ("bar", Get("foo"));
2470 ASSERT_GT(env_->now_cpu_count_.load(), 2);
2471 ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos);
2472 ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
2473
2474 SetPerfLevel(PerfLevel::kDisable);
2475 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2476 }
2477
TEST_F(DBTest2,TestPerfContextIterCpuTime)2478 TEST_F(DBTest2, TestPerfContextIterCpuTime) {
2479 DestroyAndReopen(CurrentOptions());
2480 // force resizing table cache so table handle is not preloaded so that
2481 // we can measure find_table_nanos during iteration
2482 dbfull()->TEST_table_cache()->SetCapacity(0);
2483
2484 const size_t kNumEntries = 10;
2485 for (size_t i = 0; i < kNumEntries; ++i) {
2486 ASSERT_OK(Put("k" + ToString(i), "v" + ToString(i)));
2487 }
2488 ASSERT_OK(Flush());
2489 for (size_t i = 0; i < kNumEntries; ++i) {
2490 ASSERT_EQ("v" + ToString(i), Get("k" + ToString(i)));
2491 }
2492 std::string last_key = "k" + ToString(kNumEntries - 1);
2493 std::string last_value = "v" + ToString(kNumEntries - 1);
2494 env_->now_cpu_count_.store(0);
2495 env_->SetMockSleep();
2496
2497 // NOTE: Presumed unnecessary and removed: resetting mock time in env
2498
2499 // CPU timing is not enabled with kEnableTimeExceptForMutex
2500 SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
2501 Iterator* iter = db_->NewIterator(ReadOptions());
2502 iter->Seek("k0");
2503 ASSERT_TRUE(iter->Valid());
2504 ASSERT_EQ("v0", iter->value().ToString());
2505 iter->SeekForPrev(last_key);
2506 ASSERT_TRUE(iter->Valid());
2507 iter->SeekToLast();
2508 ASSERT_TRUE(iter->Valid());
2509 ASSERT_EQ(last_value, iter->value().ToString());
2510 iter->SeekToFirst();
2511 ASSERT_TRUE(iter->Valid());
2512 ASSERT_EQ("v0", iter->value().ToString());
2513 ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos);
2514 iter->Next();
2515 ASSERT_TRUE(iter->Valid());
2516 ASSERT_EQ("v1", iter->value().ToString());
2517 ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
2518 iter->Prev();
2519 ASSERT_TRUE(iter->Valid());
2520 ASSERT_OK(iter->status());
2521 ASSERT_EQ("v0", iter->value().ToString());
2522 ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
2523 ASSERT_EQ(0, env_->now_cpu_count_.load());
2524 delete iter;
2525
2526 constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
2527 constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
2528
2529 // Add time to NowNanos() reading.
2530 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2531 "TableCache::FindTable:0",
2532 [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
2533 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2534
2535 SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
2536 iter = db_->NewIterator(ReadOptions());
2537 iter->Seek("k0");
2538 ASSERT_TRUE(iter->Valid());
2539 ASSERT_EQ("v0", iter->value().ToString());
2540 iter->SeekForPrev(last_key);
2541 ASSERT_TRUE(iter->Valid());
2542 iter->SeekToLast();
2543 ASSERT_TRUE(iter->Valid());
2544 ASSERT_EQ(last_value, iter->value().ToString());
2545 iter->SeekToFirst();
2546 ASSERT_TRUE(iter->Valid());
2547 ASSERT_EQ("v0", iter->value().ToString());
2548 ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
2549 ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos);
2550 iter->Next();
2551 ASSERT_TRUE(iter->Valid());
2552 ASSERT_EQ("v1", iter->value().ToString());
2553 ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
2554 ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos);
2555 iter->Prev();
2556 ASSERT_TRUE(iter->Valid());
2557 ASSERT_OK(iter->status());
2558 ASSERT_EQ("v0", iter->value().ToString());
2559 ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
2560 ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos);
2561 ASSERT_GE(env_->now_cpu_count_.load(), 12);
2562 ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
2563
2564 SetPerfLevel(PerfLevel::kDisable);
2565 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2566 delete iter;
2567 }
2568 #endif // OS_LINUX
2569
2570 #if !defined OS_SOLARIS
TEST_F(DBTest2,PersistentCache)2571 TEST_F(DBTest2, PersistentCache) {
2572 int num_iter = 80;
2573
2574 Options options;
2575 options.write_buffer_size = 64 * 1024; // small write buffer
2576 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
2577 options = CurrentOptions(options);
2578
2579 auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024};
2580 auto types = {/*compressed*/ 1, /*uncompressed*/ 0};
2581 for (auto bsize : bsizes) {
2582 for (auto type : types) {
2583 BlockBasedTableOptions table_options;
2584 table_options.persistent_cache.reset(
2585 new MockPersistentCache(type, 10 * 1024));
2586 table_options.no_block_cache = true;
2587 table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr;
2588 table_options.block_cache_compressed = nullptr;
2589 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
2590
2591 DestroyAndReopen(options);
2592 CreateAndReopenWithCF({"pikachu"}, options);
2593 // default column family doesn't have block cache
2594 Options no_block_cache_opts;
2595 no_block_cache_opts.statistics = options.statistics;
2596 no_block_cache_opts = CurrentOptions(no_block_cache_opts);
2597 BlockBasedTableOptions table_options_no_bc;
2598 table_options_no_bc.no_block_cache = true;
2599 no_block_cache_opts.table_factory.reset(
2600 NewBlockBasedTableFactory(table_options_no_bc));
2601 ReopenWithColumnFamilies(
2602 {"default", "pikachu"},
2603 std::vector<Options>({no_block_cache_opts, options}));
2604
2605 Random rnd(301);
2606
2607 // Write 8MB (80 values, each 100K)
2608 ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
2609 std::vector<std::string> values;
2610 std::string str;
2611 for (int i = 0; i < num_iter; i++) {
2612 if (i % 4 == 0) { // high compression ratio
2613 str = rnd.RandomString(1000);
2614 }
2615 values.push_back(str);
2616 ASSERT_OK(Put(1, Key(i), values[i]));
2617 }
2618
2619 // flush all data from memtable so that reads are from block cache
2620 ASSERT_OK(Flush(1));
2621
2622 for (int i = 0; i < num_iter; i++) {
2623 ASSERT_EQ(Get(1, Key(i)), values[i]);
2624 }
2625
2626 auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT);
2627 auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS);
2628
2629 ASSERT_GT(hit, 0);
2630 ASSERT_GT(miss, 0);
2631 }
2632 }
2633 }
2634 #endif // !defined OS_SOLARIS
2635
2636 namespace {
CountSyncPoint()2637 void CountSyncPoint() {
2638 TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */);
2639 }
2640 } // namespace
2641
TEST_F(DBTest2,SyncPointMarker)2642 TEST_F(DBTest2, SyncPointMarker) {
2643 std::atomic<int> sync_point_called(0);
2644 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2645 "DBTest2::MarkedPoint",
2646 [&](void* /*arg*/) { sync_point_called.fetch_add(1); });
2647
2648 // The first dependency enforces Marker can be loaded before MarkedPoint.
2649 // The second checks that thread 1's MarkedPoint should be disabled here.
2650 // Execution order:
2651 // | Thread 1 | Thread 2 |
2652 // | | Marker |
2653 // | MarkedPoint | |
2654 // | Thread1First | |
2655 // | | MarkedPoint |
2656 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
2657 {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}},
2658 {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}});
2659
2660 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2661
2662 std::function<void()> func1 = [&]() {
2663 CountSyncPoint();
2664 TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First");
2665 };
2666
2667 std::function<void()> func2 = [&]() {
2668 TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker");
2669 CountSyncPoint();
2670 };
2671
2672 auto thread1 = port::Thread(func1);
2673 auto thread2 = port::Thread(func2);
2674 thread1.join();
2675 thread2.join();
2676
2677 // Callback is only executed once
2678 ASSERT_EQ(sync_point_called.load(), 1);
2679 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2680 }
2681 #endif
2682
GetEncodedEntrySize(size_t key_size,size_t value_size)2683 size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
2684 std::string buffer;
2685
2686 PutVarint32(&buffer, static_cast<uint32_t>(0));
2687 PutVarint32(&buffer, static_cast<uint32_t>(key_size));
2688 PutVarint32(&buffer, static_cast<uint32_t>(value_size));
2689
2690 return buffer.size() + key_size + value_size;
2691 }
2692
TEST_F(DBTest2,ReadAmpBitmap)2693 TEST_F(DBTest2, ReadAmpBitmap) {
2694 Options options = CurrentOptions();
2695 BlockBasedTableOptions bbto;
2696 uint32_t bytes_per_bit[2] = {1, 16};
2697 for (size_t k = 0; k < 2; k++) {
2698 // Disable delta encoding to make it easier to calculate read amplification
2699 bbto.use_delta_encoding = false;
2700 // Huge block cache to make it easier to calculate read amplification
2701 bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
2702 bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
2703 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
2704 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
2705 DestroyAndReopen(options);
2706
2707 const size_t kNumEntries = 10000;
2708
2709 Random rnd(301);
2710 for (size_t i = 0; i < kNumEntries; i++) {
2711 ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(100)));
2712 }
2713 ASSERT_OK(Flush());
2714
2715 Close();
2716 Reopen(options);
2717
2718 // Read keys/values randomly and verify that reported read amp error
2719 // is less than 2%
2720 uint64_t total_useful_bytes = 0;
2721 std::set<int> read_keys;
2722 std::string value;
2723 for (size_t i = 0; i < kNumEntries * 5; i++) {
2724 int key_idx = rnd.Next() % kNumEntries;
2725 std::string key = Key(key_idx);
2726 ASSERT_OK(db_->Get(ReadOptions(), key, &value));
2727
2728 if (read_keys.find(key_idx) == read_keys.end()) {
2729 auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
2730 total_useful_bytes +=
2731 GetEncodedEntrySize(internal_key.size(), value.size());
2732 read_keys.insert(key_idx);
2733 }
2734
2735 double expected_read_amp =
2736 static_cast<double>(total_useful_bytes) /
2737 options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
2738
2739 double read_amp =
2740 static_cast<double>(options.statistics->getTickerCount(
2741 READ_AMP_ESTIMATE_USEFUL_BYTES)) /
2742 options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
2743
2744 double error_pct = fabs(expected_read_amp - read_amp) * 100;
2745 // Error between reported read amp and real read amp should be less than
2746 // 2%
2747 EXPECT_LE(error_pct, 2);
2748 }
2749
2750 // Make sure we read every thing in the DB (which is smaller than our cache)
2751 Iterator* iter = db_->NewIterator(ReadOptions());
2752 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
2753 ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
2754 }
2755 ASSERT_OK(iter->status());
2756 delete iter;
2757
2758 // Read amp is on average 100% since we read all what we loaded in memory
2759 if (k == 0) {
2760 ASSERT_EQ(
2761 options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
2762 options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
2763 } else {
2764 ASSERT_NEAR(
2765 options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) *
2766 1.0f /
2767 options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
2768 1, .01);
2769 }
2770 }
2771 }
2772
2773 #ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
TEST_F(DBTest2,ReadAmpBitmapLiveInCacheAfterDBClose)2774 TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
2775 {
2776 const int kIdBufLen = 100;
2777 char id_buf[kIdBufLen];
2778 Status s = Status::NotSupported();
2779 #ifndef OS_WIN
2780 // You can't open a directory on windows using random access file
2781 std::unique_ptr<RandomAccessFile> file;
2782 s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions());
2783 if (s.ok()) {
2784 if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
2785 // fs holding db directory doesn't support getting a unique file id,
2786 // this means that running this test will fail because lru_cache will
2787 // load the blocks again regardless of them being already in the cache
2788 return;
2789 }
2790 }
2791 #endif
2792 if (!s.ok()) {
2793 std::unique_ptr<Directory> dir;
2794 ASSERT_OK(env_->NewDirectory(dbname_, &dir));
2795 if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
2796 // fs holding db directory doesn't support getting a unique file id,
2797 // this means that running this test will fail because lru_cache will
2798 // load the blocks again regardless of them being already in the cache
2799 return;
2800 }
2801 }
2802 }
2803 uint32_t bytes_per_bit[2] = {1, 16};
2804 for (size_t k = 0; k < 2; k++) {
2805 std::shared_ptr<Cache> lru_cache = NewLRUCache(1024 * 1024 * 1024);
2806 std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
2807
2808 Options options = CurrentOptions();
2809 BlockBasedTableOptions bbto;
2810 // Disable delta encoding to make it easier to calculate read amplification
2811 bbto.use_delta_encoding = false;
2812 // Huge block cache to make it easier to calculate read amplification
2813 bbto.block_cache = lru_cache;
2814 bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
2815 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
2816 options.statistics = stats;
2817 DestroyAndReopen(options);
2818
2819 const int kNumEntries = 10000;
2820
2821 Random rnd(301);
2822 for (int i = 0; i < kNumEntries; i++) {
2823 ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
2824 }
2825 ASSERT_OK(Flush());
2826
2827 Close();
2828 Reopen(options);
2829
2830 uint64_t total_useful_bytes = 0;
2831 std::set<int> read_keys;
2832 std::string value;
2833 // Iter1: Read half the DB, Read even keys
2834 // Key(0), Key(2), Key(4), Key(6), Key(8), ...
2835 for (int i = 0; i < kNumEntries; i += 2) {
2836 std::string key = Key(i);
2837 ASSERT_OK(db_->Get(ReadOptions(), key, &value));
2838
2839 if (read_keys.find(i) == read_keys.end()) {
2840 auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
2841 total_useful_bytes +=
2842 GetEncodedEntrySize(internal_key.size(), value.size());
2843 read_keys.insert(i);
2844 }
2845 }
2846
2847 size_t total_useful_bytes_iter1 =
2848 options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
2849 size_t total_loaded_bytes_iter1 =
2850 options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
2851
2852 Close();
2853 std::shared_ptr<Statistics> new_statistics =
2854 ROCKSDB_NAMESPACE::CreateDBStatistics();
2855 // Destroy old statistics obj that the blocks in lru_cache are pointing to
2856 options.statistics.reset();
2857 // Use the statistics object that we just created
2858 options.statistics = new_statistics;
2859 Reopen(options);
2860
2861 // Iter2: Read half the DB, Read odd keys
2862 // Key(1), Key(3), Key(5), Key(7), Key(9), ...
2863 for (int i = 1; i < kNumEntries; i += 2) {
2864 std::string key = Key(i);
2865 ASSERT_OK(db_->Get(ReadOptions(), key, &value));
2866
2867 if (read_keys.find(i) == read_keys.end()) {
2868 auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
2869 total_useful_bytes +=
2870 GetEncodedEntrySize(internal_key.size(), value.size());
2871 read_keys.insert(i);
2872 }
2873 }
2874
2875 size_t total_useful_bytes_iter2 =
2876 options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
2877 size_t total_loaded_bytes_iter2 =
2878 options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
2879
2880
2881 // Read amp is on average 100% since we read all what we loaded in memory
2882 if (k == 0) {
2883 ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2,
2884 total_loaded_bytes_iter1 + total_loaded_bytes_iter2);
2885 } else {
2886 ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f /
2887 (total_loaded_bytes_iter1 + total_loaded_bytes_iter2),
2888 1, .01);
2889 }
2890 }
2891 }
2892 #endif // !OS_SOLARIS
2893
2894 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,AutomaticCompactionOverlapManualCompaction)2895 TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
2896 Options options = CurrentOptions();
2897 options.num_levels = 3;
2898 options.IncreaseParallelism(20);
2899 DestroyAndReopen(options);
2900
2901 ASSERT_OK(Put(Key(0), "a"));
2902 ASSERT_OK(Put(Key(5), "a"));
2903 ASSERT_OK(Flush());
2904
2905 ASSERT_OK(Put(Key(10), "a"));
2906 ASSERT_OK(Put(Key(15), "a"));
2907 ASSERT_OK(Flush());
2908
2909 CompactRangeOptions cro;
2910 cro.change_level = true;
2911 cro.target_level = 2;
2912 ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
2913
2914 auto get_stat = [](std::string level_str, LevelStatType type,
2915 std::map<std::string, std::string> props) {
2916 auto prop_str =
2917 "compaction." + level_str + "." +
2918 InternalStats::compaction_level_stats.at(type).property_name.c_str();
2919 auto prop_item = props.find(prop_str);
2920 return prop_item == props.end() ? 0 : std::stod(prop_item->second);
2921 };
2922
2923 // Trivial move 2 files to L2
2924 ASSERT_EQ("0,0,2", FilesPerLevel());
2925 // Also test that the stats GetMapProperty API reporting the same result
2926 {
2927 std::map<std::string, std::string> prop;
2928 ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
2929 ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop));
2930 ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop));
2931 ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop));
2932 ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop));
2933 }
2934
2935 // While the compaction is running, we will create 2 new files that
2936 // can fit in L2, these 2 files will be moved to L2 and overlap with
2937 // the running compaction and break the LSM consistency.
2938 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
2939 "CompactionJob::Run():Start", [&](void* /*arg*/) {
2940 ASSERT_OK(
2941 dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
2942 {"max_bytes_for_level_base", "1"}}));
2943 ASSERT_OK(Put(Key(6), "a"));
2944 ASSERT_OK(Put(Key(7), "a"));
2945 ASSERT_OK(Flush());
2946
2947 ASSERT_OK(Put(Key(8), "a"));
2948 ASSERT_OK(Put(Key(9), "a"));
2949 ASSERT_OK(Flush());
2950 });
2951 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
2952
2953 // Run a manual compaction that will compact the 2 files in L2
2954 // into 1 file in L2
2955 cro.exclusive_manual_compaction = false;
2956 cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
2957 ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
2958
2959 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
2960
2961 // Test that the stats GetMapProperty API reporting 1 file in L2
2962 {
2963 std::map<std::string, std::string> prop;
2964 ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
2965 ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop));
2966 }
2967 }
2968
TEST_F(DBTest2,ManualCompactionOverlapManualCompaction)2969 TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
2970 Options options = CurrentOptions();
2971 options.num_levels = 2;
2972 options.IncreaseParallelism(20);
2973 options.disable_auto_compactions = true;
2974 DestroyAndReopen(options);
2975
2976 ASSERT_OK(Put(Key(0), "a"));
2977 ASSERT_OK(Put(Key(5), "a"));
2978 ASSERT_OK(Flush());
2979
2980 ASSERT_OK(Put(Key(10), "a"));
2981 ASSERT_OK(Put(Key(15), "a"));
2982 ASSERT_OK(Flush());
2983
2984 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
2985
2986 // Trivial move 2 files to L1
2987 ASSERT_EQ("0,2", FilesPerLevel());
2988
2989 std::function<void()> bg_manual_compact = [&]() {
2990 std::string k1 = Key(6);
2991 std::string k2 = Key(9);
2992 Slice k1s(k1);
2993 Slice k2s(k2);
2994 CompactRangeOptions cro;
2995 cro.exclusive_manual_compaction = false;
2996 ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s));
2997 };
2998 ROCKSDB_NAMESPACE::port::Thread bg_thread;
2999
3000 // While the compaction is running, we will create 2 new files that
3001 // can fit in L1, these 2 files will be moved to L1 and overlap with
3002 // the running compaction and break the LSM consistency.
3003 std::atomic<bool> flag(false);
3004 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3005 "CompactionJob::Run():Start", [&](void* /*arg*/) {
3006 if (flag.exchange(true)) {
3007 // We want to make sure to call this callback only once
3008 return;
3009 }
3010 ASSERT_OK(Put(Key(6), "a"));
3011 ASSERT_OK(Put(Key(7), "a"));
3012 ASSERT_OK(Flush());
3013
3014 ASSERT_OK(Put(Key(8), "a"));
3015 ASSERT_OK(Put(Key(9), "a"));
3016 ASSERT_OK(Flush());
3017
3018 // Start a non-exclusive manual compaction in a bg thread
3019 bg_thread = port::Thread(bg_manual_compact);
3020 // This manual compaction conflict with the other manual compaction
3021 // so it should wait until the first compaction finish
3022 env_->SleepForMicroseconds(1000000);
3023 });
3024 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3025
3026 // Run a manual compaction that will compact the 2 files in L1
3027 // into 1 file in L1
3028 CompactRangeOptions cro;
3029 cro.exclusive_manual_compaction = false;
3030 cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
3031 ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
3032 bg_thread.join();
3033
3034 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3035 }
3036
TEST_F(DBTest2,PausingManualCompaction1)3037 TEST_F(DBTest2, PausingManualCompaction1) {
3038 Options options = CurrentOptions();
3039 options.disable_auto_compactions = true;
3040 options.num_levels = 7;
3041
3042 DestroyAndReopen(options);
3043 Random rnd(301);
3044 // Generate a file containing 10 keys.
3045 for (int i = 0; i < 10; i++) {
3046 ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
3047 }
3048 ASSERT_OK(Flush());
3049
3050 // Generate another file containing same keys
3051 for (int i = 0; i < 10; i++) {
3052 ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
3053 }
3054 ASSERT_OK(Flush());
3055
3056 int manual_compactions_paused = 0;
3057 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3058 "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
3059 auto paused = static_cast<std::atomic<int>*>(arg);
3060 ASSERT_EQ(0, paused->load(std::memory_order_acquire));
3061 paused->fetch_add(1, std::memory_order_release);
3062 manual_compactions_paused += 1;
3063 });
3064 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3065
3066 std::vector<std::string> files_before_compact, files_after_compact;
3067 // Remember file name before compaction is triggered
3068 std::vector<LiveFileMetaData> files_meta;
3069 dbfull()->GetLiveFilesMetaData(&files_meta);
3070 for (auto file : files_meta) {
3071 files_before_compact.push_back(file.name);
3072 }
3073
3074 // OK, now trigger a manual compaction
3075 ASSERT_TRUE(dbfull()
3076 ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
3077 .IsManualCompactionPaused());
3078
3079 // Wait for compactions to get scheduled and stopped
3080 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3081
3082 // Get file names after compaction is stopped
3083 files_meta.clear();
3084 dbfull()->GetLiveFilesMetaData(&files_meta);
3085 for (auto file : files_meta) {
3086 files_after_compact.push_back(file.name);
3087 }
3088
3089 // Like nothing happened
3090 ASSERT_EQ(files_before_compact, files_after_compact);
3091 ASSERT_EQ(manual_compactions_paused, 1);
3092
3093 manual_compactions_paused = 0;
3094 // Now make sure CompactFiles also not run
3095 ASSERT_TRUE(dbfull()
3096 ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
3097 files_before_compact, 0)
3098 .IsManualCompactionPaused());
3099 // Wait for manual compaction to get scheduled and finish
3100 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3101
3102 files_meta.clear();
3103 files_after_compact.clear();
3104 dbfull()->GetLiveFilesMetaData(&files_meta);
3105 for (auto file : files_meta) {
3106 files_after_compact.push_back(file.name);
3107 }
3108
3109 ASSERT_EQ(files_before_compact, files_after_compact);
3110 // CompactFiles returns at entry point
3111 ASSERT_EQ(manual_compactions_paused, 0);
3112
3113 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3114 }
3115
3116 // PausingManualCompaction does not affect auto compaction
TEST_F(DBTest2,PausingManualCompaction2)3117 TEST_F(DBTest2, PausingManualCompaction2) {
3118 Options options = CurrentOptions();
3119 options.level0_file_num_compaction_trigger = 2;
3120 options.disable_auto_compactions = false;
3121
3122 DestroyAndReopen(options);
3123 dbfull()->DisableManualCompaction();
3124
3125 Random rnd(301);
3126 for (int i = 0; i < 2; i++) {
3127 // Generate a file containing 10 keys.
3128 for (int j = 0; j < 100; j++) {
3129 ASSERT_OK(Put(Key(j), rnd.RandomString(50)));
3130 }
3131 ASSERT_OK(Flush());
3132 }
3133 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3134
3135 std::vector<LiveFileMetaData> files_meta;
3136 dbfull()->GetLiveFilesMetaData(&files_meta);
3137 ASSERT_EQ(files_meta.size(), 1);
3138 }
3139
TEST_F(DBTest2,PausingManualCompaction3)3140 TEST_F(DBTest2, PausingManualCompaction3) {
3141 CompactRangeOptions compact_options;
3142 Options options = CurrentOptions();
3143 options.disable_auto_compactions = true;
3144 options.num_levels = 7;
3145
3146 Random rnd(301);
3147 auto generate_files = [&]() {
3148 for (int i = 0; i < options.num_levels; i++) {
3149 for (int j = 0; j < options.num_levels - i + 1; j++) {
3150 for (int k = 0; k < 1000; k++) {
3151 ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
3152 }
3153 ASSERT_OK(Flush());
3154 }
3155
3156 for (int l = 1; l < options.num_levels - i; l++) {
3157 MoveFilesToLevel(l);
3158 }
3159 }
3160 };
3161
3162 DestroyAndReopen(options);
3163 generate_files();
3164 #ifndef ROCKSDB_LITE
3165 ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3166 #endif // !ROCKSDB_LITE
3167 int run_manual_compactions = 0;
3168 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3169 "CompactionJob::Run():PausingManualCompaction:1",
3170 [&](void* /*arg*/) { run_manual_compactions++; });
3171 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3172
3173 dbfull()->DisableManualCompaction();
3174 ASSERT_TRUE(dbfull()
3175 ->CompactRange(compact_options, nullptr, nullptr)
3176 .IsManualCompactionPaused());
3177 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3178 // As manual compaction disabled, not even reach sync point
3179 ASSERT_EQ(run_manual_compactions, 0);
3180 #ifndef ROCKSDB_LITE
3181 ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3182 #endif // !ROCKSDB_LITE
3183
3184 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3185 "CompactionJob::Run():PausingManualCompaction:1");
3186 dbfull()->EnableManualCompaction();
3187 ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3188 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3189 #ifndef ROCKSDB_LITE
3190 ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
3191 #endif // !ROCKSDB_LITE
3192
3193 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3194 }
3195
TEST_F(DBTest2,PausingManualCompaction4)3196 TEST_F(DBTest2, PausingManualCompaction4) {
3197 CompactRangeOptions compact_options;
3198 Options options = CurrentOptions();
3199 options.disable_auto_compactions = true;
3200 options.num_levels = 7;
3201
3202 Random rnd(301);
3203 auto generate_files = [&]() {
3204 for (int i = 0; i < options.num_levels; i++) {
3205 for (int j = 0; j < options.num_levels - i + 1; j++) {
3206 for (int k = 0; k < 1000; k++) {
3207 ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
3208 }
3209 ASSERT_OK(Flush());
3210 }
3211
3212 for (int l = 1; l < options.num_levels - i; l++) {
3213 MoveFilesToLevel(l);
3214 }
3215 }
3216 };
3217
3218 DestroyAndReopen(options);
3219 generate_files();
3220 #ifndef ROCKSDB_LITE
3221 ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3222 #endif // !ROCKSDB_LITE
3223 int run_manual_compactions = 0;
3224 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3225 "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
3226 auto paused = static_cast<std::atomic<int>*>(arg);
3227 ASSERT_EQ(0, paused->load(std::memory_order_acquire));
3228 paused->fetch_add(1, std::memory_order_release);
3229 run_manual_compactions++;
3230 });
3231 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3232
3233 ASSERT_TRUE(dbfull()
3234 ->CompactRange(compact_options, nullptr, nullptr)
3235 .IsManualCompactionPaused());
3236 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3237 ASSERT_EQ(run_manual_compactions, 1);
3238 #ifndef ROCKSDB_LITE
3239 ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3240 #endif // !ROCKSDB_LITE
3241
3242 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3243 "CompactionJob::Run():PausingManualCompaction:2");
3244 dbfull()->EnableManualCompaction();
3245 ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3246 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3247 #ifndef ROCKSDB_LITE
3248 ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
3249 #endif // !ROCKSDB_LITE
3250
3251 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3252 }
3253
TEST_F(DBTest2,CancelManualCompaction1)3254 TEST_F(DBTest2, CancelManualCompaction1) {
3255 CompactRangeOptions compact_options;
3256 auto canceledPtr =
3257 std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
3258 compact_options.canceled = canceledPtr.get();
3259
3260 Options options = CurrentOptions();
3261 options.disable_auto_compactions = true;
3262 options.num_levels = 7;
3263
3264 Random rnd(301);
3265 auto generate_files = [&]() {
3266 for (int i = 0; i < options.num_levels; i++) {
3267 for (int j = 0; j < options.num_levels - i + 1; j++) {
3268 for (int k = 0; k < 1000; k++) {
3269 ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
3270 }
3271 ASSERT_OK(Flush());
3272 }
3273
3274 for (int l = 1; l < options.num_levels - i; l++) {
3275 MoveFilesToLevel(l);
3276 }
3277 }
3278 };
3279
3280 DestroyAndReopen(options);
3281 generate_files();
3282 #ifndef ROCKSDB_LITE
3283 ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3284 #endif // !ROCKSDB_LITE
3285
3286 int run_manual_compactions = 0;
3287 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3288 "CompactionJob::Run():PausingManualCompaction:1",
3289 [&](void* /*arg*/) { run_manual_compactions++; });
3290 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3291
3292 // Setup a callback to disable compactions after a couple of levels are
3293 // compacted
3294 int compactions_run = 0;
3295 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3296 "DBImpl::RunManualCompaction()::1",
3297 [&](void* /*arg*/) { ++compactions_run; });
3298
3299 ASSERT_TRUE(dbfull()
3300 ->CompactRange(compact_options, nullptr, nullptr)
3301 .IsManualCompactionPaused());
3302 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3303
3304 // Since compactions are disabled, we shouldn't start compacting.
3305 // E.g. we should call the compaction function exactly one time.
3306 ASSERT_EQ(compactions_run, 0);
3307 ASSERT_EQ(run_manual_compactions, 0);
3308 #ifndef ROCKSDB_LITE
3309 ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3310 #endif // !ROCKSDB_LITE
3311
3312 compactions_run = 0;
3313 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3314 "DBImpl::RunManualCompaction()::1");
3315 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3316 "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
3317 ++compactions_run;
3318 // After 3 compactions disable
3319 if (compactions_run == 3) {
3320 compact_options.canceled->store(true, std::memory_order_release);
3321 }
3322 });
3323
3324 compact_options.canceled->store(false, std::memory_order_release);
3325 ASSERT_TRUE(dbfull()
3326 ->CompactRange(compact_options, nullptr, nullptr)
3327 .IsManualCompactionPaused());
3328 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3329
3330 ASSERT_EQ(compactions_run, 3);
3331
3332 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3333 "DBImpl::RunManualCompaction()::1");
3334 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3335 "CompactionJob::Run():PausingManualCompaction:1");
3336
3337 // Compactions should work again if we re-enable them..
3338 compact_options.canceled->store(false, std::memory_order_relaxed);
3339 ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3340 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3341 #ifndef ROCKSDB_LITE
3342 ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
3343 #endif // !ROCKSDB_LITE
3344
3345 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3346 }
3347
TEST_F(DBTest2,CancelManualCompaction2)3348 TEST_F(DBTest2, CancelManualCompaction2) {
3349 CompactRangeOptions compact_options;
3350 auto canceledPtr =
3351 std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
3352 compact_options.canceled = canceledPtr.get();
3353 compact_options.max_subcompactions = 1;
3354
3355 Options options = CurrentOptions();
3356 options.disable_auto_compactions = true;
3357 options.num_levels = 7;
3358
3359 Random rnd(301);
3360 auto generate_files = [&]() {
3361 for (int i = 0; i < options.num_levels; i++) {
3362 for (int j = 0; j < options.num_levels - i + 1; j++) {
3363 for (int k = 0; k < 1000; k++) {
3364 ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
3365 }
3366 ASSERT_OK(Flush());
3367 }
3368
3369 for (int l = 1; l < options.num_levels - i; l++) {
3370 MoveFilesToLevel(l);
3371 }
3372 }
3373 };
3374
3375 DestroyAndReopen(options);
3376 generate_files();
3377 #ifndef ROCKSDB_LITE
3378 ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
3379 #endif // !ROCKSDB_LITE
3380
3381 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3382
3383 int compactions_run = 0;
3384 std::atomic<int> kv_compactions{0};
3385 int compactions_stopped_at = 0;
3386 int kv_compactions_stopped_at = 0;
3387 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3388 "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
3389 ++compactions_run;
3390 // After 3 compactions disable
3391 });
3392
3393 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3394 "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
3395 int kv_compactions_run =
3396 kv_compactions.fetch_add(1, std::memory_order_release);
3397 if (kv_compactions_run == 5) {
3398 compact_options.canceled->store(true, std::memory_order_release);
3399 kv_compactions_stopped_at = kv_compactions_run;
3400 compactions_stopped_at = compactions_run;
3401 }
3402 });
3403
3404 compact_options.canceled->store(false, std::memory_order_release);
3405 ASSERT_TRUE(dbfull()
3406 ->CompactRange(compact_options, nullptr, nullptr)
3407 .IsManualCompactionPaused());
3408 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3409
3410 // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to
3411 // the canceled variable from the single compacting thread (via callback),
3412 // this value is deterministically kv_compactions_stopped_at + 1.
3413 ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1);
3414 ASSERT_EQ(compactions_run, compactions_stopped_at);
3415
3416 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3417 "CompactionIterator::ProcessKV");
3418 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3419 "DBImpl::RunManualCompaction()::1");
3420 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3421 "CompactionJob::Run():PausingManualCompaction:1");
3422
3423 // Compactions should work again if we re-enable them..
3424 compact_options.canceled->store(false, std::memory_order_relaxed);
3425 ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3426 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3427 #ifndef ROCKSDB_LITE
3428 ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
3429 #endif // !ROCKSDB_LITE
3430
3431 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3432 }
3433
3434 class CancelCompactionListener : public EventListener {
3435 public:
CancelCompactionListener()3436 CancelCompactionListener()
3437 : num_compaction_started_(0), num_compaction_ended_(0) {}
3438
OnCompactionBegin(DB *,const CompactionJobInfo & ci)3439 void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
3440 ASSERT_EQ(ci.cf_name, "default");
3441 ASSERT_EQ(ci.base_input_level, 0);
3442 num_compaction_started_++;
3443 }
3444
OnCompactionCompleted(DB *,const CompactionJobInfo & ci)3445 void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
3446 ASSERT_EQ(ci.cf_name, "default");
3447 ASSERT_EQ(ci.base_input_level, 0);
3448 ASSERT_EQ(ci.status.code(), code_);
3449 ASSERT_EQ(ci.status.subcode(), subcode_);
3450 num_compaction_ended_++;
3451 }
3452
3453 std::atomic<size_t> num_compaction_started_;
3454 std::atomic<size_t> num_compaction_ended_;
3455 Status::Code code_;
3456 Status::SubCode subcode_;
3457 };
3458
TEST_F(DBTest2,CancelManualCompactionWithListener)3459 TEST_F(DBTest2, CancelManualCompactionWithListener) {
3460 CompactRangeOptions compact_options;
3461 auto canceledPtr =
3462 std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
3463 compact_options.canceled = canceledPtr.get();
3464 compact_options.max_subcompactions = 1;
3465
3466 Options options = CurrentOptions();
3467 options.disable_auto_compactions = true;
3468 CancelCompactionListener* listener = new CancelCompactionListener();
3469 options.listeners.emplace_back(listener);
3470
3471 DestroyAndReopen(options);
3472
3473 Random rnd(301);
3474 for (int i = 0; i < 10; i++) {
3475 for (int j = 0; j < 10; j++) {
3476 ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50)));
3477 }
3478 ASSERT_OK(Flush());
3479 }
3480
3481 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3482
3483 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3484 "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
3485 compact_options.canceled->store(true, std::memory_order_release);
3486 });
3487
3488 int running_compaction = 0;
3489 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3490 "CompactionJob::FinishCompactionOutputFile1",
3491 [&](void* /*arg*/) { running_compaction++; });
3492
3493 // Case I: 1 Notify begin compaction, 2 DisableManualCompaction, 3 Compaction
3494 // not run, 4 Notify compaction end.
3495 listener->code_ = Status::kIncomplete;
3496 listener->subcode_ = Status::SubCode::kManualCompactionPaused;
3497
3498 compact_options.canceled->store(false, std::memory_order_release);
3499 ASSERT_TRUE(dbfull()
3500 ->CompactRange(compact_options, nullptr, nullptr)
3501 .IsManualCompactionPaused());
3502 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3503
3504 ASSERT_GT(listener->num_compaction_started_, 0);
3505 ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
3506 ASSERT_EQ(running_compaction, 0);
3507
3508 listener->num_compaction_started_ = 0;
3509 listener->num_compaction_ended_ = 0;
3510
3511 // Case II: 1 DisableManualCompaction, 2 Notify begin compaction (return
3512 // without notifying), 3 Notify compaction end (return without notifying).
3513 ASSERT_TRUE(dbfull()
3514 ->CompactRange(compact_options, nullptr, nullptr)
3515 .IsManualCompactionPaused());
3516 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3517
3518 ASSERT_EQ(listener->num_compaction_started_, 0);
3519 ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
3520 ASSERT_EQ(running_compaction, 0);
3521
3522 // Case III: 1 Notify begin compaction, 2 Compaction in between
3523 // 3. DisableManualCompaction, , 4 Notify compaction end.
3524 // compact_options.canceled->store(false, std::memory_order_release);
3525 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
3526 "CompactionIterator:ProcessKV");
3527
3528 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3529 "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) {
3530 compact_options.canceled->store(true, std::memory_order_release);
3531 });
3532
3533 listener->code_ = Status::kOk;
3534 listener->subcode_ = Status::SubCode::kNone;
3535
3536 compact_options.canceled->store(false, std::memory_order_release);
3537 ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
3538 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
3539
3540 ASSERT_GT(listener->num_compaction_started_, 0);
3541 ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
3542
3543 // Compaction job will succeed.
3544 ASSERT_GT(running_compaction, 0);
3545
3546 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
3547 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3548 }
3549
TEST_F(DBTest2,CompactionOnBottomPriorityWithListener)3550 TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) {
3551 int num_levels = 3;
3552 const int kNumFilesTrigger = 4;
3553
3554 Options options = CurrentOptions();
3555 env_->SetBackgroundThreads(0, Env::Priority::HIGH);
3556 env_->SetBackgroundThreads(0, Env::Priority::LOW);
3557 env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
3558 options.env = env_;
3559 options.compaction_style = kCompactionStyleUniversal;
3560 options.num_levels = num_levels;
3561 options.write_buffer_size = 100 << 10; // 100KB
3562 options.target_file_size_base = 32 << 10; // 32KB
3563 options.level0_file_num_compaction_trigger = kNumFilesTrigger;
3564 // Trigger compaction if size amplification exceeds 110%
3565 options.compaction_options_universal.max_size_amplification_percent = 110;
3566
3567 CancelCompactionListener* listener = new CancelCompactionListener();
3568 options.listeners.emplace_back(listener);
3569
3570 DestroyAndReopen(options);
3571
3572 int num_bottom_thread_compaction_scheduled = 0;
3573 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3574
3575 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3576 "DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
3577 [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; });
3578
3579 int num_compaction_jobs = 0;
3580 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3581 "CompactionJob::Run():End",
3582 [&](void* /*arg*/) { num_compaction_jobs++; });
3583
3584 listener->code_ = Status::kOk;
3585 listener->subcode_ = Status::SubCode::kNone;
3586
3587 Random rnd(301);
3588 for (int i = 0; i < 1; ++i) {
3589 for (int num = 0; num < kNumFilesTrigger; num++) {
3590 int key_idx = 0;
3591 GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
3592 // use no_wait above because that one waits for flush and compaction. We
3593 // don't want to wait for compaction because the full compaction is
3594 // intentionally blocked while more files are flushed.
3595 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
3596 }
3597 }
3598 ASSERT_OK(dbfull()->TEST_WaitForCompact());
3599 ASSERT_GT(num_bottom_thread_compaction_scheduled, 0);
3600 ASSERT_EQ(num_compaction_jobs, 1);
3601 ASSERT_GT(listener->num_compaction_started_, 0);
3602 ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
3603
3604 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
3605 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3606 }
3607
TEST_F(DBTest2,OptimizeForPointLookup)3608 TEST_F(DBTest2, OptimizeForPointLookup) {
3609 Options options = CurrentOptions();
3610 Close();
3611 options.OptimizeForPointLookup(2);
3612 ASSERT_OK(DB::Open(options, dbname_, &db_));
3613
3614 ASSERT_OK(Put("foo", "v1"));
3615 ASSERT_EQ("v1", Get("foo"));
3616 ASSERT_OK(Flush());
3617 ASSERT_EQ("v1", Get("foo"));
3618 }
3619
TEST_F(DBTest2,OptimizeForSmallDB)3620 TEST_F(DBTest2, OptimizeForSmallDB) {
3621 Options options = CurrentOptions();
3622 Close();
3623 options.OptimizeForSmallDb();
3624
3625 // Find the cache object
3626 ASSERT_TRUE(options.table_factory->IsInstanceOf(
3627 TableFactory::kBlockBasedTableName()));
3628 auto table_options =
3629 options.table_factory->GetOptions<BlockBasedTableOptions>();
3630
3631 ASSERT_TRUE(table_options != nullptr);
3632 std::shared_ptr<Cache> cache = table_options->block_cache;
3633
3634 ASSERT_EQ(0, cache->GetUsage());
3635 ASSERT_OK(DB::Open(options, dbname_, &db_));
3636 ASSERT_OK(Put("foo", "v1"));
3637
3638 // memtable size is costed to the block cache
3639 ASSERT_NE(0, cache->GetUsage());
3640
3641 ASSERT_EQ("v1", Get("foo"));
3642 ASSERT_OK(Flush());
3643
3644 size_t prev_size = cache->GetUsage();
3645 // Remember block cache size, so that we can find that
3646 // it is filled after Get().
3647 // Use pinnable slice so that it can ping the block so that
3648 // when we check the size it is not evicted.
3649 PinnableSlice value;
3650 ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
3651 ASSERT_GT(cache->GetUsage(), prev_size);
3652 value.Reset();
3653 }
3654
3655 #endif // ROCKSDB_LITE
3656
TEST_F(DBTest2,IterRaceFlush1)3657 TEST_F(DBTest2, IterRaceFlush1) {
3658 ASSERT_OK(Put("foo", "v1"));
3659
3660 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3661 {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"},
3662 {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}});
3663
3664 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3665
3666 ROCKSDB_NAMESPACE::port::Thread t1([&] {
3667 TEST_SYNC_POINT("DBTest2::IterRaceFlush:1");
3668 ASSERT_OK(Put("foo", "v2"));
3669 ASSERT_OK(Flush());
3670 TEST_SYNC_POINT("DBTest2::IterRaceFlush:2");
3671 });
3672
3673 // iterator is created after the first Put(), and its snapshot sequence is
3674 // assigned after second Put(), so it must see v2.
3675 {
3676 std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
3677 it->Seek("foo");
3678 ASSERT_TRUE(it->Valid());
3679 ASSERT_OK(it->status());
3680 ASSERT_EQ("foo", it->key().ToString());
3681 ASSERT_EQ("v2", it->value().ToString());
3682 }
3683
3684 t1.join();
3685 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3686 }
3687
TEST_F(DBTest2,IterRaceFlush2)3688 TEST_F(DBTest2, IterRaceFlush2) {
3689 ASSERT_OK(Put("foo", "v1"));
3690
3691 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3692 {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"},
3693 {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}});
3694
3695 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3696
3697 ROCKSDB_NAMESPACE::port::Thread t1([&] {
3698 TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1");
3699 ASSERT_OK(Put("foo", "v2"));
3700 ASSERT_OK(Flush());
3701 TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2");
3702 });
3703
3704 // iterator is created after the first Put(), and its snapshot sequence is
3705 // assigned before second Put(), thus it must see v1.
3706 {
3707 std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
3708 it->Seek("foo");
3709 ASSERT_TRUE(it->Valid());
3710 ASSERT_OK(it->status());
3711 ASSERT_EQ("foo", it->key().ToString());
3712 ASSERT_EQ("v1", it->value().ToString());
3713 }
3714
3715 t1.join();
3716 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3717 }
3718
TEST_F(DBTest2,IterRefreshRaceFlush)3719 TEST_F(DBTest2, IterRefreshRaceFlush) {
3720 ASSERT_OK(Put("foo", "v1"));
3721
3722 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3723 {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"},
3724 {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}});
3725
3726 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3727
3728 ROCKSDB_NAMESPACE::port::Thread t1([&] {
3729 TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1");
3730 ASSERT_OK(Put("foo", "v2"));
3731 ASSERT_OK(Flush());
3732 TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2");
3733 });
3734
3735 // iterator is refreshed after the first Put(), and its sequence number is
3736 // assigned after second Put(), thus it must see v2.
3737 {
3738 std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
3739 ASSERT_OK(it->status());
3740 ASSERT_OK(it->Refresh());
3741 it->Seek("foo");
3742 ASSERT_TRUE(it->Valid());
3743 ASSERT_OK(it->status());
3744 ASSERT_EQ("foo", it->key().ToString());
3745 ASSERT_EQ("v2", it->value().ToString());
3746 }
3747
3748 t1.join();
3749 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3750 }
3751
TEST_F(DBTest2,GetRaceFlush1)3752 TEST_F(DBTest2, GetRaceFlush1) {
3753 ASSERT_OK(Put("foo", "v1"));
3754
3755 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3756 {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"},
3757 {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}});
3758
3759 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3760
3761 ROCKSDB_NAMESPACE::port::Thread t1([&] {
3762 TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
3763 ASSERT_OK(Put("foo", "v2"));
3764 ASSERT_OK(Flush());
3765 TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
3766 });
3767
3768 // Get() is issued after the first Put(), so it should see either
3769 // "v1" or "v2".
3770 ASSERT_NE("NOT_FOUND", Get("foo"));
3771 t1.join();
3772 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3773 }
3774
TEST_F(DBTest2,GetRaceFlush2)3775 TEST_F(DBTest2, GetRaceFlush2) {
3776 ASSERT_OK(Put("foo", "v1"));
3777
3778 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
3779 {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"},
3780 {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}});
3781
3782 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3783
3784 port::Thread t1([&] {
3785 TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
3786 ASSERT_OK(Put("foo", "v2"));
3787 ASSERT_OK(Flush());
3788 TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
3789 });
3790
3791 // Get() is issued after the first Put(), so it should see either
3792 // "v1" or "v2".
3793 ASSERT_NE("NOT_FOUND", Get("foo"));
3794 t1.join();
3795 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3796 }
3797
TEST_F(DBTest2,DirectIO)3798 TEST_F(DBTest2, DirectIO) {
3799 if (!IsDirectIOSupported()) {
3800 return;
3801 }
3802 Options options = CurrentOptions();
3803 options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
3804 true;
3805 options.allow_mmap_reads = options.allow_mmap_writes = false;
3806 DestroyAndReopen(options);
3807
3808 ASSERT_OK(Put(Key(0), "a"));
3809 ASSERT_OK(Put(Key(5), "a"));
3810 ASSERT_OK(Flush());
3811
3812 ASSERT_OK(Put(Key(10), "a"));
3813 ASSERT_OK(Put(Key(15), "a"));
3814 ASSERT_OK(Flush());
3815
3816 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
3817 Reopen(options);
3818 }
3819
TEST_F(DBTest2,MemtableOnlyIterator)3820 TEST_F(DBTest2, MemtableOnlyIterator) {
3821 Options options = CurrentOptions();
3822 CreateAndReopenWithCF({"pikachu"}, options);
3823
3824 ASSERT_OK(Put(1, "foo", "first"));
3825 ASSERT_OK(Put(1, "bar", "second"));
3826
3827 ReadOptions ropt;
3828 ropt.read_tier = kMemtableTier;
3829 std::string value;
3830 Iterator* it = nullptr;
3831
3832 // Before flushing
3833 // point lookups
3834 ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
3835 ASSERT_EQ("first", value);
3836 ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
3837 ASSERT_EQ("second", value);
3838
3839 // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet.
3840 it = db_->NewIterator(ropt, handles_[1]);
3841 int count = 0;
3842 for (it->SeekToFirst(); it->Valid(); it->Next()) {
3843 ASSERT_TRUE(it->Valid());
3844 count++;
3845 }
3846 ASSERT_TRUE(!it->Valid());
3847 ASSERT_EQ(2, count);
3848 delete it;
3849
3850 Flush(1);
3851
3852 // After flushing
3853 // point lookups
3854 ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
3855 ASSERT_EQ("first", value);
3856 ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
3857 ASSERT_EQ("second", value);
3858 // nothing should be returned using memtable-only iterator after flushing.
3859 it = db_->NewIterator(ropt, handles_[1]);
3860 ASSERT_OK(it->status());
3861 count = 0;
3862 for (it->SeekToFirst(); it->Valid(); it->Next()) {
3863 ASSERT_TRUE(it->Valid());
3864 count++;
3865 }
3866 ASSERT_TRUE(!it->Valid());
3867 ASSERT_EQ(0, count);
3868 ASSERT_OK(it->status());
3869 delete it;
3870
3871 // Add a key to memtable
3872 ASSERT_OK(Put(1, "foobar", "third"));
3873 it = db_->NewIterator(ropt, handles_[1]);
3874 ASSERT_OK(it->status());
3875 count = 0;
3876 for (it->SeekToFirst(); it->Valid(); it->Next()) {
3877 ASSERT_TRUE(it->Valid());
3878 ASSERT_EQ("foobar", it->key().ToString());
3879 ASSERT_EQ("third", it->value().ToString());
3880 count++;
3881 }
3882 ASSERT_TRUE(!it->Valid());
3883 ASSERT_EQ(1, count);
3884 ASSERT_OK(it->status());
3885 delete it;
3886 }
3887
TEST_F(DBTest2,LowPriWrite)3888 TEST_F(DBTest2, LowPriWrite) {
3889 Options options = CurrentOptions();
3890 // Compaction pressure should trigger since 6 files
3891 options.level0_file_num_compaction_trigger = 4;
3892 options.level0_slowdown_writes_trigger = 12;
3893 options.level0_stop_writes_trigger = 30;
3894 options.delayed_write_rate = 8 * 1024 * 1024;
3895 Reopen(options);
3896
3897 std::atomic<int> rate_limit_count(0);
3898
3899 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
3900 "GenericRateLimiter::Request:1", [&](void* arg) {
3901 rate_limit_count.fetch_add(1);
3902 int64_t* rate_bytes_per_sec = static_cast<int64_t*>(arg);
3903 ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec);
3904 });
3905 // Block compaction
3906 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
3907 {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"},
3908 });
3909 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
3910 WriteOptions wo;
3911 for (int i = 0; i < 6; i++) {
3912 wo.low_pri = false;
3913 ASSERT_OK(Put("", "", wo));
3914 wo.low_pri = true;
3915 ASSERT_OK(Put("", "", wo));
3916 ASSERT_OK(Flush());
3917 }
3918 ASSERT_EQ(0, rate_limit_count.load());
3919 wo.low_pri = true;
3920 ASSERT_OK(Put("", "", wo));
3921 ASSERT_EQ(1, rate_limit_count.load());
3922 wo.low_pri = false;
3923 ASSERT_OK(Put("", "", wo));
3924 ASSERT_EQ(1, rate_limit_count.load());
3925
3926 TEST_SYNC_POINT("DBTest.LowPriWrite:0");
3927 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
3928
3929 ASSERT_OK(dbfull()->TEST_WaitForCompact());
3930 wo.low_pri = true;
3931 ASSERT_OK(Put("", "", wo));
3932 ASSERT_EQ(1, rate_limit_count.load());
3933 wo.low_pri = false;
3934 ASSERT_OK(Put("", "", wo));
3935 ASSERT_EQ(1, rate_limit_count.load());
3936 }
3937
3938 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,RateLimitedCompactionReads)3939 TEST_F(DBTest2, RateLimitedCompactionReads) {
3940 // compaction input has 512KB data
3941 const int kNumKeysPerFile = 128;
3942 const int kBytesPerKey = 1024;
3943 const int kNumL0Files = 4;
3944
3945 for (auto use_direct_io : {false, true}) {
3946 if (use_direct_io && !IsDirectIOSupported()) {
3947 continue;
3948 }
3949 Options options = CurrentOptions();
3950 options.compression = kNoCompression;
3951 options.level0_file_num_compaction_trigger = kNumL0Files;
3952 options.memtable_factory.reset(
3953 test::NewSpecialSkipListFactory(kNumKeysPerFile));
3954 options.new_table_reader_for_compaction_inputs = true;
3955 // takes roughly one second, split into 100 x 10ms intervals. Each interval
3956 // permits 5.12KB, which is smaller than the block size, so this test
3957 // exercises the code for chunking reads.
3958 options.rate_limiter.reset(NewGenericRateLimiter(
3959 static_cast<int64_t>(kNumL0Files * kNumKeysPerFile *
3960 kBytesPerKey) /* rate_bytes_per_sec */,
3961 10 * 1000 /* refill_period_us */, 10 /* fairness */,
3962 RateLimiter::Mode::kReadsOnly));
3963 options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
3964 use_direct_io;
3965 BlockBasedTableOptions bbto;
3966 bbto.block_size = 16384;
3967 bbto.no_block_cache = true;
3968 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
3969 DestroyAndReopen(options);
3970
3971 for (int i = 0; i < kNumL0Files; ++i) {
3972 for (int j = 0; j <= kNumKeysPerFile; ++j) {
3973 ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey)));
3974 }
3975 ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
3976 ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
3977 }
3978 ASSERT_OK(dbfull()->TEST_WaitForCompact());
3979 ASSERT_EQ(0, NumTableFilesAtLevel(0));
3980
3981 ASSERT_EQ(0, options.rate_limiter->GetTotalBytesThrough(Env::IO_HIGH));
3982 // should be slightly above 512KB due to non-data blocks read. Arbitrarily
3983 // chose 1MB as the upper bound on the total bytes read.
3984 size_t rate_limited_bytes =
3985 options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW);
3986 // Include the explicit prefetch of the footer in direct I/O case.
3987 size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
3988 ASSERT_GE(
3989 rate_limited_bytes,
3990 static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
3991 ASSERT_LT(
3992 rate_limited_bytes,
3993 static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
3994 direct_io_extra));
3995
3996 Iterator* iter = db_->NewIterator(ReadOptions());
3997 ASSERT_OK(iter->status());
3998 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
3999 ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey));
4000 }
4001 delete iter;
4002 // bytes read for user iterator shouldn't count against the rate limit.
4003 ASSERT_EQ(rate_limited_bytes,
4004 static_cast<size_t>(
4005 options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW)));
4006 }
4007 }
4008 #endif // ROCKSDB_LITE
4009
4010 // Make sure DB can be reopen with reduced number of levels, given no file
4011 // is on levels higher than the new num_levels.
TEST_F(DBTest2,ReduceLevel)4012 TEST_F(DBTest2, ReduceLevel) {
4013 Options options;
4014 options.env = env_;
4015 options.disable_auto_compactions = true;
4016 options.num_levels = 7;
4017 Reopen(options);
4018 ASSERT_OK(Put("foo", "bar"));
4019 ASSERT_OK(Flush());
4020 MoveFilesToLevel(6);
4021 #ifndef ROCKSDB_LITE
4022 ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
4023 #endif // !ROCKSDB_LITE
4024 CompactRangeOptions compact_options;
4025 compact_options.change_level = true;
4026 compact_options.target_level = 1;
4027 ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
4028 #ifndef ROCKSDB_LITE
4029 ASSERT_EQ("0,1", FilesPerLevel());
4030 #endif // !ROCKSDB_LITE
4031 options.num_levels = 3;
4032 Reopen(options);
4033 #ifndef ROCKSDB_LITE
4034 ASSERT_EQ("0,1", FilesPerLevel());
4035 #endif // !ROCKSDB_LITE
4036 }
4037
4038 // Test that ReadCallback is actually used in both memtbale and sst tables
TEST_F(DBTest2,ReadCallbackTest)4039 TEST_F(DBTest2, ReadCallbackTest) {
4040 Options options;
4041 options.disable_auto_compactions = true;
4042 options.num_levels = 7;
4043 options.env = env_;
4044 Reopen(options);
4045 std::vector<const Snapshot*> snapshots;
4046 // Try to create a db with multiple layers and a memtable
4047 const std::string key = "foo";
4048 const std::string value = "bar";
4049 // This test assumes that the seq start with 1 and increased by 1 after each
4050 // write batch of size 1. If that behavior changes, the test needs to be
4051 // updated as well.
4052 // TODO(myabandeh): update this test to use the seq number that is returned by
4053 // the DB instead of assuming what seq the DB used.
4054 int i = 1;
4055 for (; i < 10; i++) {
4056 ASSERT_OK(Put(key, value + std::to_string(i)));
4057 // Take a snapshot to avoid the value being removed during compaction
4058 auto snapshot = dbfull()->GetSnapshot();
4059 snapshots.push_back(snapshot);
4060 }
4061 ASSERT_OK(Flush());
4062 for (; i < 20; i++) {
4063 ASSERT_OK(Put(key, value + std::to_string(i)));
4064 // Take a snapshot to avoid the value being removed during compaction
4065 auto snapshot = dbfull()->GetSnapshot();
4066 snapshots.push_back(snapshot);
4067 }
4068 ASSERT_OK(Flush());
4069 MoveFilesToLevel(6);
4070 #ifndef ROCKSDB_LITE
4071 ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
4072 #endif // !ROCKSDB_LITE
4073 for (; i < 30; i++) {
4074 ASSERT_OK(Put(key, value + std::to_string(i)));
4075 auto snapshot = dbfull()->GetSnapshot();
4076 snapshots.push_back(snapshot);
4077 }
4078 ASSERT_OK(Flush());
4079 #ifndef ROCKSDB_LITE
4080 ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
4081 #endif // !ROCKSDB_LITE
4082 // And also add some values to the memtable
4083 for (; i < 40; i++) {
4084 ASSERT_OK(Put(key, value + std::to_string(i)));
4085 auto snapshot = dbfull()->GetSnapshot();
4086 snapshots.push_back(snapshot);
4087 }
4088
4089 class TestReadCallback : public ReadCallback {
4090 public:
4091 explicit TestReadCallback(SequenceNumber snapshot)
4092 : ReadCallback(snapshot), snapshot_(snapshot) {}
4093 bool IsVisibleFullCheck(SequenceNumber seq) override {
4094 return seq <= snapshot_;
4095 }
4096
4097 private:
4098 SequenceNumber snapshot_;
4099 };
4100
4101 for (int seq = 1; seq < i; seq++) {
4102 PinnableSlice pinnable_val;
4103 ReadOptions roptions;
4104 TestReadCallback callback(seq);
4105 bool dont_care = true;
4106 DBImpl::GetImplOptions get_impl_options;
4107 get_impl_options.column_family = dbfull()->DefaultColumnFamily();
4108 get_impl_options.value = &pinnable_val;
4109 get_impl_options.value_found = &dont_care;
4110 get_impl_options.callback = &callback;
4111 Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
4112 ASSERT_TRUE(s.ok());
4113 // Assuming that after each Put the DB increased seq by one, the value and
4114 // seq number must be equal since we also inc value by 1 after each Put.
4115 ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString());
4116 }
4117
4118 for (auto snapshot : snapshots) {
4119 dbfull()->ReleaseSnapshot(snapshot);
4120 }
4121 }
4122
4123 #ifndef ROCKSDB_LITE
4124
TEST_F(DBTest2,LiveFilesOmitObsoleteFiles)4125 TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
4126 // Regression test for race condition where an obsolete file is returned to
4127 // user as a "live file" but then deleted, all while file deletions are
4128 // disabled.
4129 //
4130 // It happened like this:
4131 //
4132 // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles
4133 // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the
4134 // latter returned "x.log"
4135 // 3. [flush thread] PurgeObsoleteFiles deleted "x.log"
4136 // 4. [user thread] Reading "x.log" failed
4137 //
4138 // Unfortunately the only regression test I can come up with involves sleep.
4139 // We cannot set SyncPoints to repro since, once the fix is applied, the
4140 // SyncPoints would cause a deadlock as the repro's sequence of events is now
4141 // prohibited.
4142 //
4143 // Instead, if we sleep for a second between Find and Purge, and ensure the
4144 // read attempt happens after purge, then the sequence of events will almost
4145 // certainly happen on the old code.
4146 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
4147 {"DBImpl::BackgroundCallFlush:FilesFound",
4148 "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"},
4149 {"DBImpl::PurgeObsoleteFiles:End",
4150 "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"},
4151 });
4152 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
4153 "DBImpl::PurgeObsoleteFiles:Begin",
4154 [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
4155 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
4156
4157 ASSERT_OK(Put("key", "val"));
4158 FlushOptions flush_opts;
4159 flush_opts.wait = false;
4160 db_->Flush(flush_opts);
4161 TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");
4162
4163 ASSERT_OK(db_->DisableFileDeletions());
4164 VectorLogPtr log_files;
4165 ASSERT_OK(db_->GetSortedWalFiles(log_files));
4166 TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
4167 for (const auto& log_file : log_files) {
4168 ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
4169 }
4170
4171 ASSERT_OK(db_->EnableFileDeletions());
4172 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
4173 }
4174
TEST_F(DBTest2,TestNumPread)4175 TEST_F(DBTest2, TestNumPread) {
4176 Options options = CurrentOptions();
4177 bool prefetch_supported =
4178 test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
4179 // disable block cache
4180 BlockBasedTableOptions table_options;
4181 table_options.no_block_cache = true;
4182 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
4183 Reopen(options);
4184 env_->count_random_reads_ = true;
4185 env_->random_file_open_counter_.store(0);
4186 ASSERT_OK(Put("bar", "foo"));
4187 ASSERT_OK(Put("foo", "bar"));
4188 ASSERT_OK(Flush());
4189 if (prefetch_supported) {
4190 // After flush, we'll open the file and read footer, meta block,
4191 // property block and index block.
4192 ASSERT_EQ(4, env_->random_read_counter_.Read());
4193 } else {
4194 // With prefetch not supported, we will do a single read into a buffer
4195 ASSERT_EQ(1, env_->random_read_counter_.Read());
4196 }
4197 ASSERT_EQ(1, env_->random_file_open_counter_.load());
4198
4199 // One pread per a normal data block read
4200 env_->random_file_open_counter_.store(0);
4201 env_->random_read_counter_.Reset();
4202 ASSERT_EQ("bar", Get("foo"));
4203 ASSERT_EQ(1, env_->random_read_counter_.Read());
4204 // All files are already opened.
4205 ASSERT_EQ(0, env_->random_file_open_counter_.load());
4206
4207 env_->random_file_open_counter_.store(0);
4208 env_->random_read_counter_.Reset();
4209 ASSERT_OK(Put("bar2", "foo2"));
4210 ASSERT_OK(Put("foo2", "bar2"));
4211 ASSERT_OK(Flush());
4212 if (prefetch_supported) {
4213 // After flush, we'll open the file and read footer, meta block,
4214 // property block and index block.
4215 ASSERT_EQ(4, env_->random_read_counter_.Read());
4216 } else {
4217 // With prefetch not supported, we will do a single read into a buffer
4218 ASSERT_EQ(1, env_->random_read_counter_.Read());
4219 }
4220 ASSERT_EQ(1, env_->random_file_open_counter_.load());
4221
4222 env_->random_file_open_counter_.store(0);
4223 env_->random_read_counter_.Reset();
4224 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
4225 if (prefetch_supported) {
4226 // Compaction needs two input blocks, which requires 2 preads, and
4227 // generate a new SST file which needs 4 preads (footer, meta block,
4228 // property block and index block). In total 6.
4229 ASSERT_EQ(6, env_->random_read_counter_.Read());
4230 } else {
4231 // With prefetch off, compaction needs two input blocks,
4232 // followed by a single buffered read. In total 3.
4233 ASSERT_EQ(3, env_->random_read_counter_.Read());
4234 }
4235 // All compaction input files should have already been opened.
4236 ASSERT_EQ(1, env_->random_file_open_counter_.load());
4237
4238 // One pread per a normal data block read
4239 env_->random_file_open_counter_.store(0);
4240 env_->random_read_counter_.Reset();
4241 ASSERT_EQ("foo2", Get("bar2"));
4242 ASSERT_EQ(1, env_->random_read_counter_.Read());
4243 // SST files are already opened.
4244 ASSERT_EQ(0, env_->random_file_open_counter_.load());
4245 }
4246
4247 class TraceExecutionResultHandler : public TraceRecordResult::Handler {
4248 public:
TraceExecutionResultHandler()4249 TraceExecutionResultHandler() {}
~TraceExecutionResultHandler()4250 ~TraceExecutionResultHandler() override {}
4251
Handle(const StatusOnlyTraceExecutionResult & result)4252 virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override {
4253 if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
4254 return Status::InvalidArgument("Invalid timestamps.");
4255 }
4256 result.GetStatus().PermitUncheckedError();
4257 switch (result.GetTraceType()) {
4258 case kTraceWrite: {
4259 total_latency_ += result.GetLatency();
4260 cnt_++;
4261 writes_++;
4262 break;
4263 }
4264 default:
4265 return Status::Corruption("Type mismatch.");
4266 }
4267 return Status::OK();
4268 }
4269
Handle(const SingleValueTraceExecutionResult & result)4270 virtual Status Handle(
4271 const SingleValueTraceExecutionResult& result) override {
4272 if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
4273 return Status::InvalidArgument("Invalid timestamps.");
4274 }
4275 result.GetStatus().PermitUncheckedError();
4276 switch (result.GetTraceType()) {
4277 case kTraceGet: {
4278 total_latency_ += result.GetLatency();
4279 cnt_++;
4280 gets_++;
4281 break;
4282 }
4283 default:
4284 return Status::Corruption("Type mismatch.");
4285 }
4286 return Status::OK();
4287 }
4288
Handle(const MultiValuesTraceExecutionResult & result)4289 virtual Status Handle(
4290 const MultiValuesTraceExecutionResult& result) override {
4291 if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
4292 return Status::InvalidArgument("Invalid timestamps.");
4293 }
4294 for (const Status& s : result.GetMultiStatus()) {
4295 s.PermitUncheckedError();
4296 }
4297 switch (result.GetTraceType()) {
4298 case kTraceMultiGet: {
4299 total_latency_ += result.GetLatency();
4300 cnt_++;
4301 multigets_++;
4302 break;
4303 }
4304 default:
4305 return Status::Corruption("Type mismatch.");
4306 }
4307 return Status::OK();
4308 }
4309
Handle(const IteratorTraceExecutionResult & result)4310 virtual Status Handle(const IteratorTraceExecutionResult& result) override {
4311 if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
4312 return Status::InvalidArgument("Invalid timestamps.");
4313 }
4314 result.GetStatus().PermitUncheckedError();
4315 switch (result.GetTraceType()) {
4316 case kTraceIteratorSeek:
4317 case kTraceIteratorSeekForPrev: {
4318 total_latency_ += result.GetLatency();
4319 cnt_++;
4320 seeks_++;
4321 break;
4322 }
4323 default:
4324 return Status::Corruption("Type mismatch.");
4325 }
4326 return Status::OK();
4327 }
4328
Reset()4329 void Reset() {
4330 total_latency_ = 0;
4331 cnt_ = 0;
4332 writes_ = 0;
4333 gets_ = 0;
4334 seeks_ = 0;
4335 multigets_ = 0;
4336 }
4337
GetAvgLatency() const4338 double GetAvgLatency() const {
4339 return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_;
4340 }
4341
GetNumWrites() const4342 int GetNumWrites() const { return writes_; }
4343
GetNumGets() const4344 int GetNumGets() const { return gets_; }
4345
GetNumIterSeeks() const4346 int GetNumIterSeeks() const { return seeks_; }
4347
GetNumMultiGets() const4348 int GetNumMultiGets() const { return multigets_; }
4349
4350 private:
4351 std::atomic<uint64_t> total_latency_{0};
4352 std::atomic<uint32_t> cnt_{0};
4353 std::atomic<int> writes_{0};
4354 std::atomic<int> gets_{0};
4355 std::atomic<int> seeks_{0};
4356 std::atomic<int> multigets_{0};
4357 };
4358
TEST_F(DBTest2,TraceAndReplay)4359 TEST_F(DBTest2, TraceAndReplay) {
4360 Options options = CurrentOptions();
4361 options.merge_operator = MergeOperators::CreatePutOperator();
4362 ReadOptions ro;
4363 WriteOptions wo;
4364 TraceOptions trace_opts;
4365 EnvOptions env_opts;
4366 CreateAndReopenWithCF({"pikachu"}, options);
4367 Random rnd(301);
4368 Iterator* single_iter = nullptr;
4369
4370 ASSERT_TRUE(db_->EndTrace().IsIOError());
4371
4372 std::string trace_filename = dbname_ + "/rocksdb.trace";
4373 std::unique_ptr<TraceWriter> trace_writer;
4374 ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
4375 ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
4376
4377 // 5 Writes
4378 ASSERT_OK(Put(0, "a", "1"));
4379 ASSERT_OK(Merge(0, "b", "2"));
4380 ASSERT_OK(Delete(0, "c"));
4381 ASSERT_OK(SingleDelete(0, "d"));
4382 ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
4383
4384 // 6th Write
4385 WriteBatch batch;
4386 ASSERT_OK(batch.Put("f", "11"));
4387 ASSERT_OK(batch.Merge("g", "12"));
4388 ASSERT_OK(batch.Delete("h"));
4389 ASSERT_OK(batch.SingleDelete("i"));
4390 ASSERT_OK(batch.DeleteRange("j", "k"));
4391 ASSERT_OK(db_->Write(wo, &batch));
4392
4393 // 2 Seek(ForPrev)s
4394 single_iter = db_->NewIterator(ro);
4395 single_iter->Seek("f"); // Seek 1
4396 single_iter->SeekForPrev("g");
4397 ASSERT_OK(single_iter->status());
4398 delete single_iter;
4399
4400 // 2 Gets
4401 ASSERT_EQ("1", Get(0, "a"));
4402 ASSERT_EQ("12", Get(0, "g"));
4403
4404 // 7th and 8th Write, 3rd Get
4405 ASSERT_OK(Put(1, "foo", "bar"));
4406 ASSERT_OK(Put(1, "rocksdb", "rocks"));
4407 ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
4408
4409 // Total Write x 8, Get x 3, Seek x 2.
4410 ASSERT_OK(db_->EndTrace());
4411 // These should not get into the trace file as it is after EndTrace.
4412 ASSERT_OK(Put("hello", "world"));
4413 ASSERT_OK(Merge("foo", "bar"));
4414
4415 // Open another db, replay, and verify the data
4416 std::string value;
4417 std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
4418 ASSERT_OK(DestroyDB(dbname2, options));
4419
4420 // Using a different name than db2, to pacify infer's use-after-lifetime
4421 // warnings (http://fbinfer.com).
4422 DB* db2_init = nullptr;
4423 options.create_if_missing = true;
4424 ASSERT_OK(DB::Open(options, dbname2, &db2_init));
4425 ColumnFamilyHandle* cf;
4426 ASSERT_OK(
4427 db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
4428 delete cf;
4429 delete db2_init;
4430
4431 DB* db2 = nullptr;
4432 std::vector<ColumnFamilyDescriptor> column_families;
4433 ColumnFamilyOptions cf_options;
4434 cf_options.merge_operator = MergeOperators::CreatePutOperator();
4435 column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
4436 column_families.push_back(
4437 ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
4438 std::vector<ColumnFamilyHandle*> handles;
4439 DBOptions db_opts;
4440 db_opts.env = env_;
4441 ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
4442
4443 env_->SleepForMicroseconds(100);
4444 // Verify that the keys don't already exist
4445 ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4446 ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
4447
4448 std::unique_ptr<TraceReader> trace_reader;
4449 ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
4450 std::unique_ptr<Replayer> replayer;
4451 ASSERT_OK(
4452 db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
4453
4454 TraceExecutionResultHandler res_handler;
4455 std::function<void(Status, std::unique_ptr<TraceRecordResult> &&)> res_cb =
4456 [&res_handler](Status exec_s, std::unique_ptr<TraceRecordResult>&& res) {
4457 ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported());
4458 if (res != nullptr) {
4459 ASSERT_OK(res->Accept(&res_handler));
4460 res.reset();
4461 }
4462 };
4463
4464 // Unprepared replay should fail with Status::Incomplete()
4465 ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
4466 ASSERT_OK(replayer->Prepare());
4467 // Ok to repeatedly Prepare().
4468 ASSERT_OK(replayer->Prepare());
4469 // Replay using 1 thread, 1x speed.
4470 ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb));
4471 ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4472 ASSERT_EQ(res_handler.GetNumWrites(), 8);
4473 ASSERT_EQ(res_handler.GetNumGets(), 3);
4474 ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
4475 ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4476 res_handler.Reset();
4477
4478 ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
4479 ASSERT_EQ("1", value);
4480 ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
4481 ASSERT_EQ("12", value);
4482 ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
4483 ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
4484
4485 ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
4486 ASSERT_EQ("bar", value);
4487 ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
4488 ASSERT_EQ("rocks", value);
4489
4490 // Re-replay should fail with Status::Incomplete() if Prepare() was not
4491 // called. Currently we don't distinguish between unprepared and trace end.
4492 ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
4493
4494 // Re-replay using 2 threads, 2x speed.
4495 ASSERT_OK(replayer->Prepare());
4496 ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb));
4497 ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4498 ASSERT_EQ(res_handler.GetNumWrites(), 8);
4499 ASSERT_EQ(res_handler.GetNumGets(), 3);
4500 ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
4501 ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4502 res_handler.Reset();
4503
4504 // Re-replay using 2 threads, 1/2 speed.
4505 ASSERT_OK(replayer->Prepare());
4506 ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb));
4507 ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4508 ASSERT_EQ(res_handler.GetNumWrites(), 8);
4509 ASSERT_EQ(res_handler.GetNumGets(), 3);
4510 ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
4511 ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4512 res_handler.Reset();
4513
4514 replayer.reset();
4515
4516 for (auto handle : handles) {
4517 delete handle;
4518 }
4519 delete db2;
4520 ASSERT_OK(DestroyDB(dbname2, options));
4521 }
4522
TEST_F(DBTest2,TraceAndManualReplay)4523 TEST_F(DBTest2, TraceAndManualReplay) {
4524 Options options = CurrentOptions();
4525 options.merge_operator = MergeOperators::CreatePutOperator();
4526 ReadOptions ro;
4527 WriteOptions wo;
4528 TraceOptions trace_opts;
4529 EnvOptions env_opts;
4530 CreateAndReopenWithCF({"pikachu"}, options);
4531 Random rnd(301);
4532 Iterator* single_iter = nullptr;
4533
4534 ASSERT_TRUE(db_->EndTrace().IsIOError());
4535
4536 std::string trace_filename = dbname_ + "/rocksdb.trace";
4537 std::unique_ptr<TraceWriter> trace_writer;
4538 ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
4539 ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
4540
4541 ASSERT_OK(Put(0, "a", "1"));
4542 ASSERT_OK(Merge(0, "b", "2"));
4543 ASSERT_OK(Delete(0, "c"));
4544 ASSERT_OK(SingleDelete(0, "d"));
4545 ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
4546
4547 WriteBatch batch;
4548 ASSERT_OK(batch.Put("f", "11"));
4549 ASSERT_OK(batch.Merge("g", "12"));
4550 ASSERT_OK(batch.Delete("h"));
4551 ASSERT_OK(batch.SingleDelete("i"));
4552 ASSERT_OK(batch.DeleteRange("j", "k"));
4553 ASSERT_OK(db_->Write(wo, &batch));
4554
4555 single_iter = db_->NewIterator(ro);
4556 single_iter->Seek("f");
4557 single_iter->SeekForPrev("g");
4558 ASSERT_OK(single_iter->status());
4559 delete single_iter;
4560
4561 // Write some sequenced keys for testing lower/upper bounds of iterator.
4562 batch.Clear();
4563 ASSERT_OK(batch.Put("iter-0", "iter-0"));
4564 ASSERT_OK(batch.Put("iter-1", "iter-1"));
4565 ASSERT_OK(batch.Put("iter-2", "iter-2"));
4566 ASSERT_OK(batch.Put("iter-3", "iter-3"));
4567 ASSERT_OK(batch.Put("iter-4", "iter-4"));
4568 ASSERT_OK(db_->Write(wo, &batch));
4569
4570 ReadOptions bounded_ro = ro;
4571 Slice lower_bound("iter-1");
4572 Slice upper_bound("iter-3");
4573 bounded_ro.iterate_lower_bound = &lower_bound;
4574 bounded_ro.iterate_upper_bound = &upper_bound;
4575 single_iter = db_->NewIterator(bounded_ro);
4576 single_iter->Seek("iter-0");
4577 ASSERT_EQ(single_iter->key().ToString(), "iter-1");
4578 single_iter->Seek("iter-2");
4579 ASSERT_EQ(single_iter->key().ToString(), "iter-2");
4580 single_iter->Seek("iter-4");
4581 ASSERT_FALSE(single_iter->Valid());
4582 single_iter->SeekForPrev("iter-0");
4583 ASSERT_FALSE(single_iter->Valid());
4584 single_iter->SeekForPrev("iter-2");
4585 ASSERT_EQ(single_iter->key().ToString(), "iter-2");
4586 single_iter->SeekForPrev("iter-4");
4587 ASSERT_EQ(single_iter->key().ToString(), "iter-2");
4588 ASSERT_OK(single_iter->status());
4589 delete single_iter;
4590
4591 ASSERT_EQ("1", Get(0, "a"));
4592 ASSERT_EQ("12", Get(0, "g"));
4593
4594 ASSERT_OK(Put(1, "foo", "bar"));
4595 ASSERT_OK(Put(1, "rocksdb", "rocks"));
4596 ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
4597
4598 // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2.
4599 // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6
4600 // Seek(ForPrev)s.
4601 // Total Write x 9, Get x 3, Seek x 8
4602 ASSERT_OK(db_->EndTrace());
4603 // These should not get into the trace file as it is after EndTrace.
4604 ASSERT_OK(Put("hello", "world"));
4605 ASSERT_OK(Merge("foo", "bar"));
4606
4607 // Open another db, replay, and verify the data
4608 std::string value;
4609 std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
4610 ASSERT_OK(DestroyDB(dbname2, options));
4611
4612 // Using a different name than db2, to pacify infer's use-after-lifetime
4613 // warnings (http://fbinfer.com).
4614 DB* db2_init = nullptr;
4615 options.create_if_missing = true;
4616 ASSERT_OK(DB::Open(options, dbname2, &db2_init));
4617 ColumnFamilyHandle* cf;
4618 ASSERT_OK(
4619 db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
4620 delete cf;
4621 delete db2_init;
4622
4623 DB* db2 = nullptr;
4624 std::vector<ColumnFamilyDescriptor> column_families;
4625 ColumnFamilyOptions cf_options;
4626 cf_options.merge_operator = MergeOperators::CreatePutOperator();
4627 column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
4628 column_families.push_back(
4629 ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
4630 std::vector<ColumnFamilyHandle*> handles;
4631 DBOptions db_opts;
4632 db_opts.env = env_;
4633 ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
4634
4635 env_->SleepForMicroseconds(100);
4636 // Verify that the keys don't already exist
4637 ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4638 ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
4639
4640 std::unique_ptr<TraceReader> trace_reader;
4641 ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
4642 std::unique_ptr<Replayer> replayer;
4643 ASSERT_OK(
4644 db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
4645
4646 TraceExecutionResultHandler res_handler;
4647
4648 // Manual replay for 2 times. The 2nd checks if the replay can restart.
4649 std::unique_ptr<TraceRecord> record;
4650 std::unique_ptr<TraceRecordResult> result;
4651 for (int i = 0; i < 2; i++) {
4652 // Next should fail if unprepared.
4653 ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
4654 ASSERT_OK(replayer->Prepare());
4655 Status s = Status::OK();
4656 // Looping until trace end.
4657 while (s.ok()) {
4658 s = replayer->Next(&record);
4659 // Skip unsupported operations.
4660 if (s.IsNotSupported()) {
4661 continue;
4662 }
4663 if (s.ok()) {
4664 ASSERT_OK(replayer->Execute(record, &result));
4665 if (result != nullptr) {
4666 ASSERT_OK(result->Accept(&res_handler));
4667 if (record->GetTraceType() == kTraceIteratorSeek ||
4668 record->GetTraceType() == kTraceIteratorSeekForPrev) {
4669 IteratorSeekQueryTraceRecord* iter_rec =
4670 dynamic_cast<IteratorSeekQueryTraceRecord*>(record.get());
4671 IteratorTraceExecutionResult* iter_res =
4672 dynamic_cast<IteratorTraceExecutionResult*>(result.get());
4673 // Check if lower/upper bounds are correctly saved and decoded.
4674 std::string lower_str = iter_rec->GetLowerBound().ToString();
4675 std::string upper_str = iter_rec->GetUpperBound().ToString();
4676 std::string iter_key = iter_res->GetKey().ToString();
4677 std::string iter_value = iter_res->GetValue().ToString();
4678 if (!lower_str.empty() && !upper_str.empty()) {
4679 ASSERT_EQ(lower_str, "iter-1");
4680 ASSERT_EQ(upper_str, "iter-3");
4681 if (iter_res->GetValid()) {
4682 // If iterator is valid, then lower_bound <= key < upper_bound.
4683 ASSERT_GE(iter_key, lower_str);
4684 ASSERT_LT(iter_key, upper_str);
4685 } else {
4686 // If iterator is invalid, then
4687 // key < lower_bound or key >= upper_bound.
4688 ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str);
4689 }
4690 }
4691 // If iterator is invalid, the key and value should be empty.
4692 if (!iter_res->GetValid()) {
4693 ASSERT_TRUE(iter_key.empty());
4694 ASSERT_TRUE(iter_value.empty());
4695 }
4696 }
4697 result.reset();
4698 }
4699 }
4700 }
4701 // Status::Incomplete() will be returned when manually reading the trace
4702 // end, or Prepare() was not called.
4703 ASSERT_TRUE(s.IsIncomplete());
4704 ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
4705 ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4706 ASSERT_EQ(res_handler.GetNumWrites(), 9);
4707 ASSERT_EQ(res_handler.GetNumGets(), 3);
4708 ASSERT_EQ(res_handler.GetNumIterSeeks(), 8);
4709 ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4710 res_handler.Reset();
4711 }
4712
4713 ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
4714 ASSERT_EQ("1", value);
4715 ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
4716 ASSERT_EQ("12", value);
4717 ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
4718 ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
4719
4720 ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
4721 ASSERT_EQ("bar", value);
4722 ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
4723 ASSERT_EQ("rocks", value);
4724
4725 // Test execution of artificially created TraceRecords.
4726 uint64_t fake_ts = 1U;
4727 // Write
4728 batch.Clear();
4729 ASSERT_OK(batch.Put("trace-record-write1", "write1"));
4730 ASSERT_OK(batch.Put("trace-record-write2", "write2"));
4731 record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++));
4732 ASSERT_OK(replayer->Execute(record, &result));
4733 ASSERT_TRUE(result != nullptr);
4734 ASSERT_OK(result->Accept(&res_handler)); // Write x 1
4735 ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value));
4736 ASSERT_EQ("write1", value);
4737 ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value));
4738 ASSERT_EQ("write2", value);
4739 ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4740 ASSERT_EQ(res_handler.GetNumWrites(), 1);
4741 ASSERT_EQ(res_handler.GetNumGets(), 0);
4742 ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
4743 ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4744 res_handler.Reset();
4745
4746 // Get related
4747 // Get an existing key.
4748 record.reset(new GetQueryTraceRecord(handles[0]->GetID(),
4749 "trace-record-write1", fake_ts++));
4750 ASSERT_OK(replayer->Execute(record, &result));
4751 ASSERT_TRUE(result != nullptr);
4752 ASSERT_OK(result->Accept(&res_handler)); // Get x 1
4753 // Get an non-existing key, should still return Status::OK().
4754 record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get",
4755 fake_ts++));
4756 ASSERT_OK(replayer->Execute(record, &result));
4757 ASSERT_TRUE(result != nullptr);
4758 ASSERT_OK(result->Accept(&res_handler)); // Get x 2
4759 // Get from an invalid (non-existing) cf_id.
4760 uint32_t invalid_cf_id = handles[1]->GetID() + 1;
4761 record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++));
4762 ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
4763 ASSERT_TRUE(result == nullptr);
4764 ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4765 ASSERT_EQ(res_handler.GetNumWrites(), 0);
4766 ASSERT_EQ(res_handler.GetNumGets(), 2);
4767 ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
4768 ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4769 res_handler.Reset();
4770
4771 // Iteration related
4772 for (IteratorSeekQueryTraceRecord::SeekType seekType :
4773 {IteratorSeekQueryTraceRecord::kSeek,
4774 IteratorSeekQueryTraceRecord::kSeekForPrev}) {
4775 // Seek to an existing key.
4776 record.reset(new IteratorSeekQueryTraceRecord(
4777 seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++));
4778 ASSERT_OK(replayer->Execute(record, &result));
4779 ASSERT_TRUE(result != nullptr);
4780 ASSERT_OK(result->Accept(&res_handler)); // Seek x 1 in one iteration
4781 // Seek to an non-existing key, should still return Status::OK().
4782 record.reset(new IteratorSeekQueryTraceRecord(
4783 seekType, handles[0]->GetID(), "trace-record-get", fake_ts++));
4784 ASSERT_OK(replayer->Execute(record, &result));
4785 ASSERT_TRUE(result != nullptr);
4786 ASSERT_OK(result->Accept(&res_handler)); // Seek x 2 in one iteration
4787 // Seek from an invalid cf_id.
4788 record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id,
4789 "whatever", fake_ts++));
4790 ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
4791 ASSERT_TRUE(result == nullptr);
4792 }
4793 ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4794 ASSERT_EQ(res_handler.GetNumWrites(), 0);
4795 ASSERT_EQ(res_handler.GetNumGets(), 0);
4796 ASSERT_EQ(res_handler.GetNumIterSeeks(), 4); // Seek x 2 in two iterations
4797 ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
4798 res_handler.Reset();
4799
4800 // MultiGet related
4801 // Get existing keys.
4802 record.reset(new MultiGetQueryTraceRecord(
4803 std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
4804 std::vector<std::string>({"a", "foo"}), fake_ts++));
4805 ASSERT_OK(replayer->Execute(record, &result));
4806 ASSERT_TRUE(result != nullptr);
4807 ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 1
4808 // Get all non-existing keys, should still return Status::OK().
4809 record.reset(new MultiGetQueryTraceRecord(
4810 std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
4811 std::vector<std::string>({"no1", "no2"}), fake_ts++));
4812 ASSERT_OK(replayer->Execute(record, &result));
4813 ASSERT_TRUE(result != nullptr);
4814 ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 2
4815 // Get mixed of existing and non-existing keys, should still return
4816 // Status::OK().
4817 record.reset(new MultiGetQueryTraceRecord(
4818 std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
4819 std::vector<std::string>({"a", "no2"}), fake_ts++));
4820 ASSERT_OK(replayer->Execute(record, &result));
4821 ASSERT_TRUE(result != nullptr);
4822 MultiValuesTraceExecutionResult* mvr =
4823 dynamic_cast<MultiValuesTraceExecutionResult*>(result.get());
4824 ASSERT_TRUE(mvr != nullptr);
4825 ASSERT_OK(mvr->GetMultiStatus()[0]);
4826 ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound());
4827 ASSERT_EQ(mvr->GetValues()[0], "1");
4828 ASSERT_EQ(mvr->GetValues()[1], "");
4829 ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 3
4830 // Get from an invalid (non-existing) cf_id.
4831 record.reset(new MultiGetQueryTraceRecord(
4832 std::vector<uint32_t>(
4833 {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}),
4834 std::vector<std::string>({"a", "foo", "whatever"}), fake_ts++));
4835 ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
4836 ASSERT_TRUE(result == nullptr);
4837 // Empty MultiGet
4838 record.reset(new MultiGetQueryTraceRecord(
4839 std::vector<uint32_t>(), std::vector<std::string>(), fake_ts++));
4840 ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
4841 ASSERT_TRUE(result == nullptr);
4842 // MultiGet size mismatch
4843 record.reset(new MultiGetQueryTraceRecord(
4844 std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
4845 std::vector<std::string>({"a"}), fake_ts++));
4846 ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
4847 ASSERT_TRUE(result == nullptr);
4848 ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
4849 ASSERT_EQ(res_handler.GetNumWrites(), 0);
4850 ASSERT_EQ(res_handler.GetNumGets(), 0);
4851 ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
4852 ASSERT_EQ(res_handler.GetNumMultiGets(), 3);
4853 res_handler.Reset();
4854
4855 replayer.reset();
4856
4857 for (auto handle : handles) {
4858 delete handle;
4859 }
4860 delete db2;
4861 ASSERT_OK(DestroyDB(dbname2, options));
4862 }
4863
TEST_F(DBTest2,TraceWithLimit)4864 TEST_F(DBTest2, TraceWithLimit) {
4865 Options options = CurrentOptions();
4866 options.merge_operator = MergeOperators::CreatePutOperator();
4867 ReadOptions ro;
4868 WriteOptions wo;
4869 TraceOptions trace_opts;
4870 EnvOptions env_opts;
4871 CreateAndReopenWithCF({"pikachu"}, options);
4872 Random rnd(301);
4873
4874 // test the max trace file size options
4875 trace_opts.max_trace_file_size = 5;
4876 std::string trace_filename = dbname_ + "/rocksdb.trace1";
4877 std::unique_ptr<TraceWriter> trace_writer;
4878 ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
4879 ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
4880 ASSERT_OK(Put(0, "a", "1"));
4881 ASSERT_OK(Put(0, "b", "1"));
4882 ASSERT_OK(Put(0, "c", "1"));
4883 ASSERT_OK(db_->EndTrace());
4884
4885 std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2");
4886 std::string value;
4887 ASSERT_OK(DestroyDB(dbname2, options));
4888
4889 // Using a different name than db2, to pacify infer's use-after-lifetime
4890 // warnings (http://fbinfer.com).
4891 DB* db2_init = nullptr;
4892 options.create_if_missing = true;
4893 ASSERT_OK(DB::Open(options, dbname2, &db2_init));
4894 ColumnFamilyHandle* cf;
4895 ASSERT_OK(
4896 db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
4897 delete cf;
4898 delete db2_init;
4899
4900 DB* db2 = nullptr;
4901 std::vector<ColumnFamilyDescriptor> column_families;
4902 ColumnFamilyOptions cf_options;
4903 cf_options.merge_operator = MergeOperators::CreatePutOperator();
4904 column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
4905 column_families.push_back(
4906 ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
4907 std::vector<ColumnFamilyHandle*> handles;
4908 DBOptions db_opts;
4909 db_opts.env = env_;
4910 ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
4911
4912 env_->SleepForMicroseconds(100);
4913 // Verify that the keys don't already exist
4914 ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4915 ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
4916 ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
4917
4918 std::unique_ptr<TraceReader> trace_reader;
4919 ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
4920 std::unique_ptr<Replayer> replayer;
4921 ASSERT_OK(
4922 db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
4923 ASSERT_OK(replayer->Prepare());
4924 ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
4925 replayer.reset();
4926
4927 ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4928 ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
4929 ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
4930
4931 for (auto handle : handles) {
4932 delete handle;
4933 }
4934 delete db2;
4935 ASSERT_OK(DestroyDB(dbname2, options));
4936 }
4937
TEST_F(DBTest2,TraceWithSampling)4938 TEST_F(DBTest2, TraceWithSampling) {
4939 Options options = CurrentOptions();
4940 ReadOptions ro;
4941 WriteOptions wo;
4942 TraceOptions trace_opts;
4943 EnvOptions env_opts;
4944 CreateAndReopenWithCF({"pikachu"}, options);
4945 Random rnd(301);
4946
4947 // test the trace file sampling options
4948 trace_opts.sampling_frequency = 2;
4949 std::string trace_filename = dbname_ + "/rocksdb.trace_sampling";
4950 std::unique_ptr<TraceWriter> trace_writer;
4951 ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
4952 ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
4953 ASSERT_OK(Put(0, "a", "1"));
4954 ASSERT_OK(Put(0, "b", "2"));
4955 ASSERT_OK(Put(0, "c", "3"));
4956 ASSERT_OK(Put(0, "d", "4"));
4957 ASSERT_OK(Put(0, "e", "5"));
4958 ASSERT_OK(db_->EndTrace());
4959
4960 std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling");
4961 std::string value;
4962 ASSERT_OK(DestroyDB(dbname2, options));
4963
4964 // Using a different name than db2, to pacify infer's use-after-lifetime
4965 // warnings (http://fbinfer.com).
4966 DB* db2_init = nullptr;
4967 options.create_if_missing = true;
4968 ASSERT_OK(DB::Open(options, dbname2, &db2_init));
4969 ColumnFamilyHandle* cf;
4970 ASSERT_OK(
4971 db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
4972 delete cf;
4973 delete db2_init;
4974
4975 DB* db2 = nullptr;
4976 std::vector<ColumnFamilyDescriptor> column_families;
4977 ColumnFamilyOptions cf_options;
4978 column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
4979 column_families.push_back(
4980 ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
4981 std::vector<ColumnFamilyHandle*> handles;
4982 DBOptions db_opts;
4983 db_opts.env = env_;
4984 ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
4985
4986 env_->SleepForMicroseconds(100);
4987 ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
4988 ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
4989 ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
4990 ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
4991 ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
4992
4993 std::unique_ptr<TraceReader> trace_reader;
4994 ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
4995 std::unique_ptr<Replayer> replayer;
4996 ASSERT_OK(
4997 db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
4998 ASSERT_OK(replayer->Prepare());
4999 ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
5000 replayer.reset();
5001
5002 ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
5003 ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
5004 ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
5005 ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
5006 ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
5007
5008 for (auto handle : handles) {
5009 delete handle;
5010 }
5011 delete db2;
5012 ASSERT_OK(DestroyDB(dbname2, options));
5013 }
5014
TEST_F(DBTest2,TraceWithFilter)5015 TEST_F(DBTest2, TraceWithFilter) {
5016 Options options = CurrentOptions();
5017 options.merge_operator = MergeOperators::CreatePutOperator();
5018 ReadOptions ro;
5019 WriteOptions wo;
5020 TraceOptions trace_opts;
5021 EnvOptions env_opts;
5022 CreateAndReopenWithCF({"pikachu"}, options);
5023 Random rnd(301);
5024 Iterator* single_iter = nullptr;
5025
5026 trace_opts.filter = TraceFilterType::kTraceFilterWrite;
5027
5028 std::string trace_filename = dbname_ + "/rocksdb.trace";
5029 std::unique_ptr<TraceWriter> trace_writer;
5030 ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
5031 ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
5032
5033 ASSERT_OK(Put(0, "a", "1"));
5034 ASSERT_OK(Merge(0, "b", "2"));
5035 ASSERT_OK(Delete(0, "c"));
5036 ASSERT_OK(SingleDelete(0, "d"));
5037 ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
5038
5039 WriteBatch batch;
5040 ASSERT_OK(batch.Put("f", "11"));
5041 ASSERT_OK(batch.Merge("g", "12"));
5042 ASSERT_OK(batch.Delete("h"));
5043 ASSERT_OK(batch.SingleDelete("i"));
5044 ASSERT_OK(batch.DeleteRange("j", "k"));
5045 ASSERT_OK(db_->Write(wo, &batch));
5046
5047 single_iter = db_->NewIterator(ro);
5048 single_iter->Seek("f");
5049 single_iter->SeekForPrev("g");
5050 delete single_iter;
5051
5052 ASSERT_EQ("1", Get(0, "a"));
5053 ASSERT_EQ("12", Get(0, "g"));
5054
5055 ASSERT_OK(Put(1, "foo", "bar"));
5056 ASSERT_OK(Put(1, "rocksdb", "rocks"));
5057 ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
5058
5059 ASSERT_OK(db_->EndTrace());
5060 // These should not get into the trace file as it is after EndTrace.
5061 ASSERT_OK(Put("hello", "world"));
5062 ASSERT_OK(Merge("foo", "bar"));
5063
5064 // Open another db, replay, and verify the data
5065 std::string value;
5066 std::string dbname2 = test::PerThreadDBPath(env_, "db_replay");
5067 ASSERT_OK(DestroyDB(dbname2, options));
5068
5069 // Using a different name than db2, to pacify infer's use-after-lifetime
5070 // warnings (http://fbinfer.com).
5071 DB* db2_init = nullptr;
5072 options.create_if_missing = true;
5073 ASSERT_OK(DB::Open(options, dbname2, &db2_init));
5074 ColumnFamilyHandle* cf;
5075 ASSERT_OK(
5076 db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
5077 delete cf;
5078 delete db2_init;
5079
5080 DB* db2 = nullptr;
5081 std::vector<ColumnFamilyDescriptor> column_families;
5082 ColumnFamilyOptions cf_options;
5083 cf_options.merge_operator = MergeOperators::CreatePutOperator();
5084 column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
5085 column_families.push_back(
5086 ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
5087 std::vector<ColumnFamilyHandle*> handles;
5088 DBOptions db_opts;
5089 db_opts.env = env_;
5090 ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
5091
5092 env_->SleepForMicroseconds(100);
5093 // Verify that the keys don't already exist
5094 ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
5095 ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
5096
5097 std::unique_ptr<TraceReader> trace_reader;
5098 ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
5099 std::unique_ptr<Replayer> replayer;
5100 ASSERT_OK(
5101 db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
5102 ASSERT_OK(replayer->Prepare());
5103 ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
5104 replayer.reset();
5105
5106 // All the key-values should not present since we filter out the WRITE ops.
5107 ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
5108 ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
5109 ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
5110 ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
5111 ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound());
5112 ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound());
5113
5114 for (auto handle : handles) {
5115 delete handle;
5116 }
5117 delete db2;
5118 ASSERT_OK(DestroyDB(dbname2, options));
5119
5120 // Set up a new db.
5121 std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
5122 ASSERT_OK(DestroyDB(dbname3, options));
5123
5124 DB* db3_init = nullptr;
5125 options.create_if_missing = true;
5126 ColumnFamilyHandle* cf3;
5127 ASSERT_OK(DB::Open(options, dbname3, &db3_init));
5128 ASSERT_OK(
5129 db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
5130 delete cf3;
5131 delete db3_init;
5132
5133 column_families.clear();
5134 column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
5135 column_families.push_back(
5136 ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
5137 handles.clear();
5138
5139 DB* db3 = nullptr;
5140 ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));
5141
5142 env_->SleepForMicroseconds(100);
5143 // Verify that the keys don't already exist
5144 ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound());
5145 ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound());
5146
5147 //The tracer will not record the READ ops.
5148 trace_opts.filter = TraceFilterType::kTraceFilterGet;
5149 std::string trace_filename3 = dbname_ + "/rocksdb.trace_3";
5150 std::unique_ptr<TraceWriter> trace_writer3;
5151 ASSERT_OK(
5152 NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3));
5153 ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3)));
5154
5155 ASSERT_OK(db3->Put(wo, handles[0], "a", "1"));
5156 ASSERT_OK(db3->Merge(wo, handles[0], "b", "2"));
5157 ASSERT_OK(db3->Delete(wo, handles[0], "c"));
5158 ASSERT_OK(db3->SingleDelete(wo, handles[0], "d"));
5159
5160 ASSERT_OK(db3->Get(ro, handles[0], "a", &value));
5161 ASSERT_EQ(value, "1");
5162 ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound());
5163
5164 ASSERT_OK(db3->EndTrace());
5165
5166 for (auto handle : handles) {
5167 delete handle;
5168 }
5169 delete db3;
5170 ASSERT_OK(DestroyDB(dbname3, options));
5171
5172 std::unique_ptr<TraceReader> trace_reader3;
5173 ASSERT_OK(
5174 NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3));
5175
5176 // Count the number of records in the trace file;
5177 int count = 0;
5178 std::string data;
5179 Status s;
5180 while (true) {
5181 s = trace_reader3->Read(&data);
5182 if (!s.ok()) {
5183 break;
5184 }
5185 count += 1;
5186 }
5187 // We also need to count the header and footer
5188 // 4 WRITE + HEADER + FOOTER = 6
5189 ASSERT_EQ(count, 6);
5190 }
5191
5192 #endif // ROCKSDB_LITE
5193
TEST_F(DBTest2,PinnableSliceAndMmapReads)5194 TEST_F(DBTest2, PinnableSliceAndMmapReads) {
5195 Options options = CurrentOptions();
5196 options.env = env_;
5197 if (!IsMemoryMappedAccessSupported()) {
5198 ROCKSDB_GTEST_SKIP("Test requires default environment");
5199 return;
5200 }
5201 options.allow_mmap_reads = true;
5202 options.max_open_files = 100;
5203 options.compression = kNoCompression;
5204 Reopen(options);
5205
5206 ASSERT_OK(Put("foo", "bar"));
5207 ASSERT_OK(Flush());
5208
5209 PinnableSlice pinned_value;
5210 ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
5211 // It is not safe to pin mmap files as they might disappear by compaction
5212 ASSERT_FALSE(pinned_value.IsPinned());
5213 ASSERT_EQ(pinned_value.ToString(), "bar");
5214
5215 ASSERT_OK(dbfull()->TEST_CompactRange(
5216 0 /* level */, nullptr /* begin */, nullptr /* end */,
5217 nullptr /* column_family */, true /* disallow_trivial_move */));
5218
5219 // Ensure pinned_value doesn't rely on memory munmap'd by the above
5220 // compaction. It crashes if it does.
5221 ASSERT_EQ(pinned_value.ToString(), "bar");
5222
5223 #ifndef ROCKSDB_LITE
5224 pinned_value.Reset();
5225 // Unsafe to pin mmap files when they could be kicked out of table cache
5226 Close();
5227 ASSERT_OK(ReadOnlyReopen(options));
5228 ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
5229 ASSERT_FALSE(pinned_value.IsPinned());
5230 ASSERT_EQ(pinned_value.ToString(), "bar");
5231
5232 pinned_value.Reset();
5233 // In read-only mode with infinite capacity on table cache it should pin the
5234 // value and avoid the memcpy
5235 Close();
5236 options.max_open_files = -1;
5237 ASSERT_OK(ReadOnlyReopen(options));
5238 ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
5239 ASSERT_TRUE(pinned_value.IsPinned());
5240 ASSERT_EQ(pinned_value.ToString(), "bar");
5241 #endif
5242 }
5243
TEST_F(DBTest2,DISABLED_IteratorPinnedMemory)5244 TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
5245 Options options = CurrentOptions();
5246 options.create_if_missing = true;
5247 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
5248 BlockBasedTableOptions bbto;
5249 bbto.no_block_cache = false;
5250 bbto.cache_index_and_filter_blocks = false;
5251 bbto.block_cache = NewLRUCache(100000);
5252 bbto.block_size = 400; // small block size
5253 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
5254 Reopen(options);
5255
5256 Random rnd(301);
5257 std::string v = rnd.RandomString(400);
5258
5259 // Since v is the size of a block, each key should take a block
5260 // of 400+ bytes.
5261 ASSERT_OK(Put("1", v));
5262 ASSERT_OK(Put("3", v));
5263 ASSERT_OK(Put("5", v));
5264 ASSERT_OK(Put("7", v));
5265 ASSERT_OK(Flush());
5266
5267 ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
5268
5269 // Verify that iterators don't pin more than one data block in block cache
5270 // at each time.
5271 {
5272 std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
5273 iter->SeekToFirst();
5274
5275 for (int i = 0; i < 4; i++) {
5276 ASSERT_TRUE(iter->Valid());
5277 // Block cache should contain exactly one block.
5278 ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
5279 ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
5280 iter->Next();
5281 }
5282 ASSERT_FALSE(iter->Valid());
5283
5284 iter->Seek("4");
5285 ASSERT_TRUE(iter->Valid());
5286
5287 ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
5288 ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
5289
5290 iter->Seek("3");
5291 ASSERT_TRUE(iter->Valid());
5292
5293 ASSERT_OK(iter->status());
5294
5295 ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
5296 ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
5297 }
5298 ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
5299
5300 // Test compaction case
5301 ASSERT_OK(Put("2", v));
5302 ASSERT_OK(Put("5", v));
5303 ASSERT_OK(Put("6", v));
5304 ASSERT_OK(Put("8", v));
5305 ASSERT_OK(Flush());
5306
5307 // Clear existing data in block cache
5308 bbto.block_cache->SetCapacity(0);
5309 bbto.block_cache->SetCapacity(100000);
5310
5311 // Verify compaction input iterators don't hold more than one data blocks at
5312 // one time.
5313 std::atomic<bool> finished(false);
5314 std::atomic<int> block_newed(0);
5315 std::atomic<int> block_destroyed(0);
5316 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5317 "Block::Block:0", [&](void* /*arg*/) {
5318 if (finished) {
5319 return;
5320 }
5321 // Two iterators. At most 2 outstanding blocks.
5322 EXPECT_GE(block_newed.load(), block_destroyed.load());
5323 EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
5324 block_newed.fetch_add(1);
5325 });
5326 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5327 "Block::~Block", [&](void* /*arg*/) {
5328 if (finished) {
5329 return;
5330 }
5331 // Two iterators. At most 2 outstanding blocks.
5332 EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
5333 EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
5334 block_destroyed.fetch_add(1);
5335 });
5336 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5337 "CompactionJob::Run:BeforeVerify",
5338 [&](void* /*arg*/) { finished = true; });
5339 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5340
5341 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5342
5343 // Two input files. Each of them has 4 data blocks.
5344 ASSERT_EQ(8, block_newed.load());
5345 ASSERT_EQ(8, block_destroyed.load());
5346
5347 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5348 }
5349
TEST_F(DBTest2,TestBBTTailPrefetch)5350 TEST_F(DBTest2, TestBBTTailPrefetch) {
5351 std::atomic<bool> called(false);
5352 size_t expected_lower_bound = 512 * 1024;
5353 size_t expected_higher_bound = 512 * 1024;
5354 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5355 "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
5356 size_t* prefetch_size = static_cast<size_t*>(arg);
5357 EXPECT_LE(expected_lower_bound, *prefetch_size);
5358 EXPECT_GE(expected_higher_bound, *prefetch_size);
5359 called = true;
5360 });
5361 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5362
5363 ASSERT_OK(Put("1", "1"));
5364 ASSERT_OK(Put("9", "1"));
5365 ASSERT_OK(Flush());
5366
5367 expected_lower_bound = 0;
5368 expected_higher_bound = 8 * 1024;
5369
5370 ASSERT_OK(Put("1", "1"));
5371 ASSERT_OK(Put("9", "1"));
5372 ASSERT_OK(Flush());
5373
5374 ASSERT_OK(Put("1", "1"));
5375 ASSERT_OK(Put("9", "1"));
5376 ASSERT_OK(Flush());
5377
5378 // Full compaction to make sure there is no L0 file after the open.
5379 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5380
5381 ASSERT_TRUE(called.load());
5382 called = false;
5383
5384 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5385 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5386
5387 std::atomic<bool> first_call(true);
5388 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5389 "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
5390 size_t* prefetch_size = static_cast<size_t*>(arg);
5391 if (first_call) {
5392 EXPECT_EQ(4 * 1024, *prefetch_size);
5393 first_call = false;
5394 } else {
5395 EXPECT_GE(4 * 1024, *prefetch_size);
5396 }
5397 called = true;
5398 });
5399 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5400
5401 Options options = CurrentOptions();
5402 options.max_file_opening_threads = 1; // one thread
5403 BlockBasedTableOptions table_options;
5404 table_options.cache_index_and_filter_blocks = true;
5405 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
5406 options.max_open_files = -1;
5407 Reopen(options);
5408
5409 ASSERT_OK(Put("1", "1"));
5410 ASSERT_OK(Put("9", "1"));
5411 ASSERT_OK(Flush());
5412
5413 ASSERT_OK(Put("1", "1"));
5414 ASSERT_OK(Put("9", "1"));
5415 ASSERT_OK(Flush());
5416
5417 ASSERT_TRUE(called.load());
5418 called = false;
5419
5420 // Parallel loading SST files
5421 options.max_file_opening_threads = 16;
5422 Reopen(options);
5423
5424 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
5425
5426 ASSERT_TRUE(called.load());
5427
5428 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5429 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5430 }
5431
TEST_F(DBTest2,TestGetColumnFamilyHandleUnlocked)5432 TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
5433 // Setup sync point dependency to reproduce the race condition of
5434 // DBImpl::GetColumnFamilyHandleUnlocked
5435 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
5436 {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
5437 "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
5438 {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
5439 "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
5440 });
5441 SyncPoint::GetInstance()->EnableProcessing();
5442
5443 CreateColumnFamilies({"test1", "test2"}, Options());
5444 ASSERT_EQ(handles_.size(), 2);
5445
5446 DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
5447 port::Thread user_thread1([&]() {
5448 auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
5449 ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
5450 TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
5451 TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
5452 ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
5453 });
5454
5455 port::Thread user_thread2([&]() {
5456 TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
5457 auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
5458 ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
5459 TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
5460 ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
5461 });
5462
5463 user_thread1.join();
5464 user_thread2.join();
5465
5466 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5467 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5468 }
5469
5470 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,TestCompactFiles)5471 TEST_F(DBTest2, TestCompactFiles) {
5472 // Setup sync point dependency to reproduce the race condition of
5473 // DBImpl::GetColumnFamilyHandleUnlocked
5474 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
5475 {"TestCompactFiles::IngestExternalFile1",
5476 "TestCompactFiles::IngestExternalFile2"},
5477 });
5478 SyncPoint::GetInstance()->EnableProcessing();
5479
5480 Options options;
5481 options.env = env_;
5482 options.num_levels = 2;
5483 options.disable_auto_compactions = true;
5484 Reopen(options);
5485 auto* handle = db_->DefaultColumnFamily();
5486 ASSERT_EQ(db_->NumberLevels(handle), 2);
5487
5488 ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
5489 ROCKSDB_NAMESPACE::EnvOptions(), options};
5490 std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
5491 std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
5492 std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";
5493
5494 ASSERT_OK(sst_file_writer.Open(external_file1));
5495 ASSERT_OK(sst_file_writer.Put("1", "1"));
5496 ASSERT_OK(sst_file_writer.Put("2", "2"));
5497 ASSERT_OK(sst_file_writer.Finish());
5498
5499 ASSERT_OK(sst_file_writer.Open(external_file2));
5500 ASSERT_OK(sst_file_writer.Put("3", "3"));
5501 ASSERT_OK(sst_file_writer.Put("4", "4"));
5502 ASSERT_OK(sst_file_writer.Finish());
5503
5504 ASSERT_OK(sst_file_writer.Open(external_file3));
5505 ASSERT_OK(sst_file_writer.Put("5", "5"));
5506 ASSERT_OK(sst_file_writer.Put("6", "6"));
5507 ASSERT_OK(sst_file_writer.Finish());
5508
5509 ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
5510 IngestExternalFileOptions()));
5511 ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
5512 std::vector<std::string> files;
5513 GetSstFiles(env_, dbname_, &files);
5514 ASSERT_EQ(files.size(), 2);
5515
5516 Status user_thread1_status;
5517 port::Thread user_thread1([&]() {
5518 user_thread1_status =
5519 db_->CompactFiles(CompactionOptions(), handle, files, 1);
5520 });
5521
5522 Status user_thread2_status;
5523 port::Thread user_thread2([&]() {
5524 user_thread2_status = db_->IngestExternalFile(handle, {external_file2},
5525 IngestExternalFileOptions());
5526 TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
5527 });
5528
5529 user_thread1.join();
5530 user_thread2.join();
5531
5532 ASSERT_OK(user_thread1_status);
5533 ASSERT_OK(user_thread2_status);
5534
5535 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5536 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5537 }
5538 #endif // ROCKSDB_LITE
5539
TEST_F(DBTest2,MultiDBParallelOpenTest)5540 TEST_F(DBTest2, MultiDBParallelOpenTest) {
5541 const int kNumDbs = 2;
5542 Options options = CurrentOptions();
5543 std::vector<std::string> dbnames;
5544 for (int i = 0; i < kNumDbs; ++i) {
5545 dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + ToString(i)));
5546 ASSERT_OK(DestroyDB(dbnames.back(), options));
5547 }
5548
5549 // Verify empty DBs can be created in parallel
5550 std::vector<std::thread> open_threads;
5551 std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
5552 options.create_if_missing = true;
5553 for (int i = 0; i < kNumDbs; ++i) {
5554 open_threads.emplace_back(
5555 [&](int dbnum) {
5556 ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
5557 },
5558 i);
5559 }
5560
5561 // Now add some data and close, so next we can verify non-empty DBs can be
5562 // recovered in parallel
5563 for (int i = 0; i < kNumDbs; ++i) {
5564 open_threads[i].join();
5565 ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
5566 delete dbs[i];
5567 }
5568
5569 // Verify non-empty DBs can be recovered in parallel
5570 open_threads.clear();
5571 for (int i = 0; i < kNumDbs; ++i) {
5572 open_threads.emplace_back(
5573 [&](int dbnum) {
5574 ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
5575 },
5576 i);
5577 }
5578
5579 // Wait and cleanup
5580 for (int i = 0; i < kNumDbs; ++i) {
5581 open_threads[i].join();
5582 delete dbs[i];
5583 ASSERT_OK(DestroyDB(dbnames[i], options));
5584 }
5585 }
5586
5587 namespace {
5588 class DummyOldStats : public Statistics {
5589 public:
Name() const5590 const char* Name() const override { return "DummyOldStats"; }
getTickerCount(uint32_t) const5591 uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
recordTick(uint32_t,uint64_t)5592 void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
5593 num_rt++;
5594 }
setTickerCount(uint32_t,uint64_t)5595 void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {}
getAndResetTickerCount(uint32_t)5596 uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override {
5597 return 0;
5598 }
measureTime(uint32_t,uint64_t)5599 void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override {
5600 num_mt++;
5601 }
histogramData(uint32_t,ROCKSDB_NAMESPACE::HistogramData * const) const5602 void histogramData(
5603 uint32_t /*histogram_type*/,
5604 ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {}
getHistogramString(uint32_t) const5605 std::string getHistogramString(uint32_t /*type*/) const override {
5606 return "";
5607 }
HistEnabledForType(uint32_t) const5608 bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
ToString() const5609 std::string ToString() const override { return ""; }
5610 std::atomic<int> num_rt{0};
5611 std::atomic<int> num_mt{0};
5612 };
5613 } // namespace
5614
TEST_F(DBTest2,OldStatsInterface)5615 TEST_F(DBTest2, OldStatsInterface) {
5616 DummyOldStats* dos = new DummyOldStats();
5617 std::shared_ptr<Statistics> stats(dos);
5618 Options options = CurrentOptions();
5619 options.create_if_missing = true;
5620 options.statistics = stats;
5621 Reopen(options);
5622
5623 ASSERT_OK(Put("foo", "bar"));
5624 ASSERT_EQ("bar", Get("foo"));
5625 ASSERT_OK(Flush());
5626 ASSERT_EQ("bar", Get("foo"));
5627
5628 ASSERT_GT(dos->num_rt, 0);
5629 ASSERT_GT(dos->num_mt, 0);
5630 }
5631
TEST_F(DBTest2,CloseWithUnreleasedSnapshot)5632 TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
5633 const Snapshot* ss = db_->GetSnapshot();
5634
5635 for (auto h : handles_) {
5636 db_->DestroyColumnFamilyHandle(h);
5637 }
5638 handles_.clear();
5639
5640 ASSERT_NOK(db_->Close());
5641 db_->ReleaseSnapshot(ss);
5642 ASSERT_OK(db_->Close());
5643 delete db_;
5644 db_ = nullptr;
5645 }
5646
TEST_F(DBTest2,PrefixBloomReseek)5647 TEST_F(DBTest2, PrefixBloomReseek) {
5648 Options options = CurrentOptions();
5649 options.create_if_missing = true;
5650 options.prefix_extractor.reset(NewCappedPrefixTransform(3));
5651 BlockBasedTableOptions bbto;
5652 bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
5653 bbto.whole_key_filtering = false;
5654 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
5655 DestroyAndReopen(options);
5656
5657 // Construct two L1 files with keys:
5658 // f1:[aaa1 ccc1] f2:[ddd0]
5659 ASSERT_OK(Put("aaa1", ""));
5660 ASSERT_OK(Put("ccc1", ""));
5661 ASSERT_OK(Flush());
5662 ASSERT_OK(Put("ddd0", ""));
5663 ASSERT_OK(Flush());
5664 CompactRangeOptions cro;
5665 cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
5666 ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
5667
5668 ASSERT_OK(Put("bbb1", ""));
5669
5670 Iterator* iter = db_->NewIterator(ReadOptions());
5671 ASSERT_OK(iter->status());
5672
5673 // Seeking into f1, the iterator will check bloom filter which returns the
5674 // file iterator ot be invalidate, and the cursor will put into f2, with
5675 // the next key to be "ddd0".
5676 iter->Seek("bbb1");
5677 ASSERT_TRUE(iter->Valid());
5678 ASSERT_EQ("bbb1", iter->key().ToString());
5679
5680 // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
5681 iter->Seek("ccc1");
5682 ASSERT_TRUE(iter->Valid());
5683 ASSERT_EQ("ccc1", iter->key().ToString());
5684
5685 delete iter;
5686 }
5687
TEST_F(DBTest2,PrefixBloomFilteredOut)5688 TEST_F(DBTest2, PrefixBloomFilteredOut) {
5689 Options options = CurrentOptions();
5690 options.create_if_missing = true;
5691 options.prefix_extractor.reset(NewCappedPrefixTransform(3));
5692 BlockBasedTableOptions bbto;
5693 bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
5694 bbto.whole_key_filtering = false;
5695 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
5696 DestroyAndReopen(options);
5697
5698 // Construct two L1 files with keys:
5699 // f1:[aaa1 ccc1] f2:[ddd0]
5700 ASSERT_OK(Put("aaa1", ""));
5701 ASSERT_OK(Put("ccc1", ""));
5702 ASSERT_OK(Flush());
5703 ASSERT_OK(Put("ddd0", ""));
5704 ASSERT_OK(Flush());
5705 CompactRangeOptions cro;
5706 cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
5707 ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
5708
5709 Iterator* iter = db_->NewIterator(ReadOptions());
5710 ASSERT_OK(iter->status());
5711
5712 // Bloom filter is filterd out by f1.
5713 // This is just one of several valid position following the contract.
5714 // Postioning to ccc1 or ddd0 is also valid. This is just to validate
5715 // the behavior of the current implementation. If underlying implementation
5716 // changes, the test might fail here.
5717 iter->Seek("bbb1");
5718 ASSERT_OK(iter->status());
5719 ASSERT_FALSE(iter->Valid());
5720
5721 delete iter;
5722 }
5723
5724 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,RowCacheSnapshot)5725 TEST_F(DBTest2, RowCacheSnapshot) {
5726 Options options = CurrentOptions();
5727 options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
5728 options.row_cache = NewLRUCache(8 * 8192);
5729 DestroyAndReopen(options);
5730
5731 ASSERT_OK(Put("foo", "bar1"));
5732
5733 const Snapshot* s1 = db_->GetSnapshot();
5734
5735 ASSERT_OK(Put("foo", "bar2"));
5736 ASSERT_OK(Flush());
5737
5738 ASSERT_OK(Put("foo2", "bar"));
5739 const Snapshot* s2 = db_->GetSnapshot();
5740 ASSERT_OK(Put("foo3", "bar"));
5741 const Snapshot* s3 = db_->GetSnapshot();
5742
5743 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
5744 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
5745 ASSERT_EQ(Get("foo"), "bar2");
5746 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
5747 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
5748 ASSERT_EQ(Get("foo"), "bar2");
5749 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
5750 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
5751 ASSERT_EQ(Get("foo", s1), "bar1");
5752 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
5753 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
5754 ASSERT_EQ(Get("foo", s2), "bar2");
5755 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
5756 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
5757 ASSERT_EQ(Get("foo", s1), "bar1");
5758 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
5759 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
5760 ASSERT_EQ(Get("foo", s3), "bar2");
5761 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
5762 ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
5763
5764 db_->ReleaseSnapshot(s1);
5765 db_->ReleaseSnapshot(s2);
5766 db_->ReleaseSnapshot(s3);
5767 }
5768 #endif // ROCKSDB_LITE
5769
5770 // When DB is reopened with multiple column families, the manifest file
5771 // is written after the first CF is flushed, and it is written again
5772 // after each flush. If DB crashes between the flushes, the flushed CF
5773 // flushed will pass the latest log file, and now we require it not
5774 // to be corrupted, and triggering a corruption report.
5775 // We need to fix the bug and enable the test.
TEST_F(DBTest2,CrashInRecoveryMultipleCF)5776 TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
5777 const std::vector<std::string> sync_points = {
5778 "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
5779 "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
5780 for (const auto& test_sync_point : sync_points) {
5781 Options options = CurrentOptions();
5782 // First destroy original db to ensure a clean start.
5783 DestroyAndReopen(options);
5784 options.create_if_missing = true;
5785 options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
5786 CreateAndReopenWithCF({"pikachu"}, options);
5787 ASSERT_OK(Put("foo", "bar"));
5788 ASSERT_OK(Flush());
5789 ASSERT_OK(Put(1, "foo", "bar"));
5790 ASSERT_OK(Flush(1));
5791 ASSERT_OK(Put("foo", "bar"));
5792 ASSERT_OK(Put(1, "foo", "bar"));
5793 // The value is large enough to be divided to two blocks.
5794 std::string large_value(400, ' ');
5795 ASSERT_OK(Put("foo1", large_value));
5796 ASSERT_OK(Put("foo2", large_value));
5797 Close();
5798
5799 // Corrupt the log file in the middle, so that it is not corrupted
5800 // in the tail.
5801 std::vector<std::string> filenames;
5802 ASSERT_OK(env_->GetChildren(dbname_, &filenames));
5803 for (const auto& f : filenames) {
5804 uint64_t number;
5805 FileType type;
5806 if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) {
5807 std::string fname = dbname_ + "/" + f;
5808 std::string file_content;
5809 ASSERT_OK(ReadFileToString(env_, fname, &file_content));
5810 file_content[400] = 'h';
5811 file_content[401] = 'a';
5812 ASSERT_OK(WriteStringToFile(env_, file_content, fname));
5813 break;
5814 }
5815 }
5816
5817 // Reopen and freeze the file system after the first manifest write.
5818 FaultInjectionTestEnv fit_env(options.env);
5819 options.env = &fit_env;
5820 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
5821 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5822 test_sync_point,
5823 [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
5824 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
5825 ASSERT_NOK(TryReopenWithColumnFamilies(
5826 {kDefaultColumnFamilyName, "pikachu"}, options));
5827 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
5828
5829 fit_env.SetFilesystemActive(true);
5830 // If we continue using failure ingestion Env, it will conplain something
5831 // when renaming current file, which is not expected. Need to investigate
5832 // why.
5833 options.env = env_;
5834 ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
5835 options));
5836 }
5837 }
5838
TEST_F(DBTest2,SeekFileRangeDeleteTail)5839 TEST_F(DBTest2, SeekFileRangeDeleteTail) {
5840 Options options = CurrentOptions();
5841 options.prefix_extractor.reset(NewCappedPrefixTransform(1));
5842 options.num_levels = 3;
5843 DestroyAndReopen(options);
5844
5845 ASSERT_OK(Put("a", "a"));
5846 const Snapshot* s1 = db_->GetSnapshot();
5847 ASSERT_OK(
5848 db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
5849 ASSERT_OK(Put("b", "a"));
5850 ASSERT_OK(Flush());
5851
5852 ASSERT_OK(Put("x", "a"));
5853 ASSERT_OK(Put("z", "a"));
5854 ASSERT_OK(Flush());
5855
5856 CompactRangeOptions cro;
5857 cro.change_level = true;
5858 cro.target_level = 2;
5859 ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
5860
5861 {
5862 ReadOptions ro;
5863 ro.total_order_seek = true;
5864 std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
5865 ASSERT_OK(iter->status());
5866 iter->Seek("e");
5867 ASSERT_TRUE(iter->Valid());
5868 ASSERT_EQ("x", iter->key().ToString());
5869 }
5870 db_->ReleaseSnapshot(s1);
5871 }
5872
TEST_F(DBTest2,BackgroundPurgeTest)5873 TEST_F(DBTest2, BackgroundPurgeTest) {
5874 Options options = CurrentOptions();
5875 options.write_buffer_manager =
5876 std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(1 << 20);
5877 options.avoid_unnecessary_blocking_io = true;
5878 DestroyAndReopen(options);
5879 size_t base_value = options.write_buffer_manager->memory_usage();
5880
5881 ASSERT_OK(Put("a", "a"));
5882 Iterator* iter = db_->NewIterator(ReadOptions());
5883 ASSERT_OK(iter->status());
5884 ASSERT_OK(Flush());
5885 size_t value = options.write_buffer_manager->memory_usage();
5886 ASSERT_GT(value, base_value);
5887
5888 db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH);
5889 test::SleepingBackgroundTask sleeping_task_after;
5890 db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
5891 &sleeping_task_after, Env::Priority::HIGH);
5892 delete iter;
5893
5894 Env::Default()->SleepForMicroseconds(100000);
5895 value = options.write_buffer_manager->memory_usage();
5896 ASSERT_GT(value, base_value);
5897
5898 sleeping_task_after.WakeUp();
5899 sleeping_task_after.WaitUntilDone();
5900
5901 test::SleepingBackgroundTask sleeping_task_after2;
5902 db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
5903 &sleeping_task_after2, Env::Priority::HIGH);
5904 sleeping_task_after2.WakeUp();
5905 sleeping_task_after2.WaitUntilDone();
5906
5907 value = options.write_buffer_manager->memory_usage();
5908 ASSERT_EQ(base_value, value);
5909 }
5910
TEST_F(DBTest2,SwitchMemtableRaceWithNewManifest)5911 TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
5912 Options options = CurrentOptions();
5913 DestroyAndReopen(options);
5914 options.max_manifest_file_size = 10;
5915 options.create_if_missing = true;
5916 CreateAndReopenWithCF({"pikachu"}, options);
5917 ASSERT_EQ(2, handles_.size());
5918
5919 ASSERT_OK(Put("foo", "value"));
5920 const int kL0Files = options.level0_file_num_compaction_trigger;
5921 for (int i = 0; i < kL0Files; ++i) {
5922 ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i)));
5923 ASSERT_OK(Flush(/*cf=*/1));
5924 }
5925
5926 port::Thread thread([&]() { ASSERT_OK(Flush()); });
5927 ASSERT_OK(dbfull()->TEST_WaitForCompact());
5928 thread.join();
5929 }
5930
TEST_F(DBTest2,SameSmallestInSameLevel)5931 TEST_F(DBTest2, SameSmallestInSameLevel) {
5932 // This test validates fractional casacading logic when several files at one
5933 // one level only contains the same user key.
5934 Options options = CurrentOptions();
5935 options.merge_operator = MergeOperators::CreateStringAppendOperator();
5936 DestroyAndReopen(options);
5937
5938 ASSERT_OK(Put("key", "1"));
5939 ASSERT_OK(Put("key", "2"));
5940 ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
5941 ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
5942 ASSERT_OK(Flush());
5943 CompactRangeOptions cro;
5944 cro.change_level = true;
5945 cro.target_level = 2;
5946 ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr,
5947 nullptr));
5948
5949 ASSERT_OK(db_->Merge(WriteOptions(), "key", "5"));
5950 ASSERT_OK(Flush());
5951 ASSERT_OK(db_->Merge(WriteOptions(), "key", "6"));
5952 ASSERT_OK(Flush());
5953 ASSERT_OK(db_->Merge(WriteOptions(), "key", "7"));
5954 ASSERT_OK(Flush());
5955 ASSERT_OK(db_->Merge(WriteOptions(), "key", "8"));
5956 ASSERT_OK(Flush());
5957 ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
5958 #ifndef ROCKSDB_LITE
5959 ASSERT_EQ("0,4,1", FilesPerLevel());
5960 #endif // ROCKSDB_LITE
5961
5962 ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
5963 }
5964
TEST_F(DBTest2,FileConsistencyCheckInOpen)5965 TEST_F(DBTest2, FileConsistencyCheckInOpen) {
5966 ASSERT_OK(Put("foo", "bar"));
5967 ASSERT_OK(Flush());
5968
5969 SyncPoint::GetInstance()->SetCallBack(
5970 "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
5971 Status* ret_s = static_cast<Status*>(arg);
5972 *ret_s = Status::Corruption("fcc");
5973 });
5974 SyncPoint::GetInstance()->EnableProcessing();
5975
5976 Options options = CurrentOptions();
5977 options.force_consistency_checks = true;
5978 ASSERT_NOK(TryReopen(options));
5979
5980 SyncPoint::GetInstance()->DisableProcessing();
5981 }
5982
TEST_F(DBTest2,BlockBasedTablePrefixIndexSeekForPrev)5983 TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
5984 // create a DB with block prefix index
5985 BlockBasedTableOptions table_options;
5986 Options options = CurrentOptions();
5987 table_options.block_size = 300;
5988 table_options.index_type = BlockBasedTableOptions::kHashSearch;
5989 table_options.index_shortening =
5990 BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
5991 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
5992 options.prefix_extractor.reset(NewFixedPrefixTransform(1));
5993
5994 Reopen(options);
5995
5996 Random rnd(301);
5997 std::string large_value = rnd.RandomString(500);
5998
5999 ASSERT_OK(Put("a1", large_value));
6000 ASSERT_OK(Put("x1", large_value));
6001 ASSERT_OK(Put("y1", large_value));
6002 ASSERT_OK(Flush());
6003
6004 {
6005 std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
6006 ASSERT_OK(iterator->status());
6007 iterator->SeekForPrev("x3");
6008 ASSERT_TRUE(iterator->Valid());
6009 ASSERT_EQ("x1", iterator->key().ToString());
6010
6011 iterator->SeekForPrev("a3");
6012 ASSERT_TRUE(iterator->Valid());
6013 ASSERT_EQ("a1", iterator->key().ToString());
6014
6015 iterator->SeekForPrev("y3");
6016 ASSERT_TRUE(iterator->Valid());
6017 ASSERT_EQ("y1", iterator->key().ToString());
6018
6019 // Query more than one non-existing prefix to cover the case both
6020 // of empty hash bucket and hash bucket conflict.
6021 iterator->SeekForPrev("b1");
6022 // Result should be not valid or "a1".
6023 if (iterator->Valid()) {
6024 ASSERT_EQ("a1", iterator->key().ToString());
6025 }
6026
6027 iterator->SeekForPrev("c1");
6028 // Result should be not valid or "a1".
6029 if (iterator->Valid()) {
6030 ASSERT_EQ("a1", iterator->key().ToString());
6031 }
6032
6033 iterator->SeekForPrev("d1");
6034 // Result should be not valid or "a1".
6035 if (iterator->Valid()) {
6036 ASSERT_EQ("a1", iterator->key().ToString());
6037 }
6038
6039 iterator->SeekForPrev("y3");
6040 ASSERT_TRUE(iterator->Valid());
6041 ASSERT_EQ("y1", iterator->key().ToString());
6042 }
6043 }
6044
TEST_F(DBTest2,PartitionedIndexPrefetchFailure)6045 TEST_F(DBTest2, PartitionedIndexPrefetchFailure) {
6046 Options options = last_options_;
6047 options.env = env_;
6048 options.max_open_files = 20;
6049 BlockBasedTableOptions bbto;
6050 bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
6051 bbto.metadata_block_size = 128;
6052 bbto.block_size = 128;
6053 bbto.block_cache = NewLRUCache(16777216);
6054 bbto.cache_index_and_filter_blocks = true;
6055 options.table_factory.reset(NewBlockBasedTableFactory(bbto));
6056 DestroyAndReopen(options);
6057
6058 // Force no table cache so every read will preload the SST file.
6059 dbfull()->TEST_table_cache()->SetCapacity(0);
6060 bbto.block_cache->SetCapacity(0);
6061
6062 Random rnd(301);
6063 for (int i = 0; i < 4096; i++) {
6064 ASSERT_OK(Put(Key(i), rnd.RandomString(32)));
6065 }
6066 ASSERT_OK(Flush());
6067
6068 // Try different random failures in table open for 300 times.
6069 for (int i = 0; i < 300; i++) {
6070 env_->num_reads_fails_ = 0;
6071 env_->rand_reads_fail_odd_ = 8;
6072
6073 std::string value;
6074 Status s = dbfull()->Get(ReadOptions(), Key(1), &value);
6075 if (env_->num_reads_fails_ > 0) {
6076 ASSERT_NOK(s);
6077 } else {
6078 ASSERT_OK(s);
6079 }
6080 }
6081
6082 env_->rand_reads_fail_odd_ = 0;
6083 }
6084
TEST_F(DBTest2,ChangePrefixExtractor)6085 TEST_F(DBTest2, ChangePrefixExtractor) {
6086 for (bool use_partitioned_filter : {true, false}) {
6087 // create a DB with block prefix index
6088 BlockBasedTableOptions table_options;
6089 Options options = CurrentOptions();
6090
6091 // Sometimes filter is checked based on upper bound. Assert counters
6092 // for that case. Otherwise, only check data correctness.
6093 #ifndef ROCKSDB_LITE
6094 bool expect_filter_check = !use_partitioned_filter;
6095 #else
6096 bool expect_filter_check = false;
6097 #endif
6098 table_options.partition_filters = use_partitioned_filter;
6099 if (use_partitioned_filter) {
6100 table_options.index_type =
6101 BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
6102 }
6103 table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
6104
6105 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
6106 options.statistics = CreateDBStatistics();
6107
6108 options.prefix_extractor.reset(NewFixedPrefixTransform(2));
6109 DestroyAndReopen(options);
6110
6111 Random rnd(301);
6112
6113 ASSERT_OK(Put("aa", ""));
6114 ASSERT_OK(Put("xb", ""));
6115 ASSERT_OK(Put("xx1", ""));
6116 ASSERT_OK(Put("xz1", ""));
6117 ASSERT_OK(Put("zz", ""));
6118 ASSERT_OK(Flush());
6119
6120 // After reopening DB with prefix size 2 => 1, prefix extractor
6121 // won't take effective unless it won't change results based
6122 // on upper bound and seek key.
6123 options.prefix_extractor.reset(NewFixedPrefixTransform(1));
6124 Reopen(options);
6125
6126 {
6127 std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
6128 ASSERT_OK(iterator->status());
6129 iterator->Seek("xa");
6130 ASSERT_TRUE(iterator->Valid());
6131 ASSERT_EQ("xb", iterator->key().ToString());
6132 // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not
6133 // correct in this case. So don't check counters in this case.
6134 if (expect_filter_check) {
6135 ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6136 }
6137
6138 iterator->Seek("xz");
6139 ASSERT_TRUE(iterator->Valid());
6140 ASSERT_EQ("xz1", iterator->key().ToString());
6141 if (expect_filter_check) {
6142 ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6143 }
6144 }
6145
6146 std::string ub_str = "xg9";
6147 Slice ub(ub_str);
6148 ReadOptions ro;
6149 ro.iterate_upper_bound = &ub;
6150
6151 {
6152 std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6153 ASSERT_OK(iterator->status());
6154
6155 // SeekForPrev() never uses prefix bloom if it is changed.
6156 iterator->SeekForPrev("xg0");
6157 ASSERT_TRUE(iterator->Valid());
6158 ASSERT_EQ("xb", iterator->key().ToString());
6159 if (expect_filter_check) {
6160 ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6161 }
6162 }
6163
6164 ub_str = "xx9";
6165 ub = Slice(ub_str);
6166 {
6167 std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6168 ASSERT_OK(iterator->status());
6169
6170 iterator->Seek("x");
6171 ASSERT_TRUE(iterator->Valid());
6172 ASSERT_EQ("xb", iterator->key().ToString());
6173 if (expect_filter_check) {
6174 ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6175 }
6176
6177 iterator->Seek("xx0");
6178 ASSERT_TRUE(iterator->Valid());
6179 ASSERT_EQ("xx1", iterator->key().ToString());
6180 if (expect_filter_check) {
6181 ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6182 }
6183 }
6184
6185 CompactRangeOptions compact_range_opts;
6186 compact_range_opts.bottommost_level_compaction =
6187 BottommostLevelCompaction::kForce;
6188 ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
6189 ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
6190
6191 // Re-execute similar queries after a full compaction
6192 {
6193 std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
6194
6195 iterator->Seek("x");
6196 ASSERT_TRUE(iterator->Valid());
6197 ASSERT_EQ("xb", iterator->key().ToString());
6198 if (expect_filter_check) {
6199 ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6200 }
6201
6202 iterator->Seek("xg");
6203 ASSERT_TRUE(iterator->Valid());
6204 ASSERT_EQ("xx1", iterator->key().ToString());
6205 if (expect_filter_check) {
6206 ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6207 }
6208
6209 iterator->Seek("xz");
6210 ASSERT_TRUE(iterator->Valid());
6211 ASSERT_EQ("xz1", iterator->key().ToString());
6212 if (expect_filter_check) {
6213 ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6214 }
6215
6216 ASSERT_OK(iterator->status());
6217 }
6218 {
6219 std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6220
6221 iterator->SeekForPrev("xx0");
6222 ASSERT_TRUE(iterator->Valid());
6223 ASSERT_EQ("xb", iterator->key().ToString());
6224 if (expect_filter_check) {
6225 ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6226 }
6227
6228 iterator->Seek("xx0");
6229 ASSERT_TRUE(iterator->Valid());
6230 ASSERT_EQ("xx1", iterator->key().ToString());
6231 if (expect_filter_check) {
6232 ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6233 }
6234
6235 ASSERT_OK(iterator->status());
6236 }
6237
6238 ub_str = "xg9";
6239 ub = Slice(ub_str);
6240 {
6241 std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6242 iterator->SeekForPrev("xg0");
6243 ASSERT_TRUE(iterator->Valid());
6244 ASSERT_EQ("xb", iterator->key().ToString());
6245 if (expect_filter_check) {
6246 ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6247 }
6248 ASSERT_OK(iterator->status());
6249 }
6250 }
6251 }
6252
TEST_F(DBTest2,BlockBasedTablePrefixGetIndexNotFound)6253 TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
6254 // create a DB with block prefix index
6255 BlockBasedTableOptions table_options;
6256 Options options = CurrentOptions();
6257 table_options.block_size = 300;
6258 table_options.index_type = BlockBasedTableOptions::kHashSearch;
6259 table_options.index_shortening =
6260 BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
6261 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
6262 options.prefix_extractor.reset(NewFixedPrefixTransform(1));
6263 options.level0_file_num_compaction_trigger = 8;
6264
6265 Reopen(options);
6266
6267 ASSERT_OK(Put("b1", "ok"));
6268 ASSERT_OK(Flush());
6269
6270 // Flushing several files so that the chance that hash bucket
6271 // is empty fo "b" in at least one of the files is high.
6272 ASSERT_OK(Put("a1", ""));
6273 ASSERT_OK(Put("c1", ""));
6274 ASSERT_OK(Flush());
6275
6276 ASSERT_OK(Put("a2", ""));
6277 ASSERT_OK(Put("c2", ""));
6278 ASSERT_OK(Flush());
6279
6280 ASSERT_OK(Put("a3", ""));
6281 ASSERT_OK(Put("c3", ""));
6282 ASSERT_OK(Flush());
6283
6284 ASSERT_OK(Put("a4", ""));
6285 ASSERT_OK(Put("c4", ""));
6286 ASSERT_OK(Flush());
6287
6288 ASSERT_OK(Put("a5", ""));
6289 ASSERT_OK(Put("c5", ""));
6290 ASSERT_OK(Flush());
6291
6292 ASSERT_EQ("ok", Get("b1"));
6293 }
6294
6295 #ifndef ROCKSDB_LITE
TEST_F(DBTest2,AutoPrefixMode1)6296 TEST_F(DBTest2, AutoPrefixMode1) {
6297 // create a DB with block prefix index
6298 BlockBasedTableOptions table_options;
6299 Options options = CurrentOptions();
6300 table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
6301 options.table_factory.reset(NewBlockBasedTableFactory(table_options));
6302 options.prefix_extractor.reset(NewFixedPrefixTransform(1));
6303 options.statistics = CreateDBStatistics();
6304
6305 Reopen(options);
6306
6307 Random rnd(301);
6308 std::string large_value = rnd.RandomString(500);
6309
6310 ASSERT_OK(Put("a1", large_value));
6311 ASSERT_OK(Put("x1", large_value));
6312 ASSERT_OK(Put("y1", large_value));
6313 ASSERT_OK(Flush());
6314
6315 ReadOptions ro;
6316 ro.total_order_seek = false;
6317 ro.auto_prefix_mode = true;
6318 {
6319 std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6320 iterator->Seek("b1");
6321 ASSERT_TRUE(iterator->Valid());
6322 ASSERT_EQ("x1", iterator->key().ToString());
6323 ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6324 ASSERT_OK(iterator->status());
6325 }
6326
6327 std::string ub_str = "b9";
6328 Slice ub(ub_str);
6329 ro.iterate_upper_bound = &ub;
6330
6331 {
6332 std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6333 iterator->Seek("b1");
6334 ASSERT_FALSE(iterator->Valid());
6335 ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6336 ASSERT_OK(iterator->status());
6337 }
6338
6339 ub_str = "z";
6340 ub = Slice(ub_str);
6341 {
6342 std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6343 iterator->Seek("b1");
6344 ASSERT_TRUE(iterator->Valid());
6345 ASSERT_EQ("x1", iterator->key().ToString());
6346 ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6347 ASSERT_OK(iterator->status());
6348 }
6349
6350 ub_str = "c";
6351 ub = Slice(ub_str);
6352 {
6353 std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6354 iterator->Seek("b1");
6355 ASSERT_FALSE(iterator->Valid());
6356 ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6357 ASSERT_OK(iterator->status());
6358 }
6359
6360 // The same queries without recreating iterator
6361 {
6362 ub_str = "b9";
6363 ub = Slice(ub_str);
6364 ro.iterate_upper_bound = &ub;
6365
6366 std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
6367 iterator->Seek("b1");
6368 ASSERT_FALSE(iterator->Valid());
6369 ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6370 ASSERT_OK(iterator->status());
6371
6372 ub_str = "z";
6373 ub = Slice(ub_str);
6374
6375 iterator->Seek("b1");
6376 ASSERT_TRUE(iterator->Valid());
6377 ASSERT_EQ("x1", iterator->key().ToString());
6378 ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6379
6380 ub_str = "c";
6381 ub = Slice(ub_str);
6382
6383 iterator->Seek("b1");
6384 ASSERT_FALSE(iterator->Valid());
6385 ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6386
6387 ub_str = "b9";
6388 ub = Slice(ub_str);
6389 ro.iterate_upper_bound = &ub;
6390 iterator->SeekForPrev("b1");
6391 ASSERT_TRUE(iterator->Valid());
6392 ASSERT_EQ("a1", iterator->key().ToString());
6393 ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
6394
6395 ub_str = "zz";
6396 ub = Slice(ub_str);
6397 ro.iterate_upper_bound = &ub;
6398 iterator->SeekToLast();
6399 ASSERT_TRUE(iterator->Valid());
6400 ASSERT_EQ("y1", iterator->key().ToString());
6401
6402 iterator->SeekToFirst();
6403 ASSERT_TRUE(iterator->Valid());
6404 ASSERT_EQ("a1", iterator->key().ToString());
6405 }
6406 }
6407
6408 class RenameCurrentTest : public DBTestBase,
6409 public testing::WithParamInterface<std::string> {
6410 public:
RenameCurrentTest()6411 RenameCurrentTest()
6412 : DBTestBase("rename_current_test", /*env_do_fsync=*/true),
6413 sync_point_(GetParam()) {}
6414
~RenameCurrentTest()6415 ~RenameCurrentTest() override {}
6416
SetUp()6417 void SetUp() override {
6418 env_->no_file_overwrite_.store(true, std::memory_order_release);
6419 }
6420
TearDown()6421 void TearDown() override {
6422 env_->no_file_overwrite_.store(false, std::memory_order_release);
6423 }
6424
SetupSyncPoints()6425 void SetupSyncPoints() {
6426 SyncPoint::GetInstance()->DisableProcessing();
6427 SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) {
6428 Status* s = reinterpret_cast<Status*>(arg);
6429 assert(s);
6430 *s = Status::IOError("Injected IO error.");
6431 });
6432 }
6433
6434 const std::string sync_point_;
6435 };
6436
6437 INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest,
6438 ::testing::Values("SetCurrentFile:BeforeRename",
6439 "SetCurrentFile:AfterRename"));
6440
TEST_P(RenameCurrentTest,Open)6441 TEST_P(RenameCurrentTest, Open) {
6442 Destroy(last_options_);
6443 Options options = GetDefaultOptions();
6444 options.create_if_missing = true;
6445 SetupSyncPoints();
6446 SyncPoint::GetInstance()->EnableProcessing();
6447 Status s = TryReopen(options);
6448 ASSERT_NOK(s);
6449
6450 SyncPoint::GetInstance()->DisableProcessing();
6451 Reopen(options);
6452 }
6453
TEST_P(RenameCurrentTest,Flush)6454 TEST_P(RenameCurrentTest, Flush) {
6455 Destroy(last_options_);
6456 Options options = GetDefaultOptions();
6457 options.max_manifest_file_size = 1;
6458 options.create_if_missing = true;
6459 Reopen(options);
6460 ASSERT_OK(Put("key", "value"));
6461 SetupSyncPoints();
6462 SyncPoint::GetInstance()->EnableProcessing();
6463 ASSERT_NOK(Flush());
6464
6465 ASSERT_NOK(Put("foo", "value"));
6466
6467 SyncPoint::GetInstance()->DisableProcessing();
6468 Reopen(options);
6469 ASSERT_EQ("value", Get("key"));
6470 ASSERT_EQ("NOT_FOUND", Get("foo"));
6471 }
6472
TEST_P(RenameCurrentTest,Compaction)6473 TEST_P(RenameCurrentTest, Compaction) {
6474 Destroy(last_options_);
6475 Options options = GetDefaultOptions();
6476 options.max_manifest_file_size = 1;
6477 options.create_if_missing = true;
6478 Reopen(options);
6479 ASSERT_OK(Put("a", "a_value"));
6480 ASSERT_OK(Put("c", "c_value"));
6481 ASSERT_OK(Flush());
6482
6483 ASSERT_OK(Put("b", "b_value"));
6484 ASSERT_OK(Put("d", "d_value"));
6485 ASSERT_OK(Flush());
6486
6487 SetupSyncPoints();
6488 SyncPoint::GetInstance()->EnableProcessing();
6489 ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
6490 /*end=*/nullptr));
6491
6492 ASSERT_NOK(Put("foo", "value"));
6493
6494 SyncPoint::GetInstance()->DisableProcessing();
6495 Reopen(options);
6496 ASSERT_EQ("NOT_FOUND", Get("foo"));
6497 ASSERT_EQ("d_value", Get("d"));
6498 }
6499
TEST_F(DBTest2,BottommostTemperature)6500 TEST_F(DBTest2, BottommostTemperature) {
6501 Options options = CurrentOptions();
6502 options.bottommost_temperature = Temperature::kWarm;
6503 options.level0_file_num_compaction_trigger = 2;
6504 Reopen(options);
6505
6506 auto size = GetSstSizeHelper(Temperature::kUnknown);
6507 ASSERT_EQ(size, 0);
6508 size = GetSstSizeHelper(Temperature::kWarm);
6509 ASSERT_EQ(size, 0);
6510 size = GetSstSizeHelper(Temperature::kHot);
6511 ASSERT_EQ(size, 0);
6512
6513 ASSERT_OK(Put("foo", "bar"));
6514 ASSERT_OK(Put("bar", "bar"));
6515 ASSERT_OK(Flush());
6516 ASSERT_OK(Put("foo", "bar"));
6517 ASSERT_OK(Put("bar", "bar"));
6518 ASSERT_OK(Flush());
6519 ASSERT_OK(dbfull()->TEST_WaitForCompact());
6520
6521 get_iostats_context()->Reset();
6522 IOStatsContext* iostats = get_iostats_context();
6523
6524 ColumnFamilyMetaData metadata;
6525 db_->GetColumnFamilyMetaData(&metadata);
6526 ASSERT_EQ(1, metadata.file_count);
6527 ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature);
6528 size = GetSstSizeHelper(Temperature::kUnknown);
6529 ASSERT_EQ(size, 0);
6530 size = GetSstSizeHelper(Temperature::kWarm);
6531 ASSERT_GT(size, 0);
6532 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6533 ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
6534 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6535
6536 ASSERT_EQ("bar", Get("foo"));
6537
6538 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6539 ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
6540 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6541 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
6542 ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
6543 ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
6544
6545 // non-bottommost file still has unknown temperature
6546 ASSERT_OK(Put("foo", "bar"));
6547 ASSERT_OK(Put("bar", "bar"));
6548 ASSERT_OK(Flush());
6549 ASSERT_EQ("bar", Get("bar"));
6550 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6551 ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
6552 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6553 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
6554 ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
6555 ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
6556
6557 db_->GetColumnFamilyMetaData(&metadata);
6558 ASSERT_EQ(2, metadata.file_count);
6559 ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
6560 size = GetSstSizeHelper(Temperature::kUnknown);
6561 ASSERT_GT(size, 0);
6562 size = GetSstSizeHelper(Temperature::kWarm);
6563 ASSERT_GT(size, 0);
6564
6565 // reopen and check the information is persisted
6566 Reopen(options);
6567 db_->GetColumnFamilyMetaData(&metadata);
6568 ASSERT_EQ(2, metadata.file_count);
6569 ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
6570 ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature);
6571 size = GetSstSizeHelper(Temperature::kUnknown);
6572 ASSERT_GT(size, 0);
6573 size = GetSstSizeHelper(Temperature::kWarm);
6574 ASSERT_GT(size, 0);
6575
6576 // check other non-exist temperatures
6577 size = GetSstSizeHelper(Temperature::kHot);
6578 ASSERT_EQ(size, 0);
6579 size = GetSstSizeHelper(Temperature::kCold);
6580 ASSERT_EQ(size, 0);
6581 std::string prop;
6582 ASSERT_TRUE(dbfull()->GetProperty(
6583 DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
6584 &prop));
6585 ASSERT_EQ(std::atoi(prop.c_str()), 0);
6586 }
6587
TEST_F(DBTest2,BottommostTemperatureUniversal)6588 TEST_F(DBTest2, BottommostTemperatureUniversal) {
6589 const int kTriggerNum = 3;
6590 const int kNumLevels = 5;
6591 const int kBottommostLevel = kNumLevels - 1;
6592 Options options = CurrentOptions();
6593 options.compaction_style = kCompactionStyleUniversal;
6594 options.level0_file_num_compaction_trigger = kTriggerNum;
6595 options.num_levels = kNumLevels;
6596
6597 DestroyAndReopen(options);
6598
6599 auto size = GetSstSizeHelper(Temperature::kUnknown);
6600 ASSERT_EQ(size, 0);
6601 size = GetSstSizeHelper(Temperature::kWarm);
6602 ASSERT_EQ(size, 0);
6603 size = GetSstSizeHelper(Temperature::kHot);
6604 ASSERT_EQ(size, 0);
6605 get_iostats_context()->Reset();
6606 IOStatsContext* iostats = get_iostats_context();
6607
6608 for (int i = 0; i < kTriggerNum; i++) {
6609 ASSERT_OK(Put("foo", "bar"));
6610 ASSERT_OK(Put("bar", "bar"));
6611 ASSERT_OK(Flush());
6612 }
6613 ASSERT_OK(dbfull()->TEST_WaitForCompact());
6614
6615 ColumnFamilyMetaData metadata;
6616 db_->GetColumnFamilyMetaData(&metadata);
6617 ASSERT_EQ(1, metadata.file_count);
6618 ASSERT_EQ(Temperature::kUnknown,
6619 metadata.levels[kBottommostLevel].files[0].temperature);
6620 size = GetSstSizeHelper(Temperature::kUnknown);
6621 ASSERT_GT(size, 0);
6622 size = GetSstSizeHelper(Temperature::kWarm);
6623 ASSERT_EQ(size, 0);
6624 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6625 ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
6626 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6627 ASSERT_EQ("bar", Get("foo"));
6628
6629 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6630 ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
6631 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
6632 ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
6633 ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
6634 ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
6635
6636 ASSERT_OK(Put("foo", "bar"));
6637 ASSERT_OK(Put("bar", "bar"));
6638 ASSERT_OK(Flush());
6639 ASSERT_OK(dbfull()->TEST_WaitForCompact());
6640 db_->GetColumnFamilyMetaData(&metadata);
6641 ASSERT_EQ(2, metadata.file_count);
6642 ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
6643 size = GetSstSizeHelper(Temperature::kUnknown);
6644 ASSERT_GT(size, 0);
6645 size = GetSstSizeHelper(Temperature::kWarm);
6646 ASSERT_EQ(size, 0);
6647
6648 // Update bottommost temperature
6649 options.bottommost_temperature = Temperature::kWarm;
6650 Reopen(options);
6651 db_->GetColumnFamilyMetaData(&metadata);
6652 // Should not impact existing ones
6653 ASSERT_EQ(Temperature::kUnknown,
6654 metadata.levels[kBottommostLevel].files[0].temperature);
6655 size = GetSstSizeHelper(Temperature::kUnknown);
6656 ASSERT_GT(size, 0);
6657 size = GetSstSizeHelper(Temperature::kWarm);
6658 ASSERT_EQ(size, 0);
6659
6660 // new generated file should have the new settings
6661 ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
6662 db_->GetColumnFamilyMetaData(&metadata);
6663 ASSERT_EQ(1, metadata.file_count);
6664 ASSERT_EQ(Temperature::kWarm,
6665 metadata.levels[kBottommostLevel].files[0].temperature);
6666 size = GetSstSizeHelper(Temperature::kUnknown);
6667 ASSERT_EQ(size, 0);
6668 size = GetSstSizeHelper(Temperature::kWarm);
6669 ASSERT_GT(size, 0);
6670
6671 // non-bottommost file still has unknown temperature
6672 ASSERT_OK(Put("foo", "bar"));
6673 ASSERT_OK(Put("bar", "bar"));
6674 ASSERT_OK(Flush());
6675 ASSERT_OK(dbfull()->TEST_WaitForCompact());
6676 db_->GetColumnFamilyMetaData(&metadata);
6677 ASSERT_EQ(2, metadata.file_count);
6678 ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
6679 size = GetSstSizeHelper(Temperature::kUnknown);
6680 ASSERT_GT(size, 0);
6681 size = GetSstSizeHelper(Temperature::kWarm);
6682 ASSERT_GT(size, 0);
6683
6684 // check other non-exist temperatures
6685 size = GetSstSizeHelper(Temperature::kHot);
6686 ASSERT_EQ(size, 0);
6687 size = GetSstSizeHelper(Temperature::kCold);
6688 ASSERT_EQ(size, 0);
6689 std::string prop;
6690 ASSERT_TRUE(dbfull()->GetProperty(
6691 DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
6692 &prop));
6693 ASSERT_EQ(std::atoi(prop.c_str()), 0);
6694 }
6695 #endif // ROCKSDB_LITE
6696
6697 // WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
TEST_F(DBTest2,PointInTimeRecoveryWithIOErrorWhileReadingWal)6698 TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
6699 Options options = CurrentOptions();
6700 DestroyAndReopen(options);
6701 ASSERT_OK(Put("foo", "value0"));
6702 Close();
6703 SyncPoint::GetInstance()->DisableProcessing();
6704 SyncPoint::GetInstance()->ClearAllCallBacks();
6705 bool should_inject_error = false;
6706 SyncPoint::GetInstance()->SetCallBack(
6707 "DBImpl::RecoverLogFiles:BeforeReadWal",
6708 [&](void* /*arg*/) { should_inject_error = true; });
6709 SyncPoint::GetInstance()->SetCallBack(
6710 "LogReader::ReadMore:AfterReadFile", [&](void* arg) {
6711 if (should_inject_error) {
6712 ASSERT_NE(nullptr, arg);
6713 *reinterpret_cast<Status*>(arg) = Status::IOError("Injected IOError");
6714 }
6715 });
6716 SyncPoint::GetInstance()->EnableProcessing();
6717 options.avoid_flush_during_recovery = true;
6718 options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
6719 Status s = TryReopen(options);
6720 ASSERT_TRUE(s.IsIOError());
6721 }
6722
TEST_F(DBTest2,PointInTimeRecoveryWithSyncFailureInCFCreation)6723 TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
6724 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
6725 {{"DBImpl::BackgroundCallFlush:Start:1",
6726 "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
6727 {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
6728 "DBImpl::BackgroundCallFlush:Start:2"}});
6729 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
6730
6731 CreateColumnFamilies({"test1"}, Options());
6732 ASSERT_OK(Put("foo", "bar"));
6733
6734 // Creating a CF when a flush is going on, log is synced but the
6735 // closed log file is not synced and corrupted.
6736 port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
6737 TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
6738 CreateColumnFamilies({"test2"}, Options());
6739 env_->corrupt_in_sync_ = true;
6740 TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
6741 flush_thread.join();
6742 env_->corrupt_in_sync_ = false;
6743 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
6744
6745 // Reopening the DB should not corrupt anything
6746 Options options = CurrentOptions();
6747 options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
6748 ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
6749 }
6750
TEST_F(DBTest2,RenameDirectory)6751 TEST_F(DBTest2, RenameDirectory) {
6752 Options options = CurrentOptions();
6753 DestroyAndReopen(options);
6754 ASSERT_OK(Put("foo", "value0"));
6755 Close();
6756 auto old_dbname = dbname_;
6757 auto new_dbname = dbname_ + "_2";
6758 EXPECT_OK(env_->RenameFile(dbname_, new_dbname));
6759 options.create_if_missing = false;
6760 dbname_ = new_dbname;
6761 ASSERT_OK(TryReopen(options));
6762 ASSERT_EQ("value0", Get("foo"));
6763 Destroy(options);
6764 dbname_ = old_dbname;
6765 }
6766 } // namespace ROCKSDB_NAMESPACE
6767
6768 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
6769 extern "C" {
6770 void RegisterCustomObjects(int argc, char** argv);
6771 }
6772 #else
RegisterCustomObjects(int,char **)6773 void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
6774 #endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
6775
main(int argc,char ** argv)6776 int main(int argc, char** argv) {
6777 ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
6778 ::testing::InitGoogleTest(&argc, argv);
6779 RegisterCustomObjects(argc, argv);
6780 return RUN_ALL_TESTS();
6781 }
6782