1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #ifdef GFLAGS
11 #ifdef NUMA
12 #include <numa.h>
13 #endif
14 #ifndef OS_WIN
15 #include <unistd.h>
16 #endif
17 #include <fcntl.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <sys/types.h>
21 #ifdef __APPLE__
22 #include <mach/host_info.h>
23 #include <mach/mach_host.h>
24 #include <sys/sysctl.h>
25 #endif
26 #ifdef __FreeBSD__
27 #include <sys/sysctl.h>
28 #endif
29 #include <atomic>
30 #include <cinttypes>
31 #include <condition_variable>
32 #include <cstddef>
33 #include <memory>
34 #include <mutex>
35 #include <thread>
36 #include <unordered_map>
37 
38 #include "db/db_impl/db_impl.h"
39 #include "db/malloc_stats.h"
40 #include "db/version_set.h"
41 #include "hdfs/env_hdfs.h"
42 #include "monitoring/histogram.h"
43 #include "monitoring/statistics.h"
44 #include "options/cf_options.h"
45 #include "port/port.h"
46 #include "port/stack_trace.h"
47 #include "rocksdb/cache.h"
48 #include "rocksdb/db.h"
49 #include "rocksdb/env.h"
50 #include "rocksdb/filter_policy.h"
51 #include "rocksdb/memtablerep.h"
52 #include "rocksdb/options.h"
53 #include "rocksdb/perf_context.h"
54 #include "rocksdb/persistent_cache.h"
55 #include "rocksdb/rate_limiter.h"
56 #include "rocksdb/secondary_cache.h"
57 #include "rocksdb/slice.h"
58 #include "rocksdb/slice_transform.h"
59 #include "rocksdb/stats_history.h"
60 #include "rocksdb/utilities/object_registry.h"
61 #include "rocksdb/utilities/optimistic_transaction_db.h"
62 #include "rocksdb/utilities/options_type.h"
63 #include "rocksdb/utilities/options_util.h"
64 #include "rocksdb/utilities/sim_cache.h"
65 #include "rocksdb/utilities/transaction.h"
66 #include "rocksdb/utilities/transaction_db.h"
67 #include "rocksdb/write_batch.h"
68 #include "test_util/testutil.h"
69 #include "test_util/transaction_test_util.h"
70 #include "tools/simulated_hybrid_file_system.h"
71 #include "util/cast_util.h"
72 #include "util/compression.h"
73 #include "util/crc32c.h"
74 #include "util/gflags_compat.h"
75 #include "util/mutexlock.h"
76 #include "util/random.h"
77 #include "util/stderr_logger.h"
78 #include "util/string_util.h"
79 #include "util/xxhash.h"
80 #include "utilities/blob_db/blob_db.h"
81 #include "utilities/merge_operators.h"
82 #include "utilities/merge_operators/bytesxor.h"
83 #include "utilities/merge_operators/sortlist.h"
84 #include "utilities/persistent_cache/block_cache_tier.h"
85 
86 #ifdef MEMKIND
87 #include "memory/memkind_kmem_allocator.h"
88 #endif
89 
90 #ifdef OS_WIN
91 #include <io.h>  // open/close
92 #endif
93 
94 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
95 using GFLAGS_NAMESPACE::RegisterFlagValidator;
96 using GFLAGS_NAMESPACE::SetUsageMessage;
97 
98 DEFINE_string(
99     benchmarks,
100     "fillseq,"
101     "fillseqdeterministic,"
102     "fillsync,"
103     "fillrandom,"
104     "filluniquerandomdeterministic,"
105     "overwrite,"
106     "readrandom,"
107     "newiterator,"
108     "newiteratorwhilewriting,"
109     "seekrandom,"
110     "seekrandomwhilewriting,"
111     "seekrandomwhilemerging,"
112     "readseq,"
113     "readreverse,"
114     "compact,"
115     "compactall,"
116     "flush,"
117 #ifndef ROCKSDB_LITE
118     "compact0,"
119     "compact1,"
120     "waitforcompaction,"
121 #endif
122     "multireadrandom,"
123     "mixgraph,"
124     "readseq,"
125     "readtorowcache,"
126     "readtocache,"
127     "readreverse,"
128     "readwhilewriting,"
129     "readwhilemerging,"
130     "readwhilescanning,"
131     "readrandomwriterandom,"
132     "updaterandom,"
133     "xorupdaterandom,"
134     "approximatesizerandom,"
135     "randomwithverify,"
136     "fill100K,"
137     "crc32c,"
138     "xxhash,"
139     "compress,"
140     "uncompress,"
141     "acquireload,"
142     "fillseekseq,"
143     "randomtransaction,"
144     "randomreplacekeys,"
145     "timeseries,"
146     "getmergeoperands",
147 
148     "Comma-separated list of operations to run in the specified"
149     " order. Available benchmarks:\n"
150     "\tfillseq       -- write N values in sequential key"
151     " order in async mode\n"
152     "\tfillseqdeterministic       -- write N values in the specified"
153     " key order and keep the shape of the LSM tree\n"
154     "\tfillrandom    -- write N values in random key order in async"
155     " mode\n"
156     "\tfilluniquerandomdeterministic       -- write N values in a random"
157     " key order and keep the shape of the LSM tree\n"
158     "\toverwrite     -- overwrite N values in random key order in"
159     " async mode\n"
160     "\tfillsync      -- write N/1000 values in random key order in "
161     "sync mode\n"
162     "\tfill100K      -- write N/1000 100K values in random order in"
163     " async mode\n"
164     "\tdeleteseq     -- delete N keys in sequential order\n"
165     "\tdeleterandom  -- delete N keys in random order\n"
166     "\treadseq       -- read N times sequentially\n"
167     "\treadtocache   -- 1 thread reading database sequentially\n"
168     "\treadreverse   -- read N times in reverse order\n"
169     "\treadrandom    -- read N times in random order\n"
170     "\treadmissing   -- read N missing keys in random order\n"
171     "\treadwhilewriting      -- 1 writer, N threads doing random "
172     "reads\n"
173     "\treadwhilemerging      -- 1 merger, N threads doing random "
174     "reads\n"
175     "\treadwhilescanning     -- 1 thread doing full table scan, "
176     "N threads doing random reads\n"
177     "\treadrandomwriterandom -- N threads doing random-read, "
178     "random-write\n"
179     "\tupdaterandom  -- N threads doing read-modify-write for random "
180     "keys\n"
181     "\txorupdaterandom  -- N threads doing read-XOR-write for "
182     "random keys\n"
183     "\tappendrandom  -- N threads doing read-modify-write with "
184     "growing values\n"
185     "\tmergerandom   -- same as updaterandom/appendrandom using merge"
186     " operator. "
187     "Must be used with merge_operator\n"
188     "\treadrandommergerandom -- perform N random read-or-merge "
189     "operations. Must be used with merge_operator\n"
190     "\tnewiterator   -- repeated iterator creation\n"
191     "\tseekrandom    -- N random seeks, call Next seek_nexts times "
192     "per seek\n"
193     "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
194     "overwrite\n"
195     "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
196     "merge\n"
197     "\tcrc32c        -- repeated crc32c of 4K of data\n"
198     "\txxhash        -- repeated xxHash of 4K of data\n"
199     "\tacquireload   -- load N*1000 times\n"
200     "\tfillseekseq   -- write N values in sequential key, then read "
201     "them by seeking to each key\n"
202     "\trandomtransaction     -- execute N random transactions and "
203     "verify correctness\n"
204     "\trandomreplacekeys     -- randomly replaces N keys by deleting "
205     "the old version and putting the new version\n\n"
206     "\ttimeseries            -- 1 writer generates time series data "
207     "and multiple readers doing random reads on id\n\n"
208     "Meta operations:\n"
209     "\tcompact     -- Compact the entire DB; If multiple, randomly choose one\n"
210     "\tcompactall  -- Compact the entire DB\n"
211 #ifndef ROCKSDB_LITE
212     "\tcompact0  -- compact L0 into L1\n"
213     "\tcompact1  -- compact L1 into L2\n"
214     "\twaitforcompaction - pause until compaction is (probably) done\n"
215 #endif
216     "\tflush - flush the memtable\n"
217     "\tstats       -- Print DB stats\n"
218     "\tresetstats  -- Reset DB stats\n"
219     "\tlevelstats  -- Print the number of files and bytes per level\n"
220     "\tmemstats  -- Print memtable stats\n"
221     "\tsstables    -- Print sstable info\n"
222     "\theapprofile -- Dump a heap profile (if supported by this port)\n"
223     "\treplay      -- replay the trace file specified with trace_file\n"
224     "\tgetmergeoperands -- Insert lots of merge records which are a list of "
225     "sorted ints for a key and then compare performance of lookup for another "
226     "key "
227     "by doing a Get followed by binary searching in the large sorted list vs "
228     "doing a GetMergeOperands and binary searching in the operands which are"
229     "sorted sub-lists. The MergeOperator used is sortlist.h\n");
230 
231 DEFINE_int64(num, 1000000, "Number of key/values to place in database");
232 
233 DEFINE_int64(numdistinct, 1000,
234              "Number of distinct keys to use. Used in RandomWithVerify to "
235              "read/write on fewer keys so that gets are more likely to find the"
236              " key and puts are more likely to update the same key");
237 
238 DEFINE_int64(merge_keys, -1,
239              "Number of distinct keys to use for MergeRandom and "
240              "ReadRandomMergeRandom. "
241              "If negative, there will be FLAGS_num keys.");
242 DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
243 
244 DEFINE_int32(
245     num_hot_column_families, 0,
246     "Number of Hot Column Families. If more than 0, only write to this "
247     "number of column families. After finishing all the writes to them, "
248     "create new set of column families and insert to them. Only used "
249     "when num_column_families > 1.");
250 
251 DEFINE_string(column_family_distribution, "",
252               "Comma-separated list of percentages, where the ith element "
253               "indicates the probability of an op using the ith column family. "
254               "The number of elements must be `num_hot_column_families` if "
255               "specified; otherwise, it must be `num_column_families`. The "
256               "sum of elements must be 100. E.g., if `num_column_families=4`, "
257               "and `num_hot_column_families=0`, a valid list could be "
258               "\"10,20,30,40\".");
259 
260 DEFINE_int64(reads, -1, "Number of read operations to do.  "
261              "If negative, do FLAGS_num reads.");
262 
263 DEFINE_int64(deletes, -1, "Number of delete operations to do.  "
264              "If negative, do FLAGS_num deletions.");
265 
266 DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
267 
268 DEFINE_int64(seed, 0, "Seed base for random number generators. "
269              "When 0 it is deterministic.");
270 
271 DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
272 
273 DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
274              " When 0 then num & reads determine the test duration");
275 
276 DEFINE_string(value_size_distribution_type, "fixed",
277               "Value size distribution type: fixed, uniform, normal");
278 
279 DEFINE_int32(value_size, 100, "Size of each value in fixed distribution");
280 static unsigned int value_size = 100;
281 
282 DEFINE_int32(value_size_min, 100, "Min size of random value");
283 
284 DEFINE_int32(value_size_max, 102400, "Max size of random value");
285 
286 DEFINE_int32(seek_nexts, 0,
287              "How many times to call Next() after Seek() in "
288              "fillseekseq, seekrandom, seekrandomwhilewriting and "
289              "seekrandomwhilemerging");
290 
291 DEFINE_bool(reverse_iterator, false,
292             "When true use Prev rather than Next for iterators that do "
293             "Seek and then Next");
294 
295 DEFINE_int64(max_scan_distance, 0,
296              "Used to define iterate_upper_bound (or iterate_lower_bound "
297              "if FLAGS_reverse_iterator is set to true) when value is nonzero");
298 
299 DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
300 
301 DEFINE_int64(batch_size, 1, "Batch size");
302 
ValidateKeySize(const char *,int32_t)303 static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
304   return true;
305 }
306 
ValidateUint32Range(const char * flagname,uint64_t value)307 static bool ValidateUint32Range(const char* flagname, uint64_t value) {
308   if (value > std::numeric_limits<uint32_t>::max()) {
309     fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
310             (unsigned long)value);
311     return false;
312   }
313   return true;
314 }
315 
316 DEFINE_int32(key_size, 16, "size of each key");
317 
318 DEFINE_int32(user_timestamp_size, 0,
319              "number of bytes in a user-defined timestamp");
320 
321 DEFINE_int32(num_multi_db, 0,
322              "Number of DBs used in the benchmark. 0 means single DB.");
323 
324 DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
325               " to this fraction of their original size after compression");
326 
327 DEFINE_double(read_random_exp_range, 0.0,
328               "Read random's key will be generated using distribution of "
329               "num * exp(-r) where r is uniform number from 0 to this value. "
330               "The larger the number is, the more skewed the reads are. "
331               "Only used in readrandom and multireadrandom benchmarks.");
332 
333 DEFINE_bool(histogram, false, "Print histogram of operation timings");
334 
335 DEFINE_bool(enable_numa, false,
336             "Make operations aware of NUMA architecture and bind memory "
337             "and cpus corresponding to nodes together. In NUMA, memory "
338             "in same node as CPUs are closer when compared to memory in "
339             "other nodes. Reads can be faster when the process is bound to "
340             "CPU and memory of same node. Use \"$numactl --hardware\" command "
341             "to see NUMA memory architecture.");
342 
343 DEFINE_int64(db_write_buffer_size,
344              ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
345              "Number of bytes to buffer in all memtables before compacting");
346 
347 DEFINE_bool(cost_write_buffer_to_cache, false,
348             "The usage of memtable is costed to the block cache");
349 
350 DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size,
351              "The size, in bytes, of one block in arena memory allocation.");
352 
353 DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
354              "Number of bytes to buffer in memtable before compacting");
355 
356 DEFINE_int32(max_write_buffer_number,
357              ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
358              "The number of in-memory memtables. Each memtable is of size"
359              " write_buffer_size bytes.");
360 
361 DEFINE_int32(min_write_buffer_number_to_merge,
362              ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
363              "The minimum number of write buffers that will be merged together"
364              "before writing to storage. This is cheap because it is an"
365              "in-memory merge. If this feature is not enabled, then all these"
366              "write buffers are flushed to L0 as separate files and this "
367              "increases read amplification because a get request has to check"
368              " in all of these files. Also, an in-memory merge may result in"
369              " writing less data to storage if there are duplicate records "
370              " in each of these individual write buffers.");
371 
372 DEFINE_int32(max_write_buffer_number_to_maintain,
373              ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
374              "The total maximum number of write buffers to maintain in memory "
375              "including copies of buffers that have already been flushed. "
376              "Unlike max_write_buffer_number, this parameter does not affect "
377              "flushing. This controls the minimum amount of write history "
378              "that will be available in memory for conflict checking when "
379              "Transactions are used. If this value is too low, some "
380              "transactions may fail at commit time due to not being able to "
381              "determine whether there were any write conflicts. Setting this "
382              "value to 0 will cause write buffers to be freed immediately "
383              "after they are flushed.  If this value is set to -1, "
384              "'max_write_buffer_number' will be used.");
385 
386 DEFINE_int64(max_write_buffer_size_to_maintain,
387              ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
388              "The total maximum size of write buffers to maintain in memory "
389              "including copies of buffers that have already been flushed. "
390              "Unlike max_write_buffer_number, this parameter does not affect "
391              "flushing. This controls the minimum amount of write history "
392              "that will be available in memory for conflict checking when "
393              "Transactions are used. If this value is too low, some "
394              "transactions may fail at commit time due to not being able to "
395              "determine whether there were any write conflicts. Setting this "
396              "value to 0 will cause write buffers to be freed immediately "
397              "after they are flushed.  If this value is set to -1, "
398              "'max_write_buffer_number' will be used.");
399 
400 DEFINE_int32(max_background_jobs,
401              ROCKSDB_NAMESPACE::Options().max_background_jobs,
402              "The maximum number of concurrent background jobs that can occur "
403              "in parallel.");
404 
405 DEFINE_int32(num_bottom_pri_threads, 0,
406              "The number of threads in the bottom-priority thread pool (used "
407              "by universal compaction only).");
408 
409 DEFINE_int32(num_high_pri_threads, 0,
410              "The maximum number of concurrent background compactions"
411              " that can occur in parallel.");
412 
413 DEFINE_int32(num_low_pri_threads, 0,
414              "The maximum number of concurrent background compactions"
415              " that can occur in parallel.");
416 
417 DEFINE_int32(max_background_compactions,
418              ROCKSDB_NAMESPACE::Options().max_background_compactions,
419              "The maximum number of concurrent background compactions"
420              " that can occur in parallel.");
421 
422 DEFINE_int32(base_background_compactions, -1, "DEPRECATED");
423 
424 DEFINE_uint64(subcompactions, 1,
425               "Maximum number of subcompactions to divide L0-L1 compactions "
426               "into.");
427 static const bool FLAGS_subcompactions_dummy
428     __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions,
429                                                     &ValidateUint32Range);
430 
431 DEFINE_int32(max_background_flushes,
432              ROCKSDB_NAMESPACE::Options().max_background_flushes,
433              "The maximum number of concurrent background flushes"
434              " that can occur in parallel.");
435 
436 static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e;
437 DEFINE_int32(compaction_style,
438              (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style,
439              "style of compaction: level-based, universal and fifo");
440 
441 static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e;
442 DEFINE_int32(compaction_pri,
443              (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri,
444              "priority of files to compaction: by size or by data age");
445 
446 DEFINE_int32(universal_size_ratio, 0,
447              "Percentage flexibility while comparing file size"
448              " (for universal compaction only).");
449 
450 DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a"
451              " single compaction run (for universal compaction only).");
452 
453 DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
454              " in universal style compaction");
455 
456 DEFINE_int32(universal_max_size_amplification_percent, 0,
457              "The max size amplification for universal style compaction");
458 
459 DEFINE_int32(universal_compression_size_percent, -1,
460              "The percentage of the database to compress for universal "
461              "compaction. -1 means compress everything.");
462 
463 DEFINE_bool(universal_allow_trivial_move, false,
464             "Allow trivial move in universal compaction.");
465 
466 DEFINE_int64(cache_size, 8 << 20,  // 8MB
467              "Number of bytes to use as a cache of uncompressed data");
468 
469 DEFINE_int32(cache_numshardbits, 6,
470              "Number of shards for the block cache"
471              " is 2 ** cache_numshardbits. Negative means use default settings."
472              " This is applied only if FLAGS_cache_size is non-negative.");
473 
474 DEFINE_double(cache_high_pri_pool_ratio, 0.0,
475               "Ratio of block cache reserve for high pri blocks. "
476               "If > 0.0, we also enable "
477               "cache_index_and_filter_blocks_with_high_priority.");
478 
479 DEFINE_bool(use_clock_cache, false,
480             "Replace default LRU block cache with clock cache.");
481 
482 DEFINE_int64(simcache_size, -1,
483              "Number of bytes to use as a simcache of "
484              "uncompressed data. Nagative value disables simcache.");
485 
486 DEFINE_bool(cache_index_and_filter_blocks, false,
487             "Cache index/filter blocks in block cache.");
488 
489 DEFINE_bool(use_cache_memkind_kmem_allocator, false,
490             "Use memkind kmem allocator for block cache.");
491 
492 DEFINE_bool(partition_index_and_filters, false,
493             "Partition index and filter blocks.");
494 
495 DEFINE_bool(partition_index, false, "Partition index blocks");
496 
497 DEFINE_bool(index_with_first_key, false, "Include first key in the index");
498 
499 DEFINE_bool(
500     optimize_filters_for_memory,
501     ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
502     "Minimize memory footprint of filters");
503 
504 DEFINE_int64(
505     index_shortening_mode, 2,
506     "mode to shorten index: 0 for no shortening; 1 for only shortening "
507     "separaters; 2 for shortening shortening and successor");
508 
509 DEFINE_int64(metadata_block_size,
510              ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
511              "Max partition size when partitioning index/filters");
512 
513 // The default reduces the overhead of reading time with flash. With HDD, which
514 // offers much less throughput, however, this number better to be set to 1.
515 DEFINE_int32(ops_between_duration_checks, 1000,
516              "Check duration limit every x ops");
517 
518 DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
519             "Pin index/filter blocks of L0 files in block cache.");
520 
521 DEFINE_bool(
522     pin_top_level_index_and_filter, false,
523     "Pin top-level index of partitioned index/filter blocks in block cache.");
524 
525 DEFINE_int32(block_size,
526              static_cast<int32_t>(
527                  ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
528              "Number of bytes in a block.");
529 
530 DEFINE_int32(format_version,
531              static_cast<int32_t>(
532                  ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
533              "Format version of SST files.");
534 
535 DEFINE_int32(block_restart_interval,
536              ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval,
537              "Number of keys between restart points "
538              "for delta encoding of keys in data block.");
539 
540 DEFINE_int32(
541     index_block_restart_interval,
542     ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
543     "Number of keys between restart points "
544     "for delta encoding of keys in index block.");
545 
546 DEFINE_int32(read_amp_bytes_per_bit,
547              ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit,
548              "Number of bytes per bit to be used in block read-amp bitmap");
549 
550 DEFINE_bool(
551     enable_index_compression,
552     ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression,
553     "Compress the index block");
554 
555 DEFINE_bool(block_align,
556             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
557             "Align data blocks on page size");
558 
559 DEFINE_bool(use_data_block_hash_index, false,
560             "if use kDataBlockBinaryAndHash "
561             "instead of kDataBlockBinarySearch. "
562             "This is valid if only we use BlockTable");
563 
564 DEFINE_double(data_block_hash_table_util_ratio, 0.75,
565               "util ratio for data block hash index table. "
566               "This is only valid if use_data_block_hash_index is "
567               "set to true");
568 
569 DEFINE_int64(compressed_cache_size, -1,
570              "Number of bytes to use as a cache of compressed data.");
571 
572 DEFINE_int64(row_cache_size, 0,
573              "Number of bytes to use as a cache of individual rows"
574              " (0 = disabled).");
575 
576 DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
577              "Maximum number of files to keep open at the same time"
578              " (use default if == 0)");
579 
580 DEFINE_int32(file_opening_threads,
581              ROCKSDB_NAMESPACE::Options().max_file_opening_threads,
582              "If open_files is set to -1, this option set the number of "
583              "threads that will be used to open files during DB::Open()");
584 
585 DEFINE_bool(new_table_reader_for_compaction_inputs, true,
586              "If true, uses a separate file handle for compaction inputs");
587 
588 DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
589 
590 DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size");
591 
592 DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
593              "Maximum windows randomaccess buffer size");
594 
595 DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
596              "Maximum write buffer for Writable File");
597 
598 DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
599              " use default settings.");
600 
601 DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter");
602 
603 DEFINE_double(memtable_bloom_size_ratio, 0,
604               "Ratio of memtable size used for bloom filter. 0 means no bloom "
605               "filter.");
606 DEFINE_bool(memtable_whole_key_filtering, false,
607             "Try to use whole key bloom filter in memtables.");
608 DEFINE_bool(memtable_use_huge_page, false,
609             "Try to use huge page in memtables.");
610 
611 DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
612             " database.  If you set this flag and also specify a benchmark that"
613             " wants a fresh database, that benchmark will fail.");
614 
615 DEFINE_bool(use_existing_keys, false,
616             "If true, uses existing keys in the DB, "
617             "rather than generating new ones. This involves some startup "
618             "latency to load all keys into memory. It is supported for the "
619             "same read/overwrite benchmarks as `-use_existing_db=true`, which "
620             "must also be set for this flag to be enabled. When this flag is "
621             "set, the value for `-num` will be ignored.");
622 
623 DEFINE_bool(show_table_properties, false,
624             "If true, then per-level table"
625             " properties will be printed on every stats-interval when"
626             " stats_interval is set and stats_per_interval is on.");
627 
628 DEFINE_string(db, "", "Use the db with the following name.");
629 
630 // Read cache flags
631 
632 DEFINE_string(read_cache_path, "",
633               "If not empty string, a read cache will be used in this path");
634 
635 DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
636              "Maximum size of the read cache");
637 
638 DEFINE_bool(read_cache_direct_write, true,
639             "Whether to use Direct IO for writing to the read cache");
640 
641 DEFINE_bool(read_cache_direct_read, true,
642             "Whether to use Direct IO for reading from read cache");
643 
644 DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
645 
ValidateCacheNumshardbits(const char * flagname,int32_t value)646 static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
647   if (value >= 20) {
648     fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
649             flagname, value);
650     return false;
651   }
652   return true;
653 }
654 
655 DEFINE_bool(verify_checksum, true,
656             "Verify checksum for every block read"
657             " from storage");
658 
659 DEFINE_bool(statistics, false, "Database statistics");
660 DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
661              "stats level for statistics");
662 DEFINE_string(statistics_string, "", "Serialized statistics string");
663 static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
664 
665 DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
666              " --num reads.");
667 
668 DEFINE_bool(finish_after_writes, false, "Write thread terminates after all writes are finished");
669 
670 DEFINE_bool(sync, false, "Sync all writes to disk");
671 
672 DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
673 
674 DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
675 
676 DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
677 
678 DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
679               "Truth key/values used when using verify");
680 
681 DEFINE_int32(num_levels, 7, "The total number of levels");
682 
683 DEFINE_int64(target_file_size_base,
684              ROCKSDB_NAMESPACE::Options().target_file_size_base,
685              "Target file size at level-1");
686 
687 DEFINE_int32(target_file_size_multiplier,
688              ROCKSDB_NAMESPACE::Options().target_file_size_multiplier,
689              "A multiplier to compute target level-N file size (N >= 2)");
690 
691 DEFINE_uint64(max_bytes_for_level_base,
692               ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
693               "Max bytes for level-1");
694 
695 DEFINE_bool(level_compaction_dynamic_level_bytes, false,
696             "Whether level size base is dynamic");
697 
698 DEFINE_double(max_bytes_for_level_multiplier, 10,
699               "A multiplier to compute max bytes for level-N (N >= 2)");
700 
701 static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
702 DEFINE_string(max_bytes_for_level_multiplier_additional, "",
703               "A vector that specifies additional fanout per level");
704 
705 DEFINE_int32(level0_stop_writes_trigger,
706              ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
707              "Number of files in level-0"
708              " that will trigger put stop.");
709 
710 DEFINE_int32(level0_slowdown_writes_trigger,
711              ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
712              "Number of files in level-0"
713              " that will slow down writes.");
714 
715 DEFINE_int32(level0_file_num_compaction_trigger,
716              ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
717              "Number of files in level-0"
718              " when compactions start");
719 
720 DEFINE_uint64(periodic_compaction_seconds,
721               ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds,
722               "Files older than this will be picked up for compaction and"
723               " rewritten to the same level");
724 
ValidateInt32Percent(const char * flagname,int32_t value)725 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
726   if (value <= 0 || value>=100) {
727     fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
728             flagname, value);
729     return false;
730   }
731   return true;
732 }
733 DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
734              " as percentage) for the ReadRandomWriteRandom workload. The "
735              "default value 90 means 90% operations out of all reads and writes"
736              " operations are reads. In other words, 9 gets for every 1 put.");
737 
738 DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
739              " as percentage) for the ReadRandomMergeRandom workload. The"
740              " default value 70 means 70% out of all read and merge operations"
741              " are merges. In other words, 7 merges for every 3 gets.");
742 
743 DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
744              "deletes (used in RandomWithVerify only). RandomWithVerify "
745              "calculates writepercent as (100 - FLAGS_readwritepercent - "
746              "deletepercent), so deletepercent must be smaller than (100 - "
747              "FLAGS_readwritepercent)");
748 
749 DEFINE_bool(optimize_filters_for_hits, false,
750             "Optimizes bloom filters for workloads for most lookups return "
751             "a value. For now this doesn't create bloom filters for the max "
752             "level of the LSM to reduce metadata that should fit in RAM. ");
753 
754 DEFINE_uint64(delete_obsolete_files_period_micros, 0,
755               "Ignored. Left here for backward compatibility");
756 
757 DEFINE_int64(writes_before_delete_range, 0,
758              "Number of writes before DeleteRange is called regularly.");
759 
760 DEFINE_int64(writes_per_range_tombstone, 0,
761              "Number of writes between range tombstones");
762 
763 DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
764 
765 DEFINE_int64(max_num_range_tombstones, 0,
766              "Maximum number of range tombstones "
767              "to insert.");
768 
769 DEFINE_bool(expand_range_tombstones, false,
770             "Expand range tombstone into sequential regular tombstones.");
771 
772 #ifndef ROCKSDB_LITE
773 // Transactions Options
774 DEFINE_bool(optimistic_transaction_db, false,
775             "Open a OptimisticTransactionDB instance. "
776             "Required for randomtransaction benchmark.");
777 
778 DEFINE_bool(transaction_db, false,
779             "Open a TransactionDB instance. "
780             "Required for randomtransaction benchmark.");
781 
782 DEFINE_uint64(transaction_sets, 2,
783               "Number of keys each transaction will "
784               "modify (use in RandomTransaction only).  Max: 9999");
785 
786 DEFINE_bool(transaction_set_snapshot, false,
787             "Setting to true will have each transaction call SetSnapshot()"
788             " upon creation.");
789 
790 DEFINE_int32(transaction_sleep, 0,
791              "Max microseconds to sleep in between "
792              "reading and writing a value (used in RandomTransaction only). ");
793 
794 DEFINE_uint64(transaction_lock_timeout, 100,
795               "If using a transaction_db, specifies the lock wait timeout in"
796               " milliseconds before failing a transaction waiting on a lock");
797 DEFINE_string(
798     options_file, "",
799     "The path to a RocksDB options file.  If specified, then db_bench will "
800     "run with the RocksDB options in the default column family of the "
801     "specified options file. "
802     "Note that with this setting, db_bench will ONLY accept the following "
803     "RocksDB options related command-line arguments, all other arguments "
804     "that are related to RocksDB options will be ignored:\n"
805     "\t--use_existing_db\n"
806     "\t--use_existing_keys\n"
807     "\t--statistics\n"
808     "\t--row_cache_size\n"
809     "\t--row_cache_numshardbits\n"
810     "\t--enable_io_prio\n"
811     "\t--dump_malloc_stats\n"
812     "\t--num_multi_db\n");
813 
814 // FIFO Compaction Options
815 DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
816               "The limit of total table file sizes to trigger FIFO compaction");
817 
818 DEFINE_bool(fifo_compaction_allow_compaction, true,
819             "Allow compaction in FIFO compaction.");
820 
821 DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
822 
823 // Stacked BlobDB Options
824 DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
825 
826 DEFINE_bool(
827     blob_db_enable_gc,
828     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
829     "[Stacked BlobDB] Enable BlobDB garbage collection.");
830 
831 DEFINE_double(
832     blob_db_gc_cutoff,
833     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
834     "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
835 
836 DEFINE_bool(blob_db_is_fifo,
837             ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
838             "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
839 
840 DEFINE_uint64(blob_db_max_db_size,
841               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
842               "[Stacked BlobDB] Max size limit of the directory where blob "
843               "files are stored.");
844 
845 DEFINE_uint64(blob_db_max_ttl_range, 0,
846               "[Stacked BlobDB] TTL range to generate BlobDB data (in "
847               "seconds). 0 means no TTL.");
848 
849 DEFINE_uint64(
850     blob_db_ttl_range_secs,
851     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
852     "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
853 
854 DEFINE_uint64(
855     blob_db_min_blob_size,
856     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
857     "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
858     "smaller than this will be inlined with the key in the LSM tree.");
859 
860 DEFINE_uint64(blob_db_bytes_per_sync,
861               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
862               "[Stacked BlobDB] Bytes to sync blob file at.");
863 
864 DEFINE_uint64(blob_db_file_size,
865               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
866               "[Stacked BlobDB] Target size of each blob file.");
867 
868 DEFINE_string(
869     blob_db_compression_type, "snappy",
870     "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
871 static enum ROCKSDB_NAMESPACE::CompressionType
872     FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
873 
874 #endif  // ROCKSDB_LITE
875 
876 // Integrated BlobDB options
877 DEFINE_bool(
878     enable_blob_files,
879     ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
880     "[Integrated BlobDB] Enable writing large values to separate blob files.");
881 
882 DEFINE_uint64(min_blob_size,
883               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
884               "[Integrated BlobDB] The size of the smallest value to be stored "
885               "separately in a blob file.");
886 
887 DEFINE_uint64(blob_file_size,
888               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
889               "[Integrated BlobDB] The size limit for blob files.");
890 
891 DEFINE_string(blob_compression_type, "none",
892               "[Integrated BlobDB] The compression algorithm to use for large "
893               "values stored in blob files.");
894 
895 DEFINE_bool(enable_blob_garbage_collection,
896             ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
897                 .enable_blob_garbage_collection,
898             "[Integrated BlobDB] Enable blob garbage collection.");
899 
900 DEFINE_double(blob_garbage_collection_age_cutoff,
901               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
902                   .blob_garbage_collection_age_cutoff,
903               "[Integrated BlobDB] The cutoff in terms of blob file age for "
904               "garbage collection.");
905 
906 #ifndef ROCKSDB_LITE
907 
908 // Secondary DB instance Options
909 DEFINE_bool(use_secondary_db, false,
910             "Open a RocksDB secondary instance. A primary instance can be "
911             "running in another db_bench process.");
912 
913 DEFINE_string(secondary_path, "",
914               "Path to a directory used by the secondary instance to store "
915               "private files, e.g. info log.");
916 
917 DEFINE_int32(secondary_update_interval, 5,
918              "Secondary instance attempts to catch up with the primary every "
919              "secondary_update_interval seconds.");
920 
921 #endif  // ROCKSDB_LITE
922 
923 DEFINE_bool(report_bg_io_stats, false,
924             "Measure times spents on I/Os while in compactions. ");
925 
926 DEFINE_bool(use_stderr_info_logger, false,
927             "Write info logs to stderr instead of to LOG file. ");
928 
929 DEFINE_string(trace_file, "", "Trace workload to a file. ");
930 
931 DEFINE_int32(trace_replay_fast_forward, 1,
932              "Fast forward trace replay, must >= 1. ");
933 DEFINE_int32(block_cache_trace_sampling_frequency, 1,
934              "Block cache trace sampling frequency, termed s. It uses spatial "
935              "downsampling and samples accesses to one out of s blocks.");
936 DEFINE_int64(
937     block_cache_trace_max_trace_file_size_in_bytes,
938     uint64_t{64} * 1024 * 1024 * 1024,
939     "The maximum block cache trace file size in bytes. Block cache accesses "
940     "will not be logged if the trace file size exceeds this threshold. Default "
941     "is 64 GB.");
942 DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
943 DEFINE_int32(trace_replay_threads, 1,
944              "The number of threads to replay, must >=1.");
945 
StringToCompressionType(const char * ctype)946 static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
947     const char* ctype) {
948   assert(ctype);
949 
950   if (!strcasecmp(ctype, "none"))
951     return ROCKSDB_NAMESPACE::kNoCompression;
952   else if (!strcasecmp(ctype, "snappy"))
953     return ROCKSDB_NAMESPACE::kSnappyCompression;
954   else if (!strcasecmp(ctype, "zlib"))
955     return ROCKSDB_NAMESPACE::kZlibCompression;
956   else if (!strcasecmp(ctype, "bzip2"))
957     return ROCKSDB_NAMESPACE::kBZip2Compression;
958   else if (!strcasecmp(ctype, "lz4"))
959     return ROCKSDB_NAMESPACE::kLZ4Compression;
960   else if (!strcasecmp(ctype, "lz4hc"))
961     return ROCKSDB_NAMESPACE::kLZ4HCCompression;
962   else if (!strcasecmp(ctype, "xpress"))
963     return ROCKSDB_NAMESPACE::kXpressCompression;
964   else if (!strcasecmp(ctype, "zstd"))
965     return ROCKSDB_NAMESPACE::kZSTD;
966 
967   fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
968   return ROCKSDB_NAMESPACE::kSnappyCompression;  // default value
969 }
970 
ColumnFamilyName(size_t i)971 static std::string ColumnFamilyName(size_t i) {
972   if (i == 0) {
973     return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName;
974   } else {
975     char name[100];
976     snprintf(name, sizeof(name), "column_family_name_%06zu", i);
977     return std::string(name);
978   }
979 }
980 
981 DEFINE_string(compression_type, "snappy",
982               "Algorithm to use to compress the database");
983 static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e =
984     ROCKSDB_NAMESPACE::kSnappyCompression;
985 
986 DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");
987 
988 DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level,
989              "Compression level. The meaning of this value is library-"
990              "dependent. If unset, we try to use the default for the library "
991              "specified in `--compression_type`");
992 
993 DEFINE_int32(compression_max_dict_bytes,
994              ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes,
995              "Maximum size of dictionary used to prime the compression "
996              "library.");
997 
998 DEFINE_int32(compression_zstd_max_train_bytes,
999              ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes,
1000              "Maximum size of training data passed to zstd's dictionary "
1001              "trainer.");
1002 
1003 DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
1004              " from this level. Levels with number < min_level_to_compress are"
1005              " not compressed. Otherwise, apply compression_type to "
1006              "all levels.");
1007 
1008 DEFINE_int32(compression_parallel_threads, 1,
1009              "Number of threads for parallel compression.");
1010 
1011 DEFINE_uint64(compression_max_dict_buffer_bytes,
1012               ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes,
1013               "Maximum bytes to buffer to collect samples for dictionary.");
1014 
ValidateTableCacheNumshardbits(const char * flagname,int32_t value)1015 static bool ValidateTableCacheNumshardbits(const char* flagname,
1016                                            int32_t value) {
1017   if (0 >= value || value >= 20) {
1018     fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val < 20\n",
1019             flagname, value);
1020     return false;
1021   }
1022   return true;
1023 }
1024 DEFINE_int32(table_cache_numshardbits, 4, "");
1025 
1026 #ifndef ROCKSDB_LITE
1027 DEFINE_string(env_uri, "",
1028               "URI for registry Env lookup. Mutually exclusive"
1029               " with --hdfs and --fs_uri");
1030 DEFINE_string(fs_uri, "",
1031               "URI for registry Filesystem lookup. Mutually exclusive"
1032               " with --hdfs and --env_uri."
1033               " Creates a default environment with the specified filesystem.");
1034 #endif  // ROCKSDB_LITE
1035 DEFINE_string(hdfs, "",
1036               "Name of hdfs environment. Mutually exclusive with"
1037               " --env_uri and --fs_uri");
1038 DEFINE_string(simulate_hybrid_fs_file, "",
1039               "File for Store Metadata for Simulate hybrid FS. Empty means "
1040               "disable the feature. Now, if it is set, "
1041               "bottommost_temperature is set to kWarm.");
1042 
1043 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
1044 
1045 static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default();
1046 
1047 DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
1048              "this is greater than zero. When 0 the interval grows over time.");
1049 
1050 DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This "
1051              "overrides stats_interval when both are > 0.");
1052 
1053 DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
1054              " this is greater than 0.");
1055 
1056 DEFINE_int64(report_interval_seconds, 0,
1057              "If greater than zero, it will write simple stats in CVS format "
1058              "to --report_file every N seconds");
1059 
1060 DEFINE_string(report_file, "report.csv",
1061               "Filename where some simple stats are reported to (if "
1062               "--report_interval_seconds is bigger than 0)");
1063 
1064 DEFINE_int32(thread_status_per_interval, 0,
1065              "Takes and report a snapshot of the current status of each thread"
1066              " when this is greater than 0.");
1067 
1068 DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
1069              "Level of perf collection");
1070 
1071 #ifndef ROCKSDB_LITE
GetCompositeEnv(std::shared_ptr<ROCKSDB_NAMESPACE::FileSystem> fs)1072 static ROCKSDB_NAMESPACE::Env* GetCompositeEnv(
1073     std::shared_ptr<ROCKSDB_NAMESPACE::FileSystem> fs) {
1074   static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
1075       ROCKSDB_NAMESPACE::NewCompositeEnv(fs);
1076   return composite_env.get();
1077 }
1078 #endif
1079 
ValidateRateLimit(const char * flagname,double value)1080 static bool ValidateRateLimit(const char* flagname, double value) {
1081   const double EPSILON = 1e-10;
1082   if ( value < -EPSILON ) {
1083     fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n",
1084             flagname, value);
1085     return false;
1086   }
1087   return true;
1088 }
1089 DEFINE_double(soft_rate_limit, 0.0, "DEPRECATED");
1090 
1091 DEFINE_double(hard_rate_limit, 0.0, "DEPRECATED");
1092 
1093 DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
1094               "Slowdown writes if pending compaction bytes exceed this number");
1095 
1096 DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
1097               "Stop writes if pending compaction bytes exceed this number");
1098 
1099 DEFINE_uint64(delayed_write_rate, 8388608u,
1100               "Limited bytes allowed to DB when soft_rate_limit or "
1101               "level0_slowdown_writes_trigger triggers");
1102 
1103 DEFINE_bool(enable_pipelined_write, true,
1104             "Allow WAL and memtable writes to be pipelined");
1105 
1106 DEFINE_bool(
1107     unordered_write, false,
1108     "Enable the unordered write feature, which provides higher throughput but "
1109     "relaxes the guarantees around atomic reads and immutable snapshots");
1110 
1111 DEFINE_bool(allow_concurrent_memtable_write, true,
1112             "Allow multi-writers to update mem tables in parallel.");
1113 
1114 DEFINE_bool(inplace_update_support,
1115             ROCKSDB_NAMESPACE::Options().inplace_update_support,
1116             "Support in-place memtable update for smaller or same-size values");
1117 
1118 DEFINE_uint64(inplace_update_num_locks,
1119               ROCKSDB_NAMESPACE::Options().inplace_update_num_locks,
1120               "Number of RW locks to protect in-place memtable updates");
1121 
1122 DEFINE_bool(enable_write_thread_adaptive_yield, true,
1123             "Use a yielding spin loop for brief writer thread waits.");
1124 
1125 DEFINE_uint64(
1126     write_thread_max_yield_usec, 100,
1127     "Maximum microseconds for enable_write_thread_adaptive_yield operation.");
1128 
1129 DEFINE_uint64(write_thread_slow_yield_usec, 3,
1130               "The threshold at which a slow yield is considered a signal that "
1131               "other processes or threads want the core.");
1132 
1133 DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
1134              "When hard_rate_limit is set then this is the max time a put will"
1135              " be stalled.");
1136 
1137 DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
1138 
1139 DEFINE_bool(rate_limiter_auto_tuned, false,
1140             "Enable dynamic adjustment of rate limit according to demand for "
1141             "background I/O");
1142 
1143 
1144 DEFINE_bool(sine_write_rate, false,
1145             "Use a sine wave write_rate_limit");
1146 
1147 DEFINE_uint64(sine_write_rate_interval_milliseconds, 10000,
1148               "Interval of which the sine wave write_rate_limit is recalculated");
1149 
1150 DEFINE_double(sine_a, 1,
1151              "A in f(x) = A sin(bx + c) + d");
1152 
1153 DEFINE_double(sine_b, 1,
1154              "B in f(x) = A sin(bx + c) + d");
1155 
1156 DEFINE_double(sine_c, 0,
1157              "C in f(x) = A sin(bx + c) + d");
1158 
1159 DEFINE_double(sine_d, 1,
1160              "D in f(x) = A sin(bx + c) + d");
1161 
1162 DEFINE_bool(rate_limit_bg_reads, false,
1163             "Use options.rate_limiter on compaction reads");
1164 
1165 DEFINE_uint64(
1166     benchmark_write_rate_limit, 0,
1167     "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
1168     "is the global rate in bytes/second.");
1169 
1170 // the parameters of mix_graph
1171 DEFINE_double(keyrange_dist_a, 0.0,
1172               "The parameter 'a' of prefix average access distribution "
1173               "f(x)=a*exp(b*x)+c*exp(d*x)");
1174 DEFINE_double(keyrange_dist_b, 0.0,
1175               "The parameter 'b' of prefix average access distribution "
1176               "f(x)=a*exp(b*x)+c*exp(d*x)");
1177 DEFINE_double(keyrange_dist_c, 0.0,
1178               "The parameter 'c' of prefix average access distribution"
1179               "f(x)=a*exp(b*x)+c*exp(d*x)");
1180 DEFINE_double(keyrange_dist_d, 0.0,
1181               "The parameter 'd' of prefix average access distribution"
1182               "f(x)=a*exp(b*x)+c*exp(d*x)");
1183 DEFINE_int64(keyrange_num, 1,
1184              "The number of key ranges that are in the same prefix "
1185              "group, each prefix range will have its key access "
1186              "distribution");
1187 DEFINE_double(key_dist_a, 0.0,
1188               "The parameter 'a' of key access distribution model "
1189               "f(x)=a*x^b");
1190 DEFINE_double(key_dist_b, 0.0,
1191               "The parameter 'b' of key access distribution model "
1192               "f(x)=a*x^b");
1193 DEFINE_double(value_theta, 0.0,
1194               "The parameter 'theta' of Generized Pareto Distribution "
1195               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1196 DEFINE_double(value_k, 0.0,
1197               "The parameter 'k' of Generized Pareto Distribution "
1198               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1199 DEFINE_double(value_sigma, 0.0,
1200               "The parameter 'theta' of Generized Pareto Distribution "
1201               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1202 DEFINE_double(iter_theta, 0.0,
1203               "The parameter 'theta' of Generized Pareto Distribution "
1204               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1205 DEFINE_double(iter_k, 0.0,
1206               "The parameter 'k' of Generized Pareto Distribution "
1207               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1208 DEFINE_double(iter_sigma, 0.0,
1209               "The parameter 'sigma' of Generized Pareto Distribution "
1210               "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1211 DEFINE_double(mix_get_ratio, 1.0,
1212               "The ratio of Get queries of mix_graph workload");
1213 DEFINE_double(mix_put_ratio, 0.0,
1214               "The ratio of Put queries of mix_graph workload");
1215 DEFINE_double(mix_seek_ratio, 0.0,
1216               "The ratio of Seek queries of mix_graph workload");
1217 DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
1218 DEFINE_int64(mix_ave_kv_size, 512,
1219              "The average key-value size of this workload");
1220 DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
1221 DEFINE_double(
1222     sine_mix_rate_noise, 0.0,
1223     "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
1224 DEFINE_bool(sine_mix_rate, false,
1225             "Enable the sine QPS control on the mix workload");
1226 DEFINE_uint64(
1227     sine_mix_rate_interval_milliseconds, 10000,
1228     "Interval of which the sine wave read_rate_limit is recalculated");
1229 DEFINE_int64(mix_accesses, -1,
1230              "The total query accesses of mix_graph workload");
1231 
1232 DEFINE_uint64(
1233     benchmark_read_rate_limit, 0,
1234     "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
1235     "is the global rate in ops/second.");
1236 
1237 DEFINE_uint64(max_compaction_bytes,
1238               ROCKSDB_NAMESPACE::Options().max_compaction_bytes,
1239               "Max bytes allowed in one compaction");
1240 
1241 #ifndef ROCKSDB_LITE
1242 DEFINE_bool(readonly, false, "Run read only benchmarks.");
1243 
1244 DEFINE_bool(print_malloc_stats, false,
1245             "Print malloc stats to stdout after benchmarks finish.");
1246 #endif  // ROCKSDB_LITE
1247 
1248 DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
1249 
1250 DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
1251 DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
1252               " in MB.");
1253 DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
1254 
1255 DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
1256             "Allow reads to occur via mmap-ing files");
1257 
1258 DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
1259             "Allow writes to occur via mmap-ing files");
1260 
1261 DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
1262             "Use O_DIRECT for reading data");
1263 
1264 DEFINE_bool(use_direct_io_for_flush_and_compaction,
1265             ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
1266             "Use O_DIRECT for background flush and compaction writes");
1267 
1268 DEFINE_bool(advise_random_on_open,
1269             ROCKSDB_NAMESPACE::Options().advise_random_on_open,
1270             "Advise random access on table file open");
1271 
1272 DEFINE_string(compaction_fadvice, "NORMAL",
1273               "Access pattern advice when a file is compacted");
1274 static auto FLAGS_compaction_fadvice_e =
1275     ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start;
1276 
1277 DEFINE_bool(use_tailing_iterator, false,
1278             "Use tailing iterator to access a series of keys instead of get");
1279 
1280 DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex,
1281             "Use adaptive mutex");
1282 
1283 DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
1284               "Allows OS to incrementally sync SST files to disk while they are"
1285               " being written, in the background. Issue one request for every"
1286               " bytes_per_sync written. 0 turns it off.");
1287 
1288 DEFINE_uint64(wal_bytes_per_sync,
1289               ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
1290               "Allows OS to incrementally sync WAL files to disk while they are"
1291               " being written, in the background. Issue one request for every"
1292               " wal_bytes_per_sync written. 0 turns it off.");
1293 
1294 DEFINE_bool(use_single_deletes, true,
1295             "Use single deletes (used in RandomReplaceKeys only).");
1296 
1297 DEFINE_double(stddev, 2000.0,
1298               "Standard deviation of normal distribution used for picking keys"
1299               " (used in RandomReplaceKeys only).");
1300 
1301 DEFINE_int32(key_id_range, 100000,
1302              "Range of possible value of key id (used in TimeSeries only).");
1303 
1304 DEFINE_string(expire_style, "none",
1305               "Style to remove expired time entries. Can be one of the options "
1306               "below: none (do not expired data), compaction_filter (use a "
1307               "compaction filter to remove expired data), delete (seek IDs and "
1308               "remove expired data) (used in TimeSeries only).");
1309 
1310 DEFINE_uint64(
1311     time_range, 100000,
1312     "Range of timestamp that store in the database (used in TimeSeries"
1313     " only).");
1314 
1315 DEFINE_int32(num_deletion_threads, 1,
1316              "Number of threads to do deletion (used in TimeSeries and delete "
1317              "expire_style only).");
1318 
1319 DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
1320              " operations on a key in the memtable");
1321 
ValidatePrefixSize(const char * flagname,int32_t value)1322 static bool ValidatePrefixSize(const char* flagname, int32_t value) {
1323   if (value < 0 || value>=2000000000) {
1324     fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
1325             flagname, value);
1326     return false;
1327   }
1328   return true;
1329 }
1330 
1331 DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and "
1332              "plain table");
1333 DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
1334              "per prefix, 0 means no special handling of the prefix, "
1335              "i.e. use the prefix comes with the generated random number.");
1336 DEFINE_bool(total_order_seek, false,
1337             "Enable total order seek regardless of index format.");
1338 DEFINE_bool(prefix_same_as_start, false,
1339             "Enforce iterator to return keys with prefix same as seek key.");
1340 DEFINE_bool(
1341     seek_missing_prefix, false,
1342     "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");
1343 
1344 DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
1345              "If non-zero, enable "
1346              "memtable insert with hint with the given prefix size.");
1347 DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
1348             "threads' IO priority");
1349 DEFINE_bool(enable_cpu_prio, false, "Lower the background flush/compaction "
1350             "threads' CPU priority");
1351 DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
1352             "table becomes an identity function. This is only valid when key "
1353             "is 8 bytes");
1354 DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
1355 DEFINE_uint64(stats_dump_period_sec,
1356               ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
1357               "Gap between printing stats to log in seconds");
1358 DEFINE_uint64(stats_persist_period_sec,
1359               ROCKSDB_NAMESPACE::Options().stats_persist_period_sec,
1360               "Gap between persisting stats in seconds");
1361 DEFINE_bool(persist_stats_to_disk,
1362             ROCKSDB_NAMESPACE::Options().persist_stats_to_disk,
1363             "whether to persist stats to disk");
1364 DEFINE_uint64(stats_history_buffer_size,
1365               ROCKSDB_NAMESPACE::Options().stats_history_buffer_size,
1366               "Max number of stats snapshots to keep in memory");
1367 DEFINE_int64(multiread_stride, 0,
1368              "Stride length for the keys in a MultiGet batch");
1369 DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
1370 
1371 enum RepFactory {
1372   kSkipList,
1373   kPrefixHash,
1374   kVectorRep,
1375   kHashLinkedList,
1376 };
1377 
StringToRepFactory(const char * ctype)1378 static enum RepFactory StringToRepFactory(const char* ctype) {
1379   assert(ctype);
1380 
1381   if (!strcasecmp(ctype, "skip_list"))
1382     return kSkipList;
1383   else if (!strcasecmp(ctype, "prefix_hash"))
1384     return kPrefixHash;
1385   else if (!strcasecmp(ctype, "vector"))
1386     return kVectorRep;
1387   else if (!strcasecmp(ctype, "hash_linkedlist"))
1388     return kHashLinkedList;
1389 
1390   fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
1391   return kSkipList;
1392 }
1393 
1394 static enum RepFactory FLAGS_rep_factory;
1395 DEFINE_string(memtablerep, "skip_list", "");
1396 DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
1397 DEFINE_bool(use_plain_table, false, "if use plain table "
1398             "instead of block-based table format");
1399 DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
1400 DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
1401 DEFINE_bool(use_hash_search, false, "if use kHashSearch "
1402             "instead of kBinarySearch. "
1403             "This is valid if only we use BlockTable");
1404 DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
1405             "instead of kFullFilter for filter block. "
1406             "This is valid if only we use BlockTable");
1407 DEFINE_string(merge_operator, "", "The merge operator to use with the database."
1408               "If a new merge operator is specified, be sure to use fresh"
1409               " database The possible merge operators are defined in"
1410               " utilities/merge_operators.h");
1411 DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
1412              "linear search first for this many steps from the previous "
1413              "position");
1414 DEFINE_bool(report_file_operations, false, "if report number of file "
1415             "operations");
1416 DEFINE_int32(readahead_size, 0, "Iterator readahead size");
1417 
1418 DEFINE_bool(read_with_latest_user_timestamp, true,
1419             "If true, always use the current latest timestamp for read. If "
1420             "false, choose a random timestamp from the past.");
1421 
1422 #ifndef ROCKSDB_LITE
1423 DEFINE_string(secondary_cache_uri, "",
1424               "Full URI for creating a custom secondary cache object");
1425 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
1426 #endif  // ROCKSDB_LITE
1427 
1428 static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) =
1429     RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
1430 
1431 static const bool FLAGS_hard_rate_limit_dummy __attribute__((__unused__)) =
1432     RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
1433 
1434 static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
1435     RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
1436 
1437 static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
1438     RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
1439 
1440 static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
1441     RegisterFlagValidator(&FLAGS_cache_numshardbits,
1442                           &ValidateCacheNumshardbits);
1443 
1444 static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
1445     RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
1446 
1447 DEFINE_int32(disable_seek_compaction, false,
1448              "Not used, left here for backwards compatibility");
1449 
1450 static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
1451     RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
1452 static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((__unused__)) =
1453     RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
1454                           &ValidateTableCacheNumshardbits);
1455 
1456 namespace ROCKSDB_NAMESPACE {
1457 
1458 namespace {
1459 struct ReportFileOpCounters {
1460   std::atomic<int> open_counter_;
1461   std::atomic<int> read_counter_;
1462   std::atomic<int> append_counter_;
1463   std::atomic<uint64_t> bytes_read_;
1464   std::atomic<uint64_t> bytes_written_;
1465 };
1466 
1467 // A special Env to records and report file operations in db_bench
1468 class ReportFileOpEnv : public EnvWrapper {
1469  public:
ReportFileOpEnv(Env * base)1470   explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); }
1471 
reset()1472   void reset() {
1473     counters_.open_counter_ = 0;
1474     counters_.read_counter_ = 0;
1475     counters_.append_counter_ = 0;
1476     counters_.bytes_read_ = 0;
1477     counters_.bytes_written_ = 0;
1478   }
1479 
NewSequentialFile(const std::string & f,std::unique_ptr<SequentialFile> * r,const EnvOptions & soptions)1480   Status NewSequentialFile(const std::string& f,
1481                            std::unique_ptr<SequentialFile>* r,
1482                            const EnvOptions& soptions) override {
1483     class CountingFile : public SequentialFile {
1484      private:
1485       std::unique_ptr<SequentialFile> target_;
1486       ReportFileOpCounters* counters_;
1487 
1488      public:
1489       CountingFile(std::unique_ptr<SequentialFile>&& target,
1490                    ReportFileOpCounters* counters)
1491           : target_(std::move(target)), counters_(counters) {}
1492 
1493       Status Read(size_t n, Slice* result, char* scratch) override {
1494         counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
1495         Status rv = target_->Read(n, result, scratch);
1496         counters_->bytes_read_.fetch_add(result->size(),
1497                                          std::memory_order_relaxed);
1498         return rv;
1499       }
1500 
1501       Status Skip(uint64_t n) override { return target_->Skip(n); }
1502     };
1503 
1504     Status s = target()->NewSequentialFile(f, r, soptions);
1505     if (s.ok()) {
1506       counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
1507       r->reset(new CountingFile(std::move(*r), counters()));
1508     }
1509     return s;
1510   }
1511 
NewRandomAccessFile(const std::string & f,std::unique_ptr<RandomAccessFile> * r,const EnvOptions & soptions)1512   Status NewRandomAccessFile(const std::string& f,
1513                              std::unique_ptr<RandomAccessFile>* r,
1514                              const EnvOptions& soptions) override {
1515     class CountingFile : public RandomAccessFile {
1516      private:
1517       std::unique_ptr<RandomAccessFile> target_;
1518       ReportFileOpCounters* counters_;
1519 
1520      public:
1521       CountingFile(std::unique_ptr<RandomAccessFile>&& target,
1522                    ReportFileOpCounters* counters)
1523           : target_(std::move(target)), counters_(counters) {}
1524       Status Read(uint64_t offset, size_t n, Slice* result,
1525                   char* scratch) const override {
1526         counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
1527         Status rv = target_->Read(offset, n, result, scratch);
1528         counters_->bytes_read_.fetch_add(result->size(),
1529                                          std::memory_order_relaxed);
1530         return rv;
1531       }
1532     };
1533 
1534     Status s = target()->NewRandomAccessFile(f, r, soptions);
1535     if (s.ok()) {
1536       counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
1537       r->reset(new CountingFile(std::move(*r), counters()));
1538     }
1539     return s;
1540   }
1541 
NewWritableFile(const std::string & f,std::unique_ptr<WritableFile> * r,const EnvOptions & soptions)1542   Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
1543                          const EnvOptions& soptions) override {
1544     class CountingFile : public WritableFile {
1545      private:
1546       std::unique_ptr<WritableFile> target_;
1547       ReportFileOpCounters* counters_;
1548 
1549      public:
1550       CountingFile(std::unique_ptr<WritableFile>&& target,
1551                    ReportFileOpCounters* counters)
1552           : target_(std::move(target)), counters_(counters) {}
1553 
1554       Status Append(const Slice& data) override {
1555         counters_->append_counter_.fetch_add(1, std::memory_order_relaxed);
1556         Status rv = target_->Append(data);
1557         counters_->bytes_written_.fetch_add(data.size(),
1558                                             std::memory_order_relaxed);
1559         return rv;
1560       }
1561 
1562       Status Append(
1563           const Slice& data,
1564           const DataVerificationInfo& /* verification_info */) override {
1565         return Append(data);
1566       }
1567 
1568       Status Truncate(uint64_t size) override { return target_->Truncate(size); }
1569       Status Close() override { return target_->Close(); }
1570       Status Flush() override { return target_->Flush(); }
1571       Status Sync() override { return target_->Sync(); }
1572     };
1573 
1574     Status s = target()->NewWritableFile(f, r, soptions);
1575     if (s.ok()) {
1576       counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
1577       r->reset(new CountingFile(std::move(*r), counters()));
1578     }
1579     return s;
1580   }
1581 
1582   // getter
counters()1583   ReportFileOpCounters* counters() { return &counters_; }
1584 
1585  private:
1586   ReportFileOpCounters counters_;
1587 };
1588 
1589 }  // namespace
1590 
1591 enum DistributionType : unsigned char {
1592   kFixed = 0,
1593   kUniform,
1594   kNormal
1595 };
1596 
1597 static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed;
1598 
StringToDistributionType(const char * ctype)1599 static enum DistributionType StringToDistributionType(const char* ctype) {
1600   assert(ctype);
1601 
1602   if (!strcasecmp(ctype, "fixed"))
1603     return kFixed;
1604   else if (!strcasecmp(ctype, "uniform"))
1605     return kUniform;
1606   else if (!strcasecmp(ctype, "normal"))
1607     return kNormal;
1608 
1609   fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
1610   return kFixed;  // default value
1611 }
1612 
1613 class BaseDistribution {
1614  public:
BaseDistribution(unsigned int _min,unsigned int _max)1615   BaseDistribution(unsigned int _min, unsigned int _max)
1616       : min_value_size_(_min), max_value_size_(_max) {}
~BaseDistribution()1617   virtual ~BaseDistribution() {}
1618 
Generate()1619   unsigned int Generate() {
1620     auto val = Get();
1621     if (NeedTruncate()) {
1622       val = std::max(min_value_size_, val);
1623       val = std::min(max_value_size_, val);
1624     }
1625     return val;
1626   }
1627  private:
1628   virtual unsigned int Get() = 0;
NeedTruncate()1629   virtual bool NeedTruncate() {
1630     return true;
1631   }
1632   unsigned int min_value_size_;
1633   unsigned int max_value_size_;
1634 };
1635 
1636 class FixedDistribution : public BaseDistribution
1637 {
1638  public:
FixedDistribution(unsigned int size)1639   FixedDistribution(unsigned int size) :
1640     BaseDistribution(size, size),
1641     size_(size) {}
1642  private:
Get()1643   virtual unsigned int Get() override {
1644     return size_;
1645   }
NeedTruncate()1646   virtual bool NeedTruncate() override {
1647     return false;
1648   }
1649   unsigned int size_;
1650 };
1651 
1652 class NormalDistribution
1653     : public BaseDistribution, public std::normal_distribution<double> {
1654  public:
NormalDistribution(unsigned int _min,unsigned int _max)1655   NormalDistribution(unsigned int _min, unsigned int _max)
1656       : BaseDistribution(_min, _max),
1657         // 99.7% values within the range [min, max].
1658         std::normal_distribution<double>(
1659             (double)(_min + _max) / 2.0 /*mean*/,
1660             (double)(_max - _min) / 6.0 /*stddev*/),
1661         gen_(rd_()) {}
1662 
1663  private:
Get()1664   virtual unsigned int Get() override {
1665     return static_cast<unsigned int>((*this)(gen_));
1666   }
1667   std::random_device rd_;
1668   std::mt19937 gen_;
1669 };
1670 
1671 class UniformDistribution
1672     : public BaseDistribution,
1673       public std::uniform_int_distribution<unsigned int> {
1674  public:
UniformDistribution(unsigned int _min,unsigned int _max)1675   UniformDistribution(unsigned int _min, unsigned int _max)
1676       : BaseDistribution(_min, _max),
1677         std::uniform_int_distribution<unsigned int>(_min, _max),
1678         gen_(rd_()) {}
1679 
1680  private:
Get()1681   virtual unsigned int Get() override {
1682     return (*this)(gen_);
1683   }
NeedTruncate()1684   virtual bool NeedTruncate() override {
1685     return false;
1686   }
1687   std::random_device rd_;
1688   std::mt19937 gen_;
1689 };
1690 
1691 // Helper for quickly generating random data.
1692 class RandomGenerator {
1693  private:
1694   std::string data_;
1695   unsigned int pos_;
1696   std::unique_ptr<BaseDistribution> dist_;
1697 
1698  public:
1699 
RandomGenerator()1700   RandomGenerator() {
1701     auto max_value_size = FLAGS_value_size_max;
1702     switch (FLAGS_value_size_distribution_type_e) {
1703       case kUniform:
1704         dist_.reset(new UniformDistribution(FLAGS_value_size_min,
1705                                             FLAGS_value_size_max));
1706         break;
1707       case kNormal:
1708         dist_.reset(new NormalDistribution(FLAGS_value_size_min,
1709                                            FLAGS_value_size_max));
1710         break;
1711       case kFixed:
1712       default:
1713         dist_.reset(new FixedDistribution(value_size));
1714         max_value_size = value_size;
1715     }
1716     // We use a limited amount of data over and over again and ensure
1717     // that it is larger than the compression window (32KB), and also
1718     // large enough to serve all typical value sizes we want to write.
1719     Random rnd(301);
1720     std::string piece;
1721     while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
1722       // Add a short fragment that is as compressible as specified
1723       // by FLAGS_compression_ratio.
1724       test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
1725       data_.append(piece);
1726     }
1727     pos_ = 0;
1728   }
1729 
Generate(unsigned int len)1730   Slice Generate(unsigned int len) {
1731     assert(len <= data_.size());
1732     if (pos_ + len > data_.size()) {
1733       pos_ = 0;
1734     }
1735     pos_ += len;
1736     return Slice(data_.data() + pos_ - len, len);
1737   }
1738 
Generate()1739   Slice Generate() {
1740     auto len = dist_->Generate();
1741     return Generate(len);
1742   }
1743 };
1744 
AppendWithSpace(std::string * str,Slice msg)1745 static void AppendWithSpace(std::string* str, Slice msg) {
1746   if (msg.empty()) return;
1747   if (!str->empty()) {
1748     str->push_back(' ');
1749   }
1750   str->append(msg.data(), msg.size());
1751 }
1752 
1753 struct DBWithColumnFamilies {
1754   std::vector<ColumnFamilyHandle*> cfh;
1755   DB* db;
1756 #ifndef ROCKSDB_LITE
1757   OptimisticTransactionDB* opt_txn_db;
1758 #endif  // ROCKSDB_LITE
1759   std::atomic<size_t> num_created;  // Need to be updated after all the
1760                                     // new entries in cfh are set.
1761   size_t num_hot;  // Number of column families to be queried at each moment.
1762                    // After each CreateNewCf(), another num_hot number of new
1763                    // Column families will be created and used to be queried.
1764   port::Mutex create_cf_mutex;  // Only one thread can execute CreateNewCf()
1765   std::vector<int> cfh_idx_to_prob;  // ith index holds probability of operating
1766                                      // on cfh[i].
1767 
DBWithColumnFamiliesROCKSDB_NAMESPACE::DBWithColumnFamilies1768   DBWithColumnFamilies()
1769       : db(nullptr)
1770 #ifndef ROCKSDB_LITE
1771         , opt_txn_db(nullptr)
1772 #endif  // ROCKSDB_LITE
1773   {
1774     cfh.clear();
1775     num_created = 0;
1776     num_hot = 0;
1777   }
1778 
DBWithColumnFamiliesROCKSDB_NAMESPACE::DBWithColumnFamilies1779   DBWithColumnFamilies(const DBWithColumnFamilies& other)
1780       : cfh(other.cfh),
1781         db(other.db),
1782 #ifndef ROCKSDB_LITE
1783         opt_txn_db(other.opt_txn_db),
1784 #endif  // ROCKSDB_LITE
1785         num_created(other.num_created.load()),
1786         num_hot(other.num_hot),
1787         cfh_idx_to_prob(other.cfh_idx_to_prob) {
1788   }
1789 
DeleteDBsROCKSDB_NAMESPACE::DBWithColumnFamilies1790   void DeleteDBs() {
1791     std::for_each(cfh.begin(), cfh.end(),
1792                   [](ColumnFamilyHandle* cfhi) { delete cfhi; });
1793     cfh.clear();
1794 #ifndef ROCKSDB_LITE
1795     if (opt_txn_db) {
1796       delete opt_txn_db;
1797       opt_txn_db = nullptr;
1798     } else {
1799       delete db;
1800       db = nullptr;
1801     }
1802 #else
1803     delete db;
1804     db = nullptr;
1805 #endif  // ROCKSDB_LITE
1806   }
1807 
GetCfhROCKSDB_NAMESPACE::DBWithColumnFamilies1808   ColumnFamilyHandle* GetCfh(int64_t rand_num) {
1809     assert(num_hot > 0);
1810     size_t rand_offset = 0;
1811     if (!cfh_idx_to_prob.empty()) {
1812       assert(cfh_idx_to_prob.size() == num_hot);
1813       int sum = 0;
1814       while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
1815         sum += cfh_idx_to_prob[rand_offset];
1816         ++rand_offset;
1817       }
1818       assert(rand_offset < cfh_idx_to_prob.size());
1819     } else {
1820       rand_offset = rand_num % num_hot;
1821     }
1822     return cfh[num_created.load(std::memory_order_acquire) - num_hot +
1823                rand_offset];
1824   }
1825 
1826   // stage: assume CF from 0 to stage * num_hot has be created. Need to create
1827   //        stage * num_hot + 1 to stage * (num_hot + 1).
CreateNewCfROCKSDB_NAMESPACE::DBWithColumnFamilies1828   void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
1829     MutexLock l(&create_cf_mutex);
1830     if ((stage + 1) * num_hot <= num_created) {
1831       // Already created.
1832       return;
1833     }
1834     auto new_num_created = num_created + num_hot;
1835     assert(new_num_created <= cfh.size());
1836     for (size_t i = num_created; i < new_num_created; i++) {
1837       Status s =
1838           db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
1839       if (!s.ok()) {
1840         fprintf(stderr, "create column family error: %s\n",
1841                 s.ToString().c_str());
1842         abort();
1843       }
1844     }
1845     num_created.store(new_num_created, std::memory_order_release);
1846   }
1847 };
1848 
1849 // a class that reports stats to CSV file
1850 class ReporterAgent {
1851  public:
ReporterAgent(Env * env,const std::string & fname,uint64_t report_interval_secs)1852   ReporterAgent(Env* env, const std::string& fname,
1853                 uint64_t report_interval_secs)
1854       : env_(env),
1855         total_ops_done_(0),
1856         last_report_(0),
1857         report_interval_secs_(report_interval_secs),
1858         stop_(false) {
1859     auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
1860     if (s.ok()) {
1861       s = report_file_->Append(Header() + "\n");
1862     }
1863     if (s.ok()) {
1864       s = report_file_->Flush();
1865     }
1866     if (!s.ok()) {
1867       fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
1868               s.ToString().c_str());
1869       abort();
1870     }
1871 
1872     reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
1873   }
1874 
~ReporterAgent()1875   ~ReporterAgent() {
1876     {
1877       std::unique_lock<std::mutex> lk(mutex_);
1878       stop_ = true;
1879       stop_cv_.notify_all();
1880     }
1881     reporting_thread_.join();
1882   }
1883 
1884   // thread safe
ReportFinishedOps(int64_t num_ops)1885   void ReportFinishedOps(int64_t num_ops) {
1886     total_ops_done_.fetch_add(num_ops);
1887   }
1888 
1889  private:
Header() const1890   std::string Header() const { return "secs_elapsed,interval_qps"; }
SleepAndReport()1891   void SleepAndReport() {
1892     auto* clock = env_->GetSystemClock().get();
1893     auto time_started = clock->NowMicros();
1894     while (true) {
1895       {
1896         std::unique_lock<std::mutex> lk(mutex_);
1897         if (stop_ ||
1898             stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
1899                               [&]() { return stop_; })) {
1900           // stopping
1901           break;
1902         }
1903         // else -> timeout, which means time for a report!
1904       }
1905       auto total_ops_done_snapshot = total_ops_done_.load();
1906       // round the seconds elapsed
1907       auto secs_elapsed =
1908           (clock->NowMicros() - time_started + kMicrosInSecond / 2) /
1909           kMicrosInSecond;
1910       std::string report = ToString(secs_elapsed) + "," +
1911                            ToString(total_ops_done_snapshot - last_report_) +
1912                            "\n";
1913       auto s = report_file_->Append(report);
1914       if (s.ok()) {
1915         s = report_file_->Flush();
1916       }
1917       if (!s.ok()) {
1918         fprintf(stderr,
1919                 "Can't write to report file (%s), stopping the reporting\n",
1920                 s.ToString().c_str());
1921         break;
1922       }
1923       last_report_ = total_ops_done_snapshot;
1924     }
1925   }
1926 
1927   Env* env_;
1928   std::unique_ptr<WritableFile> report_file_;
1929   std::atomic<int64_t> total_ops_done_;
1930   int64_t last_report_;
1931   const uint64_t report_interval_secs_;
1932   ROCKSDB_NAMESPACE::port::Thread reporting_thread_;
1933   std::mutex mutex_;
1934   // will notify on stop
1935   std::condition_variable stop_cv_;
1936   bool stop_;
1937 };
1938 
1939 enum OperationType : unsigned char {
1940   kRead = 0,
1941   kWrite,
1942   kDelete,
1943   kSeek,
1944   kMerge,
1945   kUpdate,
1946   kCompress,
1947   kUncompress,
1948   kCrc,
1949   kHash,
1950   kOthers
1951 };
1952 
1953 static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
1954                           OperationTypeString = {
1955   {kRead, "read"},
1956   {kWrite, "write"},
1957   {kDelete, "delete"},
1958   {kSeek, "seek"},
1959   {kMerge, "merge"},
1960   {kUpdate, "update"},
1961   {kCompress, "compress"},
1962   {kCompress, "uncompress"},
1963   {kCrc, "crc"},
1964   {kHash, "hash"},
1965   {kOthers, "op"}
1966 };
1967 
1968 class CombinedStats;
1969 class Stats {
1970  private:
1971   SystemClock* clock_;
1972   int id_;
1973   uint64_t start_ = 0;
1974   uint64_t sine_interval_;
1975   uint64_t finish_;
1976   double seconds_;
1977   uint64_t done_;
1978   uint64_t last_report_done_;
1979   uint64_t next_report_;
1980   uint64_t bytes_;
1981   uint64_t last_op_finish_;
1982   uint64_t last_report_finish_;
1983   std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
1984                      std::hash<unsigned char>> hist_;
1985   std::string message_;
1986   bool exclude_from_merge_;
1987   ReporterAgent* reporter_agent_;  // does not own
1988   friend class CombinedStats;
1989 
1990  public:
Stats()1991   Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); }
1992 
SetReporterAgent(ReporterAgent * reporter_agent)1993   void SetReporterAgent(ReporterAgent* reporter_agent) {
1994     reporter_agent_ = reporter_agent;
1995   }
1996 
Start(int id)1997   void Start(int id) {
1998     id_ = id;
1999     next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
2000     last_op_finish_ = start_;
2001     hist_.clear();
2002     done_ = 0;
2003     last_report_done_ = 0;
2004     bytes_ = 0;
2005     seconds_ = 0;
2006     start_ = clock_->NowMicros();
2007     sine_interval_ = clock_->NowMicros();
2008     finish_ = start_;
2009     last_report_finish_ = start_;
2010     message_.clear();
2011     // When set, stats from this thread won't be merged with others.
2012     exclude_from_merge_ = false;
2013   }
2014 
Merge(const Stats & other)2015   void Merge(const Stats& other) {
2016     if (other.exclude_from_merge_)
2017       return;
2018 
2019     for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
2020       auto this_it = hist_.find(it->first);
2021       if (this_it != hist_.end()) {
2022         this_it->second->Merge(*(other.hist_.at(it->first)));
2023       } else {
2024         hist_.insert({ it->first, it->second });
2025       }
2026     }
2027 
2028     done_ += other.done_;
2029     bytes_ += other.bytes_;
2030     seconds_ += other.seconds_;
2031     if (other.start_ < start_) start_ = other.start_;
2032     if (other.finish_ > finish_) finish_ = other.finish_;
2033 
2034     // Just keep the messages from one thread
2035     if (message_.empty()) message_ = other.message_;
2036   }
2037 
Stop()2038   void Stop() {
2039     finish_ = clock_->NowMicros();
2040     seconds_ = (finish_ - start_) * 1e-6;
2041   }
2042 
AddMessage(Slice msg)2043   void AddMessage(Slice msg) {
2044     AppendWithSpace(&message_, msg);
2045   }
2046 
SetId(int id)2047   void SetId(int id) { id_ = id; }
SetExcludeFromMerge()2048   void SetExcludeFromMerge() { exclude_from_merge_ = true; }
2049 
PrintThreadStatus()2050   void PrintThreadStatus() {
2051     std::vector<ThreadStatus> thread_list;
2052     FLAGS_env->GetThreadList(&thread_list);
2053 
2054     fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n",
2055         "ThreadID", "ThreadType", "cfName", "Operation",
2056         "ElapsedTime", "Stage", "State", "OperationProperties");
2057 
2058     int64_t current_time = 0;
2059     clock_->GetCurrentTime(&current_time).PermitUncheckedError();
2060     for (auto ts : thread_list) {
2061       fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
2062           ts.thread_id,
2063           ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
2064           ts.cf_name.c_str(),
2065           ThreadStatus::GetOperationName(ts.operation_type).c_str(),
2066           ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
2067           ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
2068           ThreadStatus::GetStateName(ts.state_type).c_str());
2069 
2070       auto op_properties = ThreadStatus::InterpretOperationProperties(
2071           ts.operation_type, ts.op_properties);
2072       for (const auto& op_prop : op_properties) {
2073         fprintf(stderr, " %s %" PRIu64" |",
2074             op_prop.first.c_str(), op_prop.second);
2075       }
2076       fprintf(stderr, "\n");
2077     }
2078   }
2079 
ResetSineInterval()2080   void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); }
2081 
GetSineInterval()2082   uint64_t GetSineInterval() {
2083     return sine_interval_;
2084   }
2085 
GetStart()2086   uint64_t GetStart() {
2087     return start_;
2088   }
2089 
ResetLastOpTime()2090   void ResetLastOpTime() {
2091     // Set to now to avoid latency from calls to SleepForMicroseconds
2092     last_op_finish_ = clock_->NowMicros();
2093   }
2094 
FinishedOps(DBWithColumnFamilies * db_with_cfh,DB * db,int64_t num_ops,enum OperationType op_type=kOthers)2095   void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
2096                    enum OperationType op_type = kOthers) {
2097     if (reporter_agent_) {
2098       reporter_agent_->ReportFinishedOps(num_ops);
2099     }
2100     if (FLAGS_histogram) {
2101       uint64_t now = clock_->NowMicros();
2102       uint64_t micros = now - last_op_finish_;
2103 
2104       if (hist_.find(op_type) == hist_.end())
2105       {
2106         auto hist_temp = std::make_shared<HistogramImpl>();
2107         hist_.insert({op_type, std::move(hist_temp)});
2108       }
2109       hist_[op_type]->Add(micros);
2110 
2111       if (micros > 20000 && !FLAGS_stats_interval) {
2112         fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
2113         fflush(stderr);
2114       }
2115       last_op_finish_ = now;
2116     }
2117 
2118     done_ += num_ops;
2119     if (done_ >= next_report_) {
2120       if (!FLAGS_stats_interval) {
2121         if      (next_report_ < 1000)   next_report_ += 100;
2122         else if (next_report_ < 5000)   next_report_ += 500;
2123         else if (next_report_ < 10000)  next_report_ += 1000;
2124         else if (next_report_ < 50000)  next_report_ += 5000;
2125         else if (next_report_ < 100000) next_report_ += 10000;
2126         else if (next_report_ < 500000) next_report_ += 50000;
2127         else                            next_report_ += 100000;
2128         fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
2129       } else {
2130         uint64_t now = clock_->NowMicros();
2131         int64_t usecs_since_last = now - last_report_finish_;
2132 
2133         // Determine whether to print status where interval is either
2134         // each N operations or each N seconds.
2135 
2136         if (FLAGS_stats_interval_seconds &&
2137             usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
2138           // Don't check again for this many operations
2139           next_report_ += FLAGS_stats_interval;
2140 
2141         } else {
2142           fprintf(stderr,
2143                   "%s ... thread %d: (%" PRIu64 ",%" PRIu64
2144                   ") ops and "
2145                   "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
2146                   clock_->TimeToString(now / 1000000).c_str(), id_,
2147                   done_ - last_report_done_, done_,
2148                   (done_ - last_report_done_) / (usecs_since_last / 1000000.0),
2149                   done_ / ((now - start_) / 1000000.0),
2150                   (now - last_report_finish_) / 1000000.0,
2151                   (now - start_) / 1000000.0);
2152 
2153           if (id_ == 0 && FLAGS_stats_per_interval) {
2154             std::string stats;
2155 
2156             if (db_with_cfh && db_with_cfh->num_created.load()) {
2157               for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
2158                 if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
2159                                     &stats))
2160                   fprintf(stderr, "%s\n", stats.c_str());
2161                 if (FLAGS_show_table_properties) {
2162                   for (int level = 0; level < FLAGS_num_levels; ++level) {
2163                     if (db->GetProperty(
2164                             db_with_cfh->cfh[i],
2165                             "rocksdb.aggregated-table-properties-at-level" +
2166                                 ToString(level),
2167                             &stats)) {
2168                       if (stats.find("# entries=0") == std::string::npos) {
2169                         fprintf(stderr, "Level[%d]: %s\n", level,
2170                                 stats.c_str());
2171                       }
2172                     }
2173                   }
2174                 }
2175               }
2176             } else if (db) {
2177               if (db->GetProperty("rocksdb.stats", &stats)) {
2178                 fprintf(stderr, "%s\n", stats.c_str());
2179               }
2180               if (FLAGS_show_table_properties) {
2181                 for (int level = 0; level < FLAGS_num_levels; ++level) {
2182                   if (db->GetProperty(
2183                           "rocksdb.aggregated-table-properties-at-level" +
2184                               ToString(level),
2185                           &stats)) {
2186                     if (stats.find("# entries=0") == std::string::npos) {
2187                       fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
2188                     }
2189                   }
2190                 }
2191               }
2192             }
2193           }
2194 
2195           next_report_ += FLAGS_stats_interval;
2196           last_report_finish_ = now;
2197           last_report_done_ = done_;
2198         }
2199       }
2200       if (id_ == 0 && FLAGS_thread_status_per_interval) {
2201         PrintThreadStatus();
2202       }
2203       fflush(stderr);
2204     }
2205   }
2206 
AddBytes(int64_t n)2207   void AddBytes(int64_t n) {
2208     bytes_ += n;
2209   }
2210 
Report(const Slice & name)2211   void Report(const Slice& name) {
2212     // Pretend at least one op was done in case we are running a benchmark
2213     // that does not call FinishedOps().
2214     if (done_ < 1) done_ = 1;
2215 
2216     std::string extra;
2217     if (bytes_ > 0) {
2218       // Rate is computed on actual elapsed time, not the sum of per-thread
2219       // elapsed times.
2220       double elapsed = (finish_ - start_) * 1e-6;
2221       char rate[100];
2222       snprintf(rate, sizeof(rate), "%6.1f MB/s",
2223                (bytes_ / 1048576.0) / elapsed);
2224       extra = rate;
2225     }
2226     AppendWithSpace(&extra, message_);
2227     double elapsed = (finish_ - start_) * 1e-6;
2228     double throughput = (double)done_/elapsed;
2229 
2230     fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
2231             name.ToString().c_str(),
2232             seconds_ * 1e6 / done_,
2233             (long)throughput,
2234             (extra.empty() ? "" : " "),
2235             extra.c_str());
2236     if (FLAGS_histogram) {
2237       for (auto it = hist_.begin(); it != hist_.end(); ++it) {
2238         fprintf(stdout, "Microseconds per %s:\n%s\n",
2239                 OperationTypeString[it->first].c_str(),
2240                 it->second->ToString().c_str());
2241       }
2242     }
2243     if (FLAGS_report_file_operations) {
2244       ReportFileOpEnv* env = static_cast<ReportFileOpEnv*>(FLAGS_env);
2245       ReportFileOpCounters* counters = env->counters();
2246       fprintf(stdout, "Num files opened: %d\n",
2247               counters->open_counter_.load(std::memory_order_relaxed));
2248       fprintf(stdout, "Num Read(): %d\n",
2249               counters->read_counter_.load(std::memory_order_relaxed));
2250       fprintf(stdout, "Num Append(): %d\n",
2251               counters->append_counter_.load(std::memory_order_relaxed));
2252       fprintf(stdout, "Num bytes read: %" PRIu64 "\n",
2253               counters->bytes_read_.load(std::memory_order_relaxed));
2254       fprintf(stdout, "Num bytes written: %" PRIu64 "\n",
2255               counters->bytes_written_.load(std::memory_order_relaxed));
2256       env->reset();
2257     }
2258     fflush(stdout);
2259   }
2260 };
2261 
2262 class CombinedStats {
2263  public:
AddStats(const Stats & stat)2264   void AddStats(const Stats& stat) {
2265     uint64_t total_ops = stat.done_;
2266     uint64_t total_bytes_ = stat.bytes_;
2267     double elapsed;
2268 
2269     if (total_ops < 1) {
2270       total_ops = 1;
2271     }
2272 
2273     elapsed = (stat.finish_ - stat.start_) * 1e-6;
2274     throughput_ops_.emplace_back(total_ops / elapsed);
2275 
2276     if (total_bytes_ > 0) {
2277       double mbs = (total_bytes_ / 1048576.0);
2278       throughput_mbs_.emplace_back(mbs / elapsed);
2279     }
2280   }
2281 
Report(const std::string & bench_name)2282   void Report(const std::string& bench_name) {
2283     const char* name = bench_name.c_str();
2284     int num_runs = static_cast<int>(throughput_ops_.size());
2285 
2286     if (throughput_mbs_.size() == throughput_ops_.size()) {
2287       fprintf(stdout,
2288               "%s [AVG    %d runs] : %d ops/sec; %6.1f MB/sec\n"
2289               "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
2290               name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2291               CalcAvg(throughput_mbs_), name, num_runs,
2292               static_cast<int>(CalcMedian(throughput_ops_)),
2293               CalcMedian(throughput_mbs_));
2294     } else {
2295       fprintf(stdout,
2296               "%s [AVG    %d runs] : %d ops/sec\n"
2297               "%s [MEDIAN %d runs] : %d ops/sec\n",
2298               name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), name,
2299               num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
2300     }
2301   }
2302 
2303  private:
CalcAvg(std::vector<double> data)2304   double CalcAvg(std::vector<double> data) {
2305     double avg = 0;
2306     for (double x : data) {
2307       avg += x;
2308     }
2309     avg = avg / data.size();
2310     return avg;
2311   }
2312 
CalcMedian(std::vector<double> data)2313   double CalcMedian(std::vector<double> data) {
2314     assert(data.size() > 0);
2315     std::sort(data.begin(), data.end());
2316 
2317     size_t mid = data.size() / 2;
2318     if (data.size() % 2 == 1) {
2319       // Odd number of entries
2320       return data[mid];
2321     } else {
2322       // Even number of entries
2323       return (data[mid] + data[mid - 1]) / 2;
2324     }
2325   }
2326 
2327   std::vector<double> throughput_ops_;
2328   std::vector<double> throughput_mbs_;
2329 };
2330 
2331 class TimestampEmulator {
2332  private:
2333   std::atomic<uint64_t> timestamp_;
2334 
2335  public:
TimestampEmulator()2336   TimestampEmulator() : timestamp_(0) {}
Get() const2337   uint64_t Get() const { return timestamp_.load(); }
Inc()2338   void Inc() { timestamp_++; }
Allocate(char * scratch)2339   Slice Allocate(char* scratch) {
2340     // TODO: support larger timestamp sizes
2341     assert(FLAGS_user_timestamp_size == 8);
2342     assert(scratch);
2343     uint64_t ts = timestamp_.fetch_add(1);
2344     EncodeFixed64(scratch, ts);
2345     return Slice(scratch, FLAGS_user_timestamp_size);
2346   }
GetTimestampForRead(Random64 & rand,char * scratch)2347   Slice GetTimestampForRead(Random64& rand, char* scratch) {
2348     assert(FLAGS_user_timestamp_size == 8);
2349     assert(scratch);
2350     if (FLAGS_read_with_latest_user_timestamp) {
2351       return Allocate(scratch);
2352     }
2353     // Choose a random timestamp from the past.
2354     uint64_t ts = rand.Next() % Get();
2355     EncodeFixed64(scratch, ts);
2356     return Slice(scratch, FLAGS_user_timestamp_size);
2357   }
2358 };
2359 
2360 // State shared by all concurrent executions of the same benchmark.
2361 struct SharedState {
2362   port::Mutex mu;
2363   port::CondVar cv;
2364   int total;
2365   int perf_level;
2366   std::shared_ptr<RateLimiter> write_rate_limiter;
2367   std::shared_ptr<RateLimiter> read_rate_limiter;
2368 
2369   // Each thread goes through the following states:
2370   //    (1) initializing
2371   //    (2) waiting for others to be initialized
2372   //    (3) running
2373   //    (4) done
2374 
2375   long num_initialized;
2376   long num_done;
2377   bool start;
2378 
SharedStateROCKSDB_NAMESPACE::SharedState2379   SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
2380 };
2381 
2382 // Per-thread state for concurrent executions of the same benchmark.
2383 struct ThreadState {
2384   int tid;             // 0..n-1 when running in n threads
2385   Random64 rand;         // Has different seeds for different threads
2386   Stats stats;
2387   SharedState* shared;
2388 
ThreadStateROCKSDB_NAMESPACE::ThreadState2389   explicit ThreadState(int index)
2390       : tid(index), rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {}
2391 };
2392 
2393 class Duration {
2394  public:
Duration(uint64_t max_seconds,int64_t max_ops,int64_t ops_per_stage=0)2395   Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
2396     max_seconds_ = max_seconds;
2397     max_ops_= max_ops;
2398     ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
2399     ops_ = 0;
2400     start_at_ = FLAGS_env->NowMicros();
2401   }
2402 
GetStage()2403   int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
2404 
Done(int64_t increment)2405   bool Done(int64_t increment) {
2406     if (increment <= 0) increment = 1;    // avoid Done(0) and infinite loops
2407     ops_ += increment;
2408 
2409     if (max_seconds_) {
2410       // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
2411       auto granularity = FLAGS_ops_between_duration_checks;
2412       if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
2413         uint64_t now = FLAGS_env->NowMicros();
2414         return ((now - start_at_) / 1000000) >= max_seconds_;
2415       } else {
2416         return false;
2417       }
2418     } else {
2419       return ops_ > max_ops_;
2420     }
2421   }
2422 
2423  private:
2424   uint64_t max_seconds_;
2425   int64_t max_ops_;
2426   int64_t ops_per_stage_;
2427   int64_t ops_;
2428   uint64_t start_at_;
2429 };
2430 
2431 class Benchmark {
2432  private:
2433   std::shared_ptr<Cache> cache_;
2434   std::shared_ptr<Cache> compressed_cache_;
2435   std::shared_ptr<const FilterPolicy> filter_policy_;
2436   const SliceTransform* prefix_extractor_;
2437   DBWithColumnFamilies db_;
2438   std::vector<DBWithColumnFamilies> multi_dbs_;
2439   int64_t num_;
2440   int key_size_;
2441   int user_timestamp_size_;
2442   int prefix_size_;
2443   int64_t keys_per_prefix_;
2444   int64_t entries_per_batch_;
2445   int64_t writes_before_delete_range_;
2446   int64_t writes_per_range_tombstone_;
2447   int64_t range_tombstone_width_;
2448   int64_t max_num_range_tombstones_;
2449   WriteOptions write_options_;
2450   Options open_options_;  // keep options around to properly destroy db later
2451 #ifndef ROCKSDB_LITE
2452   TraceOptions trace_options_;
2453   TraceOptions block_cache_trace_options_;
2454 #endif
2455   int64_t reads_;
2456   int64_t deletes_;
2457   double read_random_exp_range_;
2458   int64_t writes_;
2459   int64_t readwrites_;
2460   int64_t merge_keys_;
2461   bool report_file_operations_;
2462   bool use_blob_db_;  // Stacked BlobDB
2463   std::vector<std::string> keys_;
2464 
2465   class ErrorHandlerListener : public EventListener {
2466    public:
2467 #ifndef ROCKSDB_LITE
ErrorHandlerListener()2468     ErrorHandlerListener()
2469         : mutex_(),
2470           cv_(&mutex_),
2471           no_auto_recovery_(false),
2472           recovery_complete_(false) {}
2473 
~ErrorHandlerListener()2474     ~ErrorHandlerListener() override {}
2475 
OnErrorRecoveryBegin(BackgroundErrorReason,Status,bool * auto_recovery)2476     void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
2477                               Status /*bg_error*/,
2478                               bool* auto_recovery) override {
2479       if (*auto_recovery && no_auto_recovery_) {
2480         *auto_recovery = false;
2481       }
2482     }
2483 
OnErrorRecoveryCompleted(Status)2484     void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
2485       InstrumentedMutexLock l(&mutex_);
2486       recovery_complete_ = true;
2487       cv_.SignalAll();
2488     }
2489 
WaitForRecovery(uint64_t abs_time_us)2490     bool WaitForRecovery(uint64_t abs_time_us) {
2491       InstrumentedMutexLock l(&mutex_);
2492       if (!recovery_complete_) {
2493         cv_.TimedWait(abs_time_us);
2494       }
2495       if (recovery_complete_) {
2496         recovery_complete_ = false;
2497         return true;
2498       }
2499       return false;
2500     }
2501 
EnableAutoRecovery(bool enable=true)2502     void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
2503 
2504    private:
2505     InstrumentedMutex mutex_;
2506     InstrumentedCondVar cv_;
2507     bool no_auto_recovery_;
2508     bool recovery_complete_;
2509 #else   // ROCKSDB_LITE
2510     bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
2511     void EnableAutoRecovery(bool /*enable*/) {}
2512 #endif  // ROCKSDB_LITE
2513   };
2514 
2515   std::shared_ptr<ErrorHandlerListener> listener_;
2516 
2517   std::unique_ptr<TimestampEmulator> mock_app_clock_;
2518 
SanityCheck()2519   bool SanityCheck() {
2520     if (FLAGS_compression_ratio > 1) {
2521       fprintf(stderr, "compression_ratio should be between 0 and 1\n");
2522       return false;
2523     }
2524     return true;
2525   }
2526 
CompressSlice(const CompressionInfo & compression_info,const Slice & input,std::string * compressed)2527   inline bool CompressSlice(const CompressionInfo& compression_info,
2528                             const Slice& input, std::string* compressed) {
2529     constexpr uint32_t compress_format_version = 2;
2530 
2531     return CompressData(input, compression_info, compress_format_version,
2532                         compressed);
2533   }
2534 
PrintHeader()2535   void PrintHeader() {
2536     PrintEnvironment();
2537     fprintf(stdout,
2538             "Keys:       %d bytes each (+ %d bytes user-defined timestamp)\n",
2539             FLAGS_key_size, FLAGS_user_timestamp_size);
2540     auto avg_value_size = FLAGS_value_size;
2541     if (FLAGS_value_size_distribution_type_e == kFixed) {
2542       fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
2543               avg_value_size,
2544               static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2545     } else {
2546       avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2;
2547       fprintf(stdout, "Values:     %d avg bytes each (%d bytes after compression)\n",
2548               avg_value_size,
2549               static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2550       fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n",
2551               FLAGS_value_size_distribution_type.c_str(),
2552               FLAGS_value_size_min, FLAGS_value_size_max);
2553     }
2554     fprintf(stdout, "Entries:    %" PRIu64 "\n", num_);
2555     fprintf(stdout, "Prefix:    %d bytes\n", FLAGS_prefix_size);
2556     fprintf(stdout, "Keys per prefix:    %" PRIu64 "\n", keys_per_prefix_);
2557     fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
2558             ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_)
2559              / 1048576.0));
2560     fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
2561             (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio)
2562               * num_)
2563              / 1048576.0));
2564     fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
2565             FLAGS_benchmark_write_rate_limit);
2566     fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
2567             FLAGS_benchmark_read_rate_limit);
2568     if (FLAGS_enable_numa) {
2569       fprintf(stderr, "Running in NUMA enabled mode.\n");
2570 #ifndef NUMA
2571       fprintf(stderr, "NUMA is not defined in the system.\n");
2572       exit(1);
2573 #else
2574       if (numa_available() == -1) {
2575         fprintf(stderr, "NUMA is not supported by the system.\n");
2576         exit(1);
2577       }
2578 #endif
2579     }
2580 
2581     auto compression = CompressionTypeToString(FLAGS_compression_type_e);
2582     fprintf(stdout, "Compression: %s\n", compression.c_str());
2583     fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
2584             FLAGS_sample_for_compression);
2585 
2586     switch (FLAGS_rep_factory) {
2587       case kPrefixHash:
2588         fprintf(stdout, "Memtablerep: prefix_hash\n");
2589         break;
2590       case kSkipList:
2591         fprintf(stdout, "Memtablerep: skip_list\n");
2592         break;
2593       case kVectorRep:
2594         fprintf(stdout, "Memtablerep: vector\n");
2595         break;
2596       case kHashLinkedList:
2597         fprintf(stdout, "Memtablerep: hash_linkedlist\n");
2598         break;
2599     }
2600     fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
2601 
2602     PrintWarnings(compression.c_str());
2603     fprintf(stdout, "------------------------------------------------\n");
2604   }
2605 
PrintWarnings(const char * compression)2606   void PrintWarnings(const char* compression) {
2607 #if defined(__GNUC__) && !defined(__OPTIMIZE__)
2608     fprintf(stdout,
2609             "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
2610             );
2611 #endif
2612 #ifndef NDEBUG
2613     fprintf(stdout,
2614             "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
2615 #endif
2616     if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) {
2617       // The test string should not be too small.
2618       const int len = FLAGS_block_size;
2619       std::string input_str(len, 'y');
2620       std::string compressed;
2621       CompressionOptions opts;
2622       CompressionContext context(FLAGS_compression_type_e);
2623       CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
2624                            FLAGS_compression_type_e,
2625                            FLAGS_sample_for_compression);
2626       bool result = CompressSlice(info, Slice(input_str), &compressed);
2627 
2628       if (!result) {
2629         fprintf(stdout, "WARNING: %s compression is not enabled\n",
2630                 compression);
2631       } else if (compressed.size() >= input_str.size()) {
2632         fprintf(stdout, "WARNING: %s compression is not effective\n",
2633                 compression);
2634       }
2635     }
2636   }
2637 
2638 // Current the following isn't equivalent to OS_LINUX.
2639 #if defined(__linux)
TrimSpace(Slice s)2640   static Slice TrimSpace(Slice s) {
2641     unsigned int start = 0;
2642     while (start < s.size() && isspace(s[start])) {
2643       start++;
2644     }
2645     unsigned int limit = static_cast<unsigned int>(s.size());
2646     while (limit > start && isspace(s[limit-1])) {
2647       limit--;
2648     }
2649     return Slice(s.data() + start, limit - start);
2650   }
2651 #endif
2652 
PrintEnvironment()2653   void PrintEnvironment() {
2654     fprintf(stderr, "RocksDB:    version %d.%d\n",
2655             kMajorVersion, kMinorVersion);
2656 
2657 #if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
2658     time_t now = time(nullptr);
2659     char buf[52];
2660     // Lint complains about ctime() usage, so replace it with ctime_r(). The
2661     // requirement is to provide a buffer which is at least 26 bytes.
2662     fprintf(stderr, "Date:       %s",
2663             ctime_r(&now, buf));  // ctime_r() adds newline
2664 
2665 #if defined(__linux)
2666     FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
2667     if (cpuinfo != nullptr) {
2668       char line[1000];
2669       int num_cpus = 0;
2670       std::string cpu_type;
2671       std::string cache_size;
2672       while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
2673         const char* sep = strchr(line, ':');
2674         if (sep == nullptr) {
2675           continue;
2676         }
2677         Slice key = TrimSpace(Slice(line, sep - 1 - line));
2678         Slice val = TrimSpace(Slice(sep + 1));
2679         if (key == "model name") {
2680           ++num_cpus;
2681           cpu_type = val.ToString();
2682         } else if (key == "cache size") {
2683           cache_size = val.ToString();
2684         }
2685       }
2686       fclose(cpuinfo);
2687       fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
2688       fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
2689     }
2690 #elif defined(__APPLE__)
2691     struct host_basic_info h;
2692     size_t hlen = HOST_BASIC_INFO_COUNT;
2693     if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h,
2694                   (uint32_t*)&hlen) == KERN_SUCCESS) {
2695       std::string cpu_type;
2696       std::string cache_size;
2697       size_t hcache_size;
2698       hlen = sizeof(hcache_size);
2699       if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) {
2700         cache_size = std::to_string(hcache_size);
2701       }
2702       switch (h.cpu_type) {
2703         case CPU_TYPE_X86_64:
2704           cpu_type = "x86_64";
2705           break;
2706         case CPU_TYPE_ARM64:
2707           cpu_type = "arm64";
2708           break;
2709         default:
2710           break;
2711       }
2712       fprintf(stderr, "CPU:        %d * %s\n", h.max_cpus, cpu_type.c_str());
2713       fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
2714     }
2715 #elif defined(__FreeBSD__)
2716     int ncpus;
2717     size_t len = sizeof(ncpus);
2718     int mib[2] = {CTL_HW, HW_NCPU};
2719     if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) {
2720       char cpu_type[16];
2721       len = sizeof(cpu_type) - 1;
2722       mib[1] = HW_MACHINE;
2723       if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0;
2724 
2725       fprintf(stderr, "CPU:        %d * %s\n", ncpus, cpu_type);
2726       // no programmatic way to get the cache line size except on PPC
2727     }
2728 #endif
2729 #endif
2730   }
2731 
KeyExpired(const TimestampEmulator * timestamp_emulator,const Slice & key)2732   static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
2733                          const Slice& key) {
2734     const char* pos = key.data();
2735     pos += 8;
2736     uint64_t timestamp = 0;
2737     if (port::kLittleEndian) {
2738       int bytes_to_fill = 8;
2739       for (int i = 0; i < bytes_to_fill; ++i) {
2740         timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
2741                       << ((bytes_to_fill - i - 1) << 3));
2742       }
2743     } else {
2744       memcpy(&timestamp, pos, sizeof(timestamp));
2745     }
2746     return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
2747   }
2748 
2749   class ExpiredTimeFilter : public CompactionFilter {
2750    public:
ExpiredTimeFilter(const std::shared_ptr<TimestampEmulator> & timestamp_emulator)2751     explicit ExpiredTimeFilter(
2752         const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
2753         : timestamp_emulator_(timestamp_emulator) {}
Filter(int,const Slice & key,const Slice &,std::string *,bool *) const2754     bool Filter(int /*level*/, const Slice& key,
2755                 const Slice& /*existing_value*/, std::string* /*new_value*/,
2756                 bool* /*value_changed*/) const override {
2757       return KeyExpired(timestamp_emulator_.get(), key);
2758     }
Name() const2759     const char* Name() const override { return "ExpiredTimeFilter"; }
2760 
2761    private:
2762     std::shared_ptr<TimestampEmulator> timestamp_emulator_;
2763   };
2764 
2765   class KeepFilter : public CompactionFilter {
2766    public:
Filter(int,const Slice &,const Slice &,std::string *,bool *) const2767     bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
2768                 std::string* /*new_value*/,
2769                 bool* /*value_changed*/) const override {
2770       return false;
2771     }
2772 
Name() const2773     const char* Name() const override { return "KeepFilter"; }
2774   };
2775 
NewCache(int64_t capacity)2776   std::shared_ptr<Cache> NewCache(int64_t capacity) {
2777     if (capacity <= 0) {
2778       return nullptr;
2779     }
2780     if (FLAGS_use_clock_cache) {
2781       auto cache = NewClockCache(static_cast<size_t>(capacity),
2782                                  FLAGS_cache_numshardbits);
2783       if (!cache) {
2784         fprintf(stderr, "Clock cache not supported.");
2785         exit(1);
2786       }
2787       return cache;
2788     } else {
2789       LRUCacheOptions opts(
2790           static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
2791           false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
2792 #ifdef MEMKIND
2793           FLAGS_use_cache_memkind_kmem_allocator
2794               ? std::make_shared<MemkindKmemAllocator>()
2795               : nullptr
2796 #else
2797           nullptr
2798 #endif
2799       );
2800       if (FLAGS_use_cache_memkind_kmem_allocator) {
2801 #ifndef MEMKIND
2802         fprintf(stderr, "Memkind library is not linked with the binary.");
2803         exit(1);
2804 #endif
2805       }
2806 #ifndef ROCKSDB_LITE
2807       if (!FLAGS_secondary_cache_uri.empty()) {
2808         Status s =
2809             ObjectRegistry::NewInstance()->NewSharedObject<SecondaryCache>(
2810                 FLAGS_secondary_cache_uri, &secondary_cache);
2811         if (secondary_cache == nullptr) {
2812           fprintf(
2813               stderr,
2814               "No secondary cache registered matching string: %s status=%s\n",
2815               FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
2816           exit(1);
2817         }
2818         opts.secondary_cache = secondary_cache;
2819       }
2820 #endif  // ROCKSDB_LITE
2821       return NewLRUCache(opts);
2822     }
2823   }
2824 
2825  public:
Benchmark()2826   Benchmark()
2827       : cache_(NewCache(FLAGS_cache_size)),
2828         compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
2829         filter_policy_(
2830             FLAGS_use_ribbon_filter
2831                 ? NewExperimentalRibbonFilterPolicy(FLAGS_bloom_bits)
2832                 : FLAGS_bloom_bits >= 0
2833                       ? NewBloomFilterPolicy(FLAGS_bloom_bits,
2834                                              FLAGS_use_block_based_filter)
2835                       : nullptr),
2836         prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
2837         num_(FLAGS_num),
2838         key_size_(FLAGS_key_size),
2839         user_timestamp_size_(FLAGS_user_timestamp_size),
2840         prefix_size_(FLAGS_prefix_size),
2841         keys_per_prefix_(FLAGS_keys_per_prefix),
2842         entries_per_batch_(1),
2843         reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
2844         read_random_exp_range_(0.0),
2845         writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
2846         readwrites_(
2847             (FLAGS_writes < 0 && FLAGS_reads < 0)
2848                 ? FLAGS_num
2849                 : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
2850         merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
2851         report_file_operations_(FLAGS_report_file_operations),
2852 #ifndef ROCKSDB_LITE
2853         use_blob_db_(FLAGS_use_blob_db)  // Stacked BlobDB
2854 #else
2855         use_blob_db_(false)  // Stacked BlobDB
2856 #endif  // !ROCKSDB_LITE
2857   {
2858     // use simcache instead of cache
2859     if (FLAGS_simcache_size >= 0) {
2860       if (FLAGS_cache_numshardbits >= 1) {
2861         cache_ =
2862             NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
2863       } else {
2864         cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
2865       }
2866     }
2867 
2868     if (report_file_operations_) {
2869       if (!FLAGS_hdfs.empty()) {
2870         fprintf(stderr,
2871                 "--hdfs and --report_file_operations cannot be enabled "
2872                 "at the same time");
2873         exit(1);
2874       }
2875       FLAGS_env = new ReportFileOpEnv(FLAGS_env);
2876     }
2877 
2878     if (FLAGS_prefix_size > FLAGS_key_size) {
2879       fprintf(stderr, "prefix size is larger than key size");
2880       exit(1);
2881     }
2882 
2883     std::vector<std::string> files;
2884     FLAGS_env->GetChildren(FLAGS_db, &files);
2885     for (size_t i = 0; i < files.size(); i++) {
2886       if (Slice(files[i]).starts_with("heap-")) {
2887         FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
2888       }
2889     }
2890     if (!FLAGS_use_existing_db) {
2891       Options options;
2892       options.env = FLAGS_env;
2893       if (!FLAGS_wal_dir.empty()) {
2894         options.wal_dir = FLAGS_wal_dir;
2895       }
2896 #ifndef ROCKSDB_LITE
2897       if (use_blob_db_) {
2898         // Stacked BlobDB
2899         blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
2900       }
2901 #endif  // !ROCKSDB_LITE
2902       DestroyDB(FLAGS_db, options);
2903       if (!FLAGS_wal_dir.empty()) {
2904         FLAGS_env->DeleteDir(FLAGS_wal_dir);
2905       }
2906 
2907       if (FLAGS_num_multi_db > 1) {
2908         FLAGS_env->CreateDir(FLAGS_db);
2909         if (!FLAGS_wal_dir.empty()) {
2910           FLAGS_env->CreateDir(FLAGS_wal_dir);
2911         }
2912       }
2913     }
2914 
2915     listener_.reset(new ErrorHandlerListener());
2916     if (user_timestamp_size_ > 0) {
2917       mock_app_clock_.reset(new TimestampEmulator());
2918     }
2919   }
2920 
~Benchmark()2921   ~Benchmark() {
2922     db_.DeleteDBs();
2923     for (auto db : multi_dbs_) {
2924       db.DeleteDBs();
2925     }
2926     delete prefix_extractor_;
2927     if (cache_.get() != nullptr) {
2928       // Clear cache reference first
2929       open_options_.write_buffer_manager.reset();
2930       // this will leak, but we're shutting down so nobody cares
2931       cache_->DisownData();
2932     }
2933   }
2934 
AllocateKey(std::unique_ptr<const char[]> * key_guard)2935   Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
2936     char* data = new char[key_size_];
2937     const char* const_data = data;
2938     key_guard->reset(const_data);
2939     return Slice(key_guard->get(), key_size_);
2940   }
2941 
2942   // Generate key according to the given specification and random number.
2943   // The resulting key will have the following format:
2944   //   - If keys_per_prefix_ is positive, extra trailing bytes are either cut
2945   //     off or padded with '0'.
2946   //     The prefix value is derived from key value.
2947   //     ----------------------------
2948   //     | prefix 00000 | key 00000 |
2949   //     ----------------------------
2950   //
2951   //   - If keys_per_prefix_ is 0, the key is simply a binary representation of
2952   //     random number followed by trailing '0's
2953   //     ----------------------------
2954   //     |        key 00000         |
2955   //     ----------------------------
GenerateKeyFromInt(uint64_t v,int64_t num_keys,Slice * key)2956   void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
2957     if (!keys_.empty()) {
2958       assert(FLAGS_use_existing_keys);
2959       assert(keys_.size() == static_cast<size_t>(num_keys));
2960       assert(v < static_cast<uint64_t>(num_keys));
2961       *key = keys_[v];
2962       return;
2963     }
2964     char* start = const_cast<char*>(key->data());
2965     char* pos = start;
2966     if (keys_per_prefix_ > 0) {
2967       int64_t num_prefix = num_keys / keys_per_prefix_;
2968       int64_t prefix = v % num_prefix;
2969       int bytes_to_fill = std::min(prefix_size_, 8);
2970       if (port::kLittleEndian) {
2971         for (int i = 0; i < bytes_to_fill; ++i) {
2972           pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
2973         }
2974       } else {
2975         memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
2976       }
2977       if (prefix_size_ > 8) {
2978         // fill the rest with 0s
2979         memset(pos + 8, '0', prefix_size_ - 8);
2980       }
2981       pos += prefix_size_;
2982     }
2983 
2984     int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
2985     if (port::kLittleEndian) {
2986       for (int i = 0; i < bytes_to_fill; ++i) {
2987         pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
2988       }
2989     } else {
2990       memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
2991     }
2992     pos += bytes_to_fill;
2993     if (key_size_ > pos - start) {
2994       memset(pos, '0', key_size_ - (pos - start));
2995     }
2996   }
2997 
GenerateKeyFromIntForSeek(uint64_t v,int64_t num_keys,Slice * key)2998   void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
2999     GenerateKeyFromInt(v, num_keys, key);
3000     if (FLAGS_seek_missing_prefix) {
3001       assert(prefix_size_ > 8);
3002       char* key_ptr = const_cast<char*>(key->data());
3003       // This rely on GenerateKeyFromInt filling paddings with '0's.
3004       // Putting a '1' will create a non-existing prefix.
3005       key_ptr[8] = '1';
3006     }
3007   }
3008 
GetPathForMultiple(std::string base_name,size_t id)3009   std::string GetPathForMultiple(std::string base_name, size_t id) {
3010     if (!base_name.empty()) {
3011 #ifndef OS_WIN
3012       if (base_name.back() != '/') {
3013         base_name += '/';
3014       }
3015 #else
3016       if (base_name.back() != '\\') {
3017         base_name += '\\';
3018       }
3019 #endif
3020     }
3021     return base_name + ToString(id);
3022   }
3023 
VerifyDBFromDB(std::string & truth_db_name)3024   void VerifyDBFromDB(std::string& truth_db_name) {
3025     DBWithColumnFamilies truth_db;
3026     auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
3027     if (!s.ok()) {
3028       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
3029       exit(1);
3030     }
3031     ReadOptions ro;
3032     ro.total_order_seek = true;
3033     std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
3034     std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
3035     // Verify that all the key/values in truth_db are retrivable in db with
3036     // ::Get
3037     fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
3038     for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
3039       std::string value;
3040       s = db_.db->Get(ro, truth_iter->key(), &value);
3041       assert(s.ok());
3042       // TODO(myabandeh): provide debugging hints
3043       assert(Slice(value) == truth_iter->value());
3044     }
3045     // Verify that the db iterator does not give any extra key/value
3046     fprintf(stderr, "Verifying db == truth_db...\n");
3047     for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
3048          db_iter->Next(), truth_iter->Next()) {
3049       assert(truth_iter->Valid());
3050       assert(truth_iter->value() == db_iter->value());
3051     }
3052     // No more key should be left unchecked in truth_db
3053     assert(!truth_iter->Valid());
3054     fprintf(stderr, "...Verified\n");
3055   }
3056 
ErrorExit()3057   void ErrorExit() {
3058     db_.DeleteDBs();
3059     for (size_t i = 0; i < multi_dbs_.size(); i++) {
3060       delete multi_dbs_[i].db;
3061     }
3062     exit(1);
3063   }
3064 
Run()3065   void Run() {
3066     if (!SanityCheck()) {
3067       ErrorExit();
3068     }
3069     Open(&open_options_);
3070     PrintHeader();
3071     std::stringstream benchmark_stream(FLAGS_benchmarks);
3072     std::string name;
3073     std::unique_ptr<ExpiredTimeFilter> filter;
3074     while (std::getline(benchmark_stream, name, ',')) {
3075       // Sanitize parameters
3076       num_ = FLAGS_num;
3077       reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
3078       writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
3079       deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
3080       value_size = FLAGS_value_size;
3081       key_size_ = FLAGS_key_size;
3082       entries_per_batch_ = FLAGS_batch_size;
3083       writes_before_delete_range_ = FLAGS_writes_before_delete_range;
3084       writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
3085       range_tombstone_width_ = FLAGS_range_tombstone_width;
3086       max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
3087       write_options_ = WriteOptions();
3088       read_random_exp_range_ = FLAGS_read_random_exp_range;
3089       if (FLAGS_sync) {
3090         write_options_.sync = true;
3091       }
3092       write_options_.disableWAL = FLAGS_disable_wal;
3093 
3094       void (Benchmark::*method)(ThreadState*) = nullptr;
3095       void (Benchmark::*post_process_method)() = nullptr;
3096 
3097       bool fresh_db = false;
3098       int num_threads = FLAGS_threads;
3099 
3100       int num_repeat = 1;
3101       int num_warmup = 0;
3102       if (!name.empty() && *name.rbegin() == ']') {
3103         auto it = name.find('[');
3104         if (it == std::string::npos) {
3105           fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
3106           ErrorExit();
3107         }
3108         std::string args = name.substr(it + 1);
3109         args.resize(args.size() - 1);
3110         name.resize(it);
3111 
3112         std::string bench_arg;
3113         std::stringstream args_stream(args);
3114         while (std::getline(args_stream, bench_arg, '-')) {
3115           if (bench_arg.empty()) {
3116             continue;
3117           }
3118           if (bench_arg[0] == 'X') {
3119             // Repeat the benchmark n times
3120             std::string num_str = bench_arg.substr(1);
3121             num_repeat = std::stoi(num_str);
3122           } else if (bench_arg[0] == 'W') {
3123             // Warm up the benchmark for n times
3124             std::string num_str = bench_arg.substr(1);
3125             num_warmup = std::stoi(num_str);
3126           }
3127         }
3128       }
3129 
3130       // Both fillseqdeterministic and filluniquerandomdeterministic
3131       // fill the levels except the max level with UNIQUE_RANDOM
3132       // and fill the max level with fillseq and filluniquerandom, respectively
3133       if (name == "fillseqdeterministic" ||
3134           name == "filluniquerandomdeterministic") {
3135         if (!FLAGS_disable_auto_compactions) {
3136           fprintf(stderr,
3137                   "Please disable_auto_compactions in FillDeterministic "
3138                   "benchmark\n");
3139           ErrorExit();
3140         }
3141         if (num_threads > 1) {
3142           fprintf(stderr,
3143                   "filldeterministic multithreaded not supported"
3144                   ", use 1 thread\n");
3145           num_threads = 1;
3146         }
3147         fresh_db = true;
3148         if (name == "fillseqdeterministic") {
3149           method = &Benchmark::WriteSeqDeterministic;
3150         } else {
3151           method = &Benchmark::WriteUniqueRandomDeterministic;
3152         }
3153       } else if (name == "fillseq") {
3154         fresh_db = true;
3155         method = &Benchmark::WriteSeq;
3156       } else if (name == "fillbatch") {
3157         fresh_db = true;
3158         entries_per_batch_ = 1000;
3159         method = &Benchmark::WriteSeq;
3160       } else if (name == "fillrandom") {
3161         fresh_db = true;
3162         method = &Benchmark::WriteRandom;
3163       } else if (name == "filluniquerandom") {
3164         fresh_db = true;
3165         if (num_threads > 1) {
3166           fprintf(stderr,
3167                   "filluniquerandom multithreaded not supported"
3168                   ", use 1 thread");
3169           num_threads = 1;
3170         }
3171         method = &Benchmark::WriteUniqueRandom;
3172       } else if (name == "overwrite") {
3173         method = &Benchmark::WriteRandom;
3174       } else if (name == "fillsync") {
3175         fresh_db = true;
3176         num_ /= 1000;
3177         write_options_.sync = true;
3178         method = &Benchmark::WriteRandom;
3179       } else if (name == "fill100K") {
3180         fresh_db = true;
3181         num_ /= 1000;
3182         value_size = 100 * 1000;
3183         method = &Benchmark::WriteRandom;
3184       } else if (name == "readseq") {
3185         method = &Benchmark::ReadSequential;
3186       } else if (name == "readtorowcache") {
3187         if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
3188           fprintf(stderr,
3189                   "Please set use_existing_keys to true and specify a "
3190                   "row cache size in readtorowcache benchmark\n");
3191           ErrorExit();
3192         }
3193         method = &Benchmark::ReadToRowCache;
3194       } else if (name == "readtocache") {
3195         method = &Benchmark::ReadSequential;
3196         num_threads = 1;
3197         reads_ = num_;
3198       } else if (name == "readreverse") {
3199         method = &Benchmark::ReadReverse;
3200       } else if (name == "readrandom") {
3201         if (FLAGS_multiread_stride) {
3202           fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3203                   entries_per_batch_);
3204         }
3205         method = &Benchmark::ReadRandom;
3206       } else if (name == "readrandomfast") {
3207         method = &Benchmark::ReadRandomFast;
3208       } else if (name == "multireadrandom") {
3209         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3210                 entries_per_batch_);
3211         method = &Benchmark::MultiReadRandom;
3212       } else if (name == "approximatesizerandom") {
3213         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3214                 entries_per_batch_);
3215         method = &Benchmark::ApproximateSizeRandom;
3216       } else if (name == "mixgraph") {
3217         method = &Benchmark::MixGraph;
3218       } else if (name == "readmissing") {
3219         ++key_size_;
3220         method = &Benchmark::ReadRandom;
3221       } else if (name == "newiterator") {
3222         method = &Benchmark::IteratorCreation;
3223       } else if (name == "newiteratorwhilewriting") {
3224         num_threads++;  // Add extra thread for writing
3225         method = &Benchmark::IteratorCreationWhileWriting;
3226       } else if (name == "seekrandom") {
3227         method = &Benchmark::SeekRandom;
3228       } else if (name == "seekrandomwhilewriting") {
3229         num_threads++;  // Add extra thread for writing
3230         method = &Benchmark::SeekRandomWhileWriting;
3231       } else if (name == "seekrandomwhilemerging") {
3232         num_threads++;  // Add extra thread for merging
3233         method = &Benchmark::SeekRandomWhileMerging;
3234       } else if (name == "readrandomsmall") {
3235         reads_ /= 1000;
3236         method = &Benchmark::ReadRandom;
3237       } else if (name == "deleteseq") {
3238         method = &Benchmark::DeleteSeq;
3239       } else if (name == "deleterandom") {
3240         method = &Benchmark::DeleteRandom;
3241       } else if (name == "readwhilewriting") {
3242         num_threads++;  // Add extra thread for writing
3243         method = &Benchmark::ReadWhileWriting;
3244       } else if (name == "readwhilemerging") {
3245         num_threads++;  // Add extra thread for writing
3246         method = &Benchmark::ReadWhileMerging;
3247       } else if (name == "readwhilescanning") {
3248         num_threads++;  // Add extra thread for scaning
3249         method = &Benchmark::ReadWhileScanning;
3250       } else if (name == "readrandomwriterandom") {
3251         method = &Benchmark::ReadRandomWriteRandom;
3252       } else if (name == "readrandommergerandom") {
3253         if (FLAGS_merge_operator.empty()) {
3254           fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3255                   name.c_str());
3256           ErrorExit();
3257         }
3258         method = &Benchmark::ReadRandomMergeRandom;
3259       } else if (name == "updaterandom") {
3260         method = &Benchmark::UpdateRandom;
3261       } else if (name == "xorupdaterandom") {
3262         method = &Benchmark::XORUpdateRandom;
3263       } else if (name == "appendrandom") {
3264         method = &Benchmark::AppendRandom;
3265       } else if (name == "mergerandom") {
3266         if (FLAGS_merge_operator.empty()) {
3267           fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3268                   name.c_str());
3269           exit(1);
3270         }
3271         method = &Benchmark::MergeRandom;
3272       } else if (name == "randomwithverify") {
3273         method = &Benchmark::RandomWithVerify;
3274       } else if (name == "fillseekseq") {
3275         method = &Benchmark::WriteSeqSeekSeq;
3276       } else if (name == "compact") {
3277         method = &Benchmark::Compact;
3278       } else if (name == "compactall") {
3279         CompactAll();
3280 #ifndef ROCKSDB_LITE
3281       } else if (name == "compact0") {
3282         CompactLevel(0);
3283       } else if (name == "compact1") {
3284         CompactLevel(1);
3285       } else if (name == "waitforcompaction") {
3286         WaitForCompaction();
3287 #endif
3288       } else if (name == "flush") {
3289         Flush();
3290       } else if (name == "crc32c") {
3291         method = &Benchmark::Crc32c;
3292       } else if (name == "xxhash") {
3293         method = &Benchmark::xxHash;
3294       } else if (name == "acquireload") {
3295         method = &Benchmark::AcquireLoad;
3296       } else if (name == "compress") {
3297         method = &Benchmark::Compress;
3298       } else if (name == "uncompress") {
3299         method = &Benchmark::Uncompress;
3300 #ifndef ROCKSDB_LITE
3301       } else if (name == "randomtransaction") {
3302         method = &Benchmark::RandomTransaction;
3303         post_process_method = &Benchmark::RandomTransactionVerify;
3304 #endif  // ROCKSDB_LITE
3305       } else if (name == "randomreplacekeys") {
3306         fresh_db = true;
3307         method = &Benchmark::RandomReplaceKeys;
3308       } else if (name == "timeseries") {
3309         timestamp_emulator_.reset(new TimestampEmulator());
3310         if (FLAGS_expire_style == "compaction_filter") {
3311           filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
3312           fprintf(stdout, "Compaction filter is used to remove expired data");
3313           open_options_.compaction_filter = filter.get();
3314         }
3315         fresh_db = true;
3316         method = &Benchmark::TimeSeries;
3317       } else if (name == "stats") {
3318         PrintStats("rocksdb.stats");
3319       } else if (name == "resetstats") {
3320         ResetStats();
3321       } else if (name == "verify") {
3322         VerifyDBFromDB(FLAGS_truth_db);
3323       } else if (name == "levelstats") {
3324         PrintStats("rocksdb.levelstats");
3325       } else if (name == "memstats") {
3326         std::vector<std::string> keys{"rocksdb.num-immutable-mem-table",
3327                                       "rocksdb.cur-size-active-mem-table",
3328                                       "rocksdb.cur-size-all-mem-tables",
3329                                       "rocksdb.size-all-mem-tables",
3330                                       "rocksdb.num-entries-active-mem-table",
3331                                       "rocksdb.num-entries-imm-mem-tables"};
3332         PrintStats(keys);
3333       } else if (name == "sstables") {
3334         PrintStats("rocksdb.sstables");
3335       } else if (name == "stats_history") {
3336         PrintStatsHistory();
3337       } else if (name == "replay") {
3338         if (num_threads > 1) {
3339           fprintf(stderr, "Multi-threaded replay is not yet supported\n");
3340           ErrorExit();
3341         }
3342         if (FLAGS_trace_file == "") {
3343           fprintf(stderr, "Please set --trace_file to be replayed from\n");
3344           ErrorExit();
3345         }
3346         method = &Benchmark::Replay;
3347       } else if (name == "getmergeoperands") {
3348         method = &Benchmark::GetMergeOperands;
3349       } else if (!name.empty()) {  // No error message for empty name
3350         fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
3351         ErrorExit();
3352       }
3353 
3354       if (fresh_db) {
3355         if (FLAGS_use_existing_db) {
3356           fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
3357                   name.c_str());
3358           method = nullptr;
3359         } else {
3360           if (db_.db != nullptr) {
3361             db_.DeleteDBs();
3362             DestroyDB(FLAGS_db, open_options_);
3363           }
3364           Options options = open_options_;
3365           for (size_t i = 0; i < multi_dbs_.size(); i++) {
3366             delete multi_dbs_[i].db;
3367             if (!open_options_.wal_dir.empty()) {
3368               options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
3369             }
3370             DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
3371           }
3372           multi_dbs_.clear();
3373         }
3374         Open(&open_options_);  // use open_options for the last accessed
3375       }
3376 
3377       if (method != nullptr) {
3378         fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
3379 
3380 #ifndef ROCKSDB_LITE
3381         // A trace_file option can be provided both for trace and replay
3382         // operations. But db_bench does not support tracing and replaying at
3383         // the same time, for now. So, start tracing only when it is not a
3384         // replay.
3385         if (FLAGS_trace_file != "" && name != "replay") {
3386           std::unique_ptr<TraceWriter> trace_writer;
3387           Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3388                                         FLAGS_trace_file, &trace_writer);
3389           if (!s.ok()) {
3390             fprintf(stderr, "Encountered an error starting a trace, %s\n",
3391                     s.ToString().c_str());
3392             ErrorExit();
3393           }
3394           s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
3395           if (!s.ok()) {
3396             fprintf(stderr, "Encountered an error starting a trace, %s\n",
3397                     s.ToString().c_str());
3398             ErrorExit();
3399           }
3400           fprintf(stdout, "Tracing the workload to: [%s]\n",
3401                   FLAGS_trace_file.c_str());
3402         }
3403         // Start block cache tracing.
3404         if (!FLAGS_block_cache_trace_file.empty()) {
3405           // Sanity checks.
3406           if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
3407             fprintf(stderr,
3408                     "Block cache trace sampling frequency must be higher than "
3409                     "0.\n");
3410             ErrorExit();
3411           }
3412           if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
3413             fprintf(stderr,
3414                     "The maximum file size for block cache tracing must be "
3415                     "higher than 0.\n");
3416             ErrorExit();
3417           }
3418           block_cache_trace_options_.max_trace_file_size =
3419               FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
3420           block_cache_trace_options_.sampling_frequency =
3421               FLAGS_block_cache_trace_sampling_frequency;
3422           std::unique_ptr<TraceWriter> block_cache_trace_writer;
3423           Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3424                                         FLAGS_block_cache_trace_file,
3425                                         &block_cache_trace_writer);
3426           if (!s.ok()) {
3427             fprintf(stderr,
3428                     "Encountered an error when creating trace writer, %s\n",
3429                     s.ToString().c_str());
3430             ErrorExit();
3431           }
3432           s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
3433                                            std::move(block_cache_trace_writer));
3434           if (!s.ok()) {
3435             fprintf(
3436                 stderr,
3437                 "Encountered an error when starting block cache tracing, %s\n",
3438                 s.ToString().c_str());
3439             ErrorExit();
3440           }
3441           fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
3442                   FLAGS_block_cache_trace_file.c_str());
3443         }
3444 #endif  // ROCKSDB_LITE
3445 
3446         if (num_warmup > 0) {
3447           printf("Warming up benchmark by running %d times\n", num_warmup);
3448         }
3449 
3450         for (int i = 0; i < num_warmup; i++) {
3451           RunBenchmark(num_threads, name, method);
3452         }
3453 
3454         if (num_repeat > 1) {
3455           printf("Running benchmark for %d times\n", num_repeat);
3456         }
3457 
3458         CombinedStats combined_stats;
3459         for (int i = 0; i < num_repeat; i++) {
3460           Stats stats = RunBenchmark(num_threads, name, method);
3461           combined_stats.AddStats(stats);
3462         }
3463         if (num_repeat > 1) {
3464           combined_stats.Report(name);
3465         }
3466       }
3467       if (post_process_method != nullptr) {
3468         (this->*post_process_method)();
3469       }
3470     }
3471 
3472     if (secondary_update_thread_) {
3473       secondary_update_stopped_.store(1, std::memory_order_relaxed);
3474       secondary_update_thread_->join();
3475       secondary_update_thread_.reset();
3476     }
3477 
3478 #ifndef ROCKSDB_LITE
3479     if (name != "replay" && FLAGS_trace_file != "") {
3480       Status s = db_.db->EndTrace();
3481       if (!s.ok()) {
3482         fprintf(stderr, "Encountered an error ending the trace, %s\n",
3483                 s.ToString().c_str());
3484       }
3485     }
3486     if (!FLAGS_block_cache_trace_file.empty()) {
3487       Status s = db_.db->EndBlockCacheTrace();
3488       if (!s.ok()) {
3489         fprintf(stderr,
3490                 "Encountered an error ending the block cache tracing, %s\n",
3491                 s.ToString().c_str());
3492       }
3493     }
3494 #endif  // ROCKSDB_LITE
3495 
3496     if (FLAGS_statistics) {
3497       fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
3498     }
3499     if (FLAGS_simcache_size >= 0) {
3500       fprintf(
3501           stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
3502           static_cast_with_check<SimCache>(cache_.get())->ToString().c_str());
3503     }
3504 
3505 #ifndef ROCKSDB_LITE
3506     if (FLAGS_use_secondary_db) {
3507       fprintf(stdout, "Secondary instance updated  %" PRIu64 " times.\n",
3508               secondary_db_updates_);
3509     }
3510 #endif  // ROCKSDB_LITE
3511   }
3512 
3513  private:
3514   std::shared_ptr<TimestampEmulator> timestamp_emulator_;
3515   std::unique_ptr<port::Thread> secondary_update_thread_;
3516   std::atomic<int> secondary_update_stopped_{0};
3517 #ifndef ROCKSDB_LITE
3518   uint64_t secondary_db_updates_ = 0;
3519 #endif  // ROCKSDB_LITE
3520   struct ThreadArg {
3521     Benchmark* bm;
3522     SharedState* shared;
3523     ThreadState* thread;
3524     void (Benchmark::*method)(ThreadState*);
3525   };
3526 
ThreadBody(void * v)3527   static void ThreadBody(void* v) {
3528     ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
3529     SharedState* shared = arg->shared;
3530     ThreadState* thread = arg->thread;
3531     {
3532       MutexLock l(&shared->mu);
3533       shared->num_initialized++;
3534       if (shared->num_initialized >= shared->total) {
3535         shared->cv.SignalAll();
3536       }
3537       while (!shared->start) {
3538         shared->cv.Wait();
3539       }
3540     }
3541 
3542     SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
3543     perf_context.EnablePerLevelPerfContext();
3544     thread->stats.Start(thread->tid);
3545     (arg->bm->*(arg->method))(thread);
3546     thread->stats.Stop();
3547 
3548     {
3549       MutexLock l(&shared->mu);
3550       shared->num_done++;
3551       if (shared->num_done >= shared->total) {
3552         shared->cv.SignalAll();
3553       }
3554     }
3555   }
3556 
RunBenchmark(int n,Slice name,void (Benchmark::* method)(ThreadState *))3557   Stats RunBenchmark(int n, Slice name,
3558                      void (Benchmark::*method)(ThreadState*)) {
3559     SharedState shared;
3560     shared.total = n;
3561     shared.num_initialized = 0;
3562     shared.num_done = 0;
3563     shared.start = false;
3564     if (FLAGS_benchmark_write_rate_limit > 0) {
3565       shared.write_rate_limiter.reset(
3566           NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
3567     }
3568     if (FLAGS_benchmark_read_rate_limit > 0) {
3569       shared.read_rate_limiter.reset(NewGenericRateLimiter(
3570           FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
3571           10 /* fairness */, RateLimiter::Mode::kReadsOnly));
3572     }
3573 
3574     std::unique_ptr<ReporterAgent> reporter_agent;
3575     if (FLAGS_report_interval_seconds > 0) {
3576       reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
3577                                              FLAGS_report_interval_seconds));
3578     }
3579 
3580     ThreadArg* arg = new ThreadArg[n];
3581 
3582     for (int i = 0; i < n; i++) {
3583 #ifdef NUMA
3584       if (FLAGS_enable_numa) {
3585         // Performs a local allocation of memory to threads in numa node.
3586         int n_nodes = numa_num_task_nodes();  // Number of nodes in NUMA.
3587         numa_exit_on_error = 1;
3588         int numa_node = i % n_nodes;
3589         bitmask* nodes = numa_allocate_nodemask();
3590         numa_bitmask_clearall(nodes);
3591         numa_bitmask_setbit(nodes, numa_node);
3592         // numa_bind() call binds the process to the node and these
3593         // properties are passed on to the thread that is created in
3594         // StartThread method called later in the loop.
3595         numa_bind(nodes);
3596         numa_set_strict(1);
3597         numa_free_nodemask(nodes);
3598       }
3599 #endif
3600       arg[i].bm = this;
3601       arg[i].method = method;
3602       arg[i].shared = &shared;
3603       arg[i].thread = new ThreadState(i);
3604       arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
3605       arg[i].thread->shared = &shared;
3606       FLAGS_env->StartThread(ThreadBody, &arg[i]);
3607     }
3608 
3609     shared.mu.Lock();
3610     while (shared.num_initialized < n) {
3611       shared.cv.Wait();
3612     }
3613 
3614     shared.start = true;
3615     shared.cv.SignalAll();
3616     while (shared.num_done < n) {
3617       shared.cv.Wait();
3618     }
3619     shared.mu.Unlock();
3620 
3621     // Stats for some threads can be excluded.
3622     Stats merge_stats;
3623     for (int i = 0; i < n; i++) {
3624       merge_stats.Merge(arg[i].thread->stats);
3625     }
3626     merge_stats.Report(name);
3627 
3628     for (int i = 0; i < n; i++) {
3629       delete arg[i].thread;
3630     }
3631     delete[] arg;
3632 
3633     return merge_stats;
3634   }
3635 
Crc32c(ThreadState * thread)3636   void Crc32c(ThreadState* thread) {
3637     // Checksum about 500MB of data total
3638     const int size = FLAGS_block_size; // use --block_size option for db_bench
3639     std::string labels = "(" + ToString(FLAGS_block_size) + " per op)";
3640     const char* label = labels.c_str();
3641 
3642     std::string data(size, 'x');
3643     int64_t bytes = 0;
3644     uint32_t crc = 0;
3645     while (bytes < 500 * 1048576) {
3646       crc = crc32c::Value(data.data(), size);
3647       thread->stats.FinishedOps(nullptr, nullptr, 1, kCrc);
3648       bytes += size;
3649     }
3650     // Print so result is not dead
3651     fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
3652 
3653     thread->stats.AddBytes(bytes);
3654     thread->stats.AddMessage(label);
3655   }
3656 
xxHash(ThreadState * thread)3657   void xxHash(ThreadState* thread) {
3658     // Checksum about 500MB of data total
3659     const int size = 4096;
3660     const char* label = "(4K per op)";
3661     std::string data(size, 'x');
3662     int64_t bytes = 0;
3663     unsigned int xxh32 = 0;
3664     while (bytes < 500 * 1048576) {
3665       xxh32 = XXH32(data.data(), size, 0);
3666       thread->stats.FinishedOps(nullptr, nullptr, 1, kHash);
3667       bytes += size;
3668     }
3669     // Print so result is not dead
3670     fprintf(stderr, "... xxh32=0x%x\r", static_cast<unsigned int>(xxh32));
3671 
3672     thread->stats.AddBytes(bytes);
3673     thread->stats.AddMessage(label);
3674   }
3675 
AcquireLoad(ThreadState * thread)3676   void AcquireLoad(ThreadState* thread) {
3677     int dummy;
3678     std::atomic<void*> ap(&dummy);
3679     int count = 0;
3680     void *ptr = nullptr;
3681     thread->stats.AddMessage("(each op is 1000 loads)");
3682     while (count < 100000) {
3683       for (int i = 0; i < 1000; i++) {
3684         ptr = ap.load(std::memory_order_acquire);
3685       }
3686       count++;
3687       thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
3688     }
3689     if (ptr == nullptr) exit(1);  // Disable unused variable warning.
3690   }
3691 
Compress(ThreadState * thread)3692   void Compress(ThreadState *thread) {
3693     RandomGenerator gen;
3694     Slice input = gen.Generate(FLAGS_block_size);
3695     int64_t bytes = 0;
3696     int64_t produced = 0;
3697     bool ok = true;
3698     std::string compressed;
3699     CompressionOptions opts;
3700     CompressionContext context(FLAGS_compression_type_e);
3701     CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
3702                          FLAGS_compression_type_e,
3703                          FLAGS_sample_for_compression);
3704     // Compress 1G
3705     while (ok && bytes < int64_t(1) << 30) {
3706       compressed.clear();
3707       ok = CompressSlice(info, input, &compressed);
3708       produced += compressed.size();
3709       bytes += input.size();
3710       thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
3711     }
3712 
3713     if (!ok) {
3714       thread->stats.AddMessage("(compression failure)");
3715     } else {
3716       char buf[340];
3717       snprintf(buf, sizeof(buf), "(output: %.1f%%)",
3718                (produced * 100.0) / bytes);
3719       thread->stats.AddMessage(buf);
3720       thread->stats.AddBytes(bytes);
3721     }
3722   }
3723 
Uncompress(ThreadState * thread)3724   void Uncompress(ThreadState *thread) {
3725     RandomGenerator gen;
3726     Slice input = gen.Generate(FLAGS_block_size);
3727     std::string compressed;
3728 
3729     CompressionContext compression_ctx(FLAGS_compression_type_e);
3730     CompressionOptions compression_opts;
3731     CompressionInfo compression_info(
3732         compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
3733         FLAGS_compression_type_e, FLAGS_sample_for_compression);
3734     UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
3735     UncompressionInfo uncompression_info(uncompression_ctx,
3736                                          UncompressionDict::GetEmptyDict(),
3737                                          FLAGS_compression_type_e);
3738 
3739     bool ok = CompressSlice(compression_info, input, &compressed);
3740     int64_t bytes = 0;
3741     size_t uncompressed_size = 0;
3742     while (ok && bytes < 1024 * 1048576) {
3743       constexpr uint32_t compress_format_version = 2;
3744 
3745       CacheAllocationPtr uncompressed = UncompressData(
3746           uncompression_info, compressed.data(), compressed.size(),
3747           &uncompressed_size, compress_format_version);
3748 
3749       ok = uncompressed.get() != nullptr;
3750       bytes += input.size();
3751       thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
3752     }
3753 
3754     if (!ok) {
3755       thread->stats.AddMessage("(compression failure)");
3756     } else {
3757       thread->stats.AddBytes(bytes);
3758     }
3759   }
3760 
3761   // Returns true if the options is initialized from the specified
3762   // options file.
InitializeOptionsFromFile(Options * opts)3763   bool InitializeOptionsFromFile(Options* opts) {
3764 #ifndef ROCKSDB_LITE
3765     printf("Initializing RocksDB Options from the specified file\n");
3766     DBOptions db_opts;
3767     std::vector<ColumnFamilyDescriptor> cf_descs;
3768     if (FLAGS_options_file != "") {
3769       auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
3770                                    &cf_descs);
3771       db_opts.env = FLAGS_env;
3772       if (s.ok()) {
3773         *opts = Options(db_opts, cf_descs[0].options);
3774         return true;
3775       }
3776       fprintf(stderr, "Unable to load options file %s --- %s\n",
3777               FLAGS_options_file.c_str(), s.ToString().c_str());
3778       exit(1);
3779     }
3780 #else
3781     (void)opts;
3782 #endif
3783     return false;
3784   }
3785 
InitializeOptionsFromFlags(Options * opts)3786   void InitializeOptionsFromFlags(Options* opts) {
3787     printf("Initializing RocksDB Options from command-line flags\n");
3788     Options& options = *opts;
3789 
3790     assert(db_.db == nullptr);
3791 
3792     options.env = FLAGS_env;
3793     options.max_open_files = FLAGS_open_files;
3794     if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
3795       options.write_buffer_manager.reset(
3796           new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
3797     }
3798     options.arena_block_size = FLAGS_arena_block_size;
3799     options.write_buffer_size = FLAGS_write_buffer_size;
3800     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
3801     options.min_write_buffer_number_to_merge =
3802       FLAGS_min_write_buffer_number_to_merge;
3803     options.max_write_buffer_number_to_maintain =
3804         FLAGS_max_write_buffer_number_to_maintain;
3805     options.max_write_buffer_size_to_maintain =
3806         FLAGS_max_write_buffer_size_to_maintain;
3807     options.max_background_jobs = FLAGS_max_background_jobs;
3808     options.max_background_compactions = FLAGS_max_background_compactions;
3809     options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
3810     options.max_background_flushes = FLAGS_max_background_flushes;
3811     options.compaction_style = FLAGS_compaction_style_e;
3812     options.compaction_pri = FLAGS_compaction_pri_e;
3813     options.allow_mmap_reads = FLAGS_mmap_read;
3814     options.allow_mmap_writes = FLAGS_mmap_write;
3815     options.use_direct_reads = FLAGS_use_direct_reads;
3816     options.use_direct_io_for_flush_and_compaction =
3817         FLAGS_use_direct_io_for_flush_and_compaction;
3818 #ifndef ROCKSDB_LITE
3819     options.ttl = FLAGS_fifo_compaction_ttl;
3820     options.compaction_options_fifo = CompactionOptionsFIFO(
3821         FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
3822         FLAGS_fifo_compaction_allow_compaction);
3823 #endif  // ROCKSDB_LITE
3824     if (FLAGS_prefix_size != 0) {
3825       options.prefix_extractor.reset(
3826           NewFixedPrefixTransform(FLAGS_prefix_size));
3827     }
3828     if (FLAGS_use_uint64_comparator) {
3829       options.comparator = test::Uint64Comparator();
3830       if (FLAGS_key_size != 8) {
3831         fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
3832         exit(1);
3833       }
3834     }
3835     if (FLAGS_use_stderr_info_logger) {
3836       options.info_log.reset(new StderrLogger());
3837     }
3838     options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
3839     options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
3840     options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
3841     if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
3842       options.memtable_insert_with_hint_prefix_extractor.reset(
3843           NewCappedPrefixTransform(
3844               FLAGS_memtable_insert_with_hint_prefix_size));
3845     }
3846     options.bloom_locality = FLAGS_bloom_locality;
3847     options.max_file_opening_threads = FLAGS_file_opening_threads;
3848     options.new_table_reader_for_compaction_inputs =
3849         FLAGS_new_table_reader_for_compaction_inputs;
3850     options.compaction_readahead_size = FLAGS_compaction_readahead_size;
3851     options.log_readahead_size = FLAGS_log_readahead_size;
3852     options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
3853     options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
3854     options.use_fsync = FLAGS_use_fsync;
3855     options.num_levels = FLAGS_num_levels;
3856     options.target_file_size_base = FLAGS_target_file_size_base;
3857     options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
3858     options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
3859     options.level_compaction_dynamic_level_bytes =
3860         FLAGS_level_compaction_dynamic_level_bytes;
3861     options.max_bytes_for_level_multiplier =
3862         FLAGS_max_bytes_for_level_multiplier;
3863     if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash ||
3864                                      FLAGS_rep_factory == kHashLinkedList)) {
3865       fprintf(stderr, "prefix_size should be non-zero if PrefixHash or "
3866                       "HashLinkedList memtablerep is used\n");
3867       exit(1);
3868     }
3869     switch (FLAGS_rep_factory) {
3870       case kSkipList:
3871         options.memtable_factory.reset(new SkipListFactory(
3872             FLAGS_skip_list_lookahead));
3873         break;
3874 #ifndef ROCKSDB_LITE
3875       case kPrefixHash:
3876         options.memtable_factory.reset(
3877             NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
3878         break;
3879       case kHashLinkedList:
3880         options.memtable_factory.reset(NewHashLinkListRepFactory(
3881             FLAGS_hash_bucket_count));
3882         break;
3883       case kVectorRep:
3884         options.memtable_factory.reset(
3885           new VectorRepFactory
3886         );
3887         break;
3888 #else
3889       default:
3890         fprintf(stderr, "Only skip list is supported in lite mode\n");
3891         exit(1);
3892 #endif  // ROCKSDB_LITE
3893     }
3894     if (FLAGS_use_plain_table) {
3895 #ifndef ROCKSDB_LITE
3896       if (FLAGS_rep_factory != kPrefixHash &&
3897           FLAGS_rep_factory != kHashLinkedList) {
3898         fprintf(stderr, "Waring: plain table is used with skipList\n");
3899       }
3900 
3901       int bloom_bits_per_key = FLAGS_bloom_bits;
3902       if (bloom_bits_per_key < 0) {
3903         bloom_bits_per_key = 0;
3904       }
3905 
3906       PlainTableOptions plain_table_options;
3907       plain_table_options.user_key_len = FLAGS_key_size;
3908       plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
3909       plain_table_options.hash_table_ratio = 0.75;
3910       options.table_factory = std::shared_ptr<TableFactory>(
3911           NewPlainTableFactory(plain_table_options));
3912 #else
3913       fprintf(stderr, "Plain table is not supported in lite mode\n");
3914       exit(1);
3915 #endif  // ROCKSDB_LITE
3916     } else if (FLAGS_use_cuckoo_table) {
3917 #ifndef ROCKSDB_LITE
3918       if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
3919         fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
3920         exit(1);
3921       }
3922 
3923       if (!FLAGS_mmap_read) {
3924         fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
3925         exit(1);
3926       }
3927 
3928       ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
3929       table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
3930       table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
3931       options.table_factory = std::shared_ptr<TableFactory>(
3932           NewCuckooTableFactory(table_options));
3933 #else
3934       fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
3935       exit(1);
3936 #endif  // ROCKSDB_LITE
3937     } else {
3938       BlockBasedTableOptions block_based_options;
3939       if (FLAGS_use_hash_search) {
3940         if (FLAGS_prefix_size == 0) {
3941           fprintf(stderr,
3942               "prefix_size not assigned when enable use_hash_search \n");
3943           exit(1);
3944         }
3945         block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
3946       } else {
3947         block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
3948       }
3949       if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
3950         if (FLAGS_index_with_first_key) {
3951           fprintf(stderr,
3952                   "--index_with_first_key is not compatible with"
3953                   " partition index.");
3954         }
3955         if (FLAGS_use_hash_search) {
3956           fprintf(stderr,
3957                   "use_hash_search is incompatible with "
3958                   "partition index and is ignored");
3959         }
3960         block_based_options.index_type =
3961             BlockBasedTableOptions::kTwoLevelIndexSearch;
3962         block_based_options.metadata_block_size = FLAGS_metadata_block_size;
3963         if (FLAGS_partition_index_and_filters) {
3964           block_based_options.partition_filters = true;
3965         }
3966       } else if (FLAGS_index_with_first_key) {
3967         block_based_options.index_type =
3968             BlockBasedTableOptions::kBinarySearchWithFirstKey;
3969       }
3970       BlockBasedTableOptions::IndexShorteningMode index_shortening =
3971           block_based_options.index_shortening;
3972       switch (FLAGS_index_shortening_mode) {
3973         case 0:
3974           index_shortening =
3975               BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
3976           break;
3977         case 1:
3978           index_shortening =
3979               BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators;
3980           break;
3981         case 2:
3982           index_shortening = BlockBasedTableOptions::IndexShorteningMode::
3983               kShortenSeparatorsAndSuccessor;
3984           break;
3985         default:
3986           fprintf(stderr, "Unknown key shortening mode\n");
3987       }
3988       block_based_options.optimize_filters_for_memory =
3989           FLAGS_optimize_filters_for_memory;
3990       block_based_options.index_shortening = index_shortening;
3991       if (cache_ == nullptr) {
3992         block_based_options.no_block_cache = true;
3993       }
3994       block_based_options.cache_index_and_filter_blocks =
3995           FLAGS_cache_index_and_filter_blocks;
3996       block_based_options.pin_l0_filter_and_index_blocks_in_cache =
3997           FLAGS_pin_l0_filter_and_index_blocks_in_cache;
3998       block_based_options.pin_top_level_index_and_filter =
3999           FLAGS_pin_top_level_index_and_filter;
4000       if (FLAGS_cache_high_pri_pool_ratio > 1e-6) {  // > 0.0 + eps
4001         block_based_options.cache_index_and_filter_blocks_with_high_priority =
4002             true;
4003       }
4004       block_based_options.block_cache = cache_;
4005       block_based_options.block_cache_compressed = compressed_cache_;
4006       block_based_options.block_size = FLAGS_block_size;
4007       block_based_options.block_restart_interval = FLAGS_block_restart_interval;
4008       block_based_options.index_block_restart_interval =
4009           FLAGS_index_block_restart_interval;
4010       block_based_options.filter_policy = filter_policy_;
4011       block_based_options.format_version =
4012           static_cast<uint32_t>(FLAGS_format_version);
4013       block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
4014       block_based_options.enable_index_compression =
4015           FLAGS_enable_index_compression;
4016       block_based_options.block_align = FLAGS_block_align;
4017       if (FLAGS_use_data_block_hash_index) {
4018         block_based_options.data_block_index_type =
4019             ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
4020       } else {
4021         block_based_options.data_block_index_type =
4022             ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch;
4023       }
4024       block_based_options.data_block_hash_table_util_ratio =
4025           FLAGS_data_block_hash_table_util_ratio;
4026       if (FLAGS_read_cache_path != "") {
4027 #ifndef ROCKSDB_LITE
4028         Status rc_status;
4029 
4030         // Read cache need to be provided with a the Logger, we will put all
4031         // reac cache logs in the read cache path in a file named rc_LOG
4032         rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
4033         std::shared_ptr<Logger> read_cache_logger;
4034         if (rc_status.ok()) {
4035           rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
4036                                            &read_cache_logger);
4037         }
4038 
4039         if (rc_status.ok()) {
4040           PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
4041                                        FLAGS_read_cache_size,
4042                                        read_cache_logger);
4043 
4044           rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
4045           rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
4046           rc_cfg.writer_qdepth = 4;
4047           rc_cfg.writer_dispatch_size = 4 * 1024;
4048 
4049           auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
4050           block_based_options.persistent_cache = pcache;
4051           rc_status = pcache->Open();
4052         }
4053 
4054         if (!rc_status.ok()) {
4055           fprintf(stderr, "Error initializing read cache, %s\n",
4056                   rc_status.ToString().c_str());
4057           exit(1);
4058         }
4059 #else
4060         fprintf(stderr, "Read cache is not supported in LITE\n");
4061         exit(1);
4062 
4063 #endif
4064       }
4065       options.table_factory.reset(
4066           NewBlockBasedTableFactory(block_based_options));
4067     }
4068     if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
4069       if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
4070           static_cast<unsigned int>(FLAGS_num_levels)) {
4071         fprintf(stderr, "Insufficient number of fanouts specified %d\n",
4072                 static_cast<int>(
4073                     FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
4074         exit(1);
4075       }
4076       options.max_bytes_for_level_multiplier_additional =
4077         FLAGS_max_bytes_for_level_multiplier_additional_v;
4078     }
4079     options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
4080     options.level0_file_num_compaction_trigger =
4081         FLAGS_level0_file_num_compaction_trigger;
4082     options.level0_slowdown_writes_trigger =
4083       FLAGS_level0_slowdown_writes_trigger;
4084     options.compression = FLAGS_compression_type_e;
4085     if (FLAGS_simulate_hybrid_fs_file != "") {
4086       options.bottommost_temperature = Temperature::kWarm;
4087     }
4088     options.sample_for_compression = FLAGS_sample_for_compression;
4089     options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
4090     options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
4091     options.max_total_wal_size = FLAGS_max_total_wal_size;
4092 
4093     if (FLAGS_min_level_to_compress >= 0) {
4094       assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
4095       options.compression_per_level.resize(FLAGS_num_levels);
4096       for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
4097         options.compression_per_level[i] = kNoCompression;
4098       }
4099       for (int i = FLAGS_min_level_to_compress;
4100            i < FLAGS_num_levels; i++) {
4101         options.compression_per_level[i] = FLAGS_compression_type_e;
4102       }
4103     }
4104     options.soft_rate_limit = FLAGS_soft_rate_limit;
4105     options.hard_rate_limit = FLAGS_hard_rate_limit;
4106     options.soft_pending_compaction_bytes_limit =
4107         FLAGS_soft_pending_compaction_bytes_limit;
4108     options.hard_pending_compaction_bytes_limit =
4109         FLAGS_hard_pending_compaction_bytes_limit;
4110     options.delayed_write_rate = FLAGS_delayed_write_rate;
4111     options.allow_concurrent_memtable_write =
4112         FLAGS_allow_concurrent_memtable_write;
4113     options.inplace_update_support = FLAGS_inplace_update_support;
4114     options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
4115     options.enable_write_thread_adaptive_yield =
4116         FLAGS_enable_write_thread_adaptive_yield;
4117     options.enable_pipelined_write = FLAGS_enable_pipelined_write;
4118     options.unordered_write = FLAGS_unordered_write;
4119     options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
4120     options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
4121     options.rate_limit_delay_max_milliseconds =
4122       FLAGS_rate_limit_delay_max_milliseconds;
4123     options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
4124     options.max_compaction_bytes = FLAGS_max_compaction_bytes;
4125     options.disable_auto_compactions = FLAGS_disable_auto_compactions;
4126     options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
4127     options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
4128 
4129     // fill storage options
4130     options.advise_random_on_open = FLAGS_advise_random_on_open;
4131     options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
4132     options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
4133     options.bytes_per_sync = FLAGS_bytes_per_sync;
4134     options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
4135 
4136     // merge operator options
4137     options.merge_operator = MergeOperators::CreateFromStringId(
4138         FLAGS_merge_operator);
4139     if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) {
4140       fprintf(stderr, "invalid merge operator: %s\n",
4141               FLAGS_merge_operator.c_str());
4142       exit(1);
4143     }
4144     options.max_successive_merges = FLAGS_max_successive_merges;
4145     options.report_bg_io_stats = FLAGS_report_bg_io_stats;
4146 
4147     // set universal style compaction configurations, if applicable
4148     if (FLAGS_universal_size_ratio != 0) {
4149       options.compaction_options_universal.size_ratio =
4150         FLAGS_universal_size_ratio;
4151     }
4152     if (FLAGS_universal_min_merge_width != 0) {
4153       options.compaction_options_universal.min_merge_width =
4154         FLAGS_universal_min_merge_width;
4155     }
4156     if (FLAGS_universal_max_merge_width != 0) {
4157       options.compaction_options_universal.max_merge_width =
4158         FLAGS_universal_max_merge_width;
4159     }
4160     if (FLAGS_universal_max_size_amplification_percent != 0) {
4161       options.compaction_options_universal.max_size_amplification_percent =
4162         FLAGS_universal_max_size_amplification_percent;
4163     }
4164     if (FLAGS_universal_compression_size_percent != -1) {
4165       options.compaction_options_universal.compression_size_percent =
4166         FLAGS_universal_compression_size_percent;
4167     }
4168     options.compaction_options_universal.allow_trivial_move =
4169         FLAGS_universal_allow_trivial_move;
4170     if (FLAGS_thread_status_per_interval > 0) {
4171       options.enable_thread_tracking = true;
4172     }
4173 
4174     if (FLAGS_user_timestamp_size > 0) {
4175       if (FLAGS_user_timestamp_size != 8) {
4176         fprintf(stderr, "Only 64 bits timestamps are supported.\n");
4177         exit(1);
4178       }
4179       options.comparator = ROCKSDB_NAMESPACE::test::ComparatorWithU64Ts();
4180     }
4181 
4182     // Integrated BlobDB
4183     options.enable_blob_files = FLAGS_enable_blob_files;
4184     options.min_blob_size = FLAGS_min_blob_size;
4185     options.blob_file_size = FLAGS_blob_file_size;
4186     options.blob_compression_type =
4187         StringToCompressionType(FLAGS_blob_compression_type.c_str());
4188     options.enable_blob_garbage_collection =
4189         FLAGS_enable_blob_garbage_collection;
4190     options.blob_garbage_collection_age_cutoff =
4191         FLAGS_blob_garbage_collection_age_cutoff;
4192 
4193 #ifndef ROCKSDB_LITE
4194     if (FLAGS_readonly && FLAGS_transaction_db) {
4195       fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
4196       exit(1);
4197     }
4198     if (FLAGS_use_secondary_db &&
4199         (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
4200       fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
4201       exit(1);
4202     }
4203 #endif  // ROCKSDB_LITE
4204 
4205   }
4206 
InitializeOptionsGeneral(Options * opts)4207   void InitializeOptionsGeneral(Options* opts) {
4208     Options& options = *opts;
4209 
4210     options.create_missing_column_families = FLAGS_num_column_families > 1;
4211     options.statistics = dbstats;
4212     options.wal_dir = FLAGS_wal_dir;
4213     options.create_if_missing = !FLAGS_use_existing_db;
4214     options.dump_malloc_stats = FLAGS_dump_malloc_stats;
4215     options.stats_dump_period_sec =
4216         static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
4217     options.stats_persist_period_sec =
4218         static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
4219     options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
4220     options.stats_history_buffer_size =
4221         static_cast<size_t>(FLAGS_stats_history_buffer_size);
4222 
4223     options.compression_opts.level = FLAGS_compression_level;
4224     options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
4225     options.compression_opts.zstd_max_train_bytes =
4226         FLAGS_compression_zstd_max_train_bytes;
4227     options.compression_opts.parallel_threads =
4228         FLAGS_compression_parallel_threads;
4229     options.compression_opts.max_dict_buffer_bytes =
4230         FLAGS_compression_max_dict_buffer_bytes;
4231     // If this is a block based table, set some related options
4232     auto table_options =
4233         options.table_factory->GetOptions<BlockBasedTableOptions>();
4234     if (table_options != nullptr) {
4235       if (FLAGS_cache_size) {
4236         table_options->block_cache = cache_;
4237       }
4238       if (FLAGS_bloom_bits >= 0) {
4239         table_options->filter_policy.reset(
4240             FLAGS_use_ribbon_filter
4241                 ? NewExperimentalRibbonFilterPolicy(FLAGS_bloom_bits)
4242                 : NewBloomFilterPolicy(FLAGS_bloom_bits,
4243                                        FLAGS_use_block_based_filter));
4244       }
4245     }
4246     if (FLAGS_row_cache_size) {
4247       if (FLAGS_cache_numshardbits >= 1) {
4248         options.row_cache =
4249             NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
4250       } else {
4251         options.row_cache = NewLRUCache(FLAGS_row_cache_size);
4252       }
4253     }
4254     if (FLAGS_enable_io_prio) {
4255       FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
4256       FLAGS_env->LowerThreadPoolIOPriority(Env::HIGH);
4257     }
4258     if (FLAGS_enable_cpu_prio) {
4259       FLAGS_env->LowerThreadPoolCPUPriority(Env::LOW);
4260       FLAGS_env->LowerThreadPoolCPUPriority(Env::HIGH);
4261     }
4262     options.env = FLAGS_env;
4263     if (FLAGS_sine_write_rate) {
4264       FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
4265     }
4266 
4267     if (FLAGS_rate_limiter_bytes_per_sec > 0) {
4268       if (FLAGS_rate_limit_bg_reads &&
4269           !FLAGS_new_table_reader_for_compaction_inputs) {
4270         fprintf(stderr,
4271                 "rate limit compaction reads must have "
4272                 "new_table_reader_for_compaction_inputs set\n");
4273         exit(1);
4274       }
4275       options.rate_limiter.reset(NewGenericRateLimiter(
4276           FLAGS_rate_limiter_bytes_per_sec, 100 * 1000 /* refill_period_us */,
4277           10 /* fairness */,
4278           FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
4279                                     : RateLimiter::Mode::kWritesOnly,
4280           FLAGS_rate_limiter_auto_tuned));
4281     }
4282 
4283     options.listeners.emplace_back(listener_);
4284 
4285     if (FLAGS_num_multi_db <= 1) {
4286       OpenDb(options, FLAGS_db, &db_);
4287     } else {
4288       multi_dbs_.clear();
4289       multi_dbs_.resize(FLAGS_num_multi_db);
4290       auto wal_dir = options.wal_dir;
4291       for (int i = 0; i < FLAGS_num_multi_db; i++) {
4292         if (!wal_dir.empty()) {
4293           options.wal_dir = GetPathForMultiple(wal_dir, i);
4294         }
4295         OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
4296       }
4297       options.wal_dir = wal_dir;
4298     }
4299 
4300     // KeepFilter is a noop filter, this can be used to test compaction filter
4301     if (FLAGS_use_keep_filter) {
4302       options.compaction_filter = new KeepFilter();
4303       fprintf(stdout, "A noop compaction filter is used\n");
4304     }
4305 
4306     if (FLAGS_use_existing_keys) {
4307       // Only work on single database
4308       assert(db_.db != nullptr);
4309       ReadOptions read_opts;
4310       read_opts.total_order_seek = true;
4311       Iterator* iter = db_.db->NewIterator(read_opts);
4312       for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
4313         keys_.emplace_back(iter->key().ToString());
4314       }
4315       delete iter;
4316       FLAGS_num = keys_.size();
4317     }
4318   }
4319 
Open(Options * opts)4320   void Open(Options* opts) {
4321     if (!InitializeOptionsFromFile(opts)) {
4322       InitializeOptionsFromFlags(opts);
4323     }
4324 
4325     InitializeOptionsGeneral(opts);
4326   }
4327 
OpenDb(Options options,const std::string & db_name,DBWithColumnFamilies * db)4328   void OpenDb(Options options, const std::string& db_name,
4329       DBWithColumnFamilies* db) {
4330     Status s;
4331     // Open with column families if necessary.
4332     if (FLAGS_num_column_families > 1) {
4333       size_t num_hot = FLAGS_num_column_families;
4334       if (FLAGS_num_hot_column_families > 0 &&
4335           FLAGS_num_hot_column_families < FLAGS_num_column_families) {
4336         num_hot = FLAGS_num_hot_column_families;
4337       } else {
4338         FLAGS_num_hot_column_families = FLAGS_num_column_families;
4339       }
4340       std::vector<ColumnFamilyDescriptor> column_families;
4341       for (size_t i = 0; i < num_hot; i++) {
4342         column_families.push_back(ColumnFamilyDescriptor(
4343               ColumnFamilyName(i), ColumnFamilyOptions(options)));
4344       }
4345       std::vector<int> cfh_idx_to_prob;
4346       if (!FLAGS_column_family_distribution.empty()) {
4347         std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
4348         std::string cf_prob;
4349         int sum = 0;
4350         while (std::getline(cf_prob_stream, cf_prob, ',')) {
4351           cfh_idx_to_prob.push_back(std::stoi(cf_prob));
4352           sum += cfh_idx_to_prob.back();
4353         }
4354         if (sum != 100) {
4355           fprintf(stderr, "column_family_distribution items must sum to 100\n");
4356           exit(1);
4357         }
4358         if (cfh_idx_to_prob.size() != num_hot) {
4359           fprintf(stderr,
4360                   "got %" ROCKSDB_PRIszt
4361                   " column_family_distribution items; expected "
4362                   "%" ROCKSDB_PRIszt "\n",
4363                   cfh_idx_to_prob.size(), num_hot);
4364           exit(1);
4365         }
4366       }
4367 #ifndef ROCKSDB_LITE
4368       if (FLAGS_readonly) {
4369         s = DB::OpenForReadOnly(options, db_name, column_families,
4370             &db->cfh, &db->db);
4371       } else if (FLAGS_optimistic_transaction_db) {
4372         s = OptimisticTransactionDB::Open(options, db_name, column_families,
4373                                           &db->cfh, &db->opt_txn_db);
4374         if (s.ok()) {
4375           db->db = db->opt_txn_db->GetBaseDB();
4376         }
4377       } else if (FLAGS_transaction_db) {
4378         TransactionDB* ptr;
4379         TransactionDBOptions txn_db_options;
4380         if (options.unordered_write) {
4381           options.two_write_queues = true;
4382           txn_db_options.skip_concurrency_control = true;
4383           txn_db_options.write_policy = WRITE_PREPARED;
4384         }
4385         s = TransactionDB::Open(options, txn_db_options, db_name,
4386                                 column_families, &db->cfh, &ptr);
4387         if (s.ok()) {
4388           db->db = ptr;
4389         }
4390       } else {
4391         s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4392       }
4393 #else
4394       s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4395 #endif  // ROCKSDB_LITE
4396       db->cfh.resize(FLAGS_num_column_families);
4397       db->num_created = num_hot;
4398       db->num_hot = num_hot;
4399       db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
4400 #ifndef ROCKSDB_LITE
4401     } else if (FLAGS_readonly) {
4402       s = DB::OpenForReadOnly(options, db_name, &db->db);
4403     } else if (FLAGS_optimistic_transaction_db) {
4404       s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
4405       if (s.ok()) {
4406         db->db = db->opt_txn_db->GetBaseDB();
4407       }
4408     } else if (FLAGS_transaction_db) {
4409       TransactionDB* ptr = nullptr;
4410       TransactionDBOptions txn_db_options;
4411       if (options.unordered_write) {
4412         options.two_write_queues = true;
4413         txn_db_options.skip_concurrency_control = true;
4414         txn_db_options.write_policy = WRITE_PREPARED;
4415       }
4416       s = CreateLoggerFromOptions(db_name, options, &options.info_log);
4417       if (s.ok()) {
4418         s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
4419       }
4420       if (s.ok()) {
4421         db->db = ptr;
4422       }
4423     } else if (FLAGS_use_blob_db) {
4424       // Stacked BlobDB
4425       blob_db::BlobDBOptions blob_db_options;
4426       blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
4427       blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
4428       blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
4429       blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
4430       blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
4431       blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
4432       blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
4433       blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
4434       blob_db_options.compression = FLAGS_blob_db_compression_type_e;
4435       blob_db::BlobDB* ptr = nullptr;
4436       s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
4437       if (s.ok()) {
4438         db->db = ptr;
4439       }
4440     } else if (FLAGS_use_secondary_db) {
4441       if (FLAGS_secondary_path.empty()) {
4442         std::string default_secondary_path;
4443         FLAGS_env->GetTestDirectory(&default_secondary_path);
4444         default_secondary_path += "/dbbench_secondary";
4445         FLAGS_secondary_path = default_secondary_path;
4446       }
4447       s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
4448       if (s.ok() && FLAGS_secondary_update_interval > 0) {
4449         secondary_update_thread_.reset(new port::Thread(
4450             [this](int interval, DBWithColumnFamilies* _db) {
4451               while (0 == secondary_update_stopped_.load(
4452                               std::memory_order_relaxed)) {
4453                 Status secondary_update_status =
4454                     _db->db->TryCatchUpWithPrimary();
4455                 if (!secondary_update_status.ok()) {
4456                   fprintf(stderr, "Failed to catch up with primary: %s\n",
4457                           secondary_update_status.ToString().c_str());
4458                   break;
4459                 }
4460                 ++secondary_db_updates_;
4461                 FLAGS_env->SleepForMicroseconds(interval * 1000000);
4462               }
4463             },
4464             FLAGS_secondary_update_interval, db));
4465       }
4466 #endif  // ROCKSDB_LITE
4467     } else {
4468       s = DB::Open(options, db_name, &db->db);
4469     }
4470     if (!s.ok()) {
4471       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
4472       exit(1);
4473     }
4474   }
4475 
4476   enum WriteMode {
4477     RANDOM, SEQUENTIAL, UNIQUE_RANDOM
4478   };
4479 
WriteSeqDeterministic(ThreadState * thread)4480   void WriteSeqDeterministic(ThreadState* thread) {
4481     DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
4482   }
4483 
WriteUniqueRandomDeterministic(ThreadState * thread)4484   void WriteUniqueRandomDeterministic(ThreadState* thread) {
4485     DoDeterministicCompact(thread, open_options_.compaction_style,
4486                            UNIQUE_RANDOM);
4487   }
4488 
WriteSeq(ThreadState * thread)4489   void WriteSeq(ThreadState* thread) {
4490     DoWrite(thread, SEQUENTIAL);
4491   }
4492 
WriteRandom(ThreadState * thread)4493   void WriteRandom(ThreadState* thread) {
4494     DoWrite(thread, RANDOM);
4495   }
4496 
WriteUniqueRandom(ThreadState * thread)4497   void WriteUniqueRandom(ThreadState* thread) {
4498     DoWrite(thread, UNIQUE_RANDOM);
4499   }
4500 
4501   class KeyGenerator {
4502    public:
KeyGenerator(Random64 * rand,WriteMode mode,uint64_t num,uint64_t=64* 1024)4503     KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
4504                  uint64_t /*num_per_set*/ = 64 * 1024)
4505         : rand_(rand), mode_(mode), num_(num), next_(0) {
4506       if (mode_ == UNIQUE_RANDOM) {
4507         // NOTE: if memory consumption of this approach becomes a concern,
4508         // we can either break it into pieces and only random shuffle a section
4509         // each time. Alternatively, use a bit map implementation
4510         // (https://reviews.facebook.net/differential/diff/54627/)
4511         values_.resize(num_);
4512         for (uint64_t i = 0; i < num_; ++i) {
4513           values_[i] = i;
4514         }
4515         RandomShuffle(values_.begin(), values_.end(),
4516                       static_cast<uint32_t>(FLAGS_seed));
4517       }
4518     }
4519 
Next()4520     uint64_t Next() {
4521       switch (mode_) {
4522         case SEQUENTIAL:
4523           return next_++;
4524         case RANDOM:
4525           return rand_->Next() % num_;
4526         case UNIQUE_RANDOM:
4527           assert(next_ < num_);
4528           return values_[next_++];
4529       }
4530       assert(false);
4531       return std::numeric_limits<uint64_t>::max();
4532     }
4533 
4534    private:
4535     Random64* rand_;
4536     WriteMode mode_;
4537     const uint64_t num_;
4538     uint64_t next_;
4539     std::vector<uint64_t> values_;
4540   };
4541 
SelectDB(ThreadState * thread)4542   DB* SelectDB(ThreadState* thread) {
4543     return SelectDBWithCfh(thread)->db;
4544   }
4545 
SelectDBWithCfh(ThreadState * thread)4546   DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
4547     return SelectDBWithCfh(thread->rand.Next());
4548   }
4549 
SelectDBWithCfh(uint64_t rand_int)4550   DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
4551     if (db_.db != nullptr) {
4552       return &db_;
4553     } else  {
4554       return &multi_dbs_[rand_int % multi_dbs_.size()];
4555     }
4556   }
4557 
SineRate(double x)4558   double SineRate(double x) {
4559     return FLAGS_sine_a*sin((FLAGS_sine_b*x) + FLAGS_sine_c) + FLAGS_sine_d;
4560   }
4561 
DoWrite(ThreadState * thread,WriteMode write_mode)4562   void DoWrite(ThreadState* thread, WriteMode write_mode) {
4563     const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
4564     const int64_t num_ops = writes_ == 0 ? num_ : writes_;
4565 
4566     size_t num_key_gens = 1;
4567     if (db_.db == nullptr) {
4568       num_key_gens = multi_dbs_.size();
4569     }
4570     std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
4571     int64_t max_ops = num_ops * num_key_gens;
4572     int64_t ops_per_stage = max_ops;
4573     if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
4574       ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
4575                                        FLAGS_num_hot_column_families) +
4576                       1;
4577     }
4578 
4579     Duration duration(test_duration, max_ops, ops_per_stage);
4580     for (size_t i = 0; i < num_key_gens; i++) {
4581       key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
4582                                          num_ + max_num_range_tombstones_,
4583                                          ops_per_stage));
4584     }
4585 
4586     if (num_ != FLAGS_num) {
4587       char msg[100];
4588       snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
4589       thread->stats.AddMessage(msg);
4590     }
4591 
4592     RandomGenerator gen;
4593     WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
4594                      user_timestamp_size_);
4595     Status s;
4596     int64_t bytes = 0;
4597 
4598     std::unique_ptr<const char[]> key_guard;
4599     Slice key = AllocateKey(&key_guard);
4600     std::unique_ptr<const char[]> begin_key_guard;
4601     Slice begin_key = AllocateKey(&begin_key_guard);
4602     std::unique_ptr<const char[]> end_key_guard;
4603     Slice end_key = AllocateKey(&end_key_guard);
4604     std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
4605     std::vector<Slice> expanded_keys;
4606     if (FLAGS_expand_range_tombstones) {
4607       expanded_key_guards.resize(range_tombstone_width_);
4608       for (auto& expanded_key_guard : expanded_key_guards) {
4609         expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
4610       }
4611     }
4612 
4613     std::unique_ptr<char[]> ts_guard;
4614     if (user_timestamp_size_ > 0) {
4615       ts_guard.reset(new char[user_timestamp_size_]);
4616     }
4617 
4618     int64_t stage = 0;
4619     int64_t num_written = 0;
4620     while (!duration.Done(entries_per_batch_)) {
4621       if (duration.GetStage() != stage) {
4622         stage = duration.GetStage();
4623         if (db_.db != nullptr) {
4624           db_.CreateNewCf(open_options_, stage);
4625         } else {
4626           for (auto& db : multi_dbs_) {
4627             db.CreateNewCf(open_options_, stage);
4628           }
4629         }
4630       }
4631 
4632       size_t id = thread->rand.Next() % num_key_gens;
4633       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
4634       batch.Clear();
4635       int64_t batch_bytes = 0;
4636 
4637       for (int64_t j = 0; j < entries_per_batch_; j++) {
4638         int64_t rand_num = key_gens[id]->Next();
4639         GenerateKeyFromInt(rand_num, FLAGS_num, &key);
4640         Slice val = gen.Generate();
4641         if (use_blob_db_) {
4642 #ifndef ROCKSDB_LITE
4643           // Stacked BlobDB
4644           blob_db::BlobDB* blobdb =
4645               static_cast<blob_db::BlobDB*>(db_with_cfh->db);
4646           if (FLAGS_blob_db_max_ttl_range > 0) {
4647             int ttl = rand() % FLAGS_blob_db_max_ttl_range;
4648             s = blobdb->PutWithTTL(write_options_, key, val, ttl);
4649           } else {
4650             s = blobdb->Put(write_options_, key, val);
4651           }
4652 #endif  //  ROCKSDB_LITE
4653         } else if (FLAGS_num_column_families <= 1) {
4654           batch.Put(key, val);
4655         } else {
4656           // We use same rand_num as seed for key and column family so that we
4657           // can deterministically find the cfh corresponding to a particular
4658           // key while reading the key.
4659           batch.Put(db_with_cfh->GetCfh(rand_num), key,
4660                     val);
4661         }
4662         batch_bytes += val.size() + key_size_ + user_timestamp_size_;
4663         bytes += val.size() + key_size_ + user_timestamp_size_;
4664         ++num_written;
4665         if (writes_per_range_tombstone_ > 0 &&
4666             num_written > writes_before_delete_range_ &&
4667             (num_written - writes_before_delete_range_) /
4668                     writes_per_range_tombstone_ <=
4669                 max_num_range_tombstones_ &&
4670             (num_written - writes_before_delete_range_) %
4671                     writes_per_range_tombstone_ ==
4672                 0) {
4673           int64_t begin_num = key_gens[id]->Next();
4674           if (FLAGS_expand_range_tombstones) {
4675             for (int64_t offset = 0; offset < range_tombstone_width_;
4676                  ++offset) {
4677               GenerateKeyFromInt(begin_num + offset, FLAGS_num,
4678                                  &expanded_keys[offset]);
4679               if (use_blob_db_) {
4680 #ifndef ROCKSDB_LITE
4681                 // Stacked BlobDB
4682                 s = db_with_cfh->db->Delete(write_options_,
4683                                             expanded_keys[offset]);
4684 #endif  //  ROCKSDB_LITE
4685               } else if (FLAGS_num_column_families <= 1) {
4686                 batch.Delete(expanded_keys[offset]);
4687               } else {
4688                 batch.Delete(db_with_cfh->GetCfh(rand_num),
4689                              expanded_keys[offset]);
4690               }
4691             }
4692           } else {
4693             GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
4694             GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
4695                                &end_key);
4696             if (use_blob_db_) {
4697 #ifndef ROCKSDB_LITE
4698               // Stacked BlobDB
4699               s = db_with_cfh->db->DeleteRange(
4700                   write_options_, db_with_cfh->db->DefaultColumnFamily(),
4701                   begin_key, end_key);
4702 #endif  //  ROCKSDB_LITE
4703             } else if (FLAGS_num_column_families <= 1) {
4704               batch.DeleteRange(begin_key, end_key);
4705             } else {
4706               batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
4707                                 end_key);
4708             }
4709           }
4710         }
4711       }
4712       if (thread->shared->write_rate_limiter.get() != nullptr) {
4713         thread->shared->write_rate_limiter->Request(
4714             batch_bytes, Env::IO_HIGH,
4715             nullptr /* stats */, RateLimiter::OpType::kWrite);
4716         // Set time at which last op finished to Now() to hide latency and
4717         // sleep from rate limiter. Also, do the check once per batch, not
4718         // once per write.
4719         thread->stats.ResetLastOpTime();
4720       }
4721       if (user_timestamp_size_ > 0) {
4722         Slice user_ts = mock_app_clock_->Allocate(ts_guard.get());
4723         s = batch.AssignTimestamp(user_ts);
4724         if (!s.ok()) {
4725           fprintf(stderr, "assign timestamp to write batch: %s\n",
4726                   s.ToString().c_str());
4727           ErrorExit();
4728         }
4729       }
4730       if (!use_blob_db_) {
4731         // Not stacked BlobDB
4732         s = db_with_cfh->db->Write(write_options_, &batch);
4733       }
4734       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
4735                                 entries_per_batch_, kWrite);
4736       if (FLAGS_sine_write_rate) {
4737         uint64_t now = FLAGS_env->NowMicros();
4738 
4739         uint64_t usecs_since_last;
4740         if (now > thread->stats.GetSineInterval()) {
4741           usecs_since_last = now - thread->stats.GetSineInterval();
4742         } else {
4743           usecs_since_last = 0;
4744         }
4745 
4746         if (usecs_since_last >
4747             (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
4748           double usecs_since_start =
4749                   static_cast<double>(now - thread->stats.GetStart());
4750           thread->stats.ResetSineInterval();
4751           uint64_t write_rate =
4752                   static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
4753           thread->shared->write_rate_limiter.reset(
4754                   NewGenericRateLimiter(write_rate));
4755         }
4756       }
4757       if (!s.ok()) {
4758         s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
4759       }
4760 
4761       if (!s.ok()) {
4762         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
4763         ErrorExit();
4764       }
4765     }
4766     thread->stats.AddBytes(bytes);
4767   }
4768 
DoDeterministicCompact(ThreadState * thread,CompactionStyle compaction_style,WriteMode write_mode)4769   Status DoDeterministicCompact(ThreadState* thread,
4770                                 CompactionStyle compaction_style,
4771                                 WriteMode write_mode) {
4772 #ifndef ROCKSDB_LITE
4773     ColumnFamilyMetaData meta;
4774     std::vector<DB*> db_list;
4775     if (db_.db != nullptr) {
4776       db_list.push_back(db_.db);
4777     } else {
4778       for (auto& db : multi_dbs_) {
4779         db_list.push_back(db.db);
4780       }
4781     }
4782     std::vector<Options> options_list;
4783     for (auto db : db_list) {
4784       options_list.push_back(db->GetOptions());
4785       if (compaction_style != kCompactionStyleFIFO) {
4786         db->SetOptions({{"disable_auto_compactions", "1"},
4787                         {"level0_slowdown_writes_trigger", "400000000"},
4788                         {"level0_stop_writes_trigger", "400000000"}});
4789       } else {
4790         db->SetOptions({{"disable_auto_compactions", "1"}});
4791       }
4792     }
4793 
4794     assert(!db_list.empty());
4795     auto num_db = db_list.size();
4796     size_t num_levels = static_cast<size_t>(open_options_.num_levels);
4797     size_t output_level = open_options_.num_levels - 1;
4798     std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
4799     std::vector<size_t> num_files_at_level0(num_db, 0);
4800     if (compaction_style == kCompactionStyleLevel) {
4801       if (num_levels == 0) {
4802         return Status::InvalidArgument("num_levels should be larger than 1");
4803       }
4804       bool should_stop = false;
4805       while (!should_stop) {
4806         if (sorted_runs[0].empty()) {
4807           DoWrite(thread, write_mode);
4808         } else {
4809           DoWrite(thread, UNIQUE_RANDOM);
4810         }
4811         for (size_t i = 0; i < num_db; i++) {
4812           auto db = db_list[i];
4813           db->Flush(FlushOptions());
4814           db->GetColumnFamilyMetaData(&meta);
4815           if (num_files_at_level0[i] == meta.levels[0].files.size() ||
4816               writes_ == 0) {
4817             should_stop = true;
4818             continue;
4819           }
4820           sorted_runs[i].emplace_back(
4821               meta.levels[0].files.begin(),
4822               meta.levels[0].files.end() - num_files_at_level0[i]);
4823           num_files_at_level0[i] = meta.levels[0].files.size();
4824           if (sorted_runs[i].back().size() == 1) {
4825             should_stop = true;
4826             continue;
4827           }
4828           if (sorted_runs[i].size() == output_level) {
4829             auto& L1 = sorted_runs[i].back();
4830             L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
4831             should_stop = true;
4832             continue;
4833           }
4834         }
4835         writes_ /= static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
4836       }
4837       for (size_t i = 0; i < num_db; i++) {
4838         if (sorted_runs[i].size() < num_levels - 1) {
4839           fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels);
4840           exit(1);
4841         }
4842       }
4843       for (size_t i = 0; i < num_db; i++) {
4844         auto db = db_list[i];
4845         auto compactionOptions = CompactionOptions();
4846         compactionOptions.compression = FLAGS_compression_type_e;
4847         auto options = db->GetOptions();
4848         MutableCFOptions mutable_cf_options(options);
4849         for (size_t j = 0; j < sorted_runs[i].size(); j++) {
4850           compactionOptions.output_file_size_limit =
4851               MaxFileSizeForLevel(mutable_cf_options,
4852                   static_cast<int>(output_level), compaction_style);
4853           std::cout << sorted_runs[i][j].size() << std::endl;
4854           db->CompactFiles(compactionOptions, {sorted_runs[i][j].back().name,
4855                                                sorted_runs[i][j].front().name},
4856                            static_cast<int>(output_level - j) /*level*/);
4857         }
4858       }
4859     } else if (compaction_style == kCompactionStyleUniversal) {
4860       auto ratio = open_options_.compaction_options_universal.size_ratio;
4861       bool should_stop = false;
4862       while (!should_stop) {
4863         if (sorted_runs[0].empty()) {
4864           DoWrite(thread, write_mode);
4865         } else {
4866           DoWrite(thread, UNIQUE_RANDOM);
4867         }
4868         for (size_t i = 0; i < num_db; i++) {
4869           auto db = db_list[i];
4870           db->Flush(FlushOptions());
4871           db->GetColumnFamilyMetaData(&meta);
4872           if (num_files_at_level0[i] == meta.levels[0].files.size() ||
4873               writes_ == 0) {
4874             should_stop = true;
4875             continue;
4876           }
4877           sorted_runs[i].emplace_back(
4878               meta.levels[0].files.begin(),
4879               meta.levels[0].files.end() - num_files_at_level0[i]);
4880           num_files_at_level0[i] = meta.levels[0].files.size();
4881           if (sorted_runs[i].back().size() == 1) {
4882             should_stop = true;
4883             continue;
4884           }
4885           num_files_at_level0[i] = meta.levels[0].files.size();
4886         }
4887         writes_ =  static_cast<int64_t>(writes_* static_cast<double>(100) / (ratio + 200));
4888       }
4889       for (size_t i = 0; i < num_db; i++) {
4890         if (sorted_runs[i].size() < num_levels) {
4891           fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt  " levels\n", num_levels);
4892           exit(1);
4893         }
4894       }
4895       for (size_t i = 0; i < num_db; i++) {
4896         auto db = db_list[i];
4897         auto compactionOptions = CompactionOptions();
4898         compactionOptions.compression = FLAGS_compression_type_e;
4899         auto options = db->GetOptions();
4900         MutableCFOptions mutable_cf_options(options);
4901         for (size_t j = 0; j < sorted_runs[i].size(); j++) {
4902           compactionOptions.output_file_size_limit =
4903               MaxFileSizeForLevel(mutable_cf_options,
4904                   static_cast<int>(output_level), compaction_style);
4905           db->CompactFiles(
4906               compactionOptions,
4907               {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
4908               (output_level > j ? static_cast<int>(output_level - j)
4909                                 : 0) /*level*/);
4910         }
4911       }
4912     } else if (compaction_style == kCompactionStyleFIFO) {
4913       if (num_levels != 1) {
4914         return Status::InvalidArgument(
4915           "num_levels should be 1 for FIFO compaction");
4916       }
4917       if (FLAGS_num_multi_db != 0) {
4918         return Status::InvalidArgument("Doesn't support multiDB");
4919       }
4920       auto db = db_list[0];
4921       std::vector<std::string> file_names;
4922       while (true) {
4923         if (sorted_runs[0].empty()) {
4924           DoWrite(thread, write_mode);
4925         } else {
4926           DoWrite(thread, UNIQUE_RANDOM);
4927         }
4928         db->Flush(FlushOptions());
4929         db->GetColumnFamilyMetaData(&meta);
4930         auto total_size = meta.levels[0].size;
4931         if (total_size >=
4932           db->GetOptions().compaction_options_fifo.max_table_files_size) {
4933           for (auto file_meta : meta.levels[0].files) {
4934             file_names.emplace_back(file_meta.name);
4935           }
4936           break;
4937         }
4938       }
4939       // TODO(shuzhang1989): Investigate why CompactFiles not working
4940       // auto compactionOptions = CompactionOptions();
4941       // db->CompactFiles(compactionOptions, file_names, 0);
4942       auto compactionOptions = CompactRangeOptions();
4943       db->CompactRange(compactionOptions, nullptr, nullptr);
4944     } else {
4945       fprintf(stdout,
4946               "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
4947               "filldeterministic");
4948       return Status::InvalidArgument("None compaction is not supported");
4949     }
4950 
4951 // Verify seqno and key range
4952 // Note: the seqno get changed at the max level by implementation
4953 // optimization, so skip the check of the max level.
4954 #ifndef NDEBUG
4955     for (size_t k = 0; k < num_db; k++) {
4956       auto db = db_list[k];
4957       db->GetColumnFamilyMetaData(&meta);
4958       // verify the number of sorted runs
4959       if (compaction_style == kCompactionStyleLevel) {
4960         assert(num_levels - 1 == sorted_runs[k].size());
4961       } else if (compaction_style == kCompactionStyleUniversal) {
4962         assert(meta.levels[0].files.size() + num_levels - 1 ==
4963                sorted_runs[k].size());
4964       } else if (compaction_style == kCompactionStyleFIFO) {
4965         // TODO(gzh): FIFO compaction
4966         db->GetColumnFamilyMetaData(&meta);
4967         auto total_size = meta.levels[0].size;
4968         assert(total_size <=
4969           db->GetOptions().compaction_options_fifo.max_table_files_size);
4970           break;
4971       }
4972 
4973       // verify smallest/largest seqno and key range of each sorted run
4974       auto max_level = num_levels - 1;
4975       int level;
4976       for (size_t i = 0; i < sorted_runs[k].size(); i++) {
4977         level = static_cast<int>(max_level - i);
4978         SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
4979         SequenceNumber sorted_run_largest_seqno = 0;
4980         std::string sorted_run_smallest_key, sorted_run_largest_key;
4981         bool first_key = true;
4982         for (auto fileMeta : sorted_runs[k][i]) {
4983           sorted_run_smallest_seqno =
4984               std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
4985           sorted_run_largest_seqno =
4986               std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
4987           if (first_key ||
4988               db->DefaultColumnFamily()->GetComparator()->Compare(
4989                   fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
4990             sorted_run_smallest_key = fileMeta.smallestkey;
4991           }
4992           if (first_key ||
4993               db->DefaultColumnFamily()->GetComparator()->Compare(
4994                   fileMeta.largestkey, sorted_run_largest_key) > 0) {
4995             sorted_run_largest_key = fileMeta.largestkey;
4996           }
4997           first_key = false;
4998         }
4999         if (compaction_style == kCompactionStyleLevel ||
5000             (compaction_style == kCompactionStyleUniversal && level > 0)) {
5001           SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
5002           SequenceNumber level_largest_seqno = 0;
5003           for (auto fileMeta : meta.levels[level].files) {
5004             level_smallest_seqno =
5005                 std::min(level_smallest_seqno, fileMeta.smallest_seqno);
5006             level_largest_seqno =
5007                 std::max(level_largest_seqno, fileMeta.largest_seqno);
5008           }
5009           assert(sorted_run_smallest_key ==
5010                  meta.levels[level].files.front().smallestkey);
5011           assert(sorted_run_largest_key ==
5012                  meta.levels[level].files.back().largestkey);
5013           if (level != static_cast<int>(max_level)) {
5014             // compaction at max_level would change sequence number
5015             assert(sorted_run_smallest_seqno == level_smallest_seqno);
5016             assert(sorted_run_largest_seqno == level_largest_seqno);
5017           }
5018         } else if (compaction_style == kCompactionStyleUniversal) {
5019           // level <= 0 means sorted runs on level 0
5020           auto level0_file =
5021               meta.levels[0].files[sorted_runs[k].size() - 1 - i];
5022           assert(sorted_run_smallest_key == level0_file.smallestkey);
5023           assert(sorted_run_largest_key == level0_file.largestkey);
5024           if (level != static_cast<int>(max_level)) {
5025             assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
5026             assert(sorted_run_largest_seqno == level0_file.largest_seqno);
5027           }
5028         }
5029       }
5030     }
5031 #endif
5032     // print the size of each sorted_run
5033     for (size_t k = 0; k < num_db; k++) {
5034       auto db = db_list[k];
5035       fprintf(stdout,
5036               "---------------------- DB %" ROCKSDB_PRIszt " LSM ---------------------\n", k);
5037       db->GetColumnFamilyMetaData(&meta);
5038       for (auto& levelMeta : meta.levels) {
5039         if (levelMeta.files.empty()) {
5040           continue;
5041         }
5042         if (levelMeta.level == 0) {
5043           for (auto& fileMeta : levelMeta.files) {
5044             fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n",
5045                     levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
5046           }
5047         } else {
5048           fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
5049                   levelMeta.level, levelMeta.files.front().name.c_str(),
5050                   levelMeta.files.back().name.c_str(), levelMeta.size);
5051         }
5052       }
5053     }
5054     for (size_t i = 0; i < num_db; i++) {
5055       db_list[i]->SetOptions(
5056           {{"disable_auto_compactions",
5057             std::to_string(options_list[i].disable_auto_compactions)},
5058            {"level0_slowdown_writes_trigger",
5059             std::to_string(options_list[i].level0_slowdown_writes_trigger)},
5060            {"level0_stop_writes_trigger",
5061             std::to_string(options_list[i].level0_stop_writes_trigger)}});
5062     }
5063     return Status::OK();
5064 #else
5065     (void)thread;
5066     (void)compaction_style;
5067     (void)write_mode;
5068     fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
5069     return Status::NotSupported(
5070         "Rocksdb Lite doesn't support filldeterministic");
5071 #endif  // ROCKSDB_LITE
5072   }
5073 
ReadSequential(ThreadState * thread)5074   void ReadSequential(ThreadState* thread) {
5075     if (db_.db != nullptr) {
5076       ReadSequential(thread, db_.db);
5077     } else {
5078       for (const auto& db_with_cfh : multi_dbs_) {
5079         ReadSequential(thread, db_with_cfh.db);
5080       }
5081     }
5082   }
5083 
ReadSequential(ThreadState * thread,DB * db)5084   void ReadSequential(ThreadState* thread, DB* db) {
5085     ReadOptions options(FLAGS_verify_checksum, true);
5086     options.tailing = FLAGS_use_tailing_iterator;
5087     std::unique_ptr<char[]> ts_guard;
5088     Slice ts;
5089     if (user_timestamp_size_ > 0) {
5090       ts_guard.reset(new char[user_timestamp_size_]);
5091       ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5092       options.timestamp = &ts;
5093     }
5094 
5095     Iterator* iter = db->NewIterator(options);
5096     int64_t i = 0;
5097     int64_t bytes = 0;
5098     for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
5099       bytes += iter->key().size() + iter->value().size();
5100       thread->stats.FinishedOps(nullptr, db, 1, kRead);
5101       ++i;
5102 
5103       if (thread->shared->read_rate_limiter.get() != nullptr &&
5104           i % 1024 == 1023) {
5105         thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5106                                                    nullptr /* stats */,
5107                                                    RateLimiter::OpType::kRead);
5108       }
5109     }
5110 
5111     delete iter;
5112     thread->stats.AddBytes(bytes);
5113     if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5114       thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5115                                get_perf_context()->ToString());
5116     }
5117   }
5118 
ReadToRowCache(ThreadState * thread)5119   void ReadToRowCache(ThreadState* thread) {
5120     int64_t read = 0;
5121     int64_t found = 0;
5122     int64_t bytes = 0;
5123     int64_t key_rand = 0;
5124     ReadOptions options(FLAGS_verify_checksum, true);
5125     std::unique_ptr<const char[]> key_guard;
5126     Slice key = AllocateKey(&key_guard);
5127     PinnableSlice pinnable_val;
5128 
5129     while (key_rand < FLAGS_num) {
5130       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
5131       // We use same key_rand as seed for key and column family so that we can
5132       // deterministically find the cfh corresponding to a particular key, as it
5133       // is done in DoWrite method.
5134       GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5135       key_rand++;
5136       read++;
5137       Status s;
5138       if (FLAGS_num_column_families > 1) {
5139         s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
5140                                  &pinnable_val);
5141       } else {
5142         pinnable_val.Reset();
5143         s = db_with_cfh->db->Get(options,
5144                                  db_with_cfh->db->DefaultColumnFamily(), key,
5145                                  &pinnable_val);
5146       }
5147 
5148       if (s.ok()) {
5149         found++;
5150         bytes += key.size() + pinnable_val.size();
5151       } else if (!s.IsNotFound()) {
5152         fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5153         abort();
5154       }
5155 
5156       if (thread->shared->read_rate_limiter.get() != nullptr &&
5157           read % 256 == 255) {
5158         thread->shared->read_rate_limiter->Request(
5159             256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5160       }
5161 
5162       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5163     }
5164 
5165     char msg[100];
5166     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
5167              read);
5168 
5169     thread->stats.AddBytes(bytes);
5170     thread->stats.AddMessage(msg);
5171 
5172     if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5173       thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5174                                get_perf_context()->ToString());
5175     }
5176   }
5177 
ReadReverse(ThreadState * thread)5178   void ReadReverse(ThreadState* thread) {
5179     if (db_.db != nullptr) {
5180       ReadReverse(thread, db_.db);
5181     } else {
5182       for (const auto& db_with_cfh : multi_dbs_) {
5183         ReadReverse(thread, db_with_cfh.db);
5184       }
5185     }
5186   }
5187 
ReadReverse(ThreadState * thread,DB * db)5188   void ReadReverse(ThreadState* thread, DB* db) {
5189     Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
5190     int64_t i = 0;
5191     int64_t bytes = 0;
5192     for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
5193       bytes += iter->key().size() + iter->value().size();
5194       thread->stats.FinishedOps(nullptr, db, 1, kRead);
5195       ++i;
5196       if (thread->shared->read_rate_limiter.get() != nullptr &&
5197           i % 1024 == 1023) {
5198         thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5199                                                    nullptr /* stats */,
5200                                                    RateLimiter::OpType::kRead);
5201       }
5202     }
5203     delete iter;
5204     thread->stats.AddBytes(bytes);
5205   }
5206 
ReadRandomFast(ThreadState * thread)5207   void ReadRandomFast(ThreadState* thread) {
5208     int64_t read = 0;
5209     int64_t found = 0;
5210     int64_t nonexist = 0;
5211     ReadOptions options(FLAGS_verify_checksum, true);
5212     std::unique_ptr<const char[]> key_guard;
5213     Slice key = AllocateKey(&key_guard);
5214     std::string value;
5215     Slice ts;
5216     std::unique_ptr<char[]> ts_guard;
5217     if (user_timestamp_size_ > 0) {
5218       ts_guard.reset(new char[user_timestamp_size_]);
5219     }
5220     DB* db = SelectDBWithCfh(thread)->db;
5221 
5222     int64_t pot = 1;
5223     while (pot < FLAGS_num) {
5224       pot <<= 1;
5225     }
5226 
5227     Duration duration(FLAGS_duration, reads_);
5228     do {
5229       for (int i = 0; i < 100; ++i) {
5230         int64_t key_rand = thread->rand.Next() & (pot - 1);
5231         GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5232         ++read;
5233         std::string ts_ret;
5234         std::string* ts_ptr = nullptr;
5235         if (user_timestamp_size_ > 0) {
5236           ts = mock_app_clock_->GetTimestampForRead(thread->rand,
5237                                                     ts_guard.get());
5238           options.timestamp = &ts;
5239           ts_ptr = &ts_ret;
5240         }
5241         auto status = db->Get(options, key, &value, ts_ptr);
5242         if (status.ok()) {
5243           ++found;
5244         } else if (!status.IsNotFound()) {
5245           fprintf(stderr, "Get returned an error: %s\n",
5246                   status.ToString().c_str());
5247           abort();
5248         }
5249         if (key_rand >= FLAGS_num) {
5250           ++nonexist;
5251         }
5252       }
5253       if (thread->shared->read_rate_limiter.get() != nullptr) {
5254         thread->shared->read_rate_limiter->Request(
5255             100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5256       }
5257 
5258       thread->stats.FinishedOps(nullptr, db, 100, kRead);
5259     } while (!duration.Done(100));
5260 
5261     char msg[100];
5262     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, "
5263              "issued %" PRIu64 " non-exist keys)\n",
5264              found, read, nonexist);
5265 
5266     thread->stats.AddMessage(msg);
5267 
5268     if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5269       thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5270                                get_perf_context()->ToString());
5271     }
5272   }
5273 
GetRandomKey(Random64 * rand)5274   int64_t GetRandomKey(Random64* rand) {
5275     uint64_t rand_int = rand->Next();
5276     int64_t key_rand;
5277     if (read_random_exp_range_ == 0) {
5278       key_rand = rand_int % FLAGS_num;
5279     } else {
5280       const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
5281       long double order = -static_cast<long double>(rand_int % kBigInt) /
5282                           static_cast<long double>(kBigInt) *
5283                           read_random_exp_range_;
5284       long double exp_ran = std::exp(order);
5285       uint64_t rand_num =
5286           static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
5287       // Map to a different number to avoid locality.
5288       const uint64_t kBigPrime = 0x5bd1e995;
5289       // Overflow is like %(2^64). Will have little impact of results.
5290       key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
5291     }
5292     return key_rand;
5293   }
5294 
ReadRandom(ThreadState * thread)5295   void ReadRandom(ThreadState* thread) {
5296     int64_t read = 0;
5297     int64_t found = 0;
5298     int64_t bytes = 0;
5299     int num_keys = 0;
5300     int64_t key_rand = 0;
5301     ReadOptions options(FLAGS_verify_checksum, true);
5302     std::unique_ptr<const char[]> key_guard;
5303     Slice key = AllocateKey(&key_guard);
5304     PinnableSlice pinnable_val;
5305     std::unique_ptr<char[]> ts_guard;
5306     Slice ts;
5307     if (user_timestamp_size_ > 0) {
5308       ts_guard.reset(new char[user_timestamp_size_]);
5309     }
5310 
5311     Duration duration(FLAGS_duration, reads_);
5312     while (!duration.Done(1)) {
5313       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
5314       // We use same key_rand as seed for key and column family so that we can
5315       // deterministically find the cfh corresponding to a particular key, as it
5316       // is done in DoWrite method.
5317       if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
5318         if (++num_keys == entries_per_batch_) {
5319           num_keys = 0;
5320           key_rand = GetRandomKey(&thread->rand);
5321           if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
5322               FLAGS_num) {
5323             key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
5324           }
5325         } else {
5326           key_rand += FLAGS_multiread_stride;
5327         }
5328       } else {
5329         key_rand = GetRandomKey(&thread->rand);
5330       }
5331       GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5332       read++;
5333       std::string ts_ret;
5334       std::string* ts_ptr = nullptr;
5335       if (user_timestamp_size_ > 0) {
5336         ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5337         options.timestamp = &ts;
5338         ts_ptr = &ts_ret;
5339       }
5340       Status s;
5341       pinnable_val.Reset();
5342       if (FLAGS_num_column_families > 1) {
5343         s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
5344                                  &pinnable_val, ts_ptr);
5345       } else {
5346         s = db_with_cfh->db->Get(options,
5347                                  db_with_cfh->db->DefaultColumnFamily(), key,
5348                                  &pinnable_val, ts_ptr);
5349       }
5350       if (s.ok()) {
5351         found++;
5352         bytes += key.size() + pinnable_val.size() + user_timestamp_size_;
5353       } else if (!s.IsNotFound()) {
5354         fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5355         abort();
5356       }
5357 
5358       if (thread->shared->read_rate_limiter.get() != nullptr &&
5359           read % 256 == 255) {
5360         thread->shared->read_rate_limiter->Request(
5361             256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5362       }
5363 
5364       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5365     }
5366 
5367     char msg[100];
5368     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
5369              found, read);
5370 
5371     thread->stats.AddBytes(bytes);
5372     thread->stats.AddMessage(msg);
5373 
5374     if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5375       thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5376                                get_perf_context()->ToString());
5377     }
5378   }
5379 
5380   // Calls MultiGet over a list of keys from a random distribution.
5381   // Returns the total number of keys found.
MultiReadRandom(ThreadState * thread)5382   void MultiReadRandom(ThreadState* thread) {
5383     int64_t read = 0;
5384     int64_t num_multireads = 0;
5385     int64_t found = 0;
5386     ReadOptions options(FLAGS_verify_checksum, true);
5387     std::vector<Slice> keys;
5388     std::vector<std::unique_ptr<const char[]> > key_guards;
5389     std::vector<std::string> values(entries_per_batch_);
5390     PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
5391     std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
5392     std::vector<Status> stat_list(entries_per_batch_);
5393     while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
5394       key_guards.push_back(std::unique_ptr<const char[]>());
5395       keys.push_back(AllocateKey(&key_guards.back()));
5396     }
5397 
5398     std::unique_ptr<char[]> ts_guard;
5399     if (user_timestamp_size_ > 0) {
5400       ts_guard.reset(new char[user_timestamp_size_]);
5401     }
5402 
5403     Duration duration(FLAGS_duration, reads_);
5404     while (!duration.Done(entries_per_batch_)) {
5405       DB* db = SelectDB(thread);
5406       if (FLAGS_multiread_stride) {
5407         int64_t key = GetRandomKey(&thread->rand);
5408         if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
5409             static_cast<int64_t>(FLAGS_num)) {
5410           key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
5411         }
5412         for (int64_t i = 0; i < entries_per_batch_; ++i) {
5413           GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
5414           key += FLAGS_multiread_stride;
5415         }
5416       } else {
5417         for (int64_t i = 0; i < entries_per_batch_; ++i) {
5418           GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
5419         }
5420       }
5421       Slice ts;
5422       if (user_timestamp_size_ > 0) {
5423         ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5424         options.timestamp = &ts;
5425       }
5426       if (!FLAGS_multiread_batched) {
5427         std::vector<Status> statuses = db->MultiGet(options, keys, &values);
5428         assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
5429 
5430         read += entries_per_batch_;
5431         num_multireads++;
5432         for (int64_t i = 0; i < entries_per_batch_; ++i) {
5433           if (statuses[i].ok()) {
5434             ++found;
5435           } else if (!statuses[i].IsNotFound()) {
5436             fprintf(stderr, "MultiGet returned an error: %s\n",
5437                     statuses[i].ToString().c_str());
5438             abort();
5439           }
5440         }
5441       } else {
5442         db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
5443                      keys.data(), pin_values, stat_list.data());
5444 
5445         read += entries_per_batch_;
5446         num_multireads++;
5447         for (int64_t i = 0; i < entries_per_batch_; ++i) {
5448           if (stat_list[i].ok()) {
5449             ++found;
5450           } else if (!stat_list[i].IsNotFound()) {
5451             fprintf(stderr, "MultiGet returned an error: %s\n",
5452                     stat_list[i].ToString().c_str());
5453             abort();
5454           }
5455           stat_list[i] = Status::OK();
5456           pin_values[i].Reset();
5457         }
5458       }
5459       if (thread->shared->read_rate_limiter.get() != nullptr &&
5460           num_multireads % 256 == 255) {
5461         thread->shared->read_rate_limiter->Request(
5462             256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
5463             RateLimiter::OpType::kRead);
5464       }
5465       thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
5466     }
5467 
5468     char msg[100];
5469     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
5470              found, read);
5471     thread->stats.AddMessage(msg);
5472   }
5473 
5474   // Calls ApproximateSize over random key ranges.
ApproximateSizeRandom(ThreadState * thread)5475   void ApproximateSizeRandom(ThreadState* thread) {
5476     int64_t size_sum = 0;
5477     int64_t num_sizes = 0;
5478     const size_t batch_size = entries_per_batch_;
5479     std::vector<Range> ranges;
5480     std::vector<Slice> lkeys;
5481     std::vector<std::unique_ptr<const char[]>> lkey_guards;
5482     std::vector<Slice> rkeys;
5483     std::vector<std::unique_ptr<const char[]>> rkey_guards;
5484     std::vector<uint64_t> sizes;
5485     while (ranges.size() < batch_size) {
5486       // Ugly without C++17 return from emplace_back
5487       lkey_guards.emplace_back();
5488       rkey_guards.emplace_back();
5489       lkeys.emplace_back(AllocateKey(&lkey_guards.back()));
5490       rkeys.emplace_back(AllocateKey(&rkey_guards.back()));
5491       ranges.emplace_back(lkeys.back(), rkeys.back());
5492       sizes.push_back(0);
5493     }
5494     Duration duration(FLAGS_duration, reads_);
5495     while (!duration.Done(1)) {
5496       DB* db = SelectDB(thread);
5497       for (size_t i = 0; i < batch_size; ++i) {
5498         int64_t lkey = GetRandomKey(&thread->rand);
5499         int64_t rkey = GetRandomKey(&thread->rand);
5500         if (lkey > rkey) {
5501           std::swap(lkey, rkey);
5502         }
5503         GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]);
5504         GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]);
5505       }
5506       db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_),
5507                               &sizes[0]);
5508       num_sizes += entries_per_batch_;
5509       for (int64_t size : sizes) {
5510         size_sum += size;
5511       }
5512       thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers);
5513     }
5514 
5515     char msg[100];
5516     snprintf(msg, sizeof(msg), "(Avg approx size=%g)",
5517              static_cast<double>(size_sum) / static_cast<double>(num_sizes));
5518     thread->stats.AddMessage(msg);
5519   }
5520 
5521   // The inverse function of Pareto distribution
ParetoCdfInversion(double u,double theta,double k,double sigma)5522   int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
5523     double ret;
5524     if (k == 0.0) {
5525       ret = theta - sigma * std::log(u);
5526     } else {
5527       ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
5528     }
5529     return static_cast<int64_t>(ceil(ret));
5530   }
5531   // The inverse function of power distribution (y=ax^b)
PowerCdfInversion(double u,double a,double b)5532   int64_t PowerCdfInversion(double u, double a, double b) {
5533     double ret;
5534     ret = std::pow((u / a), (1 / b));
5535     return static_cast<int64_t>(ceil(ret));
5536   }
5537 
5538   // Add the noice to the QPS
AddNoise(double origin,double noise_ratio)5539   double AddNoise(double origin, double noise_ratio) {
5540     if (noise_ratio < 0.0 || noise_ratio > 1.0) {
5541       return origin;
5542     }
5543     int band_int = static_cast<int>(FLAGS_sine_a);
5544     double delta = (rand() % band_int - band_int / 2) * noise_ratio;
5545     if (origin + delta < 0) {
5546       return origin;
5547     } else {
5548       return (origin + delta);
5549     }
5550   }
5551 
5552   // Decide the ratio of different query types
5553   // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
5554   class QueryDecider {
5555    public:
5556     std::vector<int> type_;
5557     std::vector<double> ratio_;
5558     int range_;
5559 
QueryDecider()5560     QueryDecider() {}
~QueryDecider()5561     ~QueryDecider() {}
5562 
Initiate(std::vector<double> ratio_input)5563     Status Initiate(std::vector<double> ratio_input) {
5564       int range_max = 1000;
5565       double sum = 0.0;
5566       for (auto& ratio : ratio_input) {
5567         sum += ratio;
5568       }
5569       range_ = 0;
5570       for (auto& ratio : ratio_input) {
5571         range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
5572         type_.push_back(range_);
5573         ratio_.push_back(ratio / sum);
5574       }
5575       return Status::OK();
5576     }
5577 
GetType(int64_t rand_num)5578     int GetType(int64_t rand_num) {
5579       if (rand_num < 0) {
5580         rand_num = rand_num * (-1);
5581       }
5582       assert(range_ != 0);
5583       int pos = static_cast<int>(rand_num % range_);
5584       for (int i = 0; i < static_cast<int>(type_.size()); i++) {
5585         if (pos < type_[i]) {
5586           return i;
5587         }
5588       }
5589       return 0;
5590     }
5591   };
5592 
5593   // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
5594   // to transfer a random value to one keyrange based on the hotness.
5595   struct KeyrangeUnit {
5596     int64_t keyrange_start;
5597     int64_t keyrange_access;
5598     int64_t keyrange_keys;
5599   };
5600 
5601   // From our observations, the prefix hotness (key-range hotness) follows
5602   // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
5603   // However, we cannot directly use the inverse function to decide a
5604   // key-range from a random distribution. To achieve it, we create a list of
5605   // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
5606   // decided based on the hotness of the key-range. When a random value is
5607   // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
5608   // and one KeyrangeUnit is selected. The probability of a  KeyrangeUnit being
5609   // selected is the same as the hotness of this KeyrangeUnit. After that, the
5610   // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
5611   // can based on the power distribution (y=ax^b) to generate the offset of
5612   // the key in the selected key-range. In this way, we generate the keyID
5613   // based on the hotness of the prefix and also the key hotness distribution.
5614   class GenerateTwoTermExpKeys {
5615    public:
5616     // Avoid uninitialized warning-as-error in some compilers
5617     int64_t keyrange_rand_max_ = 0;
5618     int64_t keyrange_size_ = 0;
5619     int64_t keyrange_num_ = 0;
5620     std::vector<KeyrangeUnit> keyrange_set_;
5621 
5622     // Initiate the KeyrangeUnit vector and calculate the size of each
5623     // KeyrangeUnit.
InitiateExpDistribution(int64_t total_keys,double prefix_a,double prefix_b,double prefix_c,double prefix_d)5624     Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
5625                                    double prefix_b, double prefix_c,
5626                                    double prefix_d) {
5627       int64_t amplify = 0;
5628       int64_t keyrange_start = 0;
5629       if (FLAGS_keyrange_num <= 0) {
5630         keyrange_num_ = 1;
5631       } else {
5632         keyrange_num_ = FLAGS_keyrange_num;
5633       }
5634       keyrange_size_ = total_keys / keyrange_num_;
5635 
5636       // Calculate the key-range shares size based on the input parameters
5637       for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
5638         // Step 1. Calculate the probability that this key range will be
5639         // accessed in a query. It is based on the two-term expoential
5640         // distribution
5641         double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
5642                             prefix_c * std::exp(prefix_d * pfx);
5643         if (keyrange_p < std::pow(10.0, -16.0)) {
5644           keyrange_p = 0.0;
5645         }
5646         // Step 2. Calculate the amplify
5647         // In order to allocate a query to a key-range based on the random
5648         // number generated for this query, we need to extend the probability
5649         // of each key range from [0,1] to [0, amplify]. Amplify is calculated
5650         // by 1/(smallest key-range probability). In this way, we ensure that
5651         // all key-ranges are assigned with an Integer that  >=0
5652         if (amplify == 0 && keyrange_p > 0) {
5653           amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
5654         }
5655 
5656         // Step 3. For each key-range, we calculate its position in the
5657         // [0, amplify] range, including the start, the size (keyrange_access)
5658         KeyrangeUnit p_unit;
5659         p_unit.keyrange_start = keyrange_start;
5660         if (0.0 >= keyrange_p) {
5661           p_unit.keyrange_access = 0;
5662         } else {
5663           p_unit.keyrange_access =
5664               static_cast<int64_t>(std::floor(amplify * keyrange_p));
5665         }
5666         p_unit.keyrange_keys = keyrange_size_;
5667         keyrange_set_.push_back(p_unit);
5668         keyrange_start += p_unit.keyrange_access;
5669       }
5670       keyrange_rand_max_ = keyrange_start;
5671 
5672       // Step 4. Shuffle the key-ranges randomly
5673       // Since the access probability is calculated from small to large,
5674       // If we do not re-allocate them, hot key-ranges are always at the end
5675       // and cold key-ranges are at the begin of the key space. Therefore, the
5676       // key-ranges are shuffled and the rand seed is only decide by the
5677       // key-range hotness distribution. With the same distribution parameters
5678       // the shuffle results are the same.
5679       Random64 rand_loca(keyrange_rand_max_);
5680       for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
5681         int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
5682         assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
5683                pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
5684         std::swap(keyrange_set_[i], keyrange_set_[pos]);
5685       }
5686 
5687       // Step 5. Recalculate the prefix start postion after shuffling
5688       int64_t offset = 0;
5689       for (auto& p_unit : keyrange_set_) {
5690         p_unit.keyrange_start = offset;
5691         offset += p_unit.keyrange_access;
5692       }
5693 
5694       return Status::OK();
5695     }
5696 
5697     // Generate the Key ID according to the input ini_rand and key distribution
DistGetKeyID(int64_t ini_rand,double key_dist_a,double key_dist_b)5698     int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
5699                          double key_dist_b) {
5700       int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
5701 
5702       // Calculate and select one key-range that contains the new key
5703       int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
5704       while (start + 1 < end) {
5705         int64_t mid = start + (end - start) / 2;
5706         assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
5707         if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
5708           end = mid;
5709         } else {
5710           start = mid;
5711         }
5712       }
5713       int64_t keyrange_id = start;
5714 
5715       // Select one key in the key-range and compose the keyID
5716       int64_t key_offset = 0, key_seed;
5717       if (key_dist_a == 0.0 || key_dist_b == 0.0) {
5718         key_offset = ini_rand % keyrange_size_;
5719       } else {
5720         double u =
5721             static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_;
5722         key_seed = static_cast<int64_t>(
5723             ceil(std::pow((u / key_dist_a), (1 / key_dist_b))));
5724         Random64 rand_key(key_seed);
5725         key_offset = rand_key.Next() % keyrange_size_;
5726       }
5727       return keyrange_size_ * keyrange_id + key_offset;
5728     }
5729   };
5730 
5731   // The social graph workload mixed with Get, Put, Iterator queries.
5732   // The value size and iterator length follow Pareto distribution.
5733   // The overall key access follow power distribution. If user models the
5734   // workload based on different key-ranges (or different prefixes), user
5735   // can use two-term-exponential distribution to fit the workload. User
5736   // needs to decide the ratio between Get, Put, Iterator queries before
5737   // starting the benchmark.
MixGraph(ThreadState * thread)5738   void MixGraph(ThreadState* thread) {
5739     int64_t read = 0;  // including single gets and Next of iterators
5740     int64_t gets = 0;
5741     int64_t puts = 0;
5742     int64_t found = 0;
5743     int64_t seek = 0;
5744     int64_t seek_found = 0;
5745     int64_t bytes = 0;
5746     const int64_t default_value_max = 1 * 1024 * 1024;
5747     int64_t value_max = default_value_max;
5748     int64_t scan_len_max = FLAGS_mix_max_scan_len;
5749     double write_rate = 1000000.0;
5750     double read_rate = 1000000.0;
5751     bool use_prefix_modeling = false;
5752     bool use_random_modeling = false;
5753     GenerateTwoTermExpKeys gen_exp;
5754     std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
5755                               FLAGS_mix_seek_ratio};
5756     char value_buffer[default_value_max];
5757     QueryDecider query;
5758     RandomGenerator gen;
5759     Status s;
5760     if (value_max > FLAGS_mix_max_value_size) {
5761       value_max = FLAGS_mix_max_value_size;
5762     }
5763 
5764     ReadOptions options(FLAGS_verify_checksum, true);
5765     std::unique_ptr<const char[]> key_guard;
5766     Slice key = AllocateKey(&key_guard);
5767     PinnableSlice pinnable_val;
5768     query.Initiate(ratio);
5769 
5770     // the limit of qps initiation
5771     if (FLAGS_sine_a != 0 || FLAGS_sine_d != 0) {
5772       thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
5773           static_cast<int64_t>(read_rate), 100000 /* refill_period_us */, 10 /* fairness */,
5774           RateLimiter::Mode::kReadsOnly));
5775       thread->shared->write_rate_limiter.reset(
5776           NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
5777     }
5778 
5779     // Decide if user wants to use prefix based key generation
5780     if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
5781         FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
5782       use_prefix_modeling = true;
5783       gen_exp.InitiateExpDistribution(
5784           FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
5785           FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
5786     }
5787     if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) {
5788       use_random_modeling = true;
5789     }
5790 
5791     Duration duration(FLAGS_duration, reads_);
5792     while (!duration.Done(1)) {
5793       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
5794       int64_t ini_rand, rand_v, key_rand, key_seed;
5795       ini_rand = GetRandomKey(&thread->rand);
5796       rand_v = ini_rand % FLAGS_num;
5797       double u = static_cast<double>(rand_v) / FLAGS_num;
5798 
5799       // Generate the keyID based on the key hotness and prefix hotness
5800       if (use_random_modeling) {
5801         key_rand = ini_rand;
5802       } else if (use_prefix_modeling) {
5803         key_rand =
5804             gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
5805       } else {
5806         key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
5807         Random64 rand(key_seed);
5808         key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
5809       }
5810       GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5811       int query_type = query.GetType(rand_v);
5812 
5813       // change the qps
5814       uint64_t now = FLAGS_env->NowMicros();
5815       uint64_t usecs_since_last;
5816       if (now > thread->stats.GetSineInterval()) {
5817         usecs_since_last = now - thread->stats.GetSineInterval();
5818       } else {
5819         usecs_since_last = 0;
5820       }
5821 
5822       if (usecs_since_last >
5823           (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
5824         double usecs_since_start =
5825             static_cast<double>(now - thread->stats.GetStart());
5826         thread->stats.ResetSineInterval();
5827         double mix_rate_with_noise = AddNoise(
5828             SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
5829         read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
5830         write_rate =
5831             mix_rate_with_noise * query.ratio_[1] * FLAGS_mix_ave_kv_size;
5832 
5833         thread->shared->write_rate_limiter.reset(
5834             NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
5835         thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
5836             static_cast<int64_t>(read_rate),
5837             FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000}, 10,
5838             RateLimiter::Mode::kReadsOnly));
5839       }
5840       // Start the query
5841       if (query_type == 0) {
5842         // the Get query
5843         gets++;
5844         read++;
5845         if (FLAGS_num_column_families > 1) {
5846           s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
5847                                    &pinnable_val);
5848         } else {
5849           pinnable_val.Reset();
5850           s = db_with_cfh->db->Get(options,
5851                                    db_with_cfh->db->DefaultColumnFamily(), key,
5852                                    &pinnable_val);
5853         }
5854 
5855         if (s.ok()) {
5856           found++;
5857           bytes += key.size() + pinnable_val.size();
5858         } else if (!s.IsNotFound()) {
5859           fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5860           abort();
5861         }
5862 
5863         if (thread->shared->read_rate_limiter.get() != nullptr &&
5864             read % 256 == 255) {
5865           thread->shared->read_rate_limiter->Request(
5866               256, Env::IO_HIGH, nullptr /* stats */,
5867               RateLimiter::OpType::kRead);
5868         }
5869         thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5870       } else if (query_type == 1) {
5871         // the Put query
5872         puts++;
5873         int64_t val_size = ParetoCdfInversion(
5874             u, FLAGS_value_theta, FLAGS_value_k, FLAGS_value_sigma);
5875         if (val_size < 0) {
5876           val_size = 10;
5877         } else if (val_size > value_max) {
5878           val_size = val_size % value_max;
5879         }
5880         s = db_with_cfh->db->Put(
5881             write_options_, key,
5882             gen.Generate(static_cast<unsigned int>(val_size)));
5883         if (!s.ok()) {
5884           fprintf(stderr, "put error: %s\n", s.ToString().c_str());
5885           ErrorExit();
5886         }
5887 
5888         if (thread->shared->write_rate_limiter) {
5889           thread->shared->write_rate_limiter->Request(
5890               key.size() + val_size, Env::IO_HIGH, nullptr /*stats*/,
5891               RateLimiter::OpType::kWrite);
5892         }
5893         thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
5894       } else if (query_type == 2) {
5895         // Seek query
5896         if (db_with_cfh->db != nullptr) {
5897           Iterator* single_iter = nullptr;
5898           single_iter = db_with_cfh->db->NewIterator(options);
5899           if (single_iter != nullptr) {
5900             single_iter->Seek(key);
5901             seek++;
5902             read++;
5903             if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
5904               seek_found++;
5905             }
5906             int64_t scan_length =
5907                 ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
5908                                    FLAGS_iter_sigma) %
5909                 scan_len_max;
5910             for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
5911               Slice value = single_iter->value();
5912               memcpy(value_buffer, value.data(),
5913                      std::min(value.size(), sizeof(value_buffer)));
5914               bytes += single_iter->key().size() + single_iter->value().size();
5915               single_iter->Next();
5916               assert(single_iter->status().ok());
5917             }
5918           }
5919           delete single_iter;
5920         }
5921         thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
5922       }
5923     }
5924     char msg[256];
5925     snprintf(msg, sizeof(msg),
5926              "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64 " of %" PRIu64
5927              " in %" PRIu64 " found)\n",
5928              gets, puts, seek, found, read);
5929 
5930     thread->stats.AddBytes(bytes);
5931     thread->stats.AddMessage(msg);
5932 
5933     if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5934       thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5935                                get_perf_context()->ToString());
5936     }
5937   }
5938 
IteratorCreation(ThreadState * thread)5939   void IteratorCreation(ThreadState* thread) {
5940     Duration duration(FLAGS_duration, reads_);
5941     ReadOptions options(FLAGS_verify_checksum, true);
5942     std::unique_ptr<char[]> ts_guard;
5943     if (user_timestamp_size_ > 0) {
5944       ts_guard.reset(new char[user_timestamp_size_]);
5945     }
5946     while (!duration.Done(1)) {
5947       DB* db = SelectDB(thread);
5948       Slice ts;
5949       if (user_timestamp_size_ > 0) {
5950         ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5951         options.timestamp = &ts;
5952       }
5953       Iterator* iter = db->NewIterator(options);
5954       delete iter;
5955       thread->stats.FinishedOps(nullptr, db, 1, kOthers);
5956     }
5957   }
5958 
IteratorCreationWhileWriting(ThreadState * thread)5959   void IteratorCreationWhileWriting(ThreadState* thread) {
5960     if (thread->tid > 0) {
5961       IteratorCreation(thread);
5962     } else {
5963       BGWriter(thread, kWrite);
5964     }
5965   }
5966 
SeekRandom(ThreadState * thread)5967   void SeekRandom(ThreadState* thread) {
5968     int64_t read = 0;
5969     int64_t found = 0;
5970     int64_t bytes = 0;
5971     ReadOptions options(FLAGS_verify_checksum, true);
5972     options.total_order_seek = FLAGS_total_order_seek;
5973     options.prefix_same_as_start = FLAGS_prefix_same_as_start;
5974     options.tailing = FLAGS_use_tailing_iterator;
5975     options.readahead_size = FLAGS_readahead_size;
5976     std::unique_ptr<char[]> ts_guard;
5977     Slice ts;
5978     if (user_timestamp_size_ > 0) {
5979       ts_guard.reset(new char[user_timestamp_size_]);
5980       ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5981       options.timestamp = &ts;
5982     }
5983 
5984     Iterator* single_iter = nullptr;
5985     std::vector<Iterator*> multi_iters;
5986     if (db_.db != nullptr) {
5987       single_iter = db_.db->NewIterator(options);
5988     } else {
5989       for (const auto& db_with_cfh : multi_dbs_) {
5990         multi_iters.push_back(db_with_cfh.db->NewIterator(options));
5991       }
5992     }
5993 
5994     std::unique_ptr<const char[]> key_guard;
5995     Slice key = AllocateKey(&key_guard);
5996 
5997     std::unique_ptr<const char[]> upper_bound_key_guard;
5998     Slice upper_bound = AllocateKey(&upper_bound_key_guard);
5999     std::unique_ptr<const char[]> lower_bound_key_guard;
6000     Slice lower_bound = AllocateKey(&lower_bound_key_guard);
6001 
6002     Duration duration(FLAGS_duration, reads_);
6003     char value_buffer[256];
6004     while (!duration.Done(1)) {
6005       int64_t seek_pos = thread->rand.Next() % FLAGS_num;
6006       GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
6007                                 &key);
6008       if (FLAGS_max_scan_distance != 0) {
6009         if (FLAGS_reverse_iterator) {
6010           GenerateKeyFromInt(
6011               static_cast<uint64_t>(std::max(
6012                   static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
6013               FLAGS_num, &lower_bound);
6014           options.iterate_lower_bound = &lower_bound;
6015         } else {
6016           auto min_num =
6017               std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
6018           GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
6019                              &upper_bound);
6020           options.iterate_upper_bound = &upper_bound;
6021         }
6022       }
6023 
6024       if (!FLAGS_use_tailing_iterator) {
6025         if (db_.db != nullptr) {
6026           delete single_iter;
6027           single_iter = db_.db->NewIterator(options);
6028         } else {
6029           for (auto iter : multi_iters) {
6030             delete iter;
6031           }
6032           multi_iters.clear();
6033           for (const auto& db_with_cfh : multi_dbs_) {
6034             multi_iters.push_back(db_with_cfh.db->NewIterator(options));
6035           }
6036         }
6037       }
6038       // Pick a Iterator to use
6039       Iterator* iter_to_use = single_iter;
6040       if (single_iter == nullptr) {
6041         iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()];
6042       }
6043 
6044       iter_to_use->Seek(key);
6045       read++;
6046       if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
6047         found++;
6048       }
6049 
6050       for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
6051         // Copy out iterator's value to make sure we read them.
6052         Slice value = iter_to_use->value();
6053         memcpy(value_buffer, value.data(),
6054                std::min(value.size(), sizeof(value_buffer)));
6055         bytes += iter_to_use->key().size() + iter_to_use->value().size();
6056 
6057         if (!FLAGS_reverse_iterator) {
6058           iter_to_use->Next();
6059         } else {
6060           iter_to_use->Prev();
6061         }
6062         assert(iter_to_use->status().ok());
6063       }
6064 
6065       if (thread->shared->read_rate_limiter.get() != nullptr &&
6066           read % 256 == 255) {
6067         thread->shared->read_rate_limiter->Request(
6068             256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
6069       }
6070 
6071       thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
6072     }
6073     delete single_iter;
6074     for (auto iter : multi_iters) {
6075       delete iter;
6076     }
6077 
6078     char msg[100];
6079     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
6080              found, read);
6081     thread->stats.AddBytes(bytes);
6082     thread->stats.AddMessage(msg);
6083     if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
6084       thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
6085                                get_perf_context()->ToString());
6086     }
6087   }
6088 
SeekRandomWhileWriting(ThreadState * thread)6089   void SeekRandomWhileWriting(ThreadState* thread) {
6090     if (thread->tid > 0) {
6091       SeekRandom(thread);
6092     } else {
6093       BGWriter(thread, kWrite);
6094     }
6095   }
6096 
SeekRandomWhileMerging(ThreadState * thread)6097   void SeekRandomWhileMerging(ThreadState* thread) {
6098     if (thread->tid > 0) {
6099       SeekRandom(thread);
6100     } else {
6101       BGWriter(thread, kMerge);
6102     }
6103   }
6104 
DoDelete(ThreadState * thread,bool seq)6105   void DoDelete(ThreadState* thread, bool seq) {
6106     WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
6107                      user_timestamp_size_);
6108     Duration duration(seq ? 0 : FLAGS_duration, deletes_);
6109     int64_t i = 0;
6110     std::unique_ptr<const char[]> key_guard;
6111     Slice key = AllocateKey(&key_guard);
6112     std::unique_ptr<char[]> ts_guard;
6113     Slice ts;
6114     if (user_timestamp_size_ > 0) {
6115       ts_guard.reset(new char[user_timestamp_size_]);
6116     }
6117 
6118     while (!duration.Done(entries_per_batch_)) {
6119       DB* db = SelectDB(thread);
6120       batch.Clear();
6121       for (int64_t j = 0; j < entries_per_batch_; ++j) {
6122         const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
6123         GenerateKeyFromInt(k, FLAGS_num, &key);
6124         batch.Delete(key);
6125       }
6126       Status s;
6127       if (user_timestamp_size_ > 0) {
6128         ts = mock_app_clock_->Allocate(ts_guard.get());
6129         s = batch.AssignTimestamp(ts);
6130         if (!s.ok()) {
6131           fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str());
6132           ErrorExit();
6133         }
6134       }
6135       s = db->Write(write_options_, &batch);
6136       thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
6137       if (!s.ok()) {
6138         fprintf(stderr, "del error: %s\n", s.ToString().c_str());
6139         exit(1);
6140       }
6141       i += entries_per_batch_;
6142     }
6143   }
6144 
DeleteSeq(ThreadState * thread)6145   void DeleteSeq(ThreadState* thread) {
6146     DoDelete(thread, true);
6147   }
6148 
DeleteRandom(ThreadState * thread)6149   void DeleteRandom(ThreadState* thread) {
6150     DoDelete(thread, false);
6151   }
6152 
ReadWhileWriting(ThreadState * thread)6153   void ReadWhileWriting(ThreadState* thread) {
6154     if (thread->tid > 0) {
6155       ReadRandom(thread);
6156     } else {
6157       BGWriter(thread, kWrite);
6158     }
6159   }
6160 
ReadWhileMerging(ThreadState * thread)6161   void ReadWhileMerging(ThreadState* thread) {
6162     if (thread->tid > 0) {
6163       ReadRandom(thread);
6164     } else {
6165       BGWriter(thread, kMerge);
6166     }
6167   }
6168 
BGWriter(ThreadState * thread,enum OperationType write_merge)6169   void BGWriter(ThreadState* thread, enum OperationType write_merge) {
6170     // Special thread that keeps writing until other threads are done.
6171     RandomGenerator gen;
6172     int64_t bytes = 0;
6173 
6174     std::unique_ptr<RateLimiter> write_rate_limiter;
6175     if (FLAGS_benchmark_write_rate_limit > 0) {
6176       write_rate_limiter.reset(
6177           NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
6178     }
6179 
6180     // Don't merge stats from this thread with the readers.
6181     thread->stats.SetExcludeFromMerge();
6182 
6183     std::unique_ptr<const char[]> key_guard;
6184     Slice key = AllocateKey(&key_guard);
6185     std::unique_ptr<char[]> ts_guard;
6186     if (user_timestamp_size_ > 0) {
6187       ts_guard.reset(new char[user_timestamp_size_]);
6188     }
6189     uint32_t written = 0;
6190     bool hint_printed = false;
6191 
6192     while (true) {
6193       DB* db = SelectDB(thread);
6194       {
6195         MutexLock l(&thread->shared->mu);
6196         if (FLAGS_finish_after_writes && written == writes_) {
6197           fprintf(stderr, "Exiting the writer after %u writes...\n", written);
6198           break;
6199         }
6200         if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
6201           // Other threads have finished
6202           if (FLAGS_finish_after_writes) {
6203             // Wait for the writes to be finished
6204             if (!hint_printed) {
6205               fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
6206                       static_cast<int>(writes_) - written);
6207               hint_printed = true;
6208             }
6209           } else {
6210             // Finish the write immediately
6211             break;
6212           }
6213         }
6214       }
6215 
6216       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6217       Status s;
6218 
6219       Slice val = gen.Generate();
6220       Slice ts;
6221       if (user_timestamp_size_ > 0) {
6222         ts = mock_app_clock_->Allocate(ts_guard.get());
6223         write_options_.timestamp = &ts;
6224       }
6225       if (write_merge == kWrite) {
6226         s = db->Put(write_options_, key, val);
6227       } else {
6228         s = db->Merge(write_options_, key, val);
6229       }
6230       // Restore write_options_
6231       if (user_timestamp_size_ > 0) {
6232         write_options_.timestamp = nullptr;
6233       }
6234       written++;
6235 
6236       if (!s.ok()) {
6237         fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
6238         exit(1);
6239       }
6240       bytes += key.size() + val.size() + user_timestamp_size_;
6241       thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
6242 
6243       if (FLAGS_benchmark_write_rate_limit > 0) {
6244         write_rate_limiter->Request(
6245             key.size() + val.size(), Env::IO_HIGH,
6246             nullptr /* stats */, RateLimiter::OpType::kWrite);
6247       }
6248     }
6249     thread->stats.AddBytes(bytes);
6250   }
6251 
ReadWhileScanning(ThreadState * thread)6252   void ReadWhileScanning(ThreadState* thread) {
6253     if (thread->tid > 0) {
6254       ReadRandom(thread);
6255     } else {
6256       BGScan(thread);
6257     }
6258   }
6259 
BGScan(ThreadState * thread)6260   void BGScan(ThreadState* thread) {
6261     if (FLAGS_num_multi_db > 0) {
6262       fprintf(stderr, "Not supporting multiple DBs.\n");
6263       abort();
6264     }
6265     assert(db_.db != nullptr);
6266     ReadOptions read_options;
6267     std::unique_ptr<char[]> ts_guard;
6268     Slice ts;
6269     if (user_timestamp_size_ > 0) {
6270       ts_guard.reset(new char[user_timestamp_size_]);
6271       ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6272       read_options.timestamp = &ts;
6273     }
6274     Iterator* iter = db_.db->NewIterator(read_options);
6275 
6276     fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
6277     Duration duration(FLAGS_duration, reads_);
6278     uint64_t num_seek_to_first = 0;
6279     uint64_t num_next = 0;
6280     while (!duration.Done(1)) {
6281       if (!iter->Valid()) {
6282         iter->SeekToFirst();
6283         num_seek_to_first++;
6284       } else if (!iter->status().ok()) {
6285         fprintf(stderr, "Iterator error: %s\n",
6286                 iter->status().ToString().c_str());
6287         abort();
6288       } else {
6289         iter->Next();
6290         num_next++;
6291       }
6292 
6293       thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
6294     }
6295     delete iter;
6296   }
6297 
6298   // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
6299   // in DB atomically i.e in a single batch. Also refer GetMany.
PutMany(DB * db,const WriteOptions & writeoptions,const Slice & key,const Slice & value)6300   Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
6301                  const Slice& value) {
6302     std::string suffixes[3] = {"2", "1", "0"};
6303     std::string keys[3];
6304 
6305     WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
6306                      user_timestamp_size_);
6307     Status s;
6308     for (int i = 0; i < 3; i++) {
6309       keys[i] = key.ToString() + suffixes[i];
6310       batch.Put(keys[i], value);
6311     }
6312 
6313     std::unique_ptr<char[]> ts_guard;
6314     if (user_timestamp_size_ > 0) {
6315       ts_guard.reset(new char[user_timestamp_size_]);
6316       Slice ts = mock_app_clock_->Allocate(ts_guard.get());
6317       s = batch.AssignTimestamp(ts);
6318       if (!s.ok()) {
6319         fprintf(stderr, "assign timestamp to batch: %s\n",
6320                 s.ToString().c_str());
6321         ErrorExit();
6322       }
6323     }
6324 
6325     s = db->Write(writeoptions, &batch);
6326     return s;
6327   }
6328 
6329 
6330   // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
6331   // in DB atomically i.e in a single batch. Also refer GetMany.
DeleteMany(DB * db,const WriteOptions & writeoptions,const Slice & key)6332   Status DeleteMany(DB* db, const WriteOptions& writeoptions,
6333                     const Slice& key) {
6334     std::string suffixes[3] = {"1", "2", "0"};
6335     std::string keys[3];
6336 
6337     WriteBatch batch(0, 0, user_timestamp_size_);
6338     Status s;
6339     for (int i = 0; i < 3; i++) {
6340       keys[i] = key.ToString() + suffixes[i];
6341       batch.Delete(keys[i]);
6342     }
6343 
6344     std::unique_ptr<char[]> ts_guard;
6345     if (user_timestamp_size_ > 0) {
6346       ts_guard.reset(new char[user_timestamp_size_]);
6347       Slice ts = mock_app_clock_->Allocate(ts_guard.get());
6348       s = batch.AssignTimestamp(ts);
6349       if (!s.ok()) {
6350         fprintf(stderr, "assign timestamp to batch: %s\n",
6351                 s.ToString().c_str());
6352         ErrorExit();
6353       }
6354     }
6355 
6356     s = db->Write(writeoptions, &batch);
6357     return s;
6358   }
6359 
6360   // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
6361   // in the same snapshot, and verifies that all the values are identical.
6362   // ASSUMES that PutMany was used to put (K, V) into the DB.
GetMany(DB * db,const ReadOptions & readoptions,const Slice & key,std::string * value)6363   Status GetMany(DB* db, const ReadOptions& readoptions, const Slice& key,
6364                  std::string* value) {
6365     std::string suffixes[3] = {"0", "1", "2"};
6366     std::string keys[3];
6367     Slice key_slices[3];
6368     std::string values[3];
6369     ReadOptions readoptionscopy = readoptions;
6370 
6371     std::unique_ptr<char[]> ts_guard;
6372     Slice ts;
6373     if (user_timestamp_size_ > 0) {
6374       ts_guard.reset(new char[user_timestamp_size_]);
6375       ts = mock_app_clock_->Allocate(ts_guard.get());
6376       readoptionscopy.timestamp = &ts;
6377     }
6378 
6379     readoptionscopy.snapshot = db->GetSnapshot();
6380     Status s;
6381     for (int i = 0; i < 3; i++) {
6382       keys[i] = key.ToString() + suffixes[i];
6383       key_slices[i] = keys[i];
6384       s = db->Get(readoptionscopy, key_slices[i], value);
6385       if (!s.ok() && !s.IsNotFound()) {
6386         fprintf(stderr, "get error: %s\n", s.ToString().c_str());
6387         values[i] = "";
6388         // we continue after error rather than exiting so that we can
6389         // find more errors if any
6390       } else if (s.IsNotFound()) {
6391         values[i] = "";
6392       } else {
6393         values[i] = *value;
6394       }
6395     }
6396     db->ReleaseSnapshot(readoptionscopy.snapshot);
6397 
6398     if ((values[0] != values[1]) || (values[1] != values[2])) {
6399       fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
6400               key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
6401               values[2].c_str());
6402       // we continue after error rather than exiting so that we can
6403       // find more errors if any
6404     }
6405 
6406     return s;
6407   }
6408 
6409   // Differs from readrandomwriterandom in the following ways:
6410   // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
6411   // (b) Does deletes as well (per FLAGS_deletepercent)
6412   // (c) In order to achieve high % of 'found' during lookups, and to do
6413   //     multiple writes (including puts and deletes) it uses upto
6414   //     FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
6415   // (d) Does not have a MultiGet option.
RandomWithVerify(ThreadState * thread)6416   void RandomWithVerify(ThreadState* thread) {
6417     ReadOptions options(FLAGS_verify_checksum, true);
6418     RandomGenerator gen;
6419     std::string value;
6420     int64_t found = 0;
6421     int get_weight = 0;
6422     int put_weight = 0;
6423     int delete_weight = 0;
6424     int64_t gets_done = 0;
6425     int64_t puts_done = 0;
6426     int64_t deletes_done = 0;
6427 
6428     std::unique_ptr<const char[]> key_guard;
6429     Slice key = AllocateKey(&key_guard);
6430 
6431     // the number of iterations is the larger of read_ or write_
6432     for (int64_t i = 0; i < readwrites_; i++) {
6433       DB* db = SelectDB(thread);
6434       if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
6435         // one batch completed, reinitialize for next batch
6436         get_weight = FLAGS_readwritepercent;
6437         delete_weight = FLAGS_deletepercent;
6438         put_weight = 100 - get_weight - delete_weight;
6439       }
6440       GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
6441           FLAGS_numdistinct, &key);
6442       if (get_weight > 0) {
6443         // do all the gets first
6444         Status s = GetMany(db, options, key, &value);
6445         if (!s.ok() && !s.IsNotFound()) {
6446           fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
6447           // we continue after error rather than exiting so that we can
6448           // find more errors if any
6449         } else if (!s.IsNotFound()) {
6450           found++;
6451         }
6452         get_weight--;
6453         gets_done++;
6454         thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
6455       } else if (put_weight > 0) {
6456         // then do all the corresponding number of puts
6457         // for all the gets we have done earlier
6458         Status s = PutMany(db, write_options_, key, gen.Generate());
6459         if (!s.ok()) {
6460           fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
6461           exit(1);
6462         }
6463         put_weight--;
6464         puts_done++;
6465         thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
6466       } else if (delete_weight > 0) {
6467         Status s = DeleteMany(db, write_options_, key);
6468         if (!s.ok()) {
6469           fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
6470           exit(1);
6471         }
6472         delete_weight--;
6473         deletes_done++;
6474         thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
6475       }
6476     }
6477     char msg[128];
6478     snprintf(msg, sizeof(msg),
6479              "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \
6480              PRIu64 " found:%" PRIu64 ")",
6481              gets_done, puts_done, deletes_done, readwrites_, found);
6482     thread->stats.AddMessage(msg);
6483   }
6484 
6485   // This is different from ReadWhileWriting because it does not use
6486   // an extra thread.
ReadRandomWriteRandom(ThreadState * thread)6487   void ReadRandomWriteRandom(ThreadState* thread) {
6488     ReadOptions options(FLAGS_verify_checksum, true);
6489     RandomGenerator gen;
6490     std::string value;
6491     int64_t found = 0;
6492     int get_weight = 0;
6493     int put_weight = 0;
6494     int64_t reads_done = 0;
6495     int64_t writes_done = 0;
6496     Duration duration(FLAGS_duration, readwrites_);
6497 
6498     std::unique_ptr<const char[]> key_guard;
6499     Slice key = AllocateKey(&key_guard);
6500 
6501     std::unique_ptr<char[]> ts_guard;
6502     if (user_timestamp_size_ > 0) {
6503       ts_guard.reset(new char[user_timestamp_size_]);
6504     }
6505 
6506     // the number of iterations is the larger of read_ or write_
6507     while (!duration.Done(1)) {
6508       DB* db = SelectDB(thread);
6509       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6510       if (get_weight == 0 && put_weight == 0) {
6511         // one batch completed, reinitialize for next batch
6512         get_weight = FLAGS_readwritepercent;
6513         put_weight = 100 - get_weight;
6514       }
6515       if (get_weight > 0) {
6516         // do all the gets first
6517         Slice ts;
6518         if (user_timestamp_size_ > 0) {
6519           ts = mock_app_clock_->GetTimestampForRead(thread->rand,
6520                                                     ts_guard.get());
6521           options.timestamp = &ts;
6522         }
6523         Status s = db->Get(options, key, &value);
6524         if (!s.ok() && !s.IsNotFound()) {
6525           fprintf(stderr, "get error: %s\n", s.ToString().c_str());
6526           // we continue after error rather than exiting so that we can
6527           // find more errors if any
6528         } else if (!s.IsNotFound()) {
6529           found++;
6530         }
6531         get_weight--;
6532         reads_done++;
6533         thread->stats.FinishedOps(nullptr, db, 1, kRead);
6534       } else  if (put_weight > 0) {
6535         // then do all the corresponding number of puts
6536         // for all the gets we have done earlier
6537         Slice ts;
6538         if (user_timestamp_size_ > 0) {
6539           ts = mock_app_clock_->Allocate(ts_guard.get());
6540           write_options_.timestamp = &ts;
6541         }
6542         Status s = db->Put(write_options_, key, gen.Generate());
6543         if (!s.ok()) {
6544           fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6545           ErrorExit();
6546         }
6547         put_weight--;
6548         writes_done++;
6549         thread->stats.FinishedOps(nullptr, db, 1, kWrite);
6550       }
6551     }
6552     char msg[100];
6553     snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \
6554              " total:%" PRIu64 " found:%" PRIu64 ")",
6555              reads_done, writes_done, readwrites_, found);
6556     thread->stats.AddMessage(msg);
6557   }
6558 
6559   //
6560   // Read-modify-write for random keys
UpdateRandom(ThreadState * thread)6561   void UpdateRandom(ThreadState* thread) {
6562     ReadOptions options(FLAGS_verify_checksum, true);
6563     RandomGenerator gen;
6564     std::string value;
6565     int64_t found = 0;
6566     int64_t bytes = 0;
6567     Duration duration(FLAGS_duration, readwrites_);
6568 
6569     std::unique_ptr<const char[]> key_guard;
6570     Slice key = AllocateKey(&key_guard);
6571     std::unique_ptr<char[]> ts_guard;
6572     if (user_timestamp_size_ > 0) {
6573       ts_guard.reset(new char[user_timestamp_size_]);
6574     }
6575     // the number of iterations is the larger of read_ or write_
6576     while (!duration.Done(1)) {
6577       DB* db = SelectDB(thread);
6578       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6579       Slice ts;
6580       if (user_timestamp_size_ > 0) {
6581         // Read with newest timestamp because we are doing rmw.
6582         ts = mock_app_clock_->Allocate(ts_guard.get());
6583         options.timestamp = &ts;
6584       }
6585 
6586       auto status = db->Get(options, key, &value);
6587       if (status.ok()) {
6588         ++found;
6589         bytes += key.size() + value.size() + user_timestamp_size_;
6590       } else if (!status.IsNotFound()) {
6591         fprintf(stderr, "Get returned an error: %s\n",
6592                 status.ToString().c_str());
6593         abort();
6594       }
6595 
6596       if (thread->shared->write_rate_limiter) {
6597         thread->shared->write_rate_limiter->Request(
6598             key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/,
6599             RateLimiter::OpType::kWrite);
6600       }
6601 
6602       Slice val = gen.Generate();
6603       if (user_timestamp_size_ > 0) {
6604         ts = mock_app_clock_->Allocate(ts_guard.get());
6605         write_options_.timestamp = &ts;
6606       }
6607       Status s = db->Put(write_options_, key, val);
6608       if (!s.ok()) {
6609         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6610         exit(1);
6611       }
6612       bytes += key.size() + val.size() + user_timestamp_size_;
6613       thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
6614     }
6615     char msg[100];
6616     snprintf(msg, sizeof(msg),
6617              "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
6618     thread->stats.AddBytes(bytes);
6619     thread->stats.AddMessage(msg);
6620   }
6621 
6622   // Read-XOR-write for random keys. Xors the existing value with a randomly
6623   // generated value, and stores the result. Assuming A in the array of bytes
6624   // representing the existing value, we generate an array B of the same size,
6625   // then compute C = A^B as C[i]=A[i]^B[i], and store C
XORUpdateRandom(ThreadState * thread)6626   void XORUpdateRandom(ThreadState* thread) {
6627     ReadOptions options(FLAGS_verify_checksum, true);
6628     RandomGenerator gen;
6629     std::string existing_value;
6630     int64_t found = 0;
6631     Duration duration(FLAGS_duration, readwrites_);
6632 
6633     BytesXOROperator xor_operator;
6634 
6635     std::unique_ptr<const char[]> key_guard;
6636     Slice key = AllocateKey(&key_guard);
6637     std::unique_ptr<char[]> ts_guard;
6638     if (user_timestamp_size_ > 0) {
6639       ts_guard.reset(new char[user_timestamp_size_]);
6640     }
6641     // the number of iterations is the larger of read_ or write_
6642     while (!duration.Done(1)) {
6643       DB* db = SelectDB(thread);
6644       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6645       Slice ts;
6646       if (user_timestamp_size_ > 0) {
6647         ts = mock_app_clock_->Allocate(ts_guard.get());
6648         options.timestamp = &ts;
6649       }
6650 
6651       auto status = db->Get(options, key, &existing_value);
6652       if (status.ok()) {
6653         ++found;
6654       } else if (!status.IsNotFound()) {
6655         fprintf(stderr, "Get returned an error: %s\n",
6656                 status.ToString().c_str());
6657         exit(1);
6658       }
6659 
6660       Slice value = gen.Generate(static_cast<unsigned int>(existing_value.size()));
6661       std::string new_value;
6662 
6663       if (status.ok()) {
6664         Slice existing_value_slice = Slice(existing_value);
6665         xor_operator.XOR(&existing_value_slice, value, &new_value);
6666       } else {
6667         xor_operator.XOR(nullptr, value, &new_value);
6668       }
6669 
6670       if (user_timestamp_size_ > 0) {
6671         ts = mock_app_clock_->Allocate(ts_guard.get());
6672         write_options_.timestamp = &ts;
6673       }
6674 
6675       Status s = db->Put(write_options_, key, Slice(new_value));
6676       if (!s.ok()) {
6677         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6678         ErrorExit();
6679       }
6680       thread->stats.FinishedOps(nullptr, db, 1);
6681     }
6682     char msg[100];
6683     snprintf(msg, sizeof(msg),
6684              "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
6685     thread->stats.AddMessage(msg);
6686   }
6687 
6688   // Read-modify-write for random keys.
6689   // Each operation causes the key grow by value_size (simulating an append).
6690   // Generally used for benchmarking against merges of similar type
AppendRandom(ThreadState * thread)6691   void AppendRandom(ThreadState* thread) {
6692     ReadOptions options(FLAGS_verify_checksum, true);
6693     RandomGenerator gen;
6694     std::string value;
6695     int64_t found = 0;
6696     int64_t bytes = 0;
6697 
6698     std::unique_ptr<const char[]> key_guard;
6699     Slice key = AllocateKey(&key_guard);
6700     std::unique_ptr<char[]> ts_guard;
6701     if (user_timestamp_size_ > 0) {
6702       ts_guard.reset(new char[user_timestamp_size_]);
6703     }
6704     // The number of iterations is the larger of read_ or write_
6705     Duration duration(FLAGS_duration, readwrites_);
6706     while (!duration.Done(1)) {
6707       DB* db = SelectDB(thread);
6708       GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6709       Slice ts;
6710       if (user_timestamp_size_ > 0) {
6711         ts = mock_app_clock_->Allocate(ts_guard.get());
6712         options.timestamp = &ts;
6713       }
6714 
6715       auto status = db->Get(options, key, &value);
6716       if (status.ok()) {
6717         ++found;
6718         bytes += key.size() + value.size() + user_timestamp_size_;
6719       } else if (!status.IsNotFound()) {
6720         fprintf(stderr, "Get returned an error: %s\n",
6721                 status.ToString().c_str());
6722         abort();
6723       } else {
6724         // If not existing, then just assume an empty string of data
6725         value.clear();
6726       }
6727 
6728       // Update the value (by appending data)
6729       Slice operand = gen.Generate();
6730       if (value.size() > 0) {
6731         // Use a delimiter to match the semantics for StringAppendOperator
6732         value.append(1,',');
6733       }
6734       value.append(operand.data(), operand.size());
6735 
6736       if (user_timestamp_size_ > 0) {
6737         ts = mock_app_clock_->Allocate(ts_guard.get());
6738         write_options_.timestamp = &ts;
6739       }
6740 
6741       // Write back to the database
6742       Status s = db->Put(write_options_, key, value);
6743       if (!s.ok()) {
6744         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6745         ErrorExit();
6746       }
6747       bytes += key.size() + value.size() + user_timestamp_size_;
6748       thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
6749     }
6750 
6751     char msg[100];
6752     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
6753             readwrites_, found);
6754     thread->stats.AddBytes(bytes);
6755     thread->stats.AddMessage(msg);
6756   }
6757 
6758   // Read-modify-write for random keys (using MergeOperator)
6759   // The merge operator to use should be defined by FLAGS_merge_operator
6760   // Adjust FLAGS_value_size so that the keys are reasonable for this operator
6761   // Assumes that the merge operator is non-null (i.e.: is well-defined)
6762   //
6763   // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
6764   // to simulate random additions over 64-bit integers using merge.
6765   //
6766   // The number of merges on the same key can be controlled by adjusting
6767   // FLAGS_merge_keys.
MergeRandom(ThreadState * thread)6768   void MergeRandom(ThreadState* thread) {
6769     RandomGenerator gen;
6770     int64_t bytes = 0;
6771     std::unique_ptr<const char[]> key_guard;
6772     Slice key = AllocateKey(&key_guard);
6773     // The number of iterations is the larger of read_ or write_
6774     Duration duration(FLAGS_duration, readwrites_);
6775     while (!duration.Done(1)) {
6776       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
6777       int64_t key_rand = thread->rand.Next() % merge_keys_;
6778       GenerateKeyFromInt(key_rand, merge_keys_, &key);
6779 
6780       Status s;
6781       Slice val = gen.Generate();
6782       if (FLAGS_num_column_families > 1) {
6783         s = db_with_cfh->db->Merge(write_options_,
6784                                    db_with_cfh->GetCfh(key_rand), key,
6785                                    val);
6786       } else {
6787         s = db_with_cfh->db->Merge(write_options_,
6788                                    db_with_cfh->db->DefaultColumnFamily(), key,
6789                                    val);
6790       }
6791 
6792       if (!s.ok()) {
6793         fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
6794         exit(1);
6795       }
6796       bytes += key.size() + val.size();
6797       thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
6798     }
6799 
6800     // Print some statistics
6801     char msg[100];
6802     snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
6803     thread->stats.AddBytes(bytes);
6804     thread->stats.AddMessage(msg);
6805   }
6806 
6807   // Read and merge random keys. The amount of reads and merges are controlled
6808   // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
6809   // keys (and thus also the number of reads and merges on the same key) can be
6810   // adjusted with FLAGS_merge_keys.
6811   //
6812   // As with MergeRandom, the merge operator to use should be defined by
6813   // FLAGS_merge_operator.
ReadRandomMergeRandom(ThreadState * thread)6814   void ReadRandomMergeRandom(ThreadState* thread) {
6815     ReadOptions options(FLAGS_verify_checksum, true);
6816     RandomGenerator gen;
6817     std::string value;
6818     int64_t num_hits = 0;
6819     int64_t num_gets = 0;
6820     int64_t num_merges = 0;
6821     size_t max_length = 0;
6822 
6823     std::unique_ptr<const char[]> key_guard;
6824     Slice key = AllocateKey(&key_guard);
6825     // the number of iterations is the larger of read_ or write_
6826     Duration duration(FLAGS_duration, readwrites_);
6827     while (!duration.Done(1)) {
6828       DB* db = SelectDB(thread);
6829       GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
6830 
6831       bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
6832 
6833       if (do_merge) {
6834         Status s = db->Merge(write_options_, key, gen.Generate());
6835         if (!s.ok()) {
6836           fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
6837           exit(1);
6838         }
6839         num_merges++;
6840         thread->stats.FinishedOps(nullptr, db, 1, kMerge);
6841       } else {
6842         Status s = db->Get(options, key, &value);
6843         if (value.length() > max_length)
6844           max_length = value.length();
6845 
6846         if (!s.ok() && !s.IsNotFound()) {
6847           fprintf(stderr, "get error: %s\n", s.ToString().c_str());
6848           // we continue after error rather than exiting so that we can
6849           // find more errors if any
6850         } else if (!s.IsNotFound()) {
6851           num_hits++;
6852         }
6853         num_gets++;
6854         thread->stats.FinishedOps(nullptr, db, 1, kRead);
6855       }
6856     }
6857 
6858     char msg[100];
6859     snprintf(msg, sizeof(msg),
6860              "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
6861              " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
6862              num_gets, num_merges, readwrites_, num_hits, max_length);
6863     thread->stats.AddMessage(msg);
6864   }
6865 
WriteSeqSeekSeq(ThreadState * thread)6866   void WriteSeqSeekSeq(ThreadState* thread) {
6867     writes_ = FLAGS_num;
6868     DoWrite(thread, SEQUENTIAL);
6869     // exclude writes from the ops/sec calculation
6870     thread->stats.Start(thread->tid);
6871 
6872     DB* db = SelectDB(thread);
6873     ReadOptions read_opts(FLAGS_verify_checksum, true);
6874     std::unique_ptr<char[]> ts_guard;
6875     Slice ts;
6876     if (user_timestamp_size_ > 0) {
6877       ts_guard.reset(new char[user_timestamp_size_]);
6878       ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6879       read_opts.timestamp = &ts;
6880     }
6881     std::unique_ptr<Iterator> iter(db->NewIterator(read_opts));
6882 
6883     std::unique_ptr<const char[]> key_guard;
6884     Slice key = AllocateKey(&key_guard);
6885     for (int64_t i = 0; i < FLAGS_num; ++i) {
6886       GenerateKeyFromInt(i, FLAGS_num, &key);
6887       iter->Seek(key);
6888       assert(iter->Valid() && iter->key() == key);
6889       thread->stats.FinishedOps(nullptr, db, 1, kSeek);
6890 
6891       for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
6892         if (!FLAGS_reverse_iterator) {
6893           iter->Next();
6894         } else {
6895           iter->Prev();
6896         }
6897         GenerateKeyFromInt(++i, FLAGS_num, &key);
6898         assert(iter->Valid() && iter->key() == key);
6899         thread->stats.FinishedOps(nullptr, db, 1, kSeek);
6900       }
6901 
6902       iter->Seek(key);
6903       assert(iter->Valid() && iter->key() == key);
6904       thread->stats.FinishedOps(nullptr, db, 1, kSeek);
6905     }
6906   }
6907 
binary_search(std::vector<int> & data,int start,int end,int key)6908   bool binary_search(std::vector<int>& data, int start, int end, int key) {
6909     if (data.empty()) return false;
6910     if (start > end) return false;
6911     int mid = start + (end - start) / 2;
6912     if (mid > static_cast<int>(data.size()) - 1) return false;
6913     if (data[mid] == key) {
6914       return true;
6915     } else if (data[mid] > key) {
6916       return binary_search(data, start, mid - 1, key);
6917     } else {
6918       return binary_search(data, mid + 1, end, key);
6919     }
6920   }
6921 
6922   // Does a bunch of merge operations for a key(key1) where the merge operand
6923   // is a sorted list. Next performance comparison is done between doing a Get
6924   // for key1 followed by searching for another key(key2) in the large sorted
6925   // list vs calling GetMergeOperands for key1 and then searching for the key2
6926   // in all the sorted sub-lists. Later case is expected to be a lot faster.
GetMergeOperands(ThreadState * thread)6927   void GetMergeOperands(ThreadState* thread) {
6928     DB* db = SelectDB(thread);
6929     const int kTotalValues = 100000;
6930     const int kListSize = 100;
6931     std::string key = "my_key";
6932     std::string value;
6933 
6934     for (int i = 1; i < kTotalValues; i++) {
6935       if (i % kListSize == 0) {
6936         // Remove trailing ','
6937         value.pop_back();
6938         db->Merge(WriteOptions(), key, value);
6939         value.clear();
6940       } else {
6941         value.append(std::to_string(i)).append(",");
6942       }
6943     }
6944 
6945     SortList s;
6946     std::vector<int> data;
6947     // This value can be experimented with and it will demonstrate the
6948     // perf difference between doing a Get and searching for lookup_key in the
6949     // resultant large sorted list vs doing GetMergeOperands and searching
6950     // for lookup_key within this resultant sorted sub-lists.
6951     int lookup_key = 1;
6952 
6953     // Get API call
6954     std::cout << "--- Get API call --- \n";
6955     PinnableSlice p_slice;
6956     uint64_t st = FLAGS_env->NowNanos();
6957     db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
6958     s.MakeVector(data, p_slice);
6959     bool found =
6960         binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
6961     std::cout << "Found key? " << std::to_string(found) << "\n";
6962     uint64_t sp = FLAGS_env->NowNanos();
6963     std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
6964     std::string* dat_ = p_slice.GetSelf();
6965     std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
6966               << "\n";
6967     data.clear();
6968 
6969     // GetMergeOperands API call
6970     std::cout << "--- GetMergeOperands API --- \n";
6971     std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
6972     st = FLAGS_env->NowNanos();
6973     int number_of_operands = 0;
6974     GetMergeOperandsOptions get_merge_operands_options;
6975     get_merge_operands_options.expected_max_number_of_operands =
6976         (kTotalValues / 100) + 1;
6977     db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
6978                          a_slice.data(), &get_merge_operands_options,
6979                          &number_of_operands);
6980     for (PinnableSlice& psl : a_slice) {
6981       s.MakeVector(data, psl);
6982       found =
6983           binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
6984       data.clear();
6985       if (found) break;
6986     }
6987     std::cout << "Found key? " << std::to_string(found) << "\n";
6988     sp = FLAGS_env->NowNanos();
6989     std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
6990               << " seconds \n";
6991     int to_print = 0;
6992     std::cout << "Sample data from GetMergeOperands API call: ";
6993     for (PinnableSlice& psl : a_slice) {
6994       std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
6995       if (to_print++ > 2) break;
6996     }
6997   }
6998 
6999 #ifndef ROCKSDB_LITE
7000   // This benchmark stress tests Transactions.  For a given --duration (or
7001   // total number of --writes, a Transaction will perform a read-modify-write
7002   // to increment the value of a key in each of N(--transaction-sets) sets of
7003   // keys (where each set has --num keys).  If --threads is set, this will be
7004   // done in parallel.
7005   //
7006   // To test transactions, use --transaction_db=true.  Not setting this
7007   // parameter
7008   // will run the same benchmark without transactions.
7009   //
7010   // RandomTransactionVerify() will then validate the correctness of the results
7011   // by checking if the sum of all keys in each set is the same.
RandomTransaction(ThreadState * thread)7012   void RandomTransaction(ThreadState* thread) {
7013     ReadOptions options(FLAGS_verify_checksum, true);
7014     Duration duration(FLAGS_duration, readwrites_);
7015     ReadOptions read_options(FLAGS_verify_checksum, true);
7016     uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
7017     uint64_t transactions_done = 0;
7018 
7019     if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
7020       fprintf(stderr, "invalid value for transaction_sets\n");
7021       abort();
7022     }
7023 
7024     TransactionOptions txn_options;
7025     txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
7026     txn_options.set_snapshot = FLAGS_transaction_set_snapshot;
7027 
7028     RandomTransactionInserter inserter(&thread->rand, write_options_,
7029                                        read_options, FLAGS_num,
7030                                        num_prefix_ranges);
7031 
7032     if (FLAGS_num_multi_db > 1) {
7033       fprintf(stderr,
7034               "Cannot run RandomTransaction benchmark with "
7035               "FLAGS_multi_db > 1.");
7036       abort();
7037     }
7038 
7039     while (!duration.Done(1)) {
7040       bool success;
7041 
7042       // RandomTransactionInserter will attempt to insert a key for each
7043       // # of FLAGS_transaction_sets
7044       if (FLAGS_optimistic_transaction_db) {
7045         success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
7046       } else if (FLAGS_transaction_db) {
7047         TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
7048         success = inserter.TransactionDBInsert(txn_db, txn_options);
7049       } else {
7050         success = inserter.DBInsert(db_.db);
7051       }
7052 
7053       if (!success) {
7054         fprintf(stderr, "Unexpected error: %s\n",
7055                 inserter.GetLastStatus().ToString().c_str());
7056         abort();
7057       }
7058 
7059       thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
7060       transactions_done++;
7061     }
7062 
7063     char msg[100];
7064     if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
7065       snprintf(msg, sizeof(msg),
7066                "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
7067                transactions_done, inserter.GetFailureCount());
7068     } else {
7069       snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
7070     }
7071     thread->stats.AddMessage(msg);
7072 
7073     if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
7074       thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
7075                                get_perf_context()->ToString());
7076     }
7077     thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
7078   }
7079 
7080   // Verifies consistency of data after RandomTransaction() has been run.
7081   // Since each iteration of RandomTransaction() incremented a key in each set
7082   // by the same value, the sum of the keys in each set should be the same.
RandomTransactionVerify()7083   void RandomTransactionVerify() {
7084     if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
7085       // transactions not used, nothing to verify.
7086       return;
7087     }
7088 
7089     Status s =
7090         RandomTransactionInserter::Verify(db_.db,
7091                             static_cast<uint16_t>(FLAGS_transaction_sets));
7092 
7093     if (s.ok()) {
7094       fprintf(stdout, "RandomTransactionVerify Success.\n");
7095     } else {
7096       fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
7097     }
7098   }
7099 #endif  // ROCKSDB_LITE
7100 
7101   // Writes and deletes random keys without overwriting keys.
7102   //
7103   // This benchmark is intended to partially replicate the behavior of MyRocks
7104   // secondary indices: All data is stored in keys and updates happen by
7105   // deleting the old version of the key and inserting the new version.
RandomReplaceKeys(ThreadState * thread)7106   void RandomReplaceKeys(ThreadState* thread) {
7107     std::unique_ptr<const char[]> key_guard;
7108     Slice key = AllocateKey(&key_guard);
7109     std::unique_ptr<char[]> ts_guard;
7110     if (user_timestamp_size_ > 0) {
7111       ts_guard.reset(new char[user_timestamp_size_]);
7112     }
7113     std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
7114     size_t max_counter = 50;
7115     RandomGenerator gen;
7116 
7117     Status s;
7118     DB* db = SelectDB(thread);
7119     for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
7120       GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
7121       Slice ts;
7122       if (user_timestamp_size_ > 0) {
7123         ts = mock_app_clock_->Allocate(ts_guard.get());
7124         write_options_.timestamp = &ts;
7125       }
7126       s = db->Put(write_options_, key, gen.Generate());
7127       if (!s.ok()) {
7128         fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7129         exit(1);
7130       }
7131     }
7132 
7133     db->GetSnapshot();
7134 
7135     std::default_random_engine generator;
7136     std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
7137                                                   FLAGS_stddev);
7138     Duration duration(FLAGS_duration, FLAGS_num);
7139     while (!duration.Done(1)) {
7140       int64_t rnd_id = static_cast<int64_t>(distribution(generator));
7141       int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
7142                                 static_cast<int64_t>(0));
7143       GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7144                          &key);
7145       Slice ts;
7146       if (user_timestamp_size_ > 0) {
7147         ts = mock_app_clock_->Allocate(ts_guard.get());
7148         write_options_.timestamp = &ts;
7149       }
7150       s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
7151                                    : db->Delete(write_options_, key);
7152       if (s.ok()) {
7153         counters[key_id] = (counters[key_id] + 1) % max_counter;
7154         GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7155                            &key);
7156         if (user_timestamp_size_ > 0) {
7157           ts = mock_app_clock_->Allocate(ts_guard.get());
7158           write_options_.timestamp = &ts;
7159         }
7160         s = db->Put(write_options_, key, Slice());
7161       }
7162 
7163       if (!s.ok()) {
7164         fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7165         exit(1);
7166       }
7167 
7168       thread->stats.FinishedOps(nullptr, db, 1, kOthers);
7169     }
7170 
7171     char msg[200];
7172     snprintf(msg, sizeof(msg),
7173              "use single deletes: %d, "
7174              "standard deviation: %lf\n",
7175              FLAGS_use_single_deletes, FLAGS_stddev);
7176     thread->stats.AddMessage(msg);
7177   }
7178 
TimeSeriesReadOrDelete(ThreadState * thread,bool do_deletion)7179   void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
7180     ReadOptions options(FLAGS_verify_checksum, true);
7181     int64_t read = 0;
7182     int64_t found = 0;
7183     int64_t bytes = 0;
7184 
7185     Iterator* iter = nullptr;
7186     // Only work on single database
7187     assert(db_.db != nullptr);
7188     iter = db_.db->NewIterator(options);
7189 
7190     std::unique_ptr<const char[]> key_guard;
7191     Slice key = AllocateKey(&key_guard);
7192 
7193     char value_buffer[256];
7194     while (true) {
7195       {
7196         MutexLock l(&thread->shared->mu);
7197         if (thread->shared->num_done >= 1) {
7198           // Write thread have finished
7199           break;
7200         }
7201       }
7202       if (!FLAGS_use_tailing_iterator) {
7203         delete iter;
7204         iter = db_.db->NewIterator(options);
7205       }
7206       // Pick a Iterator to use
7207 
7208       int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
7209       GenerateKeyFromInt(key_id, FLAGS_num, &key);
7210       // Reset last 8 bytes to 0
7211       char* start = const_cast<char*>(key.data());
7212       start += key.size() - 8;
7213       memset(start, 0, 8);
7214       ++read;
7215 
7216       bool key_found = false;
7217       // Seek the prefix
7218       for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
7219            iter->Next()) {
7220         key_found = true;
7221         // Copy out iterator's value to make sure we read them.
7222         if (do_deletion) {
7223           bytes += iter->key().size();
7224           if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
7225             thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
7226             db_.db->Delete(write_options_, iter->key());
7227           } else {
7228             break;
7229           }
7230         } else {
7231           bytes += iter->key().size() + iter->value().size();
7232           thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
7233           Slice value = iter->value();
7234           memcpy(value_buffer, value.data(),
7235                  std::min(value.size(), sizeof(value_buffer)));
7236 
7237           assert(iter->status().ok());
7238         }
7239       }
7240       found += key_found;
7241 
7242       if (thread->shared->read_rate_limiter.get() != nullptr) {
7243         thread->shared->read_rate_limiter->Request(
7244             1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
7245       }
7246     }
7247     delete iter;
7248 
7249     char msg[100];
7250     snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
7251              read);
7252     thread->stats.AddBytes(bytes);
7253     thread->stats.AddMessage(msg);
7254     if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
7255       thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
7256                                get_perf_context()->ToString());
7257     }
7258   }
7259 
TimeSeriesWrite(ThreadState * thread)7260   void TimeSeriesWrite(ThreadState* thread) {
7261     // Special thread that keeps writing until other threads are done.
7262     RandomGenerator gen;
7263     int64_t bytes = 0;
7264 
7265     // Don't merge stats from this thread with the readers.
7266     thread->stats.SetExcludeFromMerge();
7267 
7268     std::unique_ptr<RateLimiter> write_rate_limiter;
7269     if (FLAGS_benchmark_write_rate_limit > 0) {
7270       write_rate_limiter.reset(
7271           NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
7272     }
7273 
7274     std::unique_ptr<const char[]> key_guard;
7275     Slice key = AllocateKey(&key_guard);
7276 
7277     Duration duration(FLAGS_duration, writes_);
7278     while (!duration.Done(1)) {
7279       DB* db = SelectDB(thread);
7280 
7281       uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
7282       // Write key id
7283       GenerateKeyFromInt(key_id, FLAGS_num, &key);
7284       // Write timestamp
7285 
7286       char* start = const_cast<char*>(key.data());
7287       char* pos = start + 8;
7288       int bytes_to_fill =
7289           std::min(key_size_ - static_cast<int>(pos - start), 8);
7290       uint64_t timestamp_value = timestamp_emulator_->Get();
7291       if (port::kLittleEndian) {
7292         for (int i = 0; i < bytes_to_fill; ++i) {
7293           pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
7294         }
7295       } else {
7296         memcpy(pos, static_cast<void*>(&timestamp_value), bytes_to_fill);
7297       }
7298 
7299       timestamp_emulator_->Inc();
7300 
7301       Status s;
7302       Slice val = gen.Generate();
7303       s = db->Put(write_options_, key, val);
7304 
7305       if (!s.ok()) {
7306         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7307         ErrorExit();
7308       }
7309       bytes = key.size() + val.size();
7310       thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
7311       thread->stats.AddBytes(bytes);
7312 
7313       if (FLAGS_benchmark_write_rate_limit > 0) {
7314         write_rate_limiter->Request(
7315             key.size() + val.size(), Env::IO_HIGH,
7316             nullptr /* stats */, RateLimiter::OpType::kWrite);
7317       }
7318     }
7319   }
7320 
TimeSeries(ThreadState * thread)7321   void TimeSeries(ThreadState* thread) {
7322     if (thread->tid > 0) {
7323       bool do_deletion = FLAGS_expire_style == "delete" &&
7324                          thread->tid <= FLAGS_num_deletion_threads;
7325       TimeSeriesReadOrDelete(thread, do_deletion);
7326     } else {
7327       TimeSeriesWrite(thread);
7328       thread->stats.Stop();
7329       thread->stats.Report("timeseries write");
7330     }
7331   }
7332 
Compact(ThreadState * thread)7333   void Compact(ThreadState* thread) {
7334     DB* db = SelectDB(thread);
7335     CompactRangeOptions cro;
7336     cro.bottommost_level_compaction =
7337         BottommostLevelCompaction::kForceOptimized;
7338     db->CompactRange(cro, nullptr, nullptr);
7339   }
7340 
CompactAll()7341   void CompactAll() {
7342     if (db_.db != nullptr) {
7343       db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
7344     }
7345     for (const auto& db_with_cfh : multi_dbs_) {
7346       db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
7347     }
7348   }
7349 
7350 #ifndef ROCKSDB_LITE
WaitForCompactionHelper(DBWithColumnFamilies & db)7351   void WaitForCompactionHelper(DBWithColumnFamilies& db) {
7352     // This is an imperfect way of waiting for compaction. The loop and sleep
7353     // is done because a thread that finishes a compaction job should get a
7354     // chance to pickup a new compaction job.
7355 
7356     std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending,
7357                                      DB::Properties::kNumRunningFlushes,
7358                                      DB::Properties::kCompactionPending,
7359                                      DB::Properties::kNumRunningCompactions};
7360 
7361     fprintf(stdout, "waitforcompaction(%s): started\n",
7362             db.db->GetName().c_str());
7363 
7364     while (true) {
7365       bool retry = false;
7366 
7367       for (const auto& k : keys) {
7368         uint64_t v;
7369         if (!db.db->GetIntProperty(k, &v)) {
7370           fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
7371                   db.db->GetName().c_str(), k.c_str());
7372           exit(1);
7373         } else if (v > 0) {
7374           fprintf(stdout,
7375                   "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
7376                   db.db->GetName().c_str(), k.c_str());
7377           sleep(10);
7378           retry = true;
7379           break;
7380         }
7381       }
7382 
7383       if (!retry) {
7384         fprintf(stdout, "waitforcompaction(%s): finished\n",
7385                 db.db->GetName().c_str());
7386         return;
7387       }
7388     }
7389   }
7390 
WaitForCompaction()7391   void WaitForCompaction() {
7392     // Give background threads a chance to wake
7393     sleep(5);
7394 
7395     // I am skeptical that this check race free. I hope that checking twice
7396     // reduces the chance.
7397     if (db_.db != nullptr) {
7398       WaitForCompactionHelper(db_);
7399       WaitForCompactionHelper(db_);
7400     } else {
7401       for (auto& db_with_cfh : multi_dbs_) {
7402         WaitForCompactionHelper(db_with_cfh);
7403         WaitForCompactionHelper(db_with_cfh);
7404       }
7405     }
7406   }
7407 
CompactLevelHelper(DBWithColumnFamilies & db_with_cfh,int from_level)7408   bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) {
7409     std::vector<LiveFileMetaData> files;
7410     db_with_cfh.db->GetLiveFilesMetaData(&files);
7411 
7412     assert(from_level == 0 || from_level == 1);
7413 
7414     int real_from_level = from_level;
7415     if (real_from_level > 0) {
7416       // With dynamic leveled compaction the first level with data beyond L0
7417       // might not be L1.
7418       real_from_level = std::numeric_limits<int>::max();
7419 
7420       for (auto& f : files) {
7421         if (f.level > 0 && f.level < real_from_level) real_from_level = f.level;
7422       }
7423 
7424       if (real_from_level == std::numeric_limits<int>::max()) {
7425         fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
7426         return true;
7427       }
7428     }
7429 
7430     // The goal is to compact from from_level to the level that follows it,
7431     // and with dynamic leveled compaction the next level might not be
7432     // real_from_level+1
7433     int next_level = std::numeric_limits<int>::max();
7434 
7435     std::vector<std::string> files_to_compact;
7436     for (auto& f : files) {
7437       if (f.level == real_from_level)
7438         files_to_compact.push_back(f.name);
7439       else if (f.level > real_from_level && f.level < next_level)
7440         next_level = f.level;
7441     }
7442 
7443     if (files_to_compact.empty()) {
7444       fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
7445       return true;
7446     } else if (next_level == std::numeric_limits<int>::max()) {
7447       // There is no data beyond real_from_level. So we are done.
7448       fprintf(stdout, "compact%d found no data beyond L%d\n", from_level,
7449               real_from_level);
7450       return true;
7451     }
7452 
7453     fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n",
7454             from_level, static_cast<int>(files_to_compact.size()),
7455             real_from_level, next_level);
7456 
7457     ROCKSDB_NAMESPACE::CompactionOptions options;
7458     // Lets RocksDB use the configured compression for this level
7459     options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption;
7460 
7461     ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc;
7462     db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc);
7463     options.output_file_size_limit = cfDesc.options.target_file_size_base;
7464 
7465     Status status =
7466         db_with_cfh.db->CompactFiles(options, files_to_compact, next_level);
7467     if (!status.ok()) {
7468       // This can fail for valid reasons including the operation was aborted
7469       // or a filename is invalid because background compaction removed it.
7470       // Having read the current cases for which an error is raised I prefer
7471       // not to figure out whether an exception should be thrown here.
7472       fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level,
7473               status.ToString().c_str());
7474       return false;
7475     }
7476     return true;
7477   }
7478 
CompactLevel(int from_level)7479   void CompactLevel(int from_level) {
7480     if (db_.db != nullptr) {
7481       while (!CompactLevelHelper(db_, from_level)) WaitForCompaction();
7482     }
7483     for (auto& db_with_cfh : multi_dbs_) {
7484       while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction();
7485     }
7486   }
7487 #endif
7488 
Flush()7489   void Flush() {
7490     FlushOptions flush_opt;
7491     flush_opt.wait = true;
7492 
7493     if (db_.db != nullptr) {
7494       Status s = db_.db->Flush(flush_opt, db_.cfh);
7495       if (!s.ok()) {
7496         fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
7497         exit(1);
7498       }
7499     } else {
7500       for (const auto& db_with_cfh : multi_dbs_) {
7501         Status s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh);
7502         if (!s.ok()) {
7503           fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
7504           exit(1);
7505         }
7506       }
7507     }
7508     fprintf(stdout, "flush memtable\n");
7509   }
7510 
ResetStats()7511   void ResetStats() {
7512     if (db_.db != nullptr) {
7513       db_.db->ResetStats();
7514     }
7515     for (const auto& db_with_cfh : multi_dbs_) {
7516       db_with_cfh.db->ResetStats();
7517     }
7518   }
7519 
PrintStatsHistory()7520   void PrintStatsHistory() {
7521     if (db_.db != nullptr) {
7522       PrintStatsHistoryImpl(db_.db, false);
7523     }
7524     for (const auto& db_with_cfh : multi_dbs_) {
7525       PrintStatsHistoryImpl(db_with_cfh.db, true);
7526     }
7527   }
7528 
PrintStatsHistoryImpl(DB * db,bool print_header)7529   void PrintStatsHistoryImpl(DB* db, bool print_header) {
7530     if (print_header) {
7531       fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
7532     }
7533 
7534     std::unique_ptr<StatsHistoryIterator> shi;
7535     Status s = db->GetStatsHistory(0, port::kMaxUint64, &shi);
7536     if (!s.ok()) {
7537       fprintf(stdout, "%s\n", s.ToString().c_str());
7538       return;
7539     }
7540     assert(shi);
7541     while (shi->Valid()) {
7542       uint64_t stats_time = shi->GetStatsTime();
7543       fprintf(stdout, "------ %s ------\n",
7544               TimeToHumanString(static_cast<int>(stats_time)).c_str());
7545       for (auto& entry : shi->GetStatsMap()) {
7546         fprintf(stdout, " %" PRIu64 "   %s  %" PRIu64 "\n", stats_time,
7547                 entry.first.c_str(), entry.second);
7548       }
7549       shi->Next();
7550     }
7551   }
7552 
PrintStats(const char * key)7553   void PrintStats(const char* key) {
7554     if (db_.db != nullptr) {
7555       PrintStats(db_.db, key, false);
7556     }
7557     for (const auto& db_with_cfh : multi_dbs_) {
7558       PrintStats(db_with_cfh.db, key, true);
7559     }
7560   }
7561 
PrintStats(DB * db,const char * key,bool print_header=false)7562   void PrintStats(DB* db, const char* key, bool print_header = false) {
7563     if (print_header) {
7564       fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
7565     }
7566     std::string stats;
7567     if (!db->GetProperty(key, &stats)) {
7568       stats = "(failed)";
7569     }
7570     fprintf(stdout, "\n%s\n", stats.c_str());
7571   }
7572 
PrintStats(const std::vector<std::string> & keys)7573   void PrintStats(const std::vector<std::string>& keys) {
7574     if (db_.db != nullptr) {
7575       PrintStats(db_.db, keys);
7576     }
7577     for (const auto& db_with_cfh : multi_dbs_) {
7578       PrintStats(db_with_cfh.db, keys, true);
7579     }
7580   }
7581 
PrintStats(DB * db,const std::vector<std::string> & keys,bool print_header=false)7582   void PrintStats(DB* db, const std::vector<std::string>& keys,
7583                   bool print_header = false) {
7584     if (print_header) {
7585       fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
7586     }
7587 
7588     for (const auto& key : keys) {
7589       std::string stats;
7590       if (!db->GetProperty(key, &stats)) {
7591         stats = "(failed)";
7592       }
7593       fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str());
7594     }
7595   }
7596 
Replay(ThreadState * thread)7597   void Replay(ThreadState* thread) {
7598     if (db_.db != nullptr) {
7599       Replay(thread, &db_);
7600     }
7601   }
7602 
Replay(ThreadState *,DBWithColumnFamilies * db_with_cfh)7603   void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
7604     Status s;
7605     std::unique_ptr<TraceReader> trace_reader;
7606     s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
7607                            &trace_reader);
7608     if (!s.ok()) {
7609       fprintf(
7610           stderr,
7611           "Encountered an error creating a TraceReader from the trace file. "
7612           "Error: %s\n",
7613           s.ToString().c_str());
7614       exit(1);
7615     }
7616     Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
7617                       std::move(trace_reader));
7618     replayer.SetFastForward(
7619         static_cast<uint32_t>(FLAGS_trace_replay_fast_forward));
7620     s = replayer.MultiThreadReplay(
7621         static_cast<uint32_t>(FLAGS_trace_replay_threads));
7622     if (s.ok()) {
7623       fprintf(stdout, "Replay started from trace_file: %s\n",
7624               FLAGS_trace_file.c_str());
7625     } else {
7626       fprintf(stderr, "Starting replay failed. Error: %s\n",
7627               s.ToString().c_str());
7628     }
7629   }
7630 };
7631 
db_bench_tool(int argc,char ** argv)7632 int db_bench_tool(int argc, char** argv) {
7633   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
7634   static bool initialized = false;
7635   if (!initialized) {
7636     SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
7637                     " [OPTIONS]...");
7638     initialized = true;
7639   }
7640   ParseCommandLineFlags(&argc, &argv, true);
7641   FLAGS_compaction_style_e =
7642       (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style;
7643 #ifndef ROCKSDB_LITE
7644   if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
7645     fprintf(stderr,
7646             "Cannot provide both --statistics and --statistics_string.\n");
7647     exit(1);
7648   }
7649   if (!FLAGS_statistics_string.empty()) {
7650     Status s = ObjectRegistry::NewInstance()->NewSharedObject<Statistics>(
7651         FLAGS_statistics_string, &dbstats);
7652     if (dbstats == nullptr) {
7653       fprintf(stderr,
7654               "No Statistics registered matching string: %s status=%s\n",
7655               FLAGS_statistics_string.c_str(), s.ToString().c_str());
7656       exit(1);
7657     }
7658   }
7659 #endif  // ROCKSDB_LITE
7660   if (FLAGS_statistics) {
7661     dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
7662   }
7663   if (dbstats) {
7664     dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
7665   }
7666   FLAGS_compaction_pri_e =
7667       (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri;
7668 
7669   std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit(
7670       FLAGS_max_bytes_for_level_multiplier_additional, ',');
7671   for (size_t j = 0; j < fanout.size(); j++) {
7672     FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
7673 #ifndef CYGWIN
7674         std::stoi(fanout[j]));
7675 #else
7676         stoi(fanout[j]));
7677 #endif
7678   }
7679 
7680   FLAGS_compression_type_e =
7681     StringToCompressionType(FLAGS_compression_type.c_str());
7682 
7683 #ifndef ROCKSDB_LITE
7684   // Stacked BlobDB
7685   FLAGS_blob_db_compression_type_e =
7686     StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
7687 
7688   int env_opts =
7689       !FLAGS_hdfs.empty() + !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
7690   if (env_opts > 1) {
7691     fprintf(stderr,
7692             "Error: --hdfs, --env_uri and --fs_uri are mutually exclusive\n");
7693     exit(1);
7694   }
7695 
7696   if (!FLAGS_env_uri.empty()) {
7697     Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
7698     if (FLAGS_env == nullptr) {
7699       fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
7700       exit(1);
7701     }
7702   } else if (!FLAGS_fs_uri.empty()) {
7703     std::shared_ptr<FileSystem> fs;
7704     Status s = FileSystem::Load(FLAGS_fs_uri, &fs);
7705     if (fs == nullptr) {
7706       fprintf(stderr, "Error: %s\n", s.ToString().c_str());
7707       exit(1);
7708     }
7709     FLAGS_env = GetCompositeEnv(fs);
7710   } else if (FLAGS_simulate_hybrid_fs_file != "") {
7711     FLAGS_env = GetCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
7712         FileSystem::Default(), FLAGS_simulate_hybrid_fs_file));
7713   }
7714 #endif  // ROCKSDB_LITE
7715   if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
7716     fprintf(stderr,
7717             "`-use_existing_db` must be true for `-use_existing_keys` to be "
7718             "settable\n");
7719     exit(1);
7720   }
7721 
7722   if (!FLAGS_hdfs.empty()) {
7723     FLAGS_env = new ROCKSDB_NAMESPACE::HdfsEnv(FLAGS_hdfs);
7724   }
7725 
7726   if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
7727     FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE;
7728   else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
7729     FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL;
7730   else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
7731     FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
7732   else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
7733     FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED;
7734   else {
7735     fprintf(stdout, "Unknown compaction fadvice:%s\n",
7736             FLAGS_compaction_fadvice.c_str());
7737   }
7738 
7739   FLAGS_value_size_distribution_type_e =
7740     StringToDistributionType(FLAGS_value_size_distribution_type.c_str());
7741 
7742   FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
7743 
7744   // Note options sanitization may increase thread pool sizes according to
7745   // max_background_flushes/max_background_compactions/max_background_jobs
7746   FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
7747                                   ROCKSDB_NAMESPACE::Env::Priority::HIGH);
7748   FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
7749                                   ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
7750   FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
7751                                   ROCKSDB_NAMESPACE::Env::Priority::LOW);
7752 
7753   // Choose a location for the test database if none given with --db=<path>
7754   if (FLAGS_db.empty()) {
7755     std::string default_db_path;
7756     FLAGS_env->GetTestDirectory(&default_db_path);
7757     default_db_path += "/dbbench";
7758     FLAGS_db = default_db_path;
7759   }
7760 
7761   if (FLAGS_stats_interval_seconds > 0) {
7762     // When both are set then FLAGS_stats_interval determines the frequency
7763     // at which the timer is checked for FLAGS_stats_interval_seconds
7764     FLAGS_stats_interval = 1000;
7765   }
7766 
7767   if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
7768     fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
7769     exit(1);
7770   }
7771 
7772   if ((FLAGS_enable_blob_files || FLAGS_enable_blob_garbage_collection) &&
7773       !FLAGS_merge_operator.empty()) {
7774     fprintf(stderr,
7775             "Integrated BlobDB is currently incompatible with Merge.\n");
7776     exit(1);
7777   }
7778 
7779   ROCKSDB_NAMESPACE::Benchmark benchmark;
7780   benchmark.Run();
7781 
7782 #ifndef ROCKSDB_LITE
7783   if (FLAGS_print_malloc_stats) {
7784     std::string stats_string;
7785     ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string);
7786     fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
7787   }
7788 #endif  // ROCKSDB_LITE
7789 
7790   return 0;
7791 }
7792 }  // namespace ROCKSDB_NAMESPACE
7793 #endif
7794