1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #ifdef GFLAGS
11 #ifdef NUMA
12 #include <numa.h>
13 #endif
14 #ifndef OS_WIN
15 #include <unistd.h>
16 #endif
17 #include <fcntl.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <sys/types.h>
21 #ifdef __APPLE__
22 #include <mach/host_info.h>
23 #include <mach/mach_host.h>
24 #include <sys/sysctl.h>
25 #endif
26 #ifdef __FreeBSD__
27 #include <sys/sysctl.h>
28 #endif
29 #include <atomic>
30 #include <cinttypes>
31 #include <condition_variable>
32 #include <cstddef>
33 #include <memory>
34 #include <mutex>
35 #include <thread>
36 #include <unordered_map>
37
38 #include "db/db_impl/db_impl.h"
39 #include "db/malloc_stats.h"
40 #include "db/version_set.h"
41 #include "hdfs/env_hdfs.h"
42 #include "monitoring/histogram.h"
43 #include "monitoring/statistics.h"
44 #include "options/cf_options.h"
45 #include "port/port.h"
46 #include "port/stack_trace.h"
47 #include "rocksdb/cache.h"
48 #include "rocksdb/db.h"
49 #include "rocksdb/env.h"
50 #include "rocksdb/filter_policy.h"
51 #include "rocksdb/memtablerep.h"
52 #include "rocksdb/options.h"
53 #include "rocksdb/perf_context.h"
54 #include "rocksdb/persistent_cache.h"
55 #include "rocksdb/rate_limiter.h"
56 #include "rocksdb/secondary_cache.h"
57 #include "rocksdb/slice.h"
58 #include "rocksdb/slice_transform.h"
59 #include "rocksdb/stats_history.h"
60 #include "rocksdb/utilities/object_registry.h"
61 #include "rocksdb/utilities/optimistic_transaction_db.h"
62 #include "rocksdb/utilities/options_type.h"
63 #include "rocksdb/utilities/options_util.h"
64 #include "rocksdb/utilities/sim_cache.h"
65 #include "rocksdb/utilities/transaction.h"
66 #include "rocksdb/utilities/transaction_db.h"
67 #include "rocksdb/write_batch.h"
68 #include "test_util/testutil.h"
69 #include "test_util/transaction_test_util.h"
70 #include "tools/simulated_hybrid_file_system.h"
71 #include "util/cast_util.h"
72 #include "util/compression.h"
73 #include "util/crc32c.h"
74 #include "util/gflags_compat.h"
75 #include "util/mutexlock.h"
76 #include "util/random.h"
77 #include "util/stderr_logger.h"
78 #include "util/string_util.h"
79 #include "util/xxhash.h"
80 #include "utilities/blob_db/blob_db.h"
81 #include "utilities/merge_operators.h"
82 #include "utilities/merge_operators/bytesxor.h"
83 #include "utilities/merge_operators/sortlist.h"
84 #include "utilities/persistent_cache/block_cache_tier.h"
85
86 #ifdef MEMKIND
87 #include "memory/memkind_kmem_allocator.h"
88 #endif
89
90 #ifdef OS_WIN
91 #include <io.h> // open/close
92 #endif
93
94 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
95 using GFLAGS_NAMESPACE::RegisterFlagValidator;
96 using GFLAGS_NAMESPACE::SetUsageMessage;
97
98 DEFINE_string(
99 benchmarks,
100 "fillseq,"
101 "fillseqdeterministic,"
102 "fillsync,"
103 "fillrandom,"
104 "filluniquerandomdeterministic,"
105 "overwrite,"
106 "readrandom,"
107 "newiterator,"
108 "newiteratorwhilewriting,"
109 "seekrandom,"
110 "seekrandomwhilewriting,"
111 "seekrandomwhilemerging,"
112 "readseq,"
113 "readreverse,"
114 "compact,"
115 "compactall,"
116 "flush,"
117 #ifndef ROCKSDB_LITE
118 "compact0,"
119 "compact1,"
120 "waitforcompaction,"
121 #endif
122 "multireadrandom,"
123 "mixgraph,"
124 "readseq,"
125 "readtorowcache,"
126 "readtocache,"
127 "readreverse,"
128 "readwhilewriting,"
129 "readwhilemerging,"
130 "readwhilescanning,"
131 "readrandomwriterandom,"
132 "updaterandom,"
133 "xorupdaterandom,"
134 "approximatesizerandom,"
135 "randomwithverify,"
136 "fill100K,"
137 "crc32c,"
138 "xxhash,"
139 "compress,"
140 "uncompress,"
141 "acquireload,"
142 "fillseekseq,"
143 "randomtransaction,"
144 "randomreplacekeys,"
145 "timeseries,"
146 "getmergeoperands",
147
148 "Comma-separated list of operations to run in the specified"
149 " order. Available benchmarks:\n"
150 "\tfillseq -- write N values in sequential key"
151 " order in async mode\n"
152 "\tfillseqdeterministic -- write N values in the specified"
153 " key order and keep the shape of the LSM tree\n"
154 "\tfillrandom -- write N values in random key order in async"
155 " mode\n"
156 "\tfilluniquerandomdeterministic -- write N values in a random"
157 " key order and keep the shape of the LSM tree\n"
158 "\toverwrite -- overwrite N values in random key order in"
159 " async mode\n"
160 "\tfillsync -- write N/1000 values in random key order in "
161 "sync mode\n"
162 "\tfill100K -- write N/1000 100K values in random order in"
163 " async mode\n"
164 "\tdeleteseq -- delete N keys in sequential order\n"
165 "\tdeleterandom -- delete N keys in random order\n"
166 "\treadseq -- read N times sequentially\n"
167 "\treadtocache -- 1 thread reading database sequentially\n"
168 "\treadreverse -- read N times in reverse order\n"
169 "\treadrandom -- read N times in random order\n"
170 "\treadmissing -- read N missing keys in random order\n"
171 "\treadwhilewriting -- 1 writer, N threads doing random "
172 "reads\n"
173 "\treadwhilemerging -- 1 merger, N threads doing random "
174 "reads\n"
175 "\treadwhilescanning -- 1 thread doing full table scan, "
176 "N threads doing random reads\n"
177 "\treadrandomwriterandom -- N threads doing random-read, "
178 "random-write\n"
179 "\tupdaterandom -- N threads doing read-modify-write for random "
180 "keys\n"
181 "\txorupdaterandom -- N threads doing read-XOR-write for "
182 "random keys\n"
183 "\tappendrandom -- N threads doing read-modify-write with "
184 "growing values\n"
185 "\tmergerandom -- same as updaterandom/appendrandom using merge"
186 " operator. "
187 "Must be used with merge_operator\n"
188 "\treadrandommergerandom -- perform N random read-or-merge "
189 "operations. Must be used with merge_operator\n"
190 "\tnewiterator -- repeated iterator creation\n"
191 "\tseekrandom -- N random seeks, call Next seek_nexts times "
192 "per seek\n"
193 "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
194 "overwrite\n"
195 "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
196 "merge\n"
197 "\tcrc32c -- repeated crc32c of 4K of data\n"
198 "\txxhash -- repeated xxHash of 4K of data\n"
199 "\tacquireload -- load N*1000 times\n"
200 "\tfillseekseq -- write N values in sequential key, then read "
201 "them by seeking to each key\n"
202 "\trandomtransaction -- execute N random transactions and "
203 "verify correctness\n"
204 "\trandomreplacekeys -- randomly replaces N keys by deleting "
205 "the old version and putting the new version\n\n"
206 "\ttimeseries -- 1 writer generates time series data "
207 "and multiple readers doing random reads on id\n\n"
208 "Meta operations:\n"
209 "\tcompact -- Compact the entire DB; If multiple, randomly choose one\n"
210 "\tcompactall -- Compact the entire DB\n"
211 #ifndef ROCKSDB_LITE
212 "\tcompact0 -- compact L0 into L1\n"
213 "\tcompact1 -- compact L1 into L2\n"
214 "\twaitforcompaction - pause until compaction is (probably) done\n"
215 #endif
216 "\tflush - flush the memtable\n"
217 "\tstats -- Print DB stats\n"
218 "\tresetstats -- Reset DB stats\n"
219 "\tlevelstats -- Print the number of files and bytes per level\n"
220 "\tmemstats -- Print memtable stats\n"
221 "\tsstables -- Print sstable info\n"
222 "\theapprofile -- Dump a heap profile (if supported by this port)\n"
223 "\treplay -- replay the trace file specified with trace_file\n"
224 "\tgetmergeoperands -- Insert lots of merge records which are a list of "
225 "sorted ints for a key and then compare performance of lookup for another "
226 "key "
227 "by doing a Get followed by binary searching in the large sorted list vs "
228 "doing a GetMergeOperands and binary searching in the operands which are"
229 "sorted sub-lists. The MergeOperator used is sortlist.h\n");
230
231 DEFINE_int64(num, 1000000, "Number of key/values to place in database");
232
233 DEFINE_int64(numdistinct, 1000,
234 "Number of distinct keys to use. Used in RandomWithVerify to "
235 "read/write on fewer keys so that gets are more likely to find the"
236 " key and puts are more likely to update the same key");
237
238 DEFINE_int64(merge_keys, -1,
239 "Number of distinct keys to use for MergeRandom and "
240 "ReadRandomMergeRandom. "
241 "If negative, there will be FLAGS_num keys.");
242 DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
243
244 DEFINE_int32(
245 num_hot_column_families, 0,
246 "Number of Hot Column Families. If more than 0, only write to this "
247 "number of column families. After finishing all the writes to them, "
248 "create new set of column families and insert to them. Only used "
249 "when num_column_families > 1.");
250
251 DEFINE_string(column_family_distribution, "",
252 "Comma-separated list of percentages, where the ith element "
253 "indicates the probability of an op using the ith column family. "
254 "The number of elements must be `num_hot_column_families` if "
255 "specified; otherwise, it must be `num_column_families`. The "
256 "sum of elements must be 100. E.g., if `num_column_families=4`, "
257 "and `num_hot_column_families=0`, a valid list could be "
258 "\"10,20,30,40\".");
259
260 DEFINE_int64(reads, -1, "Number of read operations to do. "
261 "If negative, do FLAGS_num reads.");
262
263 DEFINE_int64(deletes, -1, "Number of delete operations to do. "
264 "If negative, do FLAGS_num deletions.");
265
266 DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
267
268 DEFINE_int64(seed, 0, "Seed base for random number generators. "
269 "When 0 it is deterministic.");
270
271 DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
272
273 DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
274 " When 0 then num & reads determine the test duration");
275
276 DEFINE_string(value_size_distribution_type, "fixed",
277 "Value size distribution type: fixed, uniform, normal");
278
279 DEFINE_int32(value_size, 100, "Size of each value in fixed distribution");
280 static unsigned int value_size = 100;
281
282 DEFINE_int32(value_size_min, 100, "Min size of random value");
283
284 DEFINE_int32(value_size_max, 102400, "Max size of random value");
285
286 DEFINE_int32(seek_nexts, 0,
287 "How many times to call Next() after Seek() in "
288 "fillseekseq, seekrandom, seekrandomwhilewriting and "
289 "seekrandomwhilemerging");
290
291 DEFINE_bool(reverse_iterator, false,
292 "When true use Prev rather than Next for iterators that do "
293 "Seek and then Next");
294
295 DEFINE_int64(max_scan_distance, 0,
296 "Used to define iterate_upper_bound (or iterate_lower_bound "
297 "if FLAGS_reverse_iterator is set to true) when value is nonzero");
298
299 DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
300
301 DEFINE_int64(batch_size, 1, "Batch size");
302
ValidateKeySize(const char *,int32_t)303 static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
304 return true;
305 }
306
ValidateUint32Range(const char * flagname,uint64_t value)307 static bool ValidateUint32Range(const char* flagname, uint64_t value) {
308 if (value > std::numeric_limits<uint32_t>::max()) {
309 fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
310 (unsigned long)value);
311 return false;
312 }
313 return true;
314 }
315
316 DEFINE_int32(key_size, 16, "size of each key");
317
318 DEFINE_int32(user_timestamp_size, 0,
319 "number of bytes in a user-defined timestamp");
320
321 DEFINE_int32(num_multi_db, 0,
322 "Number of DBs used in the benchmark. 0 means single DB.");
323
324 DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
325 " to this fraction of their original size after compression");
326
327 DEFINE_double(read_random_exp_range, 0.0,
328 "Read random's key will be generated using distribution of "
329 "num * exp(-r) where r is uniform number from 0 to this value. "
330 "The larger the number is, the more skewed the reads are. "
331 "Only used in readrandom and multireadrandom benchmarks.");
332
333 DEFINE_bool(histogram, false, "Print histogram of operation timings");
334
335 DEFINE_bool(enable_numa, false,
336 "Make operations aware of NUMA architecture and bind memory "
337 "and cpus corresponding to nodes together. In NUMA, memory "
338 "in same node as CPUs are closer when compared to memory in "
339 "other nodes. Reads can be faster when the process is bound to "
340 "CPU and memory of same node. Use \"$numactl --hardware\" command "
341 "to see NUMA memory architecture.");
342
343 DEFINE_int64(db_write_buffer_size,
344 ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
345 "Number of bytes to buffer in all memtables before compacting");
346
347 DEFINE_bool(cost_write_buffer_to_cache, false,
348 "The usage of memtable is costed to the block cache");
349
350 DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size,
351 "The size, in bytes, of one block in arena memory allocation.");
352
353 DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
354 "Number of bytes to buffer in memtable before compacting");
355
356 DEFINE_int32(max_write_buffer_number,
357 ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
358 "The number of in-memory memtables. Each memtable is of size"
359 " write_buffer_size bytes.");
360
361 DEFINE_int32(min_write_buffer_number_to_merge,
362 ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
363 "The minimum number of write buffers that will be merged together"
364 "before writing to storage. This is cheap because it is an"
365 "in-memory merge. If this feature is not enabled, then all these"
366 "write buffers are flushed to L0 as separate files and this "
367 "increases read amplification because a get request has to check"
368 " in all of these files. Also, an in-memory merge may result in"
369 " writing less data to storage if there are duplicate records "
370 " in each of these individual write buffers.");
371
372 DEFINE_int32(max_write_buffer_number_to_maintain,
373 ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
374 "The total maximum number of write buffers to maintain in memory "
375 "including copies of buffers that have already been flushed. "
376 "Unlike max_write_buffer_number, this parameter does not affect "
377 "flushing. This controls the minimum amount of write history "
378 "that will be available in memory for conflict checking when "
379 "Transactions are used. If this value is too low, some "
380 "transactions may fail at commit time due to not being able to "
381 "determine whether there were any write conflicts. Setting this "
382 "value to 0 will cause write buffers to be freed immediately "
383 "after they are flushed. If this value is set to -1, "
384 "'max_write_buffer_number' will be used.");
385
386 DEFINE_int64(max_write_buffer_size_to_maintain,
387 ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
388 "The total maximum size of write buffers to maintain in memory "
389 "including copies of buffers that have already been flushed. "
390 "Unlike max_write_buffer_number, this parameter does not affect "
391 "flushing. This controls the minimum amount of write history "
392 "that will be available in memory for conflict checking when "
393 "Transactions are used. If this value is too low, some "
394 "transactions may fail at commit time due to not being able to "
395 "determine whether there were any write conflicts. Setting this "
396 "value to 0 will cause write buffers to be freed immediately "
397 "after they are flushed. If this value is set to -1, "
398 "'max_write_buffer_number' will be used.");
399
400 DEFINE_int32(max_background_jobs,
401 ROCKSDB_NAMESPACE::Options().max_background_jobs,
402 "The maximum number of concurrent background jobs that can occur "
403 "in parallel.");
404
405 DEFINE_int32(num_bottom_pri_threads, 0,
406 "The number of threads in the bottom-priority thread pool (used "
407 "by universal compaction only).");
408
409 DEFINE_int32(num_high_pri_threads, 0,
410 "The maximum number of concurrent background compactions"
411 " that can occur in parallel.");
412
413 DEFINE_int32(num_low_pri_threads, 0,
414 "The maximum number of concurrent background compactions"
415 " that can occur in parallel.");
416
417 DEFINE_int32(max_background_compactions,
418 ROCKSDB_NAMESPACE::Options().max_background_compactions,
419 "The maximum number of concurrent background compactions"
420 " that can occur in parallel.");
421
422 DEFINE_int32(base_background_compactions, -1, "DEPRECATED");
423
424 DEFINE_uint64(subcompactions, 1,
425 "Maximum number of subcompactions to divide L0-L1 compactions "
426 "into.");
427 static const bool FLAGS_subcompactions_dummy
428 __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions,
429 &ValidateUint32Range);
430
431 DEFINE_int32(max_background_flushes,
432 ROCKSDB_NAMESPACE::Options().max_background_flushes,
433 "The maximum number of concurrent background flushes"
434 " that can occur in parallel.");
435
436 static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e;
437 DEFINE_int32(compaction_style,
438 (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style,
439 "style of compaction: level-based, universal and fifo");
440
441 static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e;
442 DEFINE_int32(compaction_pri,
443 (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri,
444 "priority of files to compaction: by size or by data age");
445
446 DEFINE_int32(universal_size_ratio, 0,
447 "Percentage flexibility while comparing file size"
448 " (for universal compaction only).");
449
450 DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a"
451 " single compaction run (for universal compaction only).");
452
453 DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
454 " in universal style compaction");
455
456 DEFINE_int32(universal_max_size_amplification_percent, 0,
457 "The max size amplification for universal style compaction");
458
459 DEFINE_int32(universal_compression_size_percent, -1,
460 "The percentage of the database to compress for universal "
461 "compaction. -1 means compress everything.");
462
463 DEFINE_bool(universal_allow_trivial_move, false,
464 "Allow trivial move in universal compaction.");
465
466 DEFINE_int64(cache_size, 8 << 20, // 8MB
467 "Number of bytes to use as a cache of uncompressed data");
468
469 DEFINE_int32(cache_numshardbits, 6,
470 "Number of shards for the block cache"
471 " is 2 ** cache_numshardbits. Negative means use default settings."
472 " This is applied only if FLAGS_cache_size is non-negative.");
473
474 DEFINE_double(cache_high_pri_pool_ratio, 0.0,
475 "Ratio of block cache reserve for high pri blocks. "
476 "If > 0.0, we also enable "
477 "cache_index_and_filter_blocks_with_high_priority.");
478
479 DEFINE_bool(use_clock_cache, false,
480 "Replace default LRU block cache with clock cache.");
481
482 DEFINE_int64(simcache_size, -1,
483 "Number of bytes to use as a simcache of "
484 "uncompressed data. Nagative value disables simcache.");
485
486 DEFINE_bool(cache_index_and_filter_blocks, false,
487 "Cache index/filter blocks in block cache.");
488
489 DEFINE_bool(use_cache_memkind_kmem_allocator, false,
490 "Use memkind kmem allocator for block cache.");
491
492 DEFINE_bool(partition_index_and_filters, false,
493 "Partition index and filter blocks.");
494
495 DEFINE_bool(partition_index, false, "Partition index blocks");
496
497 DEFINE_bool(index_with_first_key, false, "Include first key in the index");
498
499 DEFINE_bool(
500 optimize_filters_for_memory,
501 ROCKSDB_NAMESPACE::BlockBasedTableOptions().optimize_filters_for_memory,
502 "Minimize memory footprint of filters");
503
504 DEFINE_int64(
505 index_shortening_mode, 2,
506 "mode to shorten index: 0 for no shortening; 1 for only shortening "
507 "separaters; 2 for shortening shortening and successor");
508
509 DEFINE_int64(metadata_block_size,
510 ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
511 "Max partition size when partitioning index/filters");
512
513 // The default reduces the overhead of reading time with flash. With HDD, which
514 // offers much less throughput, however, this number better to be set to 1.
515 DEFINE_int32(ops_between_duration_checks, 1000,
516 "Check duration limit every x ops");
517
518 DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
519 "Pin index/filter blocks of L0 files in block cache.");
520
521 DEFINE_bool(
522 pin_top_level_index_and_filter, false,
523 "Pin top-level index of partitioned index/filter blocks in block cache.");
524
525 DEFINE_int32(block_size,
526 static_cast<int32_t>(
527 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
528 "Number of bytes in a block.");
529
530 DEFINE_int32(format_version,
531 static_cast<int32_t>(
532 ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
533 "Format version of SST files.");
534
535 DEFINE_int32(block_restart_interval,
536 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval,
537 "Number of keys between restart points "
538 "for delta encoding of keys in data block.");
539
540 DEFINE_int32(
541 index_block_restart_interval,
542 ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
543 "Number of keys between restart points "
544 "for delta encoding of keys in index block.");
545
546 DEFINE_int32(read_amp_bytes_per_bit,
547 ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit,
548 "Number of bytes per bit to be used in block read-amp bitmap");
549
550 DEFINE_bool(
551 enable_index_compression,
552 ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression,
553 "Compress the index block");
554
555 DEFINE_bool(block_align,
556 ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
557 "Align data blocks on page size");
558
559 DEFINE_bool(use_data_block_hash_index, false,
560 "if use kDataBlockBinaryAndHash "
561 "instead of kDataBlockBinarySearch. "
562 "This is valid if only we use BlockTable");
563
564 DEFINE_double(data_block_hash_table_util_ratio, 0.75,
565 "util ratio for data block hash index table. "
566 "This is only valid if use_data_block_hash_index is "
567 "set to true");
568
569 DEFINE_int64(compressed_cache_size, -1,
570 "Number of bytes to use as a cache of compressed data.");
571
572 DEFINE_int64(row_cache_size, 0,
573 "Number of bytes to use as a cache of individual rows"
574 " (0 = disabled).");
575
576 DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
577 "Maximum number of files to keep open at the same time"
578 " (use default if == 0)");
579
580 DEFINE_int32(file_opening_threads,
581 ROCKSDB_NAMESPACE::Options().max_file_opening_threads,
582 "If open_files is set to -1, this option set the number of "
583 "threads that will be used to open files during DB::Open()");
584
585 DEFINE_bool(new_table_reader_for_compaction_inputs, true,
586 "If true, uses a separate file handle for compaction inputs");
587
588 DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
589
590 DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size");
591
592 DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
593 "Maximum windows randomaccess buffer size");
594
595 DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
596 "Maximum write buffer for Writable File");
597
598 DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
599 " use default settings.");
600
601 DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter");
602
603 DEFINE_double(memtable_bloom_size_ratio, 0,
604 "Ratio of memtable size used for bloom filter. 0 means no bloom "
605 "filter.");
606 DEFINE_bool(memtable_whole_key_filtering, false,
607 "Try to use whole key bloom filter in memtables.");
608 DEFINE_bool(memtable_use_huge_page, false,
609 "Try to use huge page in memtables.");
610
611 DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
612 " database. If you set this flag and also specify a benchmark that"
613 " wants a fresh database, that benchmark will fail.");
614
615 DEFINE_bool(use_existing_keys, false,
616 "If true, uses existing keys in the DB, "
617 "rather than generating new ones. This involves some startup "
618 "latency to load all keys into memory. It is supported for the "
619 "same read/overwrite benchmarks as `-use_existing_db=true`, which "
620 "must also be set for this flag to be enabled. When this flag is "
621 "set, the value for `-num` will be ignored.");
622
623 DEFINE_bool(show_table_properties, false,
624 "If true, then per-level table"
625 " properties will be printed on every stats-interval when"
626 " stats_interval is set and stats_per_interval is on.");
627
628 DEFINE_string(db, "", "Use the db with the following name.");
629
630 // Read cache flags
631
632 DEFINE_string(read_cache_path, "",
633 "If not empty string, a read cache will be used in this path");
634
635 DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
636 "Maximum size of the read cache");
637
638 DEFINE_bool(read_cache_direct_write, true,
639 "Whether to use Direct IO for writing to the read cache");
640
641 DEFINE_bool(read_cache_direct_read, true,
642 "Whether to use Direct IO for reading from read cache");
643
644 DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
645
ValidateCacheNumshardbits(const char * flagname,int32_t value)646 static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
647 if (value >= 20) {
648 fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
649 flagname, value);
650 return false;
651 }
652 return true;
653 }
654
655 DEFINE_bool(verify_checksum, true,
656 "Verify checksum for every block read"
657 " from storage");
658
659 DEFINE_bool(statistics, false, "Database statistics");
660 DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
661 "stats level for statistics");
662 DEFINE_string(statistics_string, "", "Serialized statistics string");
663 static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
664
665 DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
666 " --num reads.");
667
668 DEFINE_bool(finish_after_writes, false, "Write thread terminates after all writes are finished");
669
670 DEFINE_bool(sync, false, "Sync all writes to disk");
671
672 DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
673
674 DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
675
676 DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
677
678 DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
679 "Truth key/values used when using verify");
680
681 DEFINE_int32(num_levels, 7, "The total number of levels");
682
683 DEFINE_int64(target_file_size_base,
684 ROCKSDB_NAMESPACE::Options().target_file_size_base,
685 "Target file size at level-1");
686
687 DEFINE_int32(target_file_size_multiplier,
688 ROCKSDB_NAMESPACE::Options().target_file_size_multiplier,
689 "A multiplier to compute target level-N file size (N >= 2)");
690
691 DEFINE_uint64(max_bytes_for_level_base,
692 ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
693 "Max bytes for level-1");
694
695 DEFINE_bool(level_compaction_dynamic_level_bytes, false,
696 "Whether level size base is dynamic");
697
698 DEFINE_double(max_bytes_for_level_multiplier, 10,
699 "A multiplier to compute max bytes for level-N (N >= 2)");
700
701 static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
702 DEFINE_string(max_bytes_for_level_multiplier_additional, "",
703 "A vector that specifies additional fanout per level");
704
705 DEFINE_int32(level0_stop_writes_trigger,
706 ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
707 "Number of files in level-0"
708 " that will trigger put stop.");
709
710 DEFINE_int32(level0_slowdown_writes_trigger,
711 ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
712 "Number of files in level-0"
713 " that will slow down writes.");
714
715 DEFINE_int32(level0_file_num_compaction_trigger,
716 ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
717 "Number of files in level-0"
718 " when compactions start");
719
720 DEFINE_uint64(periodic_compaction_seconds,
721 ROCKSDB_NAMESPACE::Options().periodic_compaction_seconds,
722 "Files older than this will be picked up for compaction and"
723 " rewritten to the same level");
724
ValidateInt32Percent(const char * flagname,int32_t value)725 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
726 if (value <= 0 || value>=100) {
727 fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
728 flagname, value);
729 return false;
730 }
731 return true;
732 }
733 DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
734 " as percentage) for the ReadRandomWriteRandom workload. The "
735 "default value 90 means 90% operations out of all reads and writes"
736 " operations are reads. In other words, 9 gets for every 1 put.");
737
738 DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
739 " as percentage) for the ReadRandomMergeRandom workload. The"
740 " default value 70 means 70% out of all read and merge operations"
741 " are merges. In other words, 7 merges for every 3 gets.");
742
743 DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
744 "deletes (used in RandomWithVerify only). RandomWithVerify "
745 "calculates writepercent as (100 - FLAGS_readwritepercent - "
746 "deletepercent), so deletepercent must be smaller than (100 - "
747 "FLAGS_readwritepercent)");
748
749 DEFINE_bool(optimize_filters_for_hits, false,
750 "Optimizes bloom filters for workloads for most lookups return "
751 "a value. For now this doesn't create bloom filters for the max "
752 "level of the LSM to reduce metadata that should fit in RAM. ");
753
754 DEFINE_uint64(delete_obsolete_files_period_micros, 0,
755 "Ignored. Left here for backward compatibility");
756
757 DEFINE_int64(writes_before_delete_range, 0,
758 "Number of writes before DeleteRange is called regularly.");
759
760 DEFINE_int64(writes_per_range_tombstone, 0,
761 "Number of writes between range tombstones");
762
763 DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
764
765 DEFINE_int64(max_num_range_tombstones, 0,
766 "Maximum number of range tombstones "
767 "to insert.");
768
769 DEFINE_bool(expand_range_tombstones, false,
770 "Expand range tombstone into sequential regular tombstones.");
771
772 #ifndef ROCKSDB_LITE
773 // Transactions Options
774 DEFINE_bool(optimistic_transaction_db, false,
775 "Open a OptimisticTransactionDB instance. "
776 "Required for randomtransaction benchmark.");
777
778 DEFINE_bool(transaction_db, false,
779 "Open a TransactionDB instance. "
780 "Required for randomtransaction benchmark.");
781
782 DEFINE_uint64(transaction_sets, 2,
783 "Number of keys each transaction will "
784 "modify (use in RandomTransaction only). Max: 9999");
785
786 DEFINE_bool(transaction_set_snapshot, false,
787 "Setting to true will have each transaction call SetSnapshot()"
788 " upon creation.");
789
790 DEFINE_int32(transaction_sleep, 0,
791 "Max microseconds to sleep in between "
792 "reading and writing a value (used in RandomTransaction only). ");
793
794 DEFINE_uint64(transaction_lock_timeout, 100,
795 "If using a transaction_db, specifies the lock wait timeout in"
796 " milliseconds before failing a transaction waiting on a lock");
797 DEFINE_string(
798 options_file, "",
799 "The path to a RocksDB options file. If specified, then db_bench will "
800 "run with the RocksDB options in the default column family of the "
801 "specified options file. "
802 "Note that with this setting, db_bench will ONLY accept the following "
803 "RocksDB options related command-line arguments, all other arguments "
804 "that are related to RocksDB options will be ignored:\n"
805 "\t--use_existing_db\n"
806 "\t--use_existing_keys\n"
807 "\t--statistics\n"
808 "\t--row_cache_size\n"
809 "\t--row_cache_numshardbits\n"
810 "\t--enable_io_prio\n"
811 "\t--dump_malloc_stats\n"
812 "\t--num_multi_db\n");
813
814 // FIFO Compaction Options
815 DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
816 "The limit of total table file sizes to trigger FIFO compaction");
817
818 DEFINE_bool(fifo_compaction_allow_compaction, true,
819 "Allow compaction in FIFO compaction.");
820
821 DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
822
823 // Stacked BlobDB Options
824 DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
825
826 DEFINE_bool(
827 blob_db_enable_gc,
828 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
829 "[Stacked BlobDB] Enable BlobDB garbage collection.");
830
831 DEFINE_double(
832 blob_db_gc_cutoff,
833 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
834 "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
835
836 DEFINE_bool(blob_db_is_fifo,
837 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
838 "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
839
840 DEFINE_uint64(blob_db_max_db_size,
841 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
842 "[Stacked BlobDB] Max size limit of the directory where blob "
843 "files are stored.");
844
845 DEFINE_uint64(blob_db_max_ttl_range, 0,
846 "[Stacked BlobDB] TTL range to generate BlobDB data (in "
847 "seconds). 0 means no TTL.");
848
849 DEFINE_uint64(
850 blob_db_ttl_range_secs,
851 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
852 "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
853
854 DEFINE_uint64(
855 blob_db_min_blob_size,
856 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
857 "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
858 "smaller than this will be inlined with the key in the LSM tree.");
859
860 DEFINE_uint64(blob_db_bytes_per_sync,
861 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
862 "[Stacked BlobDB] Bytes to sync blob file at.");
863
864 DEFINE_uint64(blob_db_file_size,
865 ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
866 "[Stacked BlobDB] Target size of each blob file.");
867
868 DEFINE_string(
869 blob_db_compression_type, "snappy",
870 "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
871 static enum ROCKSDB_NAMESPACE::CompressionType
872 FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
873
874 #endif // ROCKSDB_LITE
875
876 // Integrated BlobDB options
877 DEFINE_bool(
878 enable_blob_files,
879 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
880 "[Integrated BlobDB] Enable writing large values to separate blob files.");
881
882 DEFINE_uint64(min_blob_size,
883 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
884 "[Integrated BlobDB] The size of the smallest value to be stored "
885 "separately in a blob file.");
886
887 DEFINE_uint64(blob_file_size,
888 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
889 "[Integrated BlobDB] The size limit for blob files.");
890
891 DEFINE_string(blob_compression_type, "none",
892 "[Integrated BlobDB] The compression algorithm to use for large "
893 "values stored in blob files.");
894
895 DEFINE_bool(enable_blob_garbage_collection,
896 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
897 .enable_blob_garbage_collection,
898 "[Integrated BlobDB] Enable blob garbage collection.");
899
900 DEFINE_double(blob_garbage_collection_age_cutoff,
901 ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
902 .blob_garbage_collection_age_cutoff,
903 "[Integrated BlobDB] The cutoff in terms of blob file age for "
904 "garbage collection.");
905
906 #ifndef ROCKSDB_LITE
907
908 // Secondary DB instance Options
909 DEFINE_bool(use_secondary_db, false,
910 "Open a RocksDB secondary instance. A primary instance can be "
911 "running in another db_bench process.");
912
913 DEFINE_string(secondary_path, "",
914 "Path to a directory used by the secondary instance to store "
915 "private files, e.g. info log.");
916
917 DEFINE_int32(secondary_update_interval, 5,
918 "Secondary instance attempts to catch up with the primary every "
919 "secondary_update_interval seconds.");
920
921 #endif // ROCKSDB_LITE
922
923 DEFINE_bool(report_bg_io_stats, false,
924 "Measure times spents on I/Os while in compactions. ");
925
926 DEFINE_bool(use_stderr_info_logger, false,
927 "Write info logs to stderr instead of to LOG file. ");
928
929 DEFINE_string(trace_file, "", "Trace workload to a file. ");
930
931 DEFINE_int32(trace_replay_fast_forward, 1,
932 "Fast forward trace replay, must >= 1. ");
933 DEFINE_int32(block_cache_trace_sampling_frequency, 1,
934 "Block cache trace sampling frequency, termed s. It uses spatial "
935 "downsampling and samples accesses to one out of s blocks.");
936 DEFINE_int64(
937 block_cache_trace_max_trace_file_size_in_bytes,
938 uint64_t{64} * 1024 * 1024 * 1024,
939 "The maximum block cache trace file size in bytes. Block cache accesses "
940 "will not be logged if the trace file size exceeds this threshold. Default "
941 "is 64 GB.");
942 DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
943 DEFINE_int32(trace_replay_threads, 1,
944 "The number of threads to replay, must >=1.");
945
StringToCompressionType(const char * ctype)946 static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
947 const char* ctype) {
948 assert(ctype);
949
950 if (!strcasecmp(ctype, "none"))
951 return ROCKSDB_NAMESPACE::kNoCompression;
952 else if (!strcasecmp(ctype, "snappy"))
953 return ROCKSDB_NAMESPACE::kSnappyCompression;
954 else if (!strcasecmp(ctype, "zlib"))
955 return ROCKSDB_NAMESPACE::kZlibCompression;
956 else if (!strcasecmp(ctype, "bzip2"))
957 return ROCKSDB_NAMESPACE::kBZip2Compression;
958 else if (!strcasecmp(ctype, "lz4"))
959 return ROCKSDB_NAMESPACE::kLZ4Compression;
960 else if (!strcasecmp(ctype, "lz4hc"))
961 return ROCKSDB_NAMESPACE::kLZ4HCCompression;
962 else if (!strcasecmp(ctype, "xpress"))
963 return ROCKSDB_NAMESPACE::kXpressCompression;
964 else if (!strcasecmp(ctype, "zstd"))
965 return ROCKSDB_NAMESPACE::kZSTD;
966
967 fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
968 return ROCKSDB_NAMESPACE::kSnappyCompression; // default value
969 }
970
ColumnFamilyName(size_t i)971 static std::string ColumnFamilyName(size_t i) {
972 if (i == 0) {
973 return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName;
974 } else {
975 char name[100];
976 snprintf(name, sizeof(name), "column_family_name_%06zu", i);
977 return std::string(name);
978 }
979 }
980
981 DEFINE_string(compression_type, "snappy",
982 "Algorithm to use to compress the database");
983 static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e =
984 ROCKSDB_NAMESPACE::kSnappyCompression;
985
986 DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");
987
988 DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level,
989 "Compression level. The meaning of this value is library-"
990 "dependent. If unset, we try to use the default for the library "
991 "specified in `--compression_type`");
992
993 DEFINE_int32(compression_max_dict_bytes,
994 ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes,
995 "Maximum size of dictionary used to prime the compression "
996 "library.");
997
998 DEFINE_int32(compression_zstd_max_train_bytes,
999 ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes,
1000 "Maximum size of training data passed to zstd's dictionary "
1001 "trainer.");
1002
1003 DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
1004 " from this level. Levels with number < min_level_to_compress are"
1005 " not compressed. Otherwise, apply compression_type to "
1006 "all levels.");
1007
1008 DEFINE_int32(compression_parallel_threads, 1,
1009 "Number of threads for parallel compression.");
1010
1011 DEFINE_uint64(compression_max_dict_buffer_bytes,
1012 ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes,
1013 "Maximum bytes to buffer to collect samples for dictionary.");
1014
ValidateTableCacheNumshardbits(const char * flagname,int32_t value)1015 static bool ValidateTableCacheNumshardbits(const char* flagname,
1016 int32_t value) {
1017 if (0 >= value || value >= 20) {
1018 fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val < 20\n",
1019 flagname, value);
1020 return false;
1021 }
1022 return true;
1023 }
1024 DEFINE_int32(table_cache_numshardbits, 4, "");
1025
1026 #ifndef ROCKSDB_LITE
1027 DEFINE_string(env_uri, "",
1028 "URI for registry Env lookup. Mutually exclusive"
1029 " with --hdfs and --fs_uri");
1030 DEFINE_string(fs_uri, "",
1031 "URI for registry Filesystem lookup. Mutually exclusive"
1032 " with --hdfs and --env_uri."
1033 " Creates a default environment with the specified filesystem.");
1034 #endif // ROCKSDB_LITE
1035 DEFINE_string(hdfs, "",
1036 "Name of hdfs environment. Mutually exclusive with"
1037 " --env_uri and --fs_uri");
1038 DEFINE_string(simulate_hybrid_fs_file, "",
1039 "File for Store Metadata for Simulate hybrid FS. Empty means "
1040 "disable the feature. Now, if it is set, "
1041 "bottommost_temperature is set to kWarm.");
1042
1043 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
1044
1045 static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default();
1046
1047 DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
1048 "this is greater than zero. When 0 the interval grows over time.");
1049
1050 DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This "
1051 "overrides stats_interval when both are > 0.");
1052
1053 DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
1054 " this is greater than 0.");
1055
1056 DEFINE_int64(report_interval_seconds, 0,
1057 "If greater than zero, it will write simple stats in CVS format "
1058 "to --report_file every N seconds");
1059
1060 DEFINE_string(report_file, "report.csv",
1061 "Filename where some simple stats are reported to (if "
1062 "--report_interval_seconds is bigger than 0)");
1063
1064 DEFINE_int32(thread_status_per_interval, 0,
1065 "Takes and report a snapshot of the current status of each thread"
1066 " when this is greater than 0.");
1067
1068 DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
1069 "Level of perf collection");
1070
1071 #ifndef ROCKSDB_LITE
GetCompositeEnv(std::shared_ptr<ROCKSDB_NAMESPACE::FileSystem> fs)1072 static ROCKSDB_NAMESPACE::Env* GetCompositeEnv(
1073 std::shared_ptr<ROCKSDB_NAMESPACE::FileSystem> fs) {
1074 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
1075 ROCKSDB_NAMESPACE::NewCompositeEnv(fs);
1076 return composite_env.get();
1077 }
1078 #endif
1079
ValidateRateLimit(const char * flagname,double value)1080 static bool ValidateRateLimit(const char* flagname, double value) {
1081 const double EPSILON = 1e-10;
1082 if ( value < -EPSILON ) {
1083 fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n",
1084 flagname, value);
1085 return false;
1086 }
1087 return true;
1088 }
1089 DEFINE_double(soft_rate_limit, 0.0, "DEPRECATED");
1090
1091 DEFINE_double(hard_rate_limit, 0.0, "DEPRECATED");
1092
1093 DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
1094 "Slowdown writes if pending compaction bytes exceed this number");
1095
1096 DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
1097 "Stop writes if pending compaction bytes exceed this number");
1098
1099 DEFINE_uint64(delayed_write_rate, 8388608u,
1100 "Limited bytes allowed to DB when soft_rate_limit or "
1101 "level0_slowdown_writes_trigger triggers");
1102
1103 DEFINE_bool(enable_pipelined_write, true,
1104 "Allow WAL and memtable writes to be pipelined");
1105
1106 DEFINE_bool(
1107 unordered_write, false,
1108 "Enable the unordered write feature, which provides higher throughput but "
1109 "relaxes the guarantees around atomic reads and immutable snapshots");
1110
1111 DEFINE_bool(allow_concurrent_memtable_write, true,
1112 "Allow multi-writers to update mem tables in parallel.");
1113
1114 DEFINE_bool(inplace_update_support,
1115 ROCKSDB_NAMESPACE::Options().inplace_update_support,
1116 "Support in-place memtable update for smaller or same-size values");
1117
1118 DEFINE_uint64(inplace_update_num_locks,
1119 ROCKSDB_NAMESPACE::Options().inplace_update_num_locks,
1120 "Number of RW locks to protect in-place memtable updates");
1121
1122 DEFINE_bool(enable_write_thread_adaptive_yield, true,
1123 "Use a yielding spin loop for brief writer thread waits.");
1124
1125 DEFINE_uint64(
1126 write_thread_max_yield_usec, 100,
1127 "Maximum microseconds for enable_write_thread_adaptive_yield operation.");
1128
1129 DEFINE_uint64(write_thread_slow_yield_usec, 3,
1130 "The threshold at which a slow yield is considered a signal that "
1131 "other processes or threads want the core.");
1132
1133 DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
1134 "When hard_rate_limit is set then this is the max time a put will"
1135 " be stalled.");
1136
1137 DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
1138
1139 DEFINE_bool(rate_limiter_auto_tuned, false,
1140 "Enable dynamic adjustment of rate limit according to demand for "
1141 "background I/O");
1142
1143
1144 DEFINE_bool(sine_write_rate, false,
1145 "Use a sine wave write_rate_limit");
1146
1147 DEFINE_uint64(sine_write_rate_interval_milliseconds, 10000,
1148 "Interval of which the sine wave write_rate_limit is recalculated");
1149
1150 DEFINE_double(sine_a, 1,
1151 "A in f(x) = A sin(bx + c) + d");
1152
1153 DEFINE_double(sine_b, 1,
1154 "B in f(x) = A sin(bx + c) + d");
1155
1156 DEFINE_double(sine_c, 0,
1157 "C in f(x) = A sin(bx + c) + d");
1158
1159 DEFINE_double(sine_d, 1,
1160 "D in f(x) = A sin(bx + c) + d");
1161
1162 DEFINE_bool(rate_limit_bg_reads, false,
1163 "Use options.rate_limiter on compaction reads");
1164
1165 DEFINE_uint64(
1166 benchmark_write_rate_limit, 0,
1167 "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
1168 "is the global rate in bytes/second.");
1169
1170 // the parameters of mix_graph
1171 DEFINE_double(keyrange_dist_a, 0.0,
1172 "The parameter 'a' of prefix average access distribution "
1173 "f(x)=a*exp(b*x)+c*exp(d*x)");
1174 DEFINE_double(keyrange_dist_b, 0.0,
1175 "The parameter 'b' of prefix average access distribution "
1176 "f(x)=a*exp(b*x)+c*exp(d*x)");
1177 DEFINE_double(keyrange_dist_c, 0.0,
1178 "The parameter 'c' of prefix average access distribution"
1179 "f(x)=a*exp(b*x)+c*exp(d*x)");
1180 DEFINE_double(keyrange_dist_d, 0.0,
1181 "The parameter 'd' of prefix average access distribution"
1182 "f(x)=a*exp(b*x)+c*exp(d*x)");
1183 DEFINE_int64(keyrange_num, 1,
1184 "The number of key ranges that are in the same prefix "
1185 "group, each prefix range will have its key access "
1186 "distribution");
1187 DEFINE_double(key_dist_a, 0.0,
1188 "The parameter 'a' of key access distribution model "
1189 "f(x)=a*x^b");
1190 DEFINE_double(key_dist_b, 0.0,
1191 "The parameter 'b' of key access distribution model "
1192 "f(x)=a*x^b");
1193 DEFINE_double(value_theta, 0.0,
1194 "The parameter 'theta' of Generized Pareto Distribution "
1195 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1196 DEFINE_double(value_k, 0.0,
1197 "The parameter 'k' of Generized Pareto Distribution "
1198 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1199 DEFINE_double(value_sigma, 0.0,
1200 "The parameter 'theta' of Generized Pareto Distribution "
1201 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1202 DEFINE_double(iter_theta, 0.0,
1203 "The parameter 'theta' of Generized Pareto Distribution "
1204 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1205 DEFINE_double(iter_k, 0.0,
1206 "The parameter 'k' of Generized Pareto Distribution "
1207 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1208 DEFINE_double(iter_sigma, 0.0,
1209 "The parameter 'sigma' of Generized Pareto Distribution "
1210 "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
1211 DEFINE_double(mix_get_ratio, 1.0,
1212 "The ratio of Get queries of mix_graph workload");
1213 DEFINE_double(mix_put_ratio, 0.0,
1214 "The ratio of Put queries of mix_graph workload");
1215 DEFINE_double(mix_seek_ratio, 0.0,
1216 "The ratio of Seek queries of mix_graph workload");
1217 DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
1218 DEFINE_int64(mix_ave_kv_size, 512,
1219 "The average key-value size of this workload");
1220 DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
1221 DEFINE_double(
1222 sine_mix_rate_noise, 0.0,
1223 "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
1224 DEFINE_bool(sine_mix_rate, false,
1225 "Enable the sine QPS control on the mix workload");
1226 DEFINE_uint64(
1227 sine_mix_rate_interval_milliseconds, 10000,
1228 "Interval of which the sine wave read_rate_limit is recalculated");
1229 DEFINE_int64(mix_accesses, -1,
1230 "The total query accesses of mix_graph workload");
1231
1232 DEFINE_uint64(
1233 benchmark_read_rate_limit, 0,
1234 "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
1235 "is the global rate in ops/second.");
1236
1237 DEFINE_uint64(max_compaction_bytes,
1238 ROCKSDB_NAMESPACE::Options().max_compaction_bytes,
1239 "Max bytes allowed in one compaction");
1240
1241 #ifndef ROCKSDB_LITE
1242 DEFINE_bool(readonly, false, "Run read only benchmarks.");
1243
1244 DEFINE_bool(print_malloc_stats, false,
1245 "Print malloc stats to stdout after benchmarks finish.");
1246 #endif // ROCKSDB_LITE
1247
1248 DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
1249
1250 DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
1251 DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
1252 " in MB.");
1253 DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
1254
1255 DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
1256 "Allow reads to occur via mmap-ing files");
1257
1258 DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
1259 "Allow writes to occur via mmap-ing files");
1260
1261 DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
1262 "Use O_DIRECT for reading data");
1263
1264 DEFINE_bool(use_direct_io_for_flush_and_compaction,
1265 ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
1266 "Use O_DIRECT for background flush and compaction writes");
1267
1268 DEFINE_bool(advise_random_on_open,
1269 ROCKSDB_NAMESPACE::Options().advise_random_on_open,
1270 "Advise random access on table file open");
1271
1272 DEFINE_string(compaction_fadvice, "NORMAL",
1273 "Access pattern advice when a file is compacted");
1274 static auto FLAGS_compaction_fadvice_e =
1275 ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start;
1276
1277 DEFINE_bool(use_tailing_iterator, false,
1278 "Use tailing iterator to access a series of keys instead of get");
1279
1280 DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex,
1281 "Use adaptive mutex");
1282
1283 DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
1284 "Allows OS to incrementally sync SST files to disk while they are"
1285 " being written, in the background. Issue one request for every"
1286 " bytes_per_sync written. 0 turns it off.");
1287
1288 DEFINE_uint64(wal_bytes_per_sync,
1289 ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
1290 "Allows OS to incrementally sync WAL files to disk while they are"
1291 " being written, in the background. Issue one request for every"
1292 " wal_bytes_per_sync written. 0 turns it off.");
1293
1294 DEFINE_bool(use_single_deletes, true,
1295 "Use single deletes (used in RandomReplaceKeys only).");
1296
1297 DEFINE_double(stddev, 2000.0,
1298 "Standard deviation of normal distribution used for picking keys"
1299 " (used in RandomReplaceKeys only).");
1300
1301 DEFINE_int32(key_id_range, 100000,
1302 "Range of possible value of key id (used in TimeSeries only).");
1303
1304 DEFINE_string(expire_style, "none",
1305 "Style to remove expired time entries. Can be one of the options "
1306 "below: none (do not expired data), compaction_filter (use a "
1307 "compaction filter to remove expired data), delete (seek IDs and "
1308 "remove expired data) (used in TimeSeries only).");
1309
1310 DEFINE_uint64(
1311 time_range, 100000,
1312 "Range of timestamp that store in the database (used in TimeSeries"
1313 " only).");
1314
1315 DEFINE_int32(num_deletion_threads, 1,
1316 "Number of threads to do deletion (used in TimeSeries and delete "
1317 "expire_style only).");
1318
1319 DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
1320 " operations on a key in the memtable");
1321
ValidatePrefixSize(const char * flagname,int32_t value)1322 static bool ValidatePrefixSize(const char* flagname, int32_t value) {
1323 if (value < 0 || value>=2000000000) {
1324 fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
1325 flagname, value);
1326 return false;
1327 }
1328 return true;
1329 }
1330
1331 DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and "
1332 "plain table");
1333 DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
1334 "per prefix, 0 means no special handling of the prefix, "
1335 "i.e. use the prefix comes with the generated random number.");
1336 DEFINE_bool(total_order_seek, false,
1337 "Enable total order seek regardless of index format.");
1338 DEFINE_bool(prefix_same_as_start, false,
1339 "Enforce iterator to return keys with prefix same as seek key.");
1340 DEFINE_bool(
1341 seek_missing_prefix, false,
1342 "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");
1343
1344 DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
1345 "If non-zero, enable "
1346 "memtable insert with hint with the given prefix size.");
1347 DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
1348 "threads' IO priority");
1349 DEFINE_bool(enable_cpu_prio, false, "Lower the background flush/compaction "
1350 "threads' CPU priority");
1351 DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
1352 "table becomes an identity function. This is only valid when key "
1353 "is 8 bytes");
1354 DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
1355 DEFINE_uint64(stats_dump_period_sec,
1356 ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
1357 "Gap between printing stats to log in seconds");
1358 DEFINE_uint64(stats_persist_period_sec,
1359 ROCKSDB_NAMESPACE::Options().stats_persist_period_sec,
1360 "Gap between persisting stats in seconds");
1361 DEFINE_bool(persist_stats_to_disk,
1362 ROCKSDB_NAMESPACE::Options().persist_stats_to_disk,
1363 "whether to persist stats to disk");
1364 DEFINE_uint64(stats_history_buffer_size,
1365 ROCKSDB_NAMESPACE::Options().stats_history_buffer_size,
1366 "Max number of stats snapshots to keep in memory");
1367 DEFINE_int64(multiread_stride, 0,
1368 "Stride length for the keys in a MultiGet batch");
1369 DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
1370
1371 enum RepFactory {
1372 kSkipList,
1373 kPrefixHash,
1374 kVectorRep,
1375 kHashLinkedList,
1376 };
1377
StringToRepFactory(const char * ctype)1378 static enum RepFactory StringToRepFactory(const char* ctype) {
1379 assert(ctype);
1380
1381 if (!strcasecmp(ctype, "skip_list"))
1382 return kSkipList;
1383 else if (!strcasecmp(ctype, "prefix_hash"))
1384 return kPrefixHash;
1385 else if (!strcasecmp(ctype, "vector"))
1386 return kVectorRep;
1387 else if (!strcasecmp(ctype, "hash_linkedlist"))
1388 return kHashLinkedList;
1389
1390 fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
1391 return kSkipList;
1392 }
1393
1394 static enum RepFactory FLAGS_rep_factory;
1395 DEFINE_string(memtablerep, "skip_list", "");
1396 DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
1397 DEFINE_bool(use_plain_table, false, "if use plain table "
1398 "instead of block-based table format");
1399 DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
1400 DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
1401 DEFINE_bool(use_hash_search, false, "if use kHashSearch "
1402 "instead of kBinarySearch. "
1403 "This is valid if only we use BlockTable");
1404 DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
1405 "instead of kFullFilter for filter block. "
1406 "This is valid if only we use BlockTable");
1407 DEFINE_string(merge_operator, "", "The merge operator to use with the database."
1408 "If a new merge operator is specified, be sure to use fresh"
1409 " database The possible merge operators are defined in"
1410 " utilities/merge_operators.h");
1411 DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
1412 "linear search first for this many steps from the previous "
1413 "position");
1414 DEFINE_bool(report_file_operations, false, "if report number of file "
1415 "operations");
1416 DEFINE_int32(readahead_size, 0, "Iterator readahead size");
1417
1418 DEFINE_bool(read_with_latest_user_timestamp, true,
1419 "If true, always use the current latest timestamp for read. If "
1420 "false, choose a random timestamp from the past.");
1421
1422 #ifndef ROCKSDB_LITE
1423 DEFINE_string(secondary_cache_uri, "",
1424 "Full URI for creating a custom secondary cache object");
1425 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
1426 #endif // ROCKSDB_LITE
1427
1428 static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) =
1429 RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
1430
1431 static const bool FLAGS_hard_rate_limit_dummy __attribute__((__unused__)) =
1432 RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
1433
1434 static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
1435 RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
1436
1437 static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
1438 RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
1439
1440 static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
1441 RegisterFlagValidator(&FLAGS_cache_numshardbits,
1442 &ValidateCacheNumshardbits);
1443
1444 static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
1445 RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
1446
1447 DEFINE_int32(disable_seek_compaction, false,
1448 "Not used, left here for backwards compatibility");
1449
1450 static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
1451 RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
1452 static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((__unused__)) =
1453 RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
1454 &ValidateTableCacheNumshardbits);
1455
1456 namespace ROCKSDB_NAMESPACE {
1457
1458 namespace {
1459 struct ReportFileOpCounters {
1460 std::atomic<int> open_counter_;
1461 std::atomic<int> read_counter_;
1462 std::atomic<int> append_counter_;
1463 std::atomic<uint64_t> bytes_read_;
1464 std::atomic<uint64_t> bytes_written_;
1465 };
1466
1467 // A special Env to records and report file operations in db_bench
1468 class ReportFileOpEnv : public EnvWrapper {
1469 public:
ReportFileOpEnv(Env * base)1470 explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); }
1471
reset()1472 void reset() {
1473 counters_.open_counter_ = 0;
1474 counters_.read_counter_ = 0;
1475 counters_.append_counter_ = 0;
1476 counters_.bytes_read_ = 0;
1477 counters_.bytes_written_ = 0;
1478 }
1479
NewSequentialFile(const std::string & f,std::unique_ptr<SequentialFile> * r,const EnvOptions & soptions)1480 Status NewSequentialFile(const std::string& f,
1481 std::unique_ptr<SequentialFile>* r,
1482 const EnvOptions& soptions) override {
1483 class CountingFile : public SequentialFile {
1484 private:
1485 std::unique_ptr<SequentialFile> target_;
1486 ReportFileOpCounters* counters_;
1487
1488 public:
1489 CountingFile(std::unique_ptr<SequentialFile>&& target,
1490 ReportFileOpCounters* counters)
1491 : target_(std::move(target)), counters_(counters) {}
1492
1493 Status Read(size_t n, Slice* result, char* scratch) override {
1494 counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
1495 Status rv = target_->Read(n, result, scratch);
1496 counters_->bytes_read_.fetch_add(result->size(),
1497 std::memory_order_relaxed);
1498 return rv;
1499 }
1500
1501 Status Skip(uint64_t n) override { return target_->Skip(n); }
1502 };
1503
1504 Status s = target()->NewSequentialFile(f, r, soptions);
1505 if (s.ok()) {
1506 counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
1507 r->reset(new CountingFile(std::move(*r), counters()));
1508 }
1509 return s;
1510 }
1511
NewRandomAccessFile(const std::string & f,std::unique_ptr<RandomAccessFile> * r,const EnvOptions & soptions)1512 Status NewRandomAccessFile(const std::string& f,
1513 std::unique_ptr<RandomAccessFile>* r,
1514 const EnvOptions& soptions) override {
1515 class CountingFile : public RandomAccessFile {
1516 private:
1517 std::unique_ptr<RandomAccessFile> target_;
1518 ReportFileOpCounters* counters_;
1519
1520 public:
1521 CountingFile(std::unique_ptr<RandomAccessFile>&& target,
1522 ReportFileOpCounters* counters)
1523 : target_(std::move(target)), counters_(counters) {}
1524 Status Read(uint64_t offset, size_t n, Slice* result,
1525 char* scratch) const override {
1526 counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
1527 Status rv = target_->Read(offset, n, result, scratch);
1528 counters_->bytes_read_.fetch_add(result->size(),
1529 std::memory_order_relaxed);
1530 return rv;
1531 }
1532 };
1533
1534 Status s = target()->NewRandomAccessFile(f, r, soptions);
1535 if (s.ok()) {
1536 counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
1537 r->reset(new CountingFile(std::move(*r), counters()));
1538 }
1539 return s;
1540 }
1541
NewWritableFile(const std::string & f,std::unique_ptr<WritableFile> * r,const EnvOptions & soptions)1542 Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
1543 const EnvOptions& soptions) override {
1544 class CountingFile : public WritableFile {
1545 private:
1546 std::unique_ptr<WritableFile> target_;
1547 ReportFileOpCounters* counters_;
1548
1549 public:
1550 CountingFile(std::unique_ptr<WritableFile>&& target,
1551 ReportFileOpCounters* counters)
1552 : target_(std::move(target)), counters_(counters) {}
1553
1554 Status Append(const Slice& data) override {
1555 counters_->append_counter_.fetch_add(1, std::memory_order_relaxed);
1556 Status rv = target_->Append(data);
1557 counters_->bytes_written_.fetch_add(data.size(),
1558 std::memory_order_relaxed);
1559 return rv;
1560 }
1561
1562 Status Append(
1563 const Slice& data,
1564 const DataVerificationInfo& /* verification_info */) override {
1565 return Append(data);
1566 }
1567
1568 Status Truncate(uint64_t size) override { return target_->Truncate(size); }
1569 Status Close() override { return target_->Close(); }
1570 Status Flush() override { return target_->Flush(); }
1571 Status Sync() override { return target_->Sync(); }
1572 };
1573
1574 Status s = target()->NewWritableFile(f, r, soptions);
1575 if (s.ok()) {
1576 counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
1577 r->reset(new CountingFile(std::move(*r), counters()));
1578 }
1579 return s;
1580 }
1581
1582 // getter
counters()1583 ReportFileOpCounters* counters() { return &counters_; }
1584
1585 private:
1586 ReportFileOpCounters counters_;
1587 };
1588
1589 } // namespace
1590
1591 enum DistributionType : unsigned char {
1592 kFixed = 0,
1593 kUniform,
1594 kNormal
1595 };
1596
1597 static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed;
1598
StringToDistributionType(const char * ctype)1599 static enum DistributionType StringToDistributionType(const char* ctype) {
1600 assert(ctype);
1601
1602 if (!strcasecmp(ctype, "fixed"))
1603 return kFixed;
1604 else if (!strcasecmp(ctype, "uniform"))
1605 return kUniform;
1606 else if (!strcasecmp(ctype, "normal"))
1607 return kNormal;
1608
1609 fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
1610 return kFixed; // default value
1611 }
1612
1613 class BaseDistribution {
1614 public:
BaseDistribution(unsigned int _min,unsigned int _max)1615 BaseDistribution(unsigned int _min, unsigned int _max)
1616 : min_value_size_(_min), max_value_size_(_max) {}
~BaseDistribution()1617 virtual ~BaseDistribution() {}
1618
Generate()1619 unsigned int Generate() {
1620 auto val = Get();
1621 if (NeedTruncate()) {
1622 val = std::max(min_value_size_, val);
1623 val = std::min(max_value_size_, val);
1624 }
1625 return val;
1626 }
1627 private:
1628 virtual unsigned int Get() = 0;
NeedTruncate()1629 virtual bool NeedTruncate() {
1630 return true;
1631 }
1632 unsigned int min_value_size_;
1633 unsigned int max_value_size_;
1634 };
1635
1636 class FixedDistribution : public BaseDistribution
1637 {
1638 public:
FixedDistribution(unsigned int size)1639 FixedDistribution(unsigned int size) :
1640 BaseDistribution(size, size),
1641 size_(size) {}
1642 private:
Get()1643 virtual unsigned int Get() override {
1644 return size_;
1645 }
NeedTruncate()1646 virtual bool NeedTruncate() override {
1647 return false;
1648 }
1649 unsigned int size_;
1650 };
1651
1652 class NormalDistribution
1653 : public BaseDistribution, public std::normal_distribution<double> {
1654 public:
NormalDistribution(unsigned int _min,unsigned int _max)1655 NormalDistribution(unsigned int _min, unsigned int _max)
1656 : BaseDistribution(_min, _max),
1657 // 99.7% values within the range [min, max].
1658 std::normal_distribution<double>(
1659 (double)(_min + _max) / 2.0 /*mean*/,
1660 (double)(_max - _min) / 6.0 /*stddev*/),
1661 gen_(rd_()) {}
1662
1663 private:
Get()1664 virtual unsigned int Get() override {
1665 return static_cast<unsigned int>((*this)(gen_));
1666 }
1667 std::random_device rd_;
1668 std::mt19937 gen_;
1669 };
1670
1671 class UniformDistribution
1672 : public BaseDistribution,
1673 public std::uniform_int_distribution<unsigned int> {
1674 public:
UniformDistribution(unsigned int _min,unsigned int _max)1675 UniformDistribution(unsigned int _min, unsigned int _max)
1676 : BaseDistribution(_min, _max),
1677 std::uniform_int_distribution<unsigned int>(_min, _max),
1678 gen_(rd_()) {}
1679
1680 private:
Get()1681 virtual unsigned int Get() override {
1682 return (*this)(gen_);
1683 }
NeedTruncate()1684 virtual bool NeedTruncate() override {
1685 return false;
1686 }
1687 std::random_device rd_;
1688 std::mt19937 gen_;
1689 };
1690
1691 // Helper for quickly generating random data.
1692 class RandomGenerator {
1693 private:
1694 std::string data_;
1695 unsigned int pos_;
1696 std::unique_ptr<BaseDistribution> dist_;
1697
1698 public:
1699
RandomGenerator()1700 RandomGenerator() {
1701 auto max_value_size = FLAGS_value_size_max;
1702 switch (FLAGS_value_size_distribution_type_e) {
1703 case kUniform:
1704 dist_.reset(new UniformDistribution(FLAGS_value_size_min,
1705 FLAGS_value_size_max));
1706 break;
1707 case kNormal:
1708 dist_.reset(new NormalDistribution(FLAGS_value_size_min,
1709 FLAGS_value_size_max));
1710 break;
1711 case kFixed:
1712 default:
1713 dist_.reset(new FixedDistribution(value_size));
1714 max_value_size = value_size;
1715 }
1716 // We use a limited amount of data over and over again and ensure
1717 // that it is larger than the compression window (32KB), and also
1718 // large enough to serve all typical value sizes we want to write.
1719 Random rnd(301);
1720 std::string piece;
1721 while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
1722 // Add a short fragment that is as compressible as specified
1723 // by FLAGS_compression_ratio.
1724 test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
1725 data_.append(piece);
1726 }
1727 pos_ = 0;
1728 }
1729
Generate(unsigned int len)1730 Slice Generate(unsigned int len) {
1731 assert(len <= data_.size());
1732 if (pos_ + len > data_.size()) {
1733 pos_ = 0;
1734 }
1735 pos_ += len;
1736 return Slice(data_.data() + pos_ - len, len);
1737 }
1738
Generate()1739 Slice Generate() {
1740 auto len = dist_->Generate();
1741 return Generate(len);
1742 }
1743 };
1744
AppendWithSpace(std::string * str,Slice msg)1745 static void AppendWithSpace(std::string* str, Slice msg) {
1746 if (msg.empty()) return;
1747 if (!str->empty()) {
1748 str->push_back(' ');
1749 }
1750 str->append(msg.data(), msg.size());
1751 }
1752
1753 struct DBWithColumnFamilies {
1754 std::vector<ColumnFamilyHandle*> cfh;
1755 DB* db;
1756 #ifndef ROCKSDB_LITE
1757 OptimisticTransactionDB* opt_txn_db;
1758 #endif // ROCKSDB_LITE
1759 std::atomic<size_t> num_created; // Need to be updated after all the
1760 // new entries in cfh are set.
1761 size_t num_hot; // Number of column families to be queried at each moment.
1762 // After each CreateNewCf(), another num_hot number of new
1763 // Column families will be created and used to be queried.
1764 port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf()
1765 std::vector<int> cfh_idx_to_prob; // ith index holds probability of operating
1766 // on cfh[i].
1767
DBWithColumnFamiliesROCKSDB_NAMESPACE::DBWithColumnFamilies1768 DBWithColumnFamilies()
1769 : db(nullptr)
1770 #ifndef ROCKSDB_LITE
1771 , opt_txn_db(nullptr)
1772 #endif // ROCKSDB_LITE
1773 {
1774 cfh.clear();
1775 num_created = 0;
1776 num_hot = 0;
1777 }
1778
DBWithColumnFamiliesROCKSDB_NAMESPACE::DBWithColumnFamilies1779 DBWithColumnFamilies(const DBWithColumnFamilies& other)
1780 : cfh(other.cfh),
1781 db(other.db),
1782 #ifndef ROCKSDB_LITE
1783 opt_txn_db(other.opt_txn_db),
1784 #endif // ROCKSDB_LITE
1785 num_created(other.num_created.load()),
1786 num_hot(other.num_hot),
1787 cfh_idx_to_prob(other.cfh_idx_to_prob) {
1788 }
1789
DeleteDBsROCKSDB_NAMESPACE::DBWithColumnFamilies1790 void DeleteDBs() {
1791 std::for_each(cfh.begin(), cfh.end(),
1792 [](ColumnFamilyHandle* cfhi) { delete cfhi; });
1793 cfh.clear();
1794 #ifndef ROCKSDB_LITE
1795 if (opt_txn_db) {
1796 delete opt_txn_db;
1797 opt_txn_db = nullptr;
1798 } else {
1799 delete db;
1800 db = nullptr;
1801 }
1802 #else
1803 delete db;
1804 db = nullptr;
1805 #endif // ROCKSDB_LITE
1806 }
1807
GetCfhROCKSDB_NAMESPACE::DBWithColumnFamilies1808 ColumnFamilyHandle* GetCfh(int64_t rand_num) {
1809 assert(num_hot > 0);
1810 size_t rand_offset = 0;
1811 if (!cfh_idx_to_prob.empty()) {
1812 assert(cfh_idx_to_prob.size() == num_hot);
1813 int sum = 0;
1814 while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
1815 sum += cfh_idx_to_prob[rand_offset];
1816 ++rand_offset;
1817 }
1818 assert(rand_offset < cfh_idx_to_prob.size());
1819 } else {
1820 rand_offset = rand_num % num_hot;
1821 }
1822 return cfh[num_created.load(std::memory_order_acquire) - num_hot +
1823 rand_offset];
1824 }
1825
1826 // stage: assume CF from 0 to stage * num_hot has be created. Need to create
1827 // stage * num_hot + 1 to stage * (num_hot + 1).
CreateNewCfROCKSDB_NAMESPACE::DBWithColumnFamilies1828 void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
1829 MutexLock l(&create_cf_mutex);
1830 if ((stage + 1) * num_hot <= num_created) {
1831 // Already created.
1832 return;
1833 }
1834 auto new_num_created = num_created + num_hot;
1835 assert(new_num_created <= cfh.size());
1836 for (size_t i = num_created; i < new_num_created; i++) {
1837 Status s =
1838 db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
1839 if (!s.ok()) {
1840 fprintf(stderr, "create column family error: %s\n",
1841 s.ToString().c_str());
1842 abort();
1843 }
1844 }
1845 num_created.store(new_num_created, std::memory_order_release);
1846 }
1847 };
1848
1849 // a class that reports stats to CSV file
1850 class ReporterAgent {
1851 public:
ReporterAgent(Env * env,const std::string & fname,uint64_t report_interval_secs)1852 ReporterAgent(Env* env, const std::string& fname,
1853 uint64_t report_interval_secs)
1854 : env_(env),
1855 total_ops_done_(0),
1856 last_report_(0),
1857 report_interval_secs_(report_interval_secs),
1858 stop_(false) {
1859 auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
1860 if (s.ok()) {
1861 s = report_file_->Append(Header() + "\n");
1862 }
1863 if (s.ok()) {
1864 s = report_file_->Flush();
1865 }
1866 if (!s.ok()) {
1867 fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
1868 s.ToString().c_str());
1869 abort();
1870 }
1871
1872 reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
1873 }
1874
~ReporterAgent()1875 ~ReporterAgent() {
1876 {
1877 std::unique_lock<std::mutex> lk(mutex_);
1878 stop_ = true;
1879 stop_cv_.notify_all();
1880 }
1881 reporting_thread_.join();
1882 }
1883
1884 // thread safe
ReportFinishedOps(int64_t num_ops)1885 void ReportFinishedOps(int64_t num_ops) {
1886 total_ops_done_.fetch_add(num_ops);
1887 }
1888
1889 private:
Header() const1890 std::string Header() const { return "secs_elapsed,interval_qps"; }
SleepAndReport()1891 void SleepAndReport() {
1892 auto* clock = env_->GetSystemClock().get();
1893 auto time_started = clock->NowMicros();
1894 while (true) {
1895 {
1896 std::unique_lock<std::mutex> lk(mutex_);
1897 if (stop_ ||
1898 stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
1899 [&]() { return stop_; })) {
1900 // stopping
1901 break;
1902 }
1903 // else -> timeout, which means time for a report!
1904 }
1905 auto total_ops_done_snapshot = total_ops_done_.load();
1906 // round the seconds elapsed
1907 auto secs_elapsed =
1908 (clock->NowMicros() - time_started + kMicrosInSecond / 2) /
1909 kMicrosInSecond;
1910 std::string report = ToString(secs_elapsed) + "," +
1911 ToString(total_ops_done_snapshot - last_report_) +
1912 "\n";
1913 auto s = report_file_->Append(report);
1914 if (s.ok()) {
1915 s = report_file_->Flush();
1916 }
1917 if (!s.ok()) {
1918 fprintf(stderr,
1919 "Can't write to report file (%s), stopping the reporting\n",
1920 s.ToString().c_str());
1921 break;
1922 }
1923 last_report_ = total_ops_done_snapshot;
1924 }
1925 }
1926
1927 Env* env_;
1928 std::unique_ptr<WritableFile> report_file_;
1929 std::atomic<int64_t> total_ops_done_;
1930 int64_t last_report_;
1931 const uint64_t report_interval_secs_;
1932 ROCKSDB_NAMESPACE::port::Thread reporting_thread_;
1933 std::mutex mutex_;
1934 // will notify on stop
1935 std::condition_variable stop_cv_;
1936 bool stop_;
1937 };
1938
1939 enum OperationType : unsigned char {
1940 kRead = 0,
1941 kWrite,
1942 kDelete,
1943 kSeek,
1944 kMerge,
1945 kUpdate,
1946 kCompress,
1947 kUncompress,
1948 kCrc,
1949 kHash,
1950 kOthers
1951 };
1952
1953 static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
1954 OperationTypeString = {
1955 {kRead, "read"},
1956 {kWrite, "write"},
1957 {kDelete, "delete"},
1958 {kSeek, "seek"},
1959 {kMerge, "merge"},
1960 {kUpdate, "update"},
1961 {kCompress, "compress"},
1962 {kCompress, "uncompress"},
1963 {kCrc, "crc"},
1964 {kHash, "hash"},
1965 {kOthers, "op"}
1966 };
1967
1968 class CombinedStats;
1969 class Stats {
1970 private:
1971 SystemClock* clock_;
1972 int id_;
1973 uint64_t start_ = 0;
1974 uint64_t sine_interval_;
1975 uint64_t finish_;
1976 double seconds_;
1977 uint64_t done_;
1978 uint64_t last_report_done_;
1979 uint64_t next_report_;
1980 uint64_t bytes_;
1981 uint64_t last_op_finish_;
1982 uint64_t last_report_finish_;
1983 std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
1984 std::hash<unsigned char>> hist_;
1985 std::string message_;
1986 bool exclude_from_merge_;
1987 ReporterAgent* reporter_agent_; // does not own
1988 friend class CombinedStats;
1989
1990 public:
Stats()1991 Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); }
1992
SetReporterAgent(ReporterAgent * reporter_agent)1993 void SetReporterAgent(ReporterAgent* reporter_agent) {
1994 reporter_agent_ = reporter_agent;
1995 }
1996
Start(int id)1997 void Start(int id) {
1998 id_ = id;
1999 next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
2000 last_op_finish_ = start_;
2001 hist_.clear();
2002 done_ = 0;
2003 last_report_done_ = 0;
2004 bytes_ = 0;
2005 seconds_ = 0;
2006 start_ = clock_->NowMicros();
2007 sine_interval_ = clock_->NowMicros();
2008 finish_ = start_;
2009 last_report_finish_ = start_;
2010 message_.clear();
2011 // When set, stats from this thread won't be merged with others.
2012 exclude_from_merge_ = false;
2013 }
2014
Merge(const Stats & other)2015 void Merge(const Stats& other) {
2016 if (other.exclude_from_merge_)
2017 return;
2018
2019 for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
2020 auto this_it = hist_.find(it->first);
2021 if (this_it != hist_.end()) {
2022 this_it->second->Merge(*(other.hist_.at(it->first)));
2023 } else {
2024 hist_.insert({ it->first, it->second });
2025 }
2026 }
2027
2028 done_ += other.done_;
2029 bytes_ += other.bytes_;
2030 seconds_ += other.seconds_;
2031 if (other.start_ < start_) start_ = other.start_;
2032 if (other.finish_ > finish_) finish_ = other.finish_;
2033
2034 // Just keep the messages from one thread
2035 if (message_.empty()) message_ = other.message_;
2036 }
2037
Stop()2038 void Stop() {
2039 finish_ = clock_->NowMicros();
2040 seconds_ = (finish_ - start_) * 1e-6;
2041 }
2042
AddMessage(Slice msg)2043 void AddMessage(Slice msg) {
2044 AppendWithSpace(&message_, msg);
2045 }
2046
SetId(int id)2047 void SetId(int id) { id_ = id; }
SetExcludeFromMerge()2048 void SetExcludeFromMerge() { exclude_from_merge_ = true; }
2049
PrintThreadStatus()2050 void PrintThreadStatus() {
2051 std::vector<ThreadStatus> thread_list;
2052 FLAGS_env->GetThreadList(&thread_list);
2053
2054 fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n",
2055 "ThreadID", "ThreadType", "cfName", "Operation",
2056 "ElapsedTime", "Stage", "State", "OperationProperties");
2057
2058 int64_t current_time = 0;
2059 clock_->GetCurrentTime(¤t_time).PermitUncheckedError();
2060 for (auto ts : thread_list) {
2061 fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
2062 ts.thread_id,
2063 ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
2064 ts.cf_name.c_str(),
2065 ThreadStatus::GetOperationName(ts.operation_type).c_str(),
2066 ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
2067 ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
2068 ThreadStatus::GetStateName(ts.state_type).c_str());
2069
2070 auto op_properties = ThreadStatus::InterpretOperationProperties(
2071 ts.operation_type, ts.op_properties);
2072 for (const auto& op_prop : op_properties) {
2073 fprintf(stderr, " %s %" PRIu64" |",
2074 op_prop.first.c_str(), op_prop.second);
2075 }
2076 fprintf(stderr, "\n");
2077 }
2078 }
2079
ResetSineInterval()2080 void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); }
2081
GetSineInterval()2082 uint64_t GetSineInterval() {
2083 return sine_interval_;
2084 }
2085
GetStart()2086 uint64_t GetStart() {
2087 return start_;
2088 }
2089
ResetLastOpTime()2090 void ResetLastOpTime() {
2091 // Set to now to avoid latency from calls to SleepForMicroseconds
2092 last_op_finish_ = clock_->NowMicros();
2093 }
2094
FinishedOps(DBWithColumnFamilies * db_with_cfh,DB * db,int64_t num_ops,enum OperationType op_type=kOthers)2095 void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
2096 enum OperationType op_type = kOthers) {
2097 if (reporter_agent_) {
2098 reporter_agent_->ReportFinishedOps(num_ops);
2099 }
2100 if (FLAGS_histogram) {
2101 uint64_t now = clock_->NowMicros();
2102 uint64_t micros = now - last_op_finish_;
2103
2104 if (hist_.find(op_type) == hist_.end())
2105 {
2106 auto hist_temp = std::make_shared<HistogramImpl>();
2107 hist_.insert({op_type, std::move(hist_temp)});
2108 }
2109 hist_[op_type]->Add(micros);
2110
2111 if (micros > 20000 && !FLAGS_stats_interval) {
2112 fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
2113 fflush(stderr);
2114 }
2115 last_op_finish_ = now;
2116 }
2117
2118 done_ += num_ops;
2119 if (done_ >= next_report_) {
2120 if (!FLAGS_stats_interval) {
2121 if (next_report_ < 1000) next_report_ += 100;
2122 else if (next_report_ < 5000) next_report_ += 500;
2123 else if (next_report_ < 10000) next_report_ += 1000;
2124 else if (next_report_ < 50000) next_report_ += 5000;
2125 else if (next_report_ < 100000) next_report_ += 10000;
2126 else if (next_report_ < 500000) next_report_ += 50000;
2127 else next_report_ += 100000;
2128 fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
2129 } else {
2130 uint64_t now = clock_->NowMicros();
2131 int64_t usecs_since_last = now - last_report_finish_;
2132
2133 // Determine whether to print status where interval is either
2134 // each N operations or each N seconds.
2135
2136 if (FLAGS_stats_interval_seconds &&
2137 usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
2138 // Don't check again for this many operations
2139 next_report_ += FLAGS_stats_interval;
2140
2141 } else {
2142 fprintf(stderr,
2143 "%s ... thread %d: (%" PRIu64 ",%" PRIu64
2144 ") ops and "
2145 "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
2146 clock_->TimeToString(now / 1000000).c_str(), id_,
2147 done_ - last_report_done_, done_,
2148 (done_ - last_report_done_) / (usecs_since_last / 1000000.0),
2149 done_ / ((now - start_) / 1000000.0),
2150 (now - last_report_finish_) / 1000000.0,
2151 (now - start_) / 1000000.0);
2152
2153 if (id_ == 0 && FLAGS_stats_per_interval) {
2154 std::string stats;
2155
2156 if (db_with_cfh && db_with_cfh->num_created.load()) {
2157 for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
2158 if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
2159 &stats))
2160 fprintf(stderr, "%s\n", stats.c_str());
2161 if (FLAGS_show_table_properties) {
2162 for (int level = 0; level < FLAGS_num_levels; ++level) {
2163 if (db->GetProperty(
2164 db_with_cfh->cfh[i],
2165 "rocksdb.aggregated-table-properties-at-level" +
2166 ToString(level),
2167 &stats)) {
2168 if (stats.find("# entries=0") == std::string::npos) {
2169 fprintf(stderr, "Level[%d]: %s\n", level,
2170 stats.c_str());
2171 }
2172 }
2173 }
2174 }
2175 }
2176 } else if (db) {
2177 if (db->GetProperty("rocksdb.stats", &stats)) {
2178 fprintf(stderr, "%s\n", stats.c_str());
2179 }
2180 if (FLAGS_show_table_properties) {
2181 for (int level = 0; level < FLAGS_num_levels; ++level) {
2182 if (db->GetProperty(
2183 "rocksdb.aggregated-table-properties-at-level" +
2184 ToString(level),
2185 &stats)) {
2186 if (stats.find("# entries=0") == std::string::npos) {
2187 fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
2188 }
2189 }
2190 }
2191 }
2192 }
2193 }
2194
2195 next_report_ += FLAGS_stats_interval;
2196 last_report_finish_ = now;
2197 last_report_done_ = done_;
2198 }
2199 }
2200 if (id_ == 0 && FLAGS_thread_status_per_interval) {
2201 PrintThreadStatus();
2202 }
2203 fflush(stderr);
2204 }
2205 }
2206
AddBytes(int64_t n)2207 void AddBytes(int64_t n) {
2208 bytes_ += n;
2209 }
2210
Report(const Slice & name)2211 void Report(const Slice& name) {
2212 // Pretend at least one op was done in case we are running a benchmark
2213 // that does not call FinishedOps().
2214 if (done_ < 1) done_ = 1;
2215
2216 std::string extra;
2217 if (bytes_ > 0) {
2218 // Rate is computed on actual elapsed time, not the sum of per-thread
2219 // elapsed times.
2220 double elapsed = (finish_ - start_) * 1e-6;
2221 char rate[100];
2222 snprintf(rate, sizeof(rate), "%6.1f MB/s",
2223 (bytes_ / 1048576.0) / elapsed);
2224 extra = rate;
2225 }
2226 AppendWithSpace(&extra, message_);
2227 double elapsed = (finish_ - start_) * 1e-6;
2228 double throughput = (double)done_/elapsed;
2229
2230 fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
2231 name.ToString().c_str(),
2232 seconds_ * 1e6 / done_,
2233 (long)throughput,
2234 (extra.empty() ? "" : " "),
2235 extra.c_str());
2236 if (FLAGS_histogram) {
2237 for (auto it = hist_.begin(); it != hist_.end(); ++it) {
2238 fprintf(stdout, "Microseconds per %s:\n%s\n",
2239 OperationTypeString[it->first].c_str(),
2240 it->second->ToString().c_str());
2241 }
2242 }
2243 if (FLAGS_report_file_operations) {
2244 ReportFileOpEnv* env = static_cast<ReportFileOpEnv*>(FLAGS_env);
2245 ReportFileOpCounters* counters = env->counters();
2246 fprintf(stdout, "Num files opened: %d\n",
2247 counters->open_counter_.load(std::memory_order_relaxed));
2248 fprintf(stdout, "Num Read(): %d\n",
2249 counters->read_counter_.load(std::memory_order_relaxed));
2250 fprintf(stdout, "Num Append(): %d\n",
2251 counters->append_counter_.load(std::memory_order_relaxed));
2252 fprintf(stdout, "Num bytes read: %" PRIu64 "\n",
2253 counters->bytes_read_.load(std::memory_order_relaxed));
2254 fprintf(stdout, "Num bytes written: %" PRIu64 "\n",
2255 counters->bytes_written_.load(std::memory_order_relaxed));
2256 env->reset();
2257 }
2258 fflush(stdout);
2259 }
2260 };
2261
2262 class CombinedStats {
2263 public:
AddStats(const Stats & stat)2264 void AddStats(const Stats& stat) {
2265 uint64_t total_ops = stat.done_;
2266 uint64_t total_bytes_ = stat.bytes_;
2267 double elapsed;
2268
2269 if (total_ops < 1) {
2270 total_ops = 1;
2271 }
2272
2273 elapsed = (stat.finish_ - stat.start_) * 1e-6;
2274 throughput_ops_.emplace_back(total_ops / elapsed);
2275
2276 if (total_bytes_ > 0) {
2277 double mbs = (total_bytes_ / 1048576.0);
2278 throughput_mbs_.emplace_back(mbs / elapsed);
2279 }
2280 }
2281
Report(const std::string & bench_name)2282 void Report(const std::string& bench_name) {
2283 const char* name = bench_name.c_str();
2284 int num_runs = static_cast<int>(throughput_ops_.size());
2285
2286 if (throughput_mbs_.size() == throughput_ops_.size()) {
2287 fprintf(stdout,
2288 "%s [AVG %d runs] : %d ops/sec; %6.1f MB/sec\n"
2289 "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
2290 name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
2291 CalcAvg(throughput_mbs_), name, num_runs,
2292 static_cast<int>(CalcMedian(throughput_ops_)),
2293 CalcMedian(throughput_mbs_));
2294 } else {
2295 fprintf(stdout,
2296 "%s [AVG %d runs] : %d ops/sec\n"
2297 "%s [MEDIAN %d runs] : %d ops/sec\n",
2298 name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), name,
2299 num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
2300 }
2301 }
2302
2303 private:
CalcAvg(std::vector<double> data)2304 double CalcAvg(std::vector<double> data) {
2305 double avg = 0;
2306 for (double x : data) {
2307 avg += x;
2308 }
2309 avg = avg / data.size();
2310 return avg;
2311 }
2312
CalcMedian(std::vector<double> data)2313 double CalcMedian(std::vector<double> data) {
2314 assert(data.size() > 0);
2315 std::sort(data.begin(), data.end());
2316
2317 size_t mid = data.size() / 2;
2318 if (data.size() % 2 == 1) {
2319 // Odd number of entries
2320 return data[mid];
2321 } else {
2322 // Even number of entries
2323 return (data[mid] + data[mid - 1]) / 2;
2324 }
2325 }
2326
2327 std::vector<double> throughput_ops_;
2328 std::vector<double> throughput_mbs_;
2329 };
2330
2331 class TimestampEmulator {
2332 private:
2333 std::atomic<uint64_t> timestamp_;
2334
2335 public:
TimestampEmulator()2336 TimestampEmulator() : timestamp_(0) {}
Get() const2337 uint64_t Get() const { return timestamp_.load(); }
Inc()2338 void Inc() { timestamp_++; }
Allocate(char * scratch)2339 Slice Allocate(char* scratch) {
2340 // TODO: support larger timestamp sizes
2341 assert(FLAGS_user_timestamp_size == 8);
2342 assert(scratch);
2343 uint64_t ts = timestamp_.fetch_add(1);
2344 EncodeFixed64(scratch, ts);
2345 return Slice(scratch, FLAGS_user_timestamp_size);
2346 }
GetTimestampForRead(Random64 & rand,char * scratch)2347 Slice GetTimestampForRead(Random64& rand, char* scratch) {
2348 assert(FLAGS_user_timestamp_size == 8);
2349 assert(scratch);
2350 if (FLAGS_read_with_latest_user_timestamp) {
2351 return Allocate(scratch);
2352 }
2353 // Choose a random timestamp from the past.
2354 uint64_t ts = rand.Next() % Get();
2355 EncodeFixed64(scratch, ts);
2356 return Slice(scratch, FLAGS_user_timestamp_size);
2357 }
2358 };
2359
2360 // State shared by all concurrent executions of the same benchmark.
2361 struct SharedState {
2362 port::Mutex mu;
2363 port::CondVar cv;
2364 int total;
2365 int perf_level;
2366 std::shared_ptr<RateLimiter> write_rate_limiter;
2367 std::shared_ptr<RateLimiter> read_rate_limiter;
2368
2369 // Each thread goes through the following states:
2370 // (1) initializing
2371 // (2) waiting for others to be initialized
2372 // (3) running
2373 // (4) done
2374
2375 long num_initialized;
2376 long num_done;
2377 bool start;
2378
SharedStateROCKSDB_NAMESPACE::SharedState2379 SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
2380 };
2381
2382 // Per-thread state for concurrent executions of the same benchmark.
2383 struct ThreadState {
2384 int tid; // 0..n-1 when running in n threads
2385 Random64 rand; // Has different seeds for different threads
2386 Stats stats;
2387 SharedState* shared;
2388
ThreadStateROCKSDB_NAMESPACE::ThreadState2389 explicit ThreadState(int index)
2390 : tid(index), rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {}
2391 };
2392
2393 class Duration {
2394 public:
Duration(uint64_t max_seconds,int64_t max_ops,int64_t ops_per_stage=0)2395 Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
2396 max_seconds_ = max_seconds;
2397 max_ops_= max_ops;
2398 ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
2399 ops_ = 0;
2400 start_at_ = FLAGS_env->NowMicros();
2401 }
2402
GetStage()2403 int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
2404
Done(int64_t increment)2405 bool Done(int64_t increment) {
2406 if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops
2407 ops_ += increment;
2408
2409 if (max_seconds_) {
2410 // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
2411 auto granularity = FLAGS_ops_between_duration_checks;
2412 if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
2413 uint64_t now = FLAGS_env->NowMicros();
2414 return ((now - start_at_) / 1000000) >= max_seconds_;
2415 } else {
2416 return false;
2417 }
2418 } else {
2419 return ops_ > max_ops_;
2420 }
2421 }
2422
2423 private:
2424 uint64_t max_seconds_;
2425 int64_t max_ops_;
2426 int64_t ops_per_stage_;
2427 int64_t ops_;
2428 uint64_t start_at_;
2429 };
2430
2431 class Benchmark {
2432 private:
2433 std::shared_ptr<Cache> cache_;
2434 std::shared_ptr<Cache> compressed_cache_;
2435 std::shared_ptr<const FilterPolicy> filter_policy_;
2436 const SliceTransform* prefix_extractor_;
2437 DBWithColumnFamilies db_;
2438 std::vector<DBWithColumnFamilies> multi_dbs_;
2439 int64_t num_;
2440 int key_size_;
2441 int user_timestamp_size_;
2442 int prefix_size_;
2443 int64_t keys_per_prefix_;
2444 int64_t entries_per_batch_;
2445 int64_t writes_before_delete_range_;
2446 int64_t writes_per_range_tombstone_;
2447 int64_t range_tombstone_width_;
2448 int64_t max_num_range_tombstones_;
2449 WriteOptions write_options_;
2450 Options open_options_; // keep options around to properly destroy db later
2451 #ifndef ROCKSDB_LITE
2452 TraceOptions trace_options_;
2453 TraceOptions block_cache_trace_options_;
2454 #endif
2455 int64_t reads_;
2456 int64_t deletes_;
2457 double read_random_exp_range_;
2458 int64_t writes_;
2459 int64_t readwrites_;
2460 int64_t merge_keys_;
2461 bool report_file_operations_;
2462 bool use_blob_db_; // Stacked BlobDB
2463 std::vector<std::string> keys_;
2464
2465 class ErrorHandlerListener : public EventListener {
2466 public:
2467 #ifndef ROCKSDB_LITE
ErrorHandlerListener()2468 ErrorHandlerListener()
2469 : mutex_(),
2470 cv_(&mutex_),
2471 no_auto_recovery_(false),
2472 recovery_complete_(false) {}
2473
~ErrorHandlerListener()2474 ~ErrorHandlerListener() override {}
2475
OnErrorRecoveryBegin(BackgroundErrorReason,Status,bool * auto_recovery)2476 void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
2477 Status /*bg_error*/,
2478 bool* auto_recovery) override {
2479 if (*auto_recovery && no_auto_recovery_) {
2480 *auto_recovery = false;
2481 }
2482 }
2483
OnErrorRecoveryCompleted(Status)2484 void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
2485 InstrumentedMutexLock l(&mutex_);
2486 recovery_complete_ = true;
2487 cv_.SignalAll();
2488 }
2489
WaitForRecovery(uint64_t abs_time_us)2490 bool WaitForRecovery(uint64_t abs_time_us) {
2491 InstrumentedMutexLock l(&mutex_);
2492 if (!recovery_complete_) {
2493 cv_.TimedWait(abs_time_us);
2494 }
2495 if (recovery_complete_) {
2496 recovery_complete_ = false;
2497 return true;
2498 }
2499 return false;
2500 }
2501
EnableAutoRecovery(bool enable=true)2502 void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
2503
2504 private:
2505 InstrumentedMutex mutex_;
2506 InstrumentedCondVar cv_;
2507 bool no_auto_recovery_;
2508 bool recovery_complete_;
2509 #else // ROCKSDB_LITE
2510 bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
2511 void EnableAutoRecovery(bool /*enable*/) {}
2512 #endif // ROCKSDB_LITE
2513 };
2514
2515 std::shared_ptr<ErrorHandlerListener> listener_;
2516
2517 std::unique_ptr<TimestampEmulator> mock_app_clock_;
2518
SanityCheck()2519 bool SanityCheck() {
2520 if (FLAGS_compression_ratio > 1) {
2521 fprintf(stderr, "compression_ratio should be between 0 and 1\n");
2522 return false;
2523 }
2524 return true;
2525 }
2526
CompressSlice(const CompressionInfo & compression_info,const Slice & input,std::string * compressed)2527 inline bool CompressSlice(const CompressionInfo& compression_info,
2528 const Slice& input, std::string* compressed) {
2529 constexpr uint32_t compress_format_version = 2;
2530
2531 return CompressData(input, compression_info, compress_format_version,
2532 compressed);
2533 }
2534
PrintHeader()2535 void PrintHeader() {
2536 PrintEnvironment();
2537 fprintf(stdout,
2538 "Keys: %d bytes each (+ %d bytes user-defined timestamp)\n",
2539 FLAGS_key_size, FLAGS_user_timestamp_size);
2540 auto avg_value_size = FLAGS_value_size;
2541 if (FLAGS_value_size_distribution_type_e == kFixed) {
2542 fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n",
2543 avg_value_size,
2544 static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2545 } else {
2546 avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2;
2547 fprintf(stdout, "Values: %d avg bytes each (%d bytes after compression)\n",
2548 avg_value_size,
2549 static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
2550 fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n",
2551 FLAGS_value_size_distribution_type.c_str(),
2552 FLAGS_value_size_min, FLAGS_value_size_max);
2553 }
2554 fprintf(stdout, "Entries: %" PRIu64 "\n", num_);
2555 fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size);
2556 fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_);
2557 fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
2558 ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_)
2559 / 1048576.0));
2560 fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
2561 (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio)
2562 * num_)
2563 / 1048576.0));
2564 fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
2565 FLAGS_benchmark_write_rate_limit);
2566 fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
2567 FLAGS_benchmark_read_rate_limit);
2568 if (FLAGS_enable_numa) {
2569 fprintf(stderr, "Running in NUMA enabled mode.\n");
2570 #ifndef NUMA
2571 fprintf(stderr, "NUMA is not defined in the system.\n");
2572 exit(1);
2573 #else
2574 if (numa_available() == -1) {
2575 fprintf(stderr, "NUMA is not supported by the system.\n");
2576 exit(1);
2577 }
2578 #endif
2579 }
2580
2581 auto compression = CompressionTypeToString(FLAGS_compression_type_e);
2582 fprintf(stdout, "Compression: %s\n", compression.c_str());
2583 fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
2584 FLAGS_sample_for_compression);
2585
2586 switch (FLAGS_rep_factory) {
2587 case kPrefixHash:
2588 fprintf(stdout, "Memtablerep: prefix_hash\n");
2589 break;
2590 case kSkipList:
2591 fprintf(stdout, "Memtablerep: skip_list\n");
2592 break;
2593 case kVectorRep:
2594 fprintf(stdout, "Memtablerep: vector\n");
2595 break;
2596 case kHashLinkedList:
2597 fprintf(stdout, "Memtablerep: hash_linkedlist\n");
2598 break;
2599 }
2600 fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
2601
2602 PrintWarnings(compression.c_str());
2603 fprintf(stdout, "------------------------------------------------\n");
2604 }
2605
PrintWarnings(const char * compression)2606 void PrintWarnings(const char* compression) {
2607 #if defined(__GNUC__) && !defined(__OPTIMIZE__)
2608 fprintf(stdout,
2609 "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
2610 );
2611 #endif
2612 #ifndef NDEBUG
2613 fprintf(stdout,
2614 "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
2615 #endif
2616 if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) {
2617 // The test string should not be too small.
2618 const int len = FLAGS_block_size;
2619 std::string input_str(len, 'y');
2620 std::string compressed;
2621 CompressionOptions opts;
2622 CompressionContext context(FLAGS_compression_type_e);
2623 CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
2624 FLAGS_compression_type_e,
2625 FLAGS_sample_for_compression);
2626 bool result = CompressSlice(info, Slice(input_str), &compressed);
2627
2628 if (!result) {
2629 fprintf(stdout, "WARNING: %s compression is not enabled\n",
2630 compression);
2631 } else if (compressed.size() >= input_str.size()) {
2632 fprintf(stdout, "WARNING: %s compression is not effective\n",
2633 compression);
2634 }
2635 }
2636 }
2637
2638 // Current the following isn't equivalent to OS_LINUX.
2639 #if defined(__linux)
TrimSpace(Slice s)2640 static Slice TrimSpace(Slice s) {
2641 unsigned int start = 0;
2642 while (start < s.size() && isspace(s[start])) {
2643 start++;
2644 }
2645 unsigned int limit = static_cast<unsigned int>(s.size());
2646 while (limit > start && isspace(s[limit-1])) {
2647 limit--;
2648 }
2649 return Slice(s.data() + start, limit - start);
2650 }
2651 #endif
2652
PrintEnvironment()2653 void PrintEnvironment() {
2654 fprintf(stderr, "RocksDB: version %d.%d\n",
2655 kMajorVersion, kMinorVersion);
2656
2657 #if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
2658 time_t now = time(nullptr);
2659 char buf[52];
2660 // Lint complains about ctime() usage, so replace it with ctime_r(). The
2661 // requirement is to provide a buffer which is at least 26 bytes.
2662 fprintf(stderr, "Date: %s",
2663 ctime_r(&now, buf)); // ctime_r() adds newline
2664
2665 #if defined(__linux)
2666 FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
2667 if (cpuinfo != nullptr) {
2668 char line[1000];
2669 int num_cpus = 0;
2670 std::string cpu_type;
2671 std::string cache_size;
2672 while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
2673 const char* sep = strchr(line, ':');
2674 if (sep == nullptr) {
2675 continue;
2676 }
2677 Slice key = TrimSpace(Slice(line, sep - 1 - line));
2678 Slice val = TrimSpace(Slice(sep + 1));
2679 if (key == "model name") {
2680 ++num_cpus;
2681 cpu_type = val.ToString();
2682 } else if (key == "cache size") {
2683 cache_size = val.ToString();
2684 }
2685 }
2686 fclose(cpuinfo);
2687 fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
2688 fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
2689 }
2690 #elif defined(__APPLE__)
2691 struct host_basic_info h;
2692 size_t hlen = HOST_BASIC_INFO_COUNT;
2693 if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h,
2694 (uint32_t*)&hlen) == KERN_SUCCESS) {
2695 std::string cpu_type;
2696 std::string cache_size;
2697 size_t hcache_size;
2698 hlen = sizeof(hcache_size);
2699 if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) {
2700 cache_size = std::to_string(hcache_size);
2701 }
2702 switch (h.cpu_type) {
2703 case CPU_TYPE_X86_64:
2704 cpu_type = "x86_64";
2705 break;
2706 case CPU_TYPE_ARM64:
2707 cpu_type = "arm64";
2708 break;
2709 default:
2710 break;
2711 }
2712 fprintf(stderr, "CPU: %d * %s\n", h.max_cpus, cpu_type.c_str());
2713 fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
2714 }
2715 #elif defined(__FreeBSD__)
2716 int ncpus;
2717 size_t len = sizeof(ncpus);
2718 int mib[2] = {CTL_HW, HW_NCPU};
2719 if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) {
2720 char cpu_type[16];
2721 len = sizeof(cpu_type) - 1;
2722 mib[1] = HW_MACHINE;
2723 if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0;
2724
2725 fprintf(stderr, "CPU: %d * %s\n", ncpus, cpu_type);
2726 // no programmatic way to get the cache line size except on PPC
2727 }
2728 #endif
2729 #endif
2730 }
2731
KeyExpired(const TimestampEmulator * timestamp_emulator,const Slice & key)2732 static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
2733 const Slice& key) {
2734 const char* pos = key.data();
2735 pos += 8;
2736 uint64_t timestamp = 0;
2737 if (port::kLittleEndian) {
2738 int bytes_to_fill = 8;
2739 for (int i = 0; i < bytes_to_fill; ++i) {
2740 timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
2741 << ((bytes_to_fill - i - 1) << 3));
2742 }
2743 } else {
2744 memcpy(×tamp, pos, sizeof(timestamp));
2745 }
2746 return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
2747 }
2748
2749 class ExpiredTimeFilter : public CompactionFilter {
2750 public:
ExpiredTimeFilter(const std::shared_ptr<TimestampEmulator> & timestamp_emulator)2751 explicit ExpiredTimeFilter(
2752 const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
2753 : timestamp_emulator_(timestamp_emulator) {}
Filter(int,const Slice & key,const Slice &,std::string *,bool *) const2754 bool Filter(int /*level*/, const Slice& key,
2755 const Slice& /*existing_value*/, std::string* /*new_value*/,
2756 bool* /*value_changed*/) const override {
2757 return KeyExpired(timestamp_emulator_.get(), key);
2758 }
Name() const2759 const char* Name() const override { return "ExpiredTimeFilter"; }
2760
2761 private:
2762 std::shared_ptr<TimestampEmulator> timestamp_emulator_;
2763 };
2764
2765 class KeepFilter : public CompactionFilter {
2766 public:
Filter(int,const Slice &,const Slice &,std::string *,bool *) const2767 bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
2768 std::string* /*new_value*/,
2769 bool* /*value_changed*/) const override {
2770 return false;
2771 }
2772
Name() const2773 const char* Name() const override { return "KeepFilter"; }
2774 };
2775
NewCache(int64_t capacity)2776 std::shared_ptr<Cache> NewCache(int64_t capacity) {
2777 if (capacity <= 0) {
2778 return nullptr;
2779 }
2780 if (FLAGS_use_clock_cache) {
2781 auto cache = NewClockCache(static_cast<size_t>(capacity),
2782 FLAGS_cache_numshardbits);
2783 if (!cache) {
2784 fprintf(stderr, "Clock cache not supported.");
2785 exit(1);
2786 }
2787 return cache;
2788 } else {
2789 LRUCacheOptions opts(
2790 static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
2791 false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
2792 #ifdef MEMKIND
2793 FLAGS_use_cache_memkind_kmem_allocator
2794 ? std::make_shared<MemkindKmemAllocator>()
2795 : nullptr
2796 #else
2797 nullptr
2798 #endif
2799 );
2800 if (FLAGS_use_cache_memkind_kmem_allocator) {
2801 #ifndef MEMKIND
2802 fprintf(stderr, "Memkind library is not linked with the binary.");
2803 exit(1);
2804 #endif
2805 }
2806 #ifndef ROCKSDB_LITE
2807 if (!FLAGS_secondary_cache_uri.empty()) {
2808 Status s =
2809 ObjectRegistry::NewInstance()->NewSharedObject<SecondaryCache>(
2810 FLAGS_secondary_cache_uri, &secondary_cache);
2811 if (secondary_cache == nullptr) {
2812 fprintf(
2813 stderr,
2814 "No secondary cache registered matching string: %s status=%s\n",
2815 FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
2816 exit(1);
2817 }
2818 opts.secondary_cache = secondary_cache;
2819 }
2820 #endif // ROCKSDB_LITE
2821 return NewLRUCache(opts);
2822 }
2823 }
2824
2825 public:
Benchmark()2826 Benchmark()
2827 : cache_(NewCache(FLAGS_cache_size)),
2828 compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
2829 filter_policy_(
2830 FLAGS_use_ribbon_filter
2831 ? NewExperimentalRibbonFilterPolicy(FLAGS_bloom_bits)
2832 : FLAGS_bloom_bits >= 0
2833 ? NewBloomFilterPolicy(FLAGS_bloom_bits,
2834 FLAGS_use_block_based_filter)
2835 : nullptr),
2836 prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
2837 num_(FLAGS_num),
2838 key_size_(FLAGS_key_size),
2839 user_timestamp_size_(FLAGS_user_timestamp_size),
2840 prefix_size_(FLAGS_prefix_size),
2841 keys_per_prefix_(FLAGS_keys_per_prefix),
2842 entries_per_batch_(1),
2843 reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
2844 read_random_exp_range_(0.0),
2845 writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
2846 readwrites_(
2847 (FLAGS_writes < 0 && FLAGS_reads < 0)
2848 ? FLAGS_num
2849 : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
2850 merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
2851 report_file_operations_(FLAGS_report_file_operations),
2852 #ifndef ROCKSDB_LITE
2853 use_blob_db_(FLAGS_use_blob_db) // Stacked BlobDB
2854 #else
2855 use_blob_db_(false) // Stacked BlobDB
2856 #endif // !ROCKSDB_LITE
2857 {
2858 // use simcache instead of cache
2859 if (FLAGS_simcache_size >= 0) {
2860 if (FLAGS_cache_numshardbits >= 1) {
2861 cache_ =
2862 NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
2863 } else {
2864 cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
2865 }
2866 }
2867
2868 if (report_file_operations_) {
2869 if (!FLAGS_hdfs.empty()) {
2870 fprintf(stderr,
2871 "--hdfs and --report_file_operations cannot be enabled "
2872 "at the same time");
2873 exit(1);
2874 }
2875 FLAGS_env = new ReportFileOpEnv(FLAGS_env);
2876 }
2877
2878 if (FLAGS_prefix_size > FLAGS_key_size) {
2879 fprintf(stderr, "prefix size is larger than key size");
2880 exit(1);
2881 }
2882
2883 std::vector<std::string> files;
2884 FLAGS_env->GetChildren(FLAGS_db, &files);
2885 for (size_t i = 0; i < files.size(); i++) {
2886 if (Slice(files[i]).starts_with("heap-")) {
2887 FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
2888 }
2889 }
2890 if (!FLAGS_use_existing_db) {
2891 Options options;
2892 options.env = FLAGS_env;
2893 if (!FLAGS_wal_dir.empty()) {
2894 options.wal_dir = FLAGS_wal_dir;
2895 }
2896 #ifndef ROCKSDB_LITE
2897 if (use_blob_db_) {
2898 // Stacked BlobDB
2899 blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
2900 }
2901 #endif // !ROCKSDB_LITE
2902 DestroyDB(FLAGS_db, options);
2903 if (!FLAGS_wal_dir.empty()) {
2904 FLAGS_env->DeleteDir(FLAGS_wal_dir);
2905 }
2906
2907 if (FLAGS_num_multi_db > 1) {
2908 FLAGS_env->CreateDir(FLAGS_db);
2909 if (!FLAGS_wal_dir.empty()) {
2910 FLAGS_env->CreateDir(FLAGS_wal_dir);
2911 }
2912 }
2913 }
2914
2915 listener_.reset(new ErrorHandlerListener());
2916 if (user_timestamp_size_ > 0) {
2917 mock_app_clock_.reset(new TimestampEmulator());
2918 }
2919 }
2920
~Benchmark()2921 ~Benchmark() {
2922 db_.DeleteDBs();
2923 for (auto db : multi_dbs_) {
2924 db.DeleteDBs();
2925 }
2926 delete prefix_extractor_;
2927 if (cache_.get() != nullptr) {
2928 // Clear cache reference first
2929 open_options_.write_buffer_manager.reset();
2930 // this will leak, but we're shutting down so nobody cares
2931 cache_->DisownData();
2932 }
2933 }
2934
AllocateKey(std::unique_ptr<const char[]> * key_guard)2935 Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
2936 char* data = new char[key_size_];
2937 const char* const_data = data;
2938 key_guard->reset(const_data);
2939 return Slice(key_guard->get(), key_size_);
2940 }
2941
2942 // Generate key according to the given specification and random number.
2943 // The resulting key will have the following format:
2944 // - If keys_per_prefix_ is positive, extra trailing bytes are either cut
2945 // off or padded with '0'.
2946 // The prefix value is derived from key value.
2947 // ----------------------------
2948 // | prefix 00000 | key 00000 |
2949 // ----------------------------
2950 //
2951 // - If keys_per_prefix_ is 0, the key is simply a binary representation of
2952 // random number followed by trailing '0's
2953 // ----------------------------
2954 // | key 00000 |
2955 // ----------------------------
GenerateKeyFromInt(uint64_t v,int64_t num_keys,Slice * key)2956 void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
2957 if (!keys_.empty()) {
2958 assert(FLAGS_use_existing_keys);
2959 assert(keys_.size() == static_cast<size_t>(num_keys));
2960 assert(v < static_cast<uint64_t>(num_keys));
2961 *key = keys_[v];
2962 return;
2963 }
2964 char* start = const_cast<char*>(key->data());
2965 char* pos = start;
2966 if (keys_per_prefix_ > 0) {
2967 int64_t num_prefix = num_keys / keys_per_prefix_;
2968 int64_t prefix = v % num_prefix;
2969 int bytes_to_fill = std::min(prefix_size_, 8);
2970 if (port::kLittleEndian) {
2971 for (int i = 0; i < bytes_to_fill; ++i) {
2972 pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
2973 }
2974 } else {
2975 memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
2976 }
2977 if (prefix_size_ > 8) {
2978 // fill the rest with 0s
2979 memset(pos + 8, '0', prefix_size_ - 8);
2980 }
2981 pos += prefix_size_;
2982 }
2983
2984 int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
2985 if (port::kLittleEndian) {
2986 for (int i = 0; i < bytes_to_fill; ++i) {
2987 pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
2988 }
2989 } else {
2990 memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
2991 }
2992 pos += bytes_to_fill;
2993 if (key_size_ > pos - start) {
2994 memset(pos, '0', key_size_ - (pos - start));
2995 }
2996 }
2997
GenerateKeyFromIntForSeek(uint64_t v,int64_t num_keys,Slice * key)2998 void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
2999 GenerateKeyFromInt(v, num_keys, key);
3000 if (FLAGS_seek_missing_prefix) {
3001 assert(prefix_size_ > 8);
3002 char* key_ptr = const_cast<char*>(key->data());
3003 // This rely on GenerateKeyFromInt filling paddings with '0's.
3004 // Putting a '1' will create a non-existing prefix.
3005 key_ptr[8] = '1';
3006 }
3007 }
3008
GetPathForMultiple(std::string base_name,size_t id)3009 std::string GetPathForMultiple(std::string base_name, size_t id) {
3010 if (!base_name.empty()) {
3011 #ifndef OS_WIN
3012 if (base_name.back() != '/') {
3013 base_name += '/';
3014 }
3015 #else
3016 if (base_name.back() != '\\') {
3017 base_name += '\\';
3018 }
3019 #endif
3020 }
3021 return base_name + ToString(id);
3022 }
3023
VerifyDBFromDB(std::string & truth_db_name)3024 void VerifyDBFromDB(std::string& truth_db_name) {
3025 DBWithColumnFamilies truth_db;
3026 auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
3027 if (!s.ok()) {
3028 fprintf(stderr, "open error: %s\n", s.ToString().c_str());
3029 exit(1);
3030 }
3031 ReadOptions ro;
3032 ro.total_order_seek = true;
3033 std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
3034 std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
3035 // Verify that all the key/values in truth_db are retrivable in db with
3036 // ::Get
3037 fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
3038 for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
3039 std::string value;
3040 s = db_.db->Get(ro, truth_iter->key(), &value);
3041 assert(s.ok());
3042 // TODO(myabandeh): provide debugging hints
3043 assert(Slice(value) == truth_iter->value());
3044 }
3045 // Verify that the db iterator does not give any extra key/value
3046 fprintf(stderr, "Verifying db == truth_db...\n");
3047 for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
3048 db_iter->Next(), truth_iter->Next()) {
3049 assert(truth_iter->Valid());
3050 assert(truth_iter->value() == db_iter->value());
3051 }
3052 // No more key should be left unchecked in truth_db
3053 assert(!truth_iter->Valid());
3054 fprintf(stderr, "...Verified\n");
3055 }
3056
ErrorExit()3057 void ErrorExit() {
3058 db_.DeleteDBs();
3059 for (size_t i = 0; i < multi_dbs_.size(); i++) {
3060 delete multi_dbs_[i].db;
3061 }
3062 exit(1);
3063 }
3064
Run()3065 void Run() {
3066 if (!SanityCheck()) {
3067 ErrorExit();
3068 }
3069 Open(&open_options_);
3070 PrintHeader();
3071 std::stringstream benchmark_stream(FLAGS_benchmarks);
3072 std::string name;
3073 std::unique_ptr<ExpiredTimeFilter> filter;
3074 while (std::getline(benchmark_stream, name, ',')) {
3075 // Sanitize parameters
3076 num_ = FLAGS_num;
3077 reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
3078 writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
3079 deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
3080 value_size = FLAGS_value_size;
3081 key_size_ = FLAGS_key_size;
3082 entries_per_batch_ = FLAGS_batch_size;
3083 writes_before_delete_range_ = FLAGS_writes_before_delete_range;
3084 writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
3085 range_tombstone_width_ = FLAGS_range_tombstone_width;
3086 max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
3087 write_options_ = WriteOptions();
3088 read_random_exp_range_ = FLAGS_read_random_exp_range;
3089 if (FLAGS_sync) {
3090 write_options_.sync = true;
3091 }
3092 write_options_.disableWAL = FLAGS_disable_wal;
3093
3094 void (Benchmark::*method)(ThreadState*) = nullptr;
3095 void (Benchmark::*post_process_method)() = nullptr;
3096
3097 bool fresh_db = false;
3098 int num_threads = FLAGS_threads;
3099
3100 int num_repeat = 1;
3101 int num_warmup = 0;
3102 if (!name.empty() && *name.rbegin() == ']') {
3103 auto it = name.find('[');
3104 if (it == std::string::npos) {
3105 fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
3106 ErrorExit();
3107 }
3108 std::string args = name.substr(it + 1);
3109 args.resize(args.size() - 1);
3110 name.resize(it);
3111
3112 std::string bench_arg;
3113 std::stringstream args_stream(args);
3114 while (std::getline(args_stream, bench_arg, '-')) {
3115 if (bench_arg.empty()) {
3116 continue;
3117 }
3118 if (bench_arg[0] == 'X') {
3119 // Repeat the benchmark n times
3120 std::string num_str = bench_arg.substr(1);
3121 num_repeat = std::stoi(num_str);
3122 } else if (bench_arg[0] == 'W') {
3123 // Warm up the benchmark for n times
3124 std::string num_str = bench_arg.substr(1);
3125 num_warmup = std::stoi(num_str);
3126 }
3127 }
3128 }
3129
3130 // Both fillseqdeterministic and filluniquerandomdeterministic
3131 // fill the levels except the max level with UNIQUE_RANDOM
3132 // and fill the max level with fillseq and filluniquerandom, respectively
3133 if (name == "fillseqdeterministic" ||
3134 name == "filluniquerandomdeterministic") {
3135 if (!FLAGS_disable_auto_compactions) {
3136 fprintf(stderr,
3137 "Please disable_auto_compactions in FillDeterministic "
3138 "benchmark\n");
3139 ErrorExit();
3140 }
3141 if (num_threads > 1) {
3142 fprintf(stderr,
3143 "filldeterministic multithreaded not supported"
3144 ", use 1 thread\n");
3145 num_threads = 1;
3146 }
3147 fresh_db = true;
3148 if (name == "fillseqdeterministic") {
3149 method = &Benchmark::WriteSeqDeterministic;
3150 } else {
3151 method = &Benchmark::WriteUniqueRandomDeterministic;
3152 }
3153 } else if (name == "fillseq") {
3154 fresh_db = true;
3155 method = &Benchmark::WriteSeq;
3156 } else if (name == "fillbatch") {
3157 fresh_db = true;
3158 entries_per_batch_ = 1000;
3159 method = &Benchmark::WriteSeq;
3160 } else if (name == "fillrandom") {
3161 fresh_db = true;
3162 method = &Benchmark::WriteRandom;
3163 } else if (name == "filluniquerandom") {
3164 fresh_db = true;
3165 if (num_threads > 1) {
3166 fprintf(stderr,
3167 "filluniquerandom multithreaded not supported"
3168 ", use 1 thread");
3169 num_threads = 1;
3170 }
3171 method = &Benchmark::WriteUniqueRandom;
3172 } else if (name == "overwrite") {
3173 method = &Benchmark::WriteRandom;
3174 } else if (name == "fillsync") {
3175 fresh_db = true;
3176 num_ /= 1000;
3177 write_options_.sync = true;
3178 method = &Benchmark::WriteRandom;
3179 } else if (name == "fill100K") {
3180 fresh_db = true;
3181 num_ /= 1000;
3182 value_size = 100 * 1000;
3183 method = &Benchmark::WriteRandom;
3184 } else if (name == "readseq") {
3185 method = &Benchmark::ReadSequential;
3186 } else if (name == "readtorowcache") {
3187 if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
3188 fprintf(stderr,
3189 "Please set use_existing_keys to true and specify a "
3190 "row cache size in readtorowcache benchmark\n");
3191 ErrorExit();
3192 }
3193 method = &Benchmark::ReadToRowCache;
3194 } else if (name == "readtocache") {
3195 method = &Benchmark::ReadSequential;
3196 num_threads = 1;
3197 reads_ = num_;
3198 } else if (name == "readreverse") {
3199 method = &Benchmark::ReadReverse;
3200 } else if (name == "readrandom") {
3201 if (FLAGS_multiread_stride) {
3202 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3203 entries_per_batch_);
3204 }
3205 method = &Benchmark::ReadRandom;
3206 } else if (name == "readrandomfast") {
3207 method = &Benchmark::ReadRandomFast;
3208 } else if (name == "multireadrandom") {
3209 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3210 entries_per_batch_);
3211 method = &Benchmark::MultiReadRandom;
3212 } else if (name == "approximatesizerandom") {
3213 fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
3214 entries_per_batch_);
3215 method = &Benchmark::ApproximateSizeRandom;
3216 } else if (name == "mixgraph") {
3217 method = &Benchmark::MixGraph;
3218 } else if (name == "readmissing") {
3219 ++key_size_;
3220 method = &Benchmark::ReadRandom;
3221 } else if (name == "newiterator") {
3222 method = &Benchmark::IteratorCreation;
3223 } else if (name == "newiteratorwhilewriting") {
3224 num_threads++; // Add extra thread for writing
3225 method = &Benchmark::IteratorCreationWhileWriting;
3226 } else if (name == "seekrandom") {
3227 method = &Benchmark::SeekRandom;
3228 } else if (name == "seekrandomwhilewriting") {
3229 num_threads++; // Add extra thread for writing
3230 method = &Benchmark::SeekRandomWhileWriting;
3231 } else if (name == "seekrandomwhilemerging") {
3232 num_threads++; // Add extra thread for merging
3233 method = &Benchmark::SeekRandomWhileMerging;
3234 } else if (name == "readrandomsmall") {
3235 reads_ /= 1000;
3236 method = &Benchmark::ReadRandom;
3237 } else if (name == "deleteseq") {
3238 method = &Benchmark::DeleteSeq;
3239 } else if (name == "deleterandom") {
3240 method = &Benchmark::DeleteRandom;
3241 } else if (name == "readwhilewriting") {
3242 num_threads++; // Add extra thread for writing
3243 method = &Benchmark::ReadWhileWriting;
3244 } else if (name == "readwhilemerging") {
3245 num_threads++; // Add extra thread for writing
3246 method = &Benchmark::ReadWhileMerging;
3247 } else if (name == "readwhilescanning") {
3248 num_threads++; // Add extra thread for scaning
3249 method = &Benchmark::ReadWhileScanning;
3250 } else if (name == "readrandomwriterandom") {
3251 method = &Benchmark::ReadRandomWriteRandom;
3252 } else if (name == "readrandommergerandom") {
3253 if (FLAGS_merge_operator.empty()) {
3254 fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3255 name.c_str());
3256 ErrorExit();
3257 }
3258 method = &Benchmark::ReadRandomMergeRandom;
3259 } else if (name == "updaterandom") {
3260 method = &Benchmark::UpdateRandom;
3261 } else if (name == "xorupdaterandom") {
3262 method = &Benchmark::XORUpdateRandom;
3263 } else if (name == "appendrandom") {
3264 method = &Benchmark::AppendRandom;
3265 } else if (name == "mergerandom") {
3266 if (FLAGS_merge_operator.empty()) {
3267 fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
3268 name.c_str());
3269 exit(1);
3270 }
3271 method = &Benchmark::MergeRandom;
3272 } else if (name == "randomwithverify") {
3273 method = &Benchmark::RandomWithVerify;
3274 } else if (name == "fillseekseq") {
3275 method = &Benchmark::WriteSeqSeekSeq;
3276 } else if (name == "compact") {
3277 method = &Benchmark::Compact;
3278 } else if (name == "compactall") {
3279 CompactAll();
3280 #ifndef ROCKSDB_LITE
3281 } else if (name == "compact0") {
3282 CompactLevel(0);
3283 } else if (name == "compact1") {
3284 CompactLevel(1);
3285 } else if (name == "waitforcompaction") {
3286 WaitForCompaction();
3287 #endif
3288 } else if (name == "flush") {
3289 Flush();
3290 } else if (name == "crc32c") {
3291 method = &Benchmark::Crc32c;
3292 } else if (name == "xxhash") {
3293 method = &Benchmark::xxHash;
3294 } else if (name == "acquireload") {
3295 method = &Benchmark::AcquireLoad;
3296 } else if (name == "compress") {
3297 method = &Benchmark::Compress;
3298 } else if (name == "uncompress") {
3299 method = &Benchmark::Uncompress;
3300 #ifndef ROCKSDB_LITE
3301 } else if (name == "randomtransaction") {
3302 method = &Benchmark::RandomTransaction;
3303 post_process_method = &Benchmark::RandomTransactionVerify;
3304 #endif // ROCKSDB_LITE
3305 } else if (name == "randomreplacekeys") {
3306 fresh_db = true;
3307 method = &Benchmark::RandomReplaceKeys;
3308 } else if (name == "timeseries") {
3309 timestamp_emulator_.reset(new TimestampEmulator());
3310 if (FLAGS_expire_style == "compaction_filter") {
3311 filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
3312 fprintf(stdout, "Compaction filter is used to remove expired data");
3313 open_options_.compaction_filter = filter.get();
3314 }
3315 fresh_db = true;
3316 method = &Benchmark::TimeSeries;
3317 } else if (name == "stats") {
3318 PrintStats("rocksdb.stats");
3319 } else if (name == "resetstats") {
3320 ResetStats();
3321 } else if (name == "verify") {
3322 VerifyDBFromDB(FLAGS_truth_db);
3323 } else if (name == "levelstats") {
3324 PrintStats("rocksdb.levelstats");
3325 } else if (name == "memstats") {
3326 std::vector<std::string> keys{"rocksdb.num-immutable-mem-table",
3327 "rocksdb.cur-size-active-mem-table",
3328 "rocksdb.cur-size-all-mem-tables",
3329 "rocksdb.size-all-mem-tables",
3330 "rocksdb.num-entries-active-mem-table",
3331 "rocksdb.num-entries-imm-mem-tables"};
3332 PrintStats(keys);
3333 } else if (name == "sstables") {
3334 PrintStats("rocksdb.sstables");
3335 } else if (name == "stats_history") {
3336 PrintStatsHistory();
3337 } else if (name == "replay") {
3338 if (num_threads > 1) {
3339 fprintf(stderr, "Multi-threaded replay is not yet supported\n");
3340 ErrorExit();
3341 }
3342 if (FLAGS_trace_file == "") {
3343 fprintf(stderr, "Please set --trace_file to be replayed from\n");
3344 ErrorExit();
3345 }
3346 method = &Benchmark::Replay;
3347 } else if (name == "getmergeoperands") {
3348 method = &Benchmark::GetMergeOperands;
3349 } else if (!name.empty()) { // No error message for empty name
3350 fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
3351 ErrorExit();
3352 }
3353
3354 if (fresh_db) {
3355 if (FLAGS_use_existing_db) {
3356 fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
3357 name.c_str());
3358 method = nullptr;
3359 } else {
3360 if (db_.db != nullptr) {
3361 db_.DeleteDBs();
3362 DestroyDB(FLAGS_db, open_options_);
3363 }
3364 Options options = open_options_;
3365 for (size_t i = 0; i < multi_dbs_.size(); i++) {
3366 delete multi_dbs_[i].db;
3367 if (!open_options_.wal_dir.empty()) {
3368 options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
3369 }
3370 DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
3371 }
3372 multi_dbs_.clear();
3373 }
3374 Open(&open_options_); // use open_options for the last accessed
3375 }
3376
3377 if (method != nullptr) {
3378 fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
3379
3380 #ifndef ROCKSDB_LITE
3381 // A trace_file option can be provided both for trace and replay
3382 // operations. But db_bench does not support tracing and replaying at
3383 // the same time, for now. So, start tracing only when it is not a
3384 // replay.
3385 if (FLAGS_trace_file != "" && name != "replay") {
3386 std::unique_ptr<TraceWriter> trace_writer;
3387 Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3388 FLAGS_trace_file, &trace_writer);
3389 if (!s.ok()) {
3390 fprintf(stderr, "Encountered an error starting a trace, %s\n",
3391 s.ToString().c_str());
3392 ErrorExit();
3393 }
3394 s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
3395 if (!s.ok()) {
3396 fprintf(stderr, "Encountered an error starting a trace, %s\n",
3397 s.ToString().c_str());
3398 ErrorExit();
3399 }
3400 fprintf(stdout, "Tracing the workload to: [%s]\n",
3401 FLAGS_trace_file.c_str());
3402 }
3403 // Start block cache tracing.
3404 if (!FLAGS_block_cache_trace_file.empty()) {
3405 // Sanity checks.
3406 if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
3407 fprintf(stderr,
3408 "Block cache trace sampling frequency must be higher than "
3409 "0.\n");
3410 ErrorExit();
3411 }
3412 if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
3413 fprintf(stderr,
3414 "The maximum file size for block cache tracing must be "
3415 "higher than 0.\n");
3416 ErrorExit();
3417 }
3418 block_cache_trace_options_.max_trace_file_size =
3419 FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
3420 block_cache_trace_options_.sampling_frequency =
3421 FLAGS_block_cache_trace_sampling_frequency;
3422 std::unique_ptr<TraceWriter> block_cache_trace_writer;
3423 Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
3424 FLAGS_block_cache_trace_file,
3425 &block_cache_trace_writer);
3426 if (!s.ok()) {
3427 fprintf(stderr,
3428 "Encountered an error when creating trace writer, %s\n",
3429 s.ToString().c_str());
3430 ErrorExit();
3431 }
3432 s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
3433 std::move(block_cache_trace_writer));
3434 if (!s.ok()) {
3435 fprintf(
3436 stderr,
3437 "Encountered an error when starting block cache tracing, %s\n",
3438 s.ToString().c_str());
3439 ErrorExit();
3440 }
3441 fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
3442 FLAGS_block_cache_trace_file.c_str());
3443 }
3444 #endif // ROCKSDB_LITE
3445
3446 if (num_warmup > 0) {
3447 printf("Warming up benchmark by running %d times\n", num_warmup);
3448 }
3449
3450 for (int i = 0; i < num_warmup; i++) {
3451 RunBenchmark(num_threads, name, method);
3452 }
3453
3454 if (num_repeat > 1) {
3455 printf("Running benchmark for %d times\n", num_repeat);
3456 }
3457
3458 CombinedStats combined_stats;
3459 for (int i = 0; i < num_repeat; i++) {
3460 Stats stats = RunBenchmark(num_threads, name, method);
3461 combined_stats.AddStats(stats);
3462 }
3463 if (num_repeat > 1) {
3464 combined_stats.Report(name);
3465 }
3466 }
3467 if (post_process_method != nullptr) {
3468 (this->*post_process_method)();
3469 }
3470 }
3471
3472 if (secondary_update_thread_) {
3473 secondary_update_stopped_.store(1, std::memory_order_relaxed);
3474 secondary_update_thread_->join();
3475 secondary_update_thread_.reset();
3476 }
3477
3478 #ifndef ROCKSDB_LITE
3479 if (name != "replay" && FLAGS_trace_file != "") {
3480 Status s = db_.db->EndTrace();
3481 if (!s.ok()) {
3482 fprintf(stderr, "Encountered an error ending the trace, %s\n",
3483 s.ToString().c_str());
3484 }
3485 }
3486 if (!FLAGS_block_cache_trace_file.empty()) {
3487 Status s = db_.db->EndBlockCacheTrace();
3488 if (!s.ok()) {
3489 fprintf(stderr,
3490 "Encountered an error ending the block cache tracing, %s\n",
3491 s.ToString().c_str());
3492 }
3493 }
3494 #endif // ROCKSDB_LITE
3495
3496 if (FLAGS_statistics) {
3497 fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
3498 }
3499 if (FLAGS_simcache_size >= 0) {
3500 fprintf(
3501 stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
3502 static_cast_with_check<SimCache>(cache_.get())->ToString().c_str());
3503 }
3504
3505 #ifndef ROCKSDB_LITE
3506 if (FLAGS_use_secondary_db) {
3507 fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n",
3508 secondary_db_updates_);
3509 }
3510 #endif // ROCKSDB_LITE
3511 }
3512
3513 private:
3514 std::shared_ptr<TimestampEmulator> timestamp_emulator_;
3515 std::unique_ptr<port::Thread> secondary_update_thread_;
3516 std::atomic<int> secondary_update_stopped_{0};
3517 #ifndef ROCKSDB_LITE
3518 uint64_t secondary_db_updates_ = 0;
3519 #endif // ROCKSDB_LITE
3520 struct ThreadArg {
3521 Benchmark* bm;
3522 SharedState* shared;
3523 ThreadState* thread;
3524 void (Benchmark::*method)(ThreadState*);
3525 };
3526
ThreadBody(void * v)3527 static void ThreadBody(void* v) {
3528 ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
3529 SharedState* shared = arg->shared;
3530 ThreadState* thread = arg->thread;
3531 {
3532 MutexLock l(&shared->mu);
3533 shared->num_initialized++;
3534 if (shared->num_initialized >= shared->total) {
3535 shared->cv.SignalAll();
3536 }
3537 while (!shared->start) {
3538 shared->cv.Wait();
3539 }
3540 }
3541
3542 SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
3543 perf_context.EnablePerLevelPerfContext();
3544 thread->stats.Start(thread->tid);
3545 (arg->bm->*(arg->method))(thread);
3546 thread->stats.Stop();
3547
3548 {
3549 MutexLock l(&shared->mu);
3550 shared->num_done++;
3551 if (shared->num_done >= shared->total) {
3552 shared->cv.SignalAll();
3553 }
3554 }
3555 }
3556
RunBenchmark(int n,Slice name,void (Benchmark::* method)(ThreadState *))3557 Stats RunBenchmark(int n, Slice name,
3558 void (Benchmark::*method)(ThreadState*)) {
3559 SharedState shared;
3560 shared.total = n;
3561 shared.num_initialized = 0;
3562 shared.num_done = 0;
3563 shared.start = false;
3564 if (FLAGS_benchmark_write_rate_limit > 0) {
3565 shared.write_rate_limiter.reset(
3566 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
3567 }
3568 if (FLAGS_benchmark_read_rate_limit > 0) {
3569 shared.read_rate_limiter.reset(NewGenericRateLimiter(
3570 FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
3571 10 /* fairness */, RateLimiter::Mode::kReadsOnly));
3572 }
3573
3574 std::unique_ptr<ReporterAgent> reporter_agent;
3575 if (FLAGS_report_interval_seconds > 0) {
3576 reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
3577 FLAGS_report_interval_seconds));
3578 }
3579
3580 ThreadArg* arg = new ThreadArg[n];
3581
3582 for (int i = 0; i < n; i++) {
3583 #ifdef NUMA
3584 if (FLAGS_enable_numa) {
3585 // Performs a local allocation of memory to threads in numa node.
3586 int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA.
3587 numa_exit_on_error = 1;
3588 int numa_node = i % n_nodes;
3589 bitmask* nodes = numa_allocate_nodemask();
3590 numa_bitmask_clearall(nodes);
3591 numa_bitmask_setbit(nodes, numa_node);
3592 // numa_bind() call binds the process to the node and these
3593 // properties are passed on to the thread that is created in
3594 // StartThread method called later in the loop.
3595 numa_bind(nodes);
3596 numa_set_strict(1);
3597 numa_free_nodemask(nodes);
3598 }
3599 #endif
3600 arg[i].bm = this;
3601 arg[i].method = method;
3602 arg[i].shared = &shared;
3603 arg[i].thread = new ThreadState(i);
3604 arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
3605 arg[i].thread->shared = &shared;
3606 FLAGS_env->StartThread(ThreadBody, &arg[i]);
3607 }
3608
3609 shared.mu.Lock();
3610 while (shared.num_initialized < n) {
3611 shared.cv.Wait();
3612 }
3613
3614 shared.start = true;
3615 shared.cv.SignalAll();
3616 while (shared.num_done < n) {
3617 shared.cv.Wait();
3618 }
3619 shared.mu.Unlock();
3620
3621 // Stats for some threads can be excluded.
3622 Stats merge_stats;
3623 for (int i = 0; i < n; i++) {
3624 merge_stats.Merge(arg[i].thread->stats);
3625 }
3626 merge_stats.Report(name);
3627
3628 for (int i = 0; i < n; i++) {
3629 delete arg[i].thread;
3630 }
3631 delete[] arg;
3632
3633 return merge_stats;
3634 }
3635
Crc32c(ThreadState * thread)3636 void Crc32c(ThreadState* thread) {
3637 // Checksum about 500MB of data total
3638 const int size = FLAGS_block_size; // use --block_size option for db_bench
3639 std::string labels = "(" + ToString(FLAGS_block_size) + " per op)";
3640 const char* label = labels.c_str();
3641
3642 std::string data(size, 'x');
3643 int64_t bytes = 0;
3644 uint32_t crc = 0;
3645 while (bytes < 500 * 1048576) {
3646 crc = crc32c::Value(data.data(), size);
3647 thread->stats.FinishedOps(nullptr, nullptr, 1, kCrc);
3648 bytes += size;
3649 }
3650 // Print so result is not dead
3651 fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
3652
3653 thread->stats.AddBytes(bytes);
3654 thread->stats.AddMessage(label);
3655 }
3656
xxHash(ThreadState * thread)3657 void xxHash(ThreadState* thread) {
3658 // Checksum about 500MB of data total
3659 const int size = 4096;
3660 const char* label = "(4K per op)";
3661 std::string data(size, 'x');
3662 int64_t bytes = 0;
3663 unsigned int xxh32 = 0;
3664 while (bytes < 500 * 1048576) {
3665 xxh32 = XXH32(data.data(), size, 0);
3666 thread->stats.FinishedOps(nullptr, nullptr, 1, kHash);
3667 bytes += size;
3668 }
3669 // Print so result is not dead
3670 fprintf(stderr, "... xxh32=0x%x\r", static_cast<unsigned int>(xxh32));
3671
3672 thread->stats.AddBytes(bytes);
3673 thread->stats.AddMessage(label);
3674 }
3675
AcquireLoad(ThreadState * thread)3676 void AcquireLoad(ThreadState* thread) {
3677 int dummy;
3678 std::atomic<void*> ap(&dummy);
3679 int count = 0;
3680 void *ptr = nullptr;
3681 thread->stats.AddMessage("(each op is 1000 loads)");
3682 while (count < 100000) {
3683 for (int i = 0; i < 1000; i++) {
3684 ptr = ap.load(std::memory_order_acquire);
3685 }
3686 count++;
3687 thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
3688 }
3689 if (ptr == nullptr) exit(1); // Disable unused variable warning.
3690 }
3691
Compress(ThreadState * thread)3692 void Compress(ThreadState *thread) {
3693 RandomGenerator gen;
3694 Slice input = gen.Generate(FLAGS_block_size);
3695 int64_t bytes = 0;
3696 int64_t produced = 0;
3697 bool ok = true;
3698 std::string compressed;
3699 CompressionOptions opts;
3700 CompressionContext context(FLAGS_compression_type_e);
3701 CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
3702 FLAGS_compression_type_e,
3703 FLAGS_sample_for_compression);
3704 // Compress 1G
3705 while (ok && bytes < int64_t(1) << 30) {
3706 compressed.clear();
3707 ok = CompressSlice(info, input, &compressed);
3708 produced += compressed.size();
3709 bytes += input.size();
3710 thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
3711 }
3712
3713 if (!ok) {
3714 thread->stats.AddMessage("(compression failure)");
3715 } else {
3716 char buf[340];
3717 snprintf(buf, sizeof(buf), "(output: %.1f%%)",
3718 (produced * 100.0) / bytes);
3719 thread->stats.AddMessage(buf);
3720 thread->stats.AddBytes(bytes);
3721 }
3722 }
3723
Uncompress(ThreadState * thread)3724 void Uncompress(ThreadState *thread) {
3725 RandomGenerator gen;
3726 Slice input = gen.Generate(FLAGS_block_size);
3727 std::string compressed;
3728
3729 CompressionContext compression_ctx(FLAGS_compression_type_e);
3730 CompressionOptions compression_opts;
3731 CompressionInfo compression_info(
3732 compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
3733 FLAGS_compression_type_e, FLAGS_sample_for_compression);
3734 UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
3735 UncompressionInfo uncompression_info(uncompression_ctx,
3736 UncompressionDict::GetEmptyDict(),
3737 FLAGS_compression_type_e);
3738
3739 bool ok = CompressSlice(compression_info, input, &compressed);
3740 int64_t bytes = 0;
3741 size_t uncompressed_size = 0;
3742 while (ok && bytes < 1024 * 1048576) {
3743 constexpr uint32_t compress_format_version = 2;
3744
3745 CacheAllocationPtr uncompressed = UncompressData(
3746 uncompression_info, compressed.data(), compressed.size(),
3747 &uncompressed_size, compress_format_version);
3748
3749 ok = uncompressed.get() != nullptr;
3750 bytes += input.size();
3751 thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
3752 }
3753
3754 if (!ok) {
3755 thread->stats.AddMessage("(compression failure)");
3756 } else {
3757 thread->stats.AddBytes(bytes);
3758 }
3759 }
3760
3761 // Returns true if the options is initialized from the specified
3762 // options file.
InitializeOptionsFromFile(Options * opts)3763 bool InitializeOptionsFromFile(Options* opts) {
3764 #ifndef ROCKSDB_LITE
3765 printf("Initializing RocksDB Options from the specified file\n");
3766 DBOptions db_opts;
3767 std::vector<ColumnFamilyDescriptor> cf_descs;
3768 if (FLAGS_options_file != "") {
3769 auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
3770 &cf_descs);
3771 db_opts.env = FLAGS_env;
3772 if (s.ok()) {
3773 *opts = Options(db_opts, cf_descs[0].options);
3774 return true;
3775 }
3776 fprintf(stderr, "Unable to load options file %s --- %s\n",
3777 FLAGS_options_file.c_str(), s.ToString().c_str());
3778 exit(1);
3779 }
3780 #else
3781 (void)opts;
3782 #endif
3783 return false;
3784 }
3785
InitializeOptionsFromFlags(Options * opts)3786 void InitializeOptionsFromFlags(Options* opts) {
3787 printf("Initializing RocksDB Options from command-line flags\n");
3788 Options& options = *opts;
3789
3790 assert(db_.db == nullptr);
3791
3792 options.env = FLAGS_env;
3793 options.max_open_files = FLAGS_open_files;
3794 if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
3795 options.write_buffer_manager.reset(
3796 new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
3797 }
3798 options.arena_block_size = FLAGS_arena_block_size;
3799 options.write_buffer_size = FLAGS_write_buffer_size;
3800 options.max_write_buffer_number = FLAGS_max_write_buffer_number;
3801 options.min_write_buffer_number_to_merge =
3802 FLAGS_min_write_buffer_number_to_merge;
3803 options.max_write_buffer_number_to_maintain =
3804 FLAGS_max_write_buffer_number_to_maintain;
3805 options.max_write_buffer_size_to_maintain =
3806 FLAGS_max_write_buffer_size_to_maintain;
3807 options.max_background_jobs = FLAGS_max_background_jobs;
3808 options.max_background_compactions = FLAGS_max_background_compactions;
3809 options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
3810 options.max_background_flushes = FLAGS_max_background_flushes;
3811 options.compaction_style = FLAGS_compaction_style_e;
3812 options.compaction_pri = FLAGS_compaction_pri_e;
3813 options.allow_mmap_reads = FLAGS_mmap_read;
3814 options.allow_mmap_writes = FLAGS_mmap_write;
3815 options.use_direct_reads = FLAGS_use_direct_reads;
3816 options.use_direct_io_for_flush_and_compaction =
3817 FLAGS_use_direct_io_for_flush_and_compaction;
3818 #ifndef ROCKSDB_LITE
3819 options.ttl = FLAGS_fifo_compaction_ttl;
3820 options.compaction_options_fifo = CompactionOptionsFIFO(
3821 FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
3822 FLAGS_fifo_compaction_allow_compaction);
3823 #endif // ROCKSDB_LITE
3824 if (FLAGS_prefix_size != 0) {
3825 options.prefix_extractor.reset(
3826 NewFixedPrefixTransform(FLAGS_prefix_size));
3827 }
3828 if (FLAGS_use_uint64_comparator) {
3829 options.comparator = test::Uint64Comparator();
3830 if (FLAGS_key_size != 8) {
3831 fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
3832 exit(1);
3833 }
3834 }
3835 if (FLAGS_use_stderr_info_logger) {
3836 options.info_log.reset(new StderrLogger());
3837 }
3838 options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
3839 options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
3840 options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
3841 if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
3842 options.memtable_insert_with_hint_prefix_extractor.reset(
3843 NewCappedPrefixTransform(
3844 FLAGS_memtable_insert_with_hint_prefix_size));
3845 }
3846 options.bloom_locality = FLAGS_bloom_locality;
3847 options.max_file_opening_threads = FLAGS_file_opening_threads;
3848 options.new_table_reader_for_compaction_inputs =
3849 FLAGS_new_table_reader_for_compaction_inputs;
3850 options.compaction_readahead_size = FLAGS_compaction_readahead_size;
3851 options.log_readahead_size = FLAGS_log_readahead_size;
3852 options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
3853 options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
3854 options.use_fsync = FLAGS_use_fsync;
3855 options.num_levels = FLAGS_num_levels;
3856 options.target_file_size_base = FLAGS_target_file_size_base;
3857 options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
3858 options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
3859 options.level_compaction_dynamic_level_bytes =
3860 FLAGS_level_compaction_dynamic_level_bytes;
3861 options.max_bytes_for_level_multiplier =
3862 FLAGS_max_bytes_for_level_multiplier;
3863 if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash ||
3864 FLAGS_rep_factory == kHashLinkedList)) {
3865 fprintf(stderr, "prefix_size should be non-zero if PrefixHash or "
3866 "HashLinkedList memtablerep is used\n");
3867 exit(1);
3868 }
3869 switch (FLAGS_rep_factory) {
3870 case kSkipList:
3871 options.memtable_factory.reset(new SkipListFactory(
3872 FLAGS_skip_list_lookahead));
3873 break;
3874 #ifndef ROCKSDB_LITE
3875 case kPrefixHash:
3876 options.memtable_factory.reset(
3877 NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
3878 break;
3879 case kHashLinkedList:
3880 options.memtable_factory.reset(NewHashLinkListRepFactory(
3881 FLAGS_hash_bucket_count));
3882 break;
3883 case kVectorRep:
3884 options.memtable_factory.reset(
3885 new VectorRepFactory
3886 );
3887 break;
3888 #else
3889 default:
3890 fprintf(stderr, "Only skip list is supported in lite mode\n");
3891 exit(1);
3892 #endif // ROCKSDB_LITE
3893 }
3894 if (FLAGS_use_plain_table) {
3895 #ifndef ROCKSDB_LITE
3896 if (FLAGS_rep_factory != kPrefixHash &&
3897 FLAGS_rep_factory != kHashLinkedList) {
3898 fprintf(stderr, "Waring: plain table is used with skipList\n");
3899 }
3900
3901 int bloom_bits_per_key = FLAGS_bloom_bits;
3902 if (bloom_bits_per_key < 0) {
3903 bloom_bits_per_key = 0;
3904 }
3905
3906 PlainTableOptions plain_table_options;
3907 plain_table_options.user_key_len = FLAGS_key_size;
3908 plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
3909 plain_table_options.hash_table_ratio = 0.75;
3910 options.table_factory = std::shared_ptr<TableFactory>(
3911 NewPlainTableFactory(plain_table_options));
3912 #else
3913 fprintf(stderr, "Plain table is not supported in lite mode\n");
3914 exit(1);
3915 #endif // ROCKSDB_LITE
3916 } else if (FLAGS_use_cuckoo_table) {
3917 #ifndef ROCKSDB_LITE
3918 if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
3919 fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
3920 exit(1);
3921 }
3922
3923 if (!FLAGS_mmap_read) {
3924 fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
3925 exit(1);
3926 }
3927
3928 ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
3929 table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
3930 table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
3931 options.table_factory = std::shared_ptr<TableFactory>(
3932 NewCuckooTableFactory(table_options));
3933 #else
3934 fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
3935 exit(1);
3936 #endif // ROCKSDB_LITE
3937 } else {
3938 BlockBasedTableOptions block_based_options;
3939 if (FLAGS_use_hash_search) {
3940 if (FLAGS_prefix_size == 0) {
3941 fprintf(stderr,
3942 "prefix_size not assigned when enable use_hash_search \n");
3943 exit(1);
3944 }
3945 block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
3946 } else {
3947 block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
3948 }
3949 if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
3950 if (FLAGS_index_with_first_key) {
3951 fprintf(stderr,
3952 "--index_with_first_key is not compatible with"
3953 " partition index.");
3954 }
3955 if (FLAGS_use_hash_search) {
3956 fprintf(stderr,
3957 "use_hash_search is incompatible with "
3958 "partition index and is ignored");
3959 }
3960 block_based_options.index_type =
3961 BlockBasedTableOptions::kTwoLevelIndexSearch;
3962 block_based_options.metadata_block_size = FLAGS_metadata_block_size;
3963 if (FLAGS_partition_index_and_filters) {
3964 block_based_options.partition_filters = true;
3965 }
3966 } else if (FLAGS_index_with_first_key) {
3967 block_based_options.index_type =
3968 BlockBasedTableOptions::kBinarySearchWithFirstKey;
3969 }
3970 BlockBasedTableOptions::IndexShorteningMode index_shortening =
3971 block_based_options.index_shortening;
3972 switch (FLAGS_index_shortening_mode) {
3973 case 0:
3974 index_shortening =
3975 BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
3976 break;
3977 case 1:
3978 index_shortening =
3979 BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators;
3980 break;
3981 case 2:
3982 index_shortening = BlockBasedTableOptions::IndexShorteningMode::
3983 kShortenSeparatorsAndSuccessor;
3984 break;
3985 default:
3986 fprintf(stderr, "Unknown key shortening mode\n");
3987 }
3988 block_based_options.optimize_filters_for_memory =
3989 FLAGS_optimize_filters_for_memory;
3990 block_based_options.index_shortening = index_shortening;
3991 if (cache_ == nullptr) {
3992 block_based_options.no_block_cache = true;
3993 }
3994 block_based_options.cache_index_and_filter_blocks =
3995 FLAGS_cache_index_and_filter_blocks;
3996 block_based_options.pin_l0_filter_and_index_blocks_in_cache =
3997 FLAGS_pin_l0_filter_and_index_blocks_in_cache;
3998 block_based_options.pin_top_level_index_and_filter =
3999 FLAGS_pin_top_level_index_and_filter;
4000 if (FLAGS_cache_high_pri_pool_ratio > 1e-6) { // > 0.0 + eps
4001 block_based_options.cache_index_and_filter_blocks_with_high_priority =
4002 true;
4003 }
4004 block_based_options.block_cache = cache_;
4005 block_based_options.block_cache_compressed = compressed_cache_;
4006 block_based_options.block_size = FLAGS_block_size;
4007 block_based_options.block_restart_interval = FLAGS_block_restart_interval;
4008 block_based_options.index_block_restart_interval =
4009 FLAGS_index_block_restart_interval;
4010 block_based_options.filter_policy = filter_policy_;
4011 block_based_options.format_version =
4012 static_cast<uint32_t>(FLAGS_format_version);
4013 block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
4014 block_based_options.enable_index_compression =
4015 FLAGS_enable_index_compression;
4016 block_based_options.block_align = FLAGS_block_align;
4017 if (FLAGS_use_data_block_hash_index) {
4018 block_based_options.data_block_index_type =
4019 ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
4020 } else {
4021 block_based_options.data_block_index_type =
4022 ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch;
4023 }
4024 block_based_options.data_block_hash_table_util_ratio =
4025 FLAGS_data_block_hash_table_util_ratio;
4026 if (FLAGS_read_cache_path != "") {
4027 #ifndef ROCKSDB_LITE
4028 Status rc_status;
4029
4030 // Read cache need to be provided with a the Logger, we will put all
4031 // reac cache logs in the read cache path in a file named rc_LOG
4032 rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
4033 std::shared_ptr<Logger> read_cache_logger;
4034 if (rc_status.ok()) {
4035 rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
4036 &read_cache_logger);
4037 }
4038
4039 if (rc_status.ok()) {
4040 PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
4041 FLAGS_read_cache_size,
4042 read_cache_logger);
4043
4044 rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
4045 rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
4046 rc_cfg.writer_qdepth = 4;
4047 rc_cfg.writer_dispatch_size = 4 * 1024;
4048
4049 auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
4050 block_based_options.persistent_cache = pcache;
4051 rc_status = pcache->Open();
4052 }
4053
4054 if (!rc_status.ok()) {
4055 fprintf(stderr, "Error initializing read cache, %s\n",
4056 rc_status.ToString().c_str());
4057 exit(1);
4058 }
4059 #else
4060 fprintf(stderr, "Read cache is not supported in LITE\n");
4061 exit(1);
4062
4063 #endif
4064 }
4065 options.table_factory.reset(
4066 NewBlockBasedTableFactory(block_based_options));
4067 }
4068 if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
4069 if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
4070 static_cast<unsigned int>(FLAGS_num_levels)) {
4071 fprintf(stderr, "Insufficient number of fanouts specified %d\n",
4072 static_cast<int>(
4073 FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
4074 exit(1);
4075 }
4076 options.max_bytes_for_level_multiplier_additional =
4077 FLAGS_max_bytes_for_level_multiplier_additional_v;
4078 }
4079 options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
4080 options.level0_file_num_compaction_trigger =
4081 FLAGS_level0_file_num_compaction_trigger;
4082 options.level0_slowdown_writes_trigger =
4083 FLAGS_level0_slowdown_writes_trigger;
4084 options.compression = FLAGS_compression_type_e;
4085 if (FLAGS_simulate_hybrid_fs_file != "") {
4086 options.bottommost_temperature = Temperature::kWarm;
4087 }
4088 options.sample_for_compression = FLAGS_sample_for_compression;
4089 options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
4090 options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
4091 options.max_total_wal_size = FLAGS_max_total_wal_size;
4092
4093 if (FLAGS_min_level_to_compress >= 0) {
4094 assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
4095 options.compression_per_level.resize(FLAGS_num_levels);
4096 for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
4097 options.compression_per_level[i] = kNoCompression;
4098 }
4099 for (int i = FLAGS_min_level_to_compress;
4100 i < FLAGS_num_levels; i++) {
4101 options.compression_per_level[i] = FLAGS_compression_type_e;
4102 }
4103 }
4104 options.soft_rate_limit = FLAGS_soft_rate_limit;
4105 options.hard_rate_limit = FLAGS_hard_rate_limit;
4106 options.soft_pending_compaction_bytes_limit =
4107 FLAGS_soft_pending_compaction_bytes_limit;
4108 options.hard_pending_compaction_bytes_limit =
4109 FLAGS_hard_pending_compaction_bytes_limit;
4110 options.delayed_write_rate = FLAGS_delayed_write_rate;
4111 options.allow_concurrent_memtable_write =
4112 FLAGS_allow_concurrent_memtable_write;
4113 options.inplace_update_support = FLAGS_inplace_update_support;
4114 options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
4115 options.enable_write_thread_adaptive_yield =
4116 FLAGS_enable_write_thread_adaptive_yield;
4117 options.enable_pipelined_write = FLAGS_enable_pipelined_write;
4118 options.unordered_write = FLAGS_unordered_write;
4119 options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
4120 options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
4121 options.rate_limit_delay_max_milliseconds =
4122 FLAGS_rate_limit_delay_max_milliseconds;
4123 options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
4124 options.max_compaction_bytes = FLAGS_max_compaction_bytes;
4125 options.disable_auto_compactions = FLAGS_disable_auto_compactions;
4126 options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
4127 options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
4128
4129 // fill storage options
4130 options.advise_random_on_open = FLAGS_advise_random_on_open;
4131 options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
4132 options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
4133 options.bytes_per_sync = FLAGS_bytes_per_sync;
4134 options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
4135
4136 // merge operator options
4137 options.merge_operator = MergeOperators::CreateFromStringId(
4138 FLAGS_merge_operator);
4139 if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) {
4140 fprintf(stderr, "invalid merge operator: %s\n",
4141 FLAGS_merge_operator.c_str());
4142 exit(1);
4143 }
4144 options.max_successive_merges = FLAGS_max_successive_merges;
4145 options.report_bg_io_stats = FLAGS_report_bg_io_stats;
4146
4147 // set universal style compaction configurations, if applicable
4148 if (FLAGS_universal_size_ratio != 0) {
4149 options.compaction_options_universal.size_ratio =
4150 FLAGS_universal_size_ratio;
4151 }
4152 if (FLAGS_universal_min_merge_width != 0) {
4153 options.compaction_options_universal.min_merge_width =
4154 FLAGS_universal_min_merge_width;
4155 }
4156 if (FLAGS_universal_max_merge_width != 0) {
4157 options.compaction_options_universal.max_merge_width =
4158 FLAGS_universal_max_merge_width;
4159 }
4160 if (FLAGS_universal_max_size_amplification_percent != 0) {
4161 options.compaction_options_universal.max_size_amplification_percent =
4162 FLAGS_universal_max_size_amplification_percent;
4163 }
4164 if (FLAGS_universal_compression_size_percent != -1) {
4165 options.compaction_options_universal.compression_size_percent =
4166 FLAGS_universal_compression_size_percent;
4167 }
4168 options.compaction_options_universal.allow_trivial_move =
4169 FLAGS_universal_allow_trivial_move;
4170 if (FLAGS_thread_status_per_interval > 0) {
4171 options.enable_thread_tracking = true;
4172 }
4173
4174 if (FLAGS_user_timestamp_size > 0) {
4175 if (FLAGS_user_timestamp_size != 8) {
4176 fprintf(stderr, "Only 64 bits timestamps are supported.\n");
4177 exit(1);
4178 }
4179 options.comparator = ROCKSDB_NAMESPACE::test::ComparatorWithU64Ts();
4180 }
4181
4182 // Integrated BlobDB
4183 options.enable_blob_files = FLAGS_enable_blob_files;
4184 options.min_blob_size = FLAGS_min_blob_size;
4185 options.blob_file_size = FLAGS_blob_file_size;
4186 options.blob_compression_type =
4187 StringToCompressionType(FLAGS_blob_compression_type.c_str());
4188 options.enable_blob_garbage_collection =
4189 FLAGS_enable_blob_garbage_collection;
4190 options.blob_garbage_collection_age_cutoff =
4191 FLAGS_blob_garbage_collection_age_cutoff;
4192
4193 #ifndef ROCKSDB_LITE
4194 if (FLAGS_readonly && FLAGS_transaction_db) {
4195 fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
4196 exit(1);
4197 }
4198 if (FLAGS_use_secondary_db &&
4199 (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
4200 fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
4201 exit(1);
4202 }
4203 #endif // ROCKSDB_LITE
4204
4205 }
4206
InitializeOptionsGeneral(Options * opts)4207 void InitializeOptionsGeneral(Options* opts) {
4208 Options& options = *opts;
4209
4210 options.create_missing_column_families = FLAGS_num_column_families > 1;
4211 options.statistics = dbstats;
4212 options.wal_dir = FLAGS_wal_dir;
4213 options.create_if_missing = !FLAGS_use_existing_db;
4214 options.dump_malloc_stats = FLAGS_dump_malloc_stats;
4215 options.stats_dump_period_sec =
4216 static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
4217 options.stats_persist_period_sec =
4218 static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
4219 options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
4220 options.stats_history_buffer_size =
4221 static_cast<size_t>(FLAGS_stats_history_buffer_size);
4222
4223 options.compression_opts.level = FLAGS_compression_level;
4224 options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
4225 options.compression_opts.zstd_max_train_bytes =
4226 FLAGS_compression_zstd_max_train_bytes;
4227 options.compression_opts.parallel_threads =
4228 FLAGS_compression_parallel_threads;
4229 options.compression_opts.max_dict_buffer_bytes =
4230 FLAGS_compression_max_dict_buffer_bytes;
4231 // If this is a block based table, set some related options
4232 auto table_options =
4233 options.table_factory->GetOptions<BlockBasedTableOptions>();
4234 if (table_options != nullptr) {
4235 if (FLAGS_cache_size) {
4236 table_options->block_cache = cache_;
4237 }
4238 if (FLAGS_bloom_bits >= 0) {
4239 table_options->filter_policy.reset(
4240 FLAGS_use_ribbon_filter
4241 ? NewExperimentalRibbonFilterPolicy(FLAGS_bloom_bits)
4242 : NewBloomFilterPolicy(FLAGS_bloom_bits,
4243 FLAGS_use_block_based_filter));
4244 }
4245 }
4246 if (FLAGS_row_cache_size) {
4247 if (FLAGS_cache_numshardbits >= 1) {
4248 options.row_cache =
4249 NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
4250 } else {
4251 options.row_cache = NewLRUCache(FLAGS_row_cache_size);
4252 }
4253 }
4254 if (FLAGS_enable_io_prio) {
4255 FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
4256 FLAGS_env->LowerThreadPoolIOPriority(Env::HIGH);
4257 }
4258 if (FLAGS_enable_cpu_prio) {
4259 FLAGS_env->LowerThreadPoolCPUPriority(Env::LOW);
4260 FLAGS_env->LowerThreadPoolCPUPriority(Env::HIGH);
4261 }
4262 options.env = FLAGS_env;
4263 if (FLAGS_sine_write_rate) {
4264 FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
4265 }
4266
4267 if (FLAGS_rate_limiter_bytes_per_sec > 0) {
4268 if (FLAGS_rate_limit_bg_reads &&
4269 !FLAGS_new_table_reader_for_compaction_inputs) {
4270 fprintf(stderr,
4271 "rate limit compaction reads must have "
4272 "new_table_reader_for_compaction_inputs set\n");
4273 exit(1);
4274 }
4275 options.rate_limiter.reset(NewGenericRateLimiter(
4276 FLAGS_rate_limiter_bytes_per_sec, 100 * 1000 /* refill_period_us */,
4277 10 /* fairness */,
4278 FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
4279 : RateLimiter::Mode::kWritesOnly,
4280 FLAGS_rate_limiter_auto_tuned));
4281 }
4282
4283 options.listeners.emplace_back(listener_);
4284
4285 if (FLAGS_num_multi_db <= 1) {
4286 OpenDb(options, FLAGS_db, &db_);
4287 } else {
4288 multi_dbs_.clear();
4289 multi_dbs_.resize(FLAGS_num_multi_db);
4290 auto wal_dir = options.wal_dir;
4291 for (int i = 0; i < FLAGS_num_multi_db; i++) {
4292 if (!wal_dir.empty()) {
4293 options.wal_dir = GetPathForMultiple(wal_dir, i);
4294 }
4295 OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
4296 }
4297 options.wal_dir = wal_dir;
4298 }
4299
4300 // KeepFilter is a noop filter, this can be used to test compaction filter
4301 if (FLAGS_use_keep_filter) {
4302 options.compaction_filter = new KeepFilter();
4303 fprintf(stdout, "A noop compaction filter is used\n");
4304 }
4305
4306 if (FLAGS_use_existing_keys) {
4307 // Only work on single database
4308 assert(db_.db != nullptr);
4309 ReadOptions read_opts;
4310 read_opts.total_order_seek = true;
4311 Iterator* iter = db_.db->NewIterator(read_opts);
4312 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
4313 keys_.emplace_back(iter->key().ToString());
4314 }
4315 delete iter;
4316 FLAGS_num = keys_.size();
4317 }
4318 }
4319
Open(Options * opts)4320 void Open(Options* opts) {
4321 if (!InitializeOptionsFromFile(opts)) {
4322 InitializeOptionsFromFlags(opts);
4323 }
4324
4325 InitializeOptionsGeneral(opts);
4326 }
4327
OpenDb(Options options,const std::string & db_name,DBWithColumnFamilies * db)4328 void OpenDb(Options options, const std::string& db_name,
4329 DBWithColumnFamilies* db) {
4330 Status s;
4331 // Open with column families if necessary.
4332 if (FLAGS_num_column_families > 1) {
4333 size_t num_hot = FLAGS_num_column_families;
4334 if (FLAGS_num_hot_column_families > 0 &&
4335 FLAGS_num_hot_column_families < FLAGS_num_column_families) {
4336 num_hot = FLAGS_num_hot_column_families;
4337 } else {
4338 FLAGS_num_hot_column_families = FLAGS_num_column_families;
4339 }
4340 std::vector<ColumnFamilyDescriptor> column_families;
4341 for (size_t i = 0; i < num_hot; i++) {
4342 column_families.push_back(ColumnFamilyDescriptor(
4343 ColumnFamilyName(i), ColumnFamilyOptions(options)));
4344 }
4345 std::vector<int> cfh_idx_to_prob;
4346 if (!FLAGS_column_family_distribution.empty()) {
4347 std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
4348 std::string cf_prob;
4349 int sum = 0;
4350 while (std::getline(cf_prob_stream, cf_prob, ',')) {
4351 cfh_idx_to_prob.push_back(std::stoi(cf_prob));
4352 sum += cfh_idx_to_prob.back();
4353 }
4354 if (sum != 100) {
4355 fprintf(stderr, "column_family_distribution items must sum to 100\n");
4356 exit(1);
4357 }
4358 if (cfh_idx_to_prob.size() != num_hot) {
4359 fprintf(stderr,
4360 "got %" ROCKSDB_PRIszt
4361 " column_family_distribution items; expected "
4362 "%" ROCKSDB_PRIszt "\n",
4363 cfh_idx_to_prob.size(), num_hot);
4364 exit(1);
4365 }
4366 }
4367 #ifndef ROCKSDB_LITE
4368 if (FLAGS_readonly) {
4369 s = DB::OpenForReadOnly(options, db_name, column_families,
4370 &db->cfh, &db->db);
4371 } else if (FLAGS_optimistic_transaction_db) {
4372 s = OptimisticTransactionDB::Open(options, db_name, column_families,
4373 &db->cfh, &db->opt_txn_db);
4374 if (s.ok()) {
4375 db->db = db->opt_txn_db->GetBaseDB();
4376 }
4377 } else if (FLAGS_transaction_db) {
4378 TransactionDB* ptr;
4379 TransactionDBOptions txn_db_options;
4380 if (options.unordered_write) {
4381 options.two_write_queues = true;
4382 txn_db_options.skip_concurrency_control = true;
4383 txn_db_options.write_policy = WRITE_PREPARED;
4384 }
4385 s = TransactionDB::Open(options, txn_db_options, db_name,
4386 column_families, &db->cfh, &ptr);
4387 if (s.ok()) {
4388 db->db = ptr;
4389 }
4390 } else {
4391 s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4392 }
4393 #else
4394 s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
4395 #endif // ROCKSDB_LITE
4396 db->cfh.resize(FLAGS_num_column_families);
4397 db->num_created = num_hot;
4398 db->num_hot = num_hot;
4399 db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
4400 #ifndef ROCKSDB_LITE
4401 } else if (FLAGS_readonly) {
4402 s = DB::OpenForReadOnly(options, db_name, &db->db);
4403 } else if (FLAGS_optimistic_transaction_db) {
4404 s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
4405 if (s.ok()) {
4406 db->db = db->opt_txn_db->GetBaseDB();
4407 }
4408 } else if (FLAGS_transaction_db) {
4409 TransactionDB* ptr = nullptr;
4410 TransactionDBOptions txn_db_options;
4411 if (options.unordered_write) {
4412 options.two_write_queues = true;
4413 txn_db_options.skip_concurrency_control = true;
4414 txn_db_options.write_policy = WRITE_PREPARED;
4415 }
4416 s = CreateLoggerFromOptions(db_name, options, &options.info_log);
4417 if (s.ok()) {
4418 s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
4419 }
4420 if (s.ok()) {
4421 db->db = ptr;
4422 }
4423 } else if (FLAGS_use_blob_db) {
4424 // Stacked BlobDB
4425 blob_db::BlobDBOptions blob_db_options;
4426 blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
4427 blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
4428 blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
4429 blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
4430 blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
4431 blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
4432 blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
4433 blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
4434 blob_db_options.compression = FLAGS_blob_db_compression_type_e;
4435 blob_db::BlobDB* ptr = nullptr;
4436 s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
4437 if (s.ok()) {
4438 db->db = ptr;
4439 }
4440 } else if (FLAGS_use_secondary_db) {
4441 if (FLAGS_secondary_path.empty()) {
4442 std::string default_secondary_path;
4443 FLAGS_env->GetTestDirectory(&default_secondary_path);
4444 default_secondary_path += "/dbbench_secondary";
4445 FLAGS_secondary_path = default_secondary_path;
4446 }
4447 s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
4448 if (s.ok() && FLAGS_secondary_update_interval > 0) {
4449 secondary_update_thread_.reset(new port::Thread(
4450 [this](int interval, DBWithColumnFamilies* _db) {
4451 while (0 == secondary_update_stopped_.load(
4452 std::memory_order_relaxed)) {
4453 Status secondary_update_status =
4454 _db->db->TryCatchUpWithPrimary();
4455 if (!secondary_update_status.ok()) {
4456 fprintf(stderr, "Failed to catch up with primary: %s\n",
4457 secondary_update_status.ToString().c_str());
4458 break;
4459 }
4460 ++secondary_db_updates_;
4461 FLAGS_env->SleepForMicroseconds(interval * 1000000);
4462 }
4463 },
4464 FLAGS_secondary_update_interval, db));
4465 }
4466 #endif // ROCKSDB_LITE
4467 } else {
4468 s = DB::Open(options, db_name, &db->db);
4469 }
4470 if (!s.ok()) {
4471 fprintf(stderr, "open error: %s\n", s.ToString().c_str());
4472 exit(1);
4473 }
4474 }
4475
4476 enum WriteMode {
4477 RANDOM, SEQUENTIAL, UNIQUE_RANDOM
4478 };
4479
WriteSeqDeterministic(ThreadState * thread)4480 void WriteSeqDeterministic(ThreadState* thread) {
4481 DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
4482 }
4483
WriteUniqueRandomDeterministic(ThreadState * thread)4484 void WriteUniqueRandomDeterministic(ThreadState* thread) {
4485 DoDeterministicCompact(thread, open_options_.compaction_style,
4486 UNIQUE_RANDOM);
4487 }
4488
WriteSeq(ThreadState * thread)4489 void WriteSeq(ThreadState* thread) {
4490 DoWrite(thread, SEQUENTIAL);
4491 }
4492
WriteRandom(ThreadState * thread)4493 void WriteRandom(ThreadState* thread) {
4494 DoWrite(thread, RANDOM);
4495 }
4496
WriteUniqueRandom(ThreadState * thread)4497 void WriteUniqueRandom(ThreadState* thread) {
4498 DoWrite(thread, UNIQUE_RANDOM);
4499 }
4500
4501 class KeyGenerator {
4502 public:
KeyGenerator(Random64 * rand,WriteMode mode,uint64_t num,uint64_t=64* 1024)4503 KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
4504 uint64_t /*num_per_set*/ = 64 * 1024)
4505 : rand_(rand), mode_(mode), num_(num), next_(0) {
4506 if (mode_ == UNIQUE_RANDOM) {
4507 // NOTE: if memory consumption of this approach becomes a concern,
4508 // we can either break it into pieces and only random shuffle a section
4509 // each time. Alternatively, use a bit map implementation
4510 // (https://reviews.facebook.net/differential/diff/54627/)
4511 values_.resize(num_);
4512 for (uint64_t i = 0; i < num_; ++i) {
4513 values_[i] = i;
4514 }
4515 RandomShuffle(values_.begin(), values_.end(),
4516 static_cast<uint32_t>(FLAGS_seed));
4517 }
4518 }
4519
Next()4520 uint64_t Next() {
4521 switch (mode_) {
4522 case SEQUENTIAL:
4523 return next_++;
4524 case RANDOM:
4525 return rand_->Next() % num_;
4526 case UNIQUE_RANDOM:
4527 assert(next_ < num_);
4528 return values_[next_++];
4529 }
4530 assert(false);
4531 return std::numeric_limits<uint64_t>::max();
4532 }
4533
4534 private:
4535 Random64* rand_;
4536 WriteMode mode_;
4537 const uint64_t num_;
4538 uint64_t next_;
4539 std::vector<uint64_t> values_;
4540 };
4541
SelectDB(ThreadState * thread)4542 DB* SelectDB(ThreadState* thread) {
4543 return SelectDBWithCfh(thread)->db;
4544 }
4545
SelectDBWithCfh(ThreadState * thread)4546 DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
4547 return SelectDBWithCfh(thread->rand.Next());
4548 }
4549
SelectDBWithCfh(uint64_t rand_int)4550 DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
4551 if (db_.db != nullptr) {
4552 return &db_;
4553 } else {
4554 return &multi_dbs_[rand_int % multi_dbs_.size()];
4555 }
4556 }
4557
SineRate(double x)4558 double SineRate(double x) {
4559 return FLAGS_sine_a*sin((FLAGS_sine_b*x) + FLAGS_sine_c) + FLAGS_sine_d;
4560 }
4561
DoWrite(ThreadState * thread,WriteMode write_mode)4562 void DoWrite(ThreadState* thread, WriteMode write_mode) {
4563 const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
4564 const int64_t num_ops = writes_ == 0 ? num_ : writes_;
4565
4566 size_t num_key_gens = 1;
4567 if (db_.db == nullptr) {
4568 num_key_gens = multi_dbs_.size();
4569 }
4570 std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
4571 int64_t max_ops = num_ops * num_key_gens;
4572 int64_t ops_per_stage = max_ops;
4573 if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
4574 ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
4575 FLAGS_num_hot_column_families) +
4576 1;
4577 }
4578
4579 Duration duration(test_duration, max_ops, ops_per_stage);
4580 for (size_t i = 0; i < num_key_gens; i++) {
4581 key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
4582 num_ + max_num_range_tombstones_,
4583 ops_per_stage));
4584 }
4585
4586 if (num_ != FLAGS_num) {
4587 char msg[100];
4588 snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
4589 thread->stats.AddMessage(msg);
4590 }
4591
4592 RandomGenerator gen;
4593 WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
4594 user_timestamp_size_);
4595 Status s;
4596 int64_t bytes = 0;
4597
4598 std::unique_ptr<const char[]> key_guard;
4599 Slice key = AllocateKey(&key_guard);
4600 std::unique_ptr<const char[]> begin_key_guard;
4601 Slice begin_key = AllocateKey(&begin_key_guard);
4602 std::unique_ptr<const char[]> end_key_guard;
4603 Slice end_key = AllocateKey(&end_key_guard);
4604 std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
4605 std::vector<Slice> expanded_keys;
4606 if (FLAGS_expand_range_tombstones) {
4607 expanded_key_guards.resize(range_tombstone_width_);
4608 for (auto& expanded_key_guard : expanded_key_guards) {
4609 expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
4610 }
4611 }
4612
4613 std::unique_ptr<char[]> ts_guard;
4614 if (user_timestamp_size_ > 0) {
4615 ts_guard.reset(new char[user_timestamp_size_]);
4616 }
4617
4618 int64_t stage = 0;
4619 int64_t num_written = 0;
4620 while (!duration.Done(entries_per_batch_)) {
4621 if (duration.GetStage() != stage) {
4622 stage = duration.GetStage();
4623 if (db_.db != nullptr) {
4624 db_.CreateNewCf(open_options_, stage);
4625 } else {
4626 for (auto& db : multi_dbs_) {
4627 db.CreateNewCf(open_options_, stage);
4628 }
4629 }
4630 }
4631
4632 size_t id = thread->rand.Next() % num_key_gens;
4633 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
4634 batch.Clear();
4635 int64_t batch_bytes = 0;
4636
4637 for (int64_t j = 0; j < entries_per_batch_; j++) {
4638 int64_t rand_num = key_gens[id]->Next();
4639 GenerateKeyFromInt(rand_num, FLAGS_num, &key);
4640 Slice val = gen.Generate();
4641 if (use_blob_db_) {
4642 #ifndef ROCKSDB_LITE
4643 // Stacked BlobDB
4644 blob_db::BlobDB* blobdb =
4645 static_cast<blob_db::BlobDB*>(db_with_cfh->db);
4646 if (FLAGS_blob_db_max_ttl_range > 0) {
4647 int ttl = rand() % FLAGS_blob_db_max_ttl_range;
4648 s = blobdb->PutWithTTL(write_options_, key, val, ttl);
4649 } else {
4650 s = blobdb->Put(write_options_, key, val);
4651 }
4652 #endif // ROCKSDB_LITE
4653 } else if (FLAGS_num_column_families <= 1) {
4654 batch.Put(key, val);
4655 } else {
4656 // We use same rand_num as seed for key and column family so that we
4657 // can deterministically find the cfh corresponding to a particular
4658 // key while reading the key.
4659 batch.Put(db_with_cfh->GetCfh(rand_num), key,
4660 val);
4661 }
4662 batch_bytes += val.size() + key_size_ + user_timestamp_size_;
4663 bytes += val.size() + key_size_ + user_timestamp_size_;
4664 ++num_written;
4665 if (writes_per_range_tombstone_ > 0 &&
4666 num_written > writes_before_delete_range_ &&
4667 (num_written - writes_before_delete_range_) /
4668 writes_per_range_tombstone_ <=
4669 max_num_range_tombstones_ &&
4670 (num_written - writes_before_delete_range_) %
4671 writes_per_range_tombstone_ ==
4672 0) {
4673 int64_t begin_num = key_gens[id]->Next();
4674 if (FLAGS_expand_range_tombstones) {
4675 for (int64_t offset = 0; offset < range_tombstone_width_;
4676 ++offset) {
4677 GenerateKeyFromInt(begin_num + offset, FLAGS_num,
4678 &expanded_keys[offset]);
4679 if (use_blob_db_) {
4680 #ifndef ROCKSDB_LITE
4681 // Stacked BlobDB
4682 s = db_with_cfh->db->Delete(write_options_,
4683 expanded_keys[offset]);
4684 #endif // ROCKSDB_LITE
4685 } else if (FLAGS_num_column_families <= 1) {
4686 batch.Delete(expanded_keys[offset]);
4687 } else {
4688 batch.Delete(db_with_cfh->GetCfh(rand_num),
4689 expanded_keys[offset]);
4690 }
4691 }
4692 } else {
4693 GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
4694 GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
4695 &end_key);
4696 if (use_blob_db_) {
4697 #ifndef ROCKSDB_LITE
4698 // Stacked BlobDB
4699 s = db_with_cfh->db->DeleteRange(
4700 write_options_, db_with_cfh->db->DefaultColumnFamily(),
4701 begin_key, end_key);
4702 #endif // ROCKSDB_LITE
4703 } else if (FLAGS_num_column_families <= 1) {
4704 batch.DeleteRange(begin_key, end_key);
4705 } else {
4706 batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
4707 end_key);
4708 }
4709 }
4710 }
4711 }
4712 if (thread->shared->write_rate_limiter.get() != nullptr) {
4713 thread->shared->write_rate_limiter->Request(
4714 batch_bytes, Env::IO_HIGH,
4715 nullptr /* stats */, RateLimiter::OpType::kWrite);
4716 // Set time at which last op finished to Now() to hide latency and
4717 // sleep from rate limiter. Also, do the check once per batch, not
4718 // once per write.
4719 thread->stats.ResetLastOpTime();
4720 }
4721 if (user_timestamp_size_ > 0) {
4722 Slice user_ts = mock_app_clock_->Allocate(ts_guard.get());
4723 s = batch.AssignTimestamp(user_ts);
4724 if (!s.ok()) {
4725 fprintf(stderr, "assign timestamp to write batch: %s\n",
4726 s.ToString().c_str());
4727 ErrorExit();
4728 }
4729 }
4730 if (!use_blob_db_) {
4731 // Not stacked BlobDB
4732 s = db_with_cfh->db->Write(write_options_, &batch);
4733 }
4734 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
4735 entries_per_batch_, kWrite);
4736 if (FLAGS_sine_write_rate) {
4737 uint64_t now = FLAGS_env->NowMicros();
4738
4739 uint64_t usecs_since_last;
4740 if (now > thread->stats.GetSineInterval()) {
4741 usecs_since_last = now - thread->stats.GetSineInterval();
4742 } else {
4743 usecs_since_last = 0;
4744 }
4745
4746 if (usecs_since_last >
4747 (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
4748 double usecs_since_start =
4749 static_cast<double>(now - thread->stats.GetStart());
4750 thread->stats.ResetSineInterval();
4751 uint64_t write_rate =
4752 static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
4753 thread->shared->write_rate_limiter.reset(
4754 NewGenericRateLimiter(write_rate));
4755 }
4756 }
4757 if (!s.ok()) {
4758 s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
4759 }
4760
4761 if (!s.ok()) {
4762 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
4763 ErrorExit();
4764 }
4765 }
4766 thread->stats.AddBytes(bytes);
4767 }
4768
DoDeterministicCompact(ThreadState * thread,CompactionStyle compaction_style,WriteMode write_mode)4769 Status DoDeterministicCompact(ThreadState* thread,
4770 CompactionStyle compaction_style,
4771 WriteMode write_mode) {
4772 #ifndef ROCKSDB_LITE
4773 ColumnFamilyMetaData meta;
4774 std::vector<DB*> db_list;
4775 if (db_.db != nullptr) {
4776 db_list.push_back(db_.db);
4777 } else {
4778 for (auto& db : multi_dbs_) {
4779 db_list.push_back(db.db);
4780 }
4781 }
4782 std::vector<Options> options_list;
4783 for (auto db : db_list) {
4784 options_list.push_back(db->GetOptions());
4785 if (compaction_style != kCompactionStyleFIFO) {
4786 db->SetOptions({{"disable_auto_compactions", "1"},
4787 {"level0_slowdown_writes_trigger", "400000000"},
4788 {"level0_stop_writes_trigger", "400000000"}});
4789 } else {
4790 db->SetOptions({{"disable_auto_compactions", "1"}});
4791 }
4792 }
4793
4794 assert(!db_list.empty());
4795 auto num_db = db_list.size();
4796 size_t num_levels = static_cast<size_t>(open_options_.num_levels);
4797 size_t output_level = open_options_.num_levels - 1;
4798 std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
4799 std::vector<size_t> num_files_at_level0(num_db, 0);
4800 if (compaction_style == kCompactionStyleLevel) {
4801 if (num_levels == 0) {
4802 return Status::InvalidArgument("num_levels should be larger than 1");
4803 }
4804 bool should_stop = false;
4805 while (!should_stop) {
4806 if (sorted_runs[0].empty()) {
4807 DoWrite(thread, write_mode);
4808 } else {
4809 DoWrite(thread, UNIQUE_RANDOM);
4810 }
4811 for (size_t i = 0; i < num_db; i++) {
4812 auto db = db_list[i];
4813 db->Flush(FlushOptions());
4814 db->GetColumnFamilyMetaData(&meta);
4815 if (num_files_at_level0[i] == meta.levels[0].files.size() ||
4816 writes_ == 0) {
4817 should_stop = true;
4818 continue;
4819 }
4820 sorted_runs[i].emplace_back(
4821 meta.levels[0].files.begin(),
4822 meta.levels[0].files.end() - num_files_at_level0[i]);
4823 num_files_at_level0[i] = meta.levels[0].files.size();
4824 if (sorted_runs[i].back().size() == 1) {
4825 should_stop = true;
4826 continue;
4827 }
4828 if (sorted_runs[i].size() == output_level) {
4829 auto& L1 = sorted_runs[i].back();
4830 L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
4831 should_stop = true;
4832 continue;
4833 }
4834 }
4835 writes_ /= static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
4836 }
4837 for (size_t i = 0; i < num_db; i++) {
4838 if (sorted_runs[i].size() < num_levels - 1) {
4839 fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels);
4840 exit(1);
4841 }
4842 }
4843 for (size_t i = 0; i < num_db; i++) {
4844 auto db = db_list[i];
4845 auto compactionOptions = CompactionOptions();
4846 compactionOptions.compression = FLAGS_compression_type_e;
4847 auto options = db->GetOptions();
4848 MutableCFOptions mutable_cf_options(options);
4849 for (size_t j = 0; j < sorted_runs[i].size(); j++) {
4850 compactionOptions.output_file_size_limit =
4851 MaxFileSizeForLevel(mutable_cf_options,
4852 static_cast<int>(output_level), compaction_style);
4853 std::cout << sorted_runs[i][j].size() << std::endl;
4854 db->CompactFiles(compactionOptions, {sorted_runs[i][j].back().name,
4855 sorted_runs[i][j].front().name},
4856 static_cast<int>(output_level - j) /*level*/);
4857 }
4858 }
4859 } else if (compaction_style == kCompactionStyleUniversal) {
4860 auto ratio = open_options_.compaction_options_universal.size_ratio;
4861 bool should_stop = false;
4862 while (!should_stop) {
4863 if (sorted_runs[0].empty()) {
4864 DoWrite(thread, write_mode);
4865 } else {
4866 DoWrite(thread, UNIQUE_RANDOM);
4867 }
4868 for (size_t i = 0; i < num_db; i++) {
4869 auto db = db_list[i];
4870 db->Flush(FlushOptions());
4871 db->GetColumnFamilyMetaData(&meta);
4872 if (num_files_at_level0[i] == meta.levels[0].files.size() ||
4873 writes_ == 0) {
4874 should_stop = true;
4875 continue;
4876 }
4877 sorted_runs[i].emplace_back(
4878 meta.levels[0].files.begin(),
4879 meta.levels[0].files.end() - num_files_at_level0[i]);
4880 num_files_at_level0[i] = meta.levels[0].files.size();
4881 if (sorted_runs[i].back().size() == 1) {
4882 should_stop = true;
4883 continue;
4884 }
4885 num_files_at_level0[i] = meta.levels[0].files.size();
4886 }
4887 writes_ = static_cast<int64_t>(writes_* static_cast<double>(100) / (ratio + 200));
4888 }
4889 for (size_t i = 0; i < num_db; i++) {
4890 if (sorted_runs[i].size() < num_levels) {
4891 fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels);
4892 exit(1);
4893 }
4894 }
4895 for (size_t i = 0; i < num_db; i++) {
4896 auto db = db_list[i];
4897 auto compactionOptions = CompactionOptions();
4898 compactionOptions.compression = FLAGS_compression_type_e;
4899 auto options = db->GetOptions();
4900 MutableCFOptions mutable_cf_options(options);
4901 for (size_t j = 0; j < sorted_runs[i].size(); j++) {
4902 compactionOptions.output_file_size_limit =
4903 MaxFileSizeForLevel(mutable_cf_options,
4904 static_cast<int>(output_level), compaction_style);
4905 db->CompactFiles(
4906 compactionOptions,
4907 {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
4908 (output_level > j ? static_cast<int>(output_level - j)
4909 : 0) /*level*/);
4910 }
4911 }
4912 } else if (compaction_style == kCompactionStyleFIFO) {
4913 if (num_levels != 1) {
4914 return Status::InvalidArgument(
4915 "num_levels should be 1 for FIFO compaction");
4916 }
4917 if (FLAGS_num_multi_db != 0) {
4918 return Status::InvalidArgument("Doesn't support multiDB");
4919 }
4920 auto db = db_list[0];
4921 std::vector<std::string> file_names;
4922 while (true) {
4923 if (sorted_runs[0].empty()) {
4924 DoWrite(thread, write_mode);
4925 } else {
4926 DoWrite(thread, UNIQUE_RANDOM);
4927 }
4928 db->Flush(FlushOptions());
4929 db->GetColumnFamilyMetaData(&meta);
4930 auto total_size = meta.levels[0].size;
4931 if (total_size >=
4932 db->GetOptions().compaction_options_fifo.max_table_files_size) {
4933 for (auto file_meta : meta.levels[0].files) {
4934 file_names.emplace_back(file_meta.name);
4935 }
4936 break;
4937 }
4938 }
4939 // TODO(shuzhang1989): Investigate why CompactFiles not working
4940 // auto compactionOptions = CompactionOptions();
4941 // db->CompactFiles(compactionOptions, file_names, 0);
4942 auto compactionOptions = CompactRangeOptions();
4943 db->CompactRange(compactionOptions, nullptr, nullptr);
4944 } else {
4945 fprintf(stdout,
4946 "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
4947 "filldeterministic");
4948 return Status::InvalidArgument("None compaction is not supported");
4949 }
4950
4951 // Verify seqno and key range
4952 // Note: the seqno get changed at the max level by implementation
4953 // optimization, so skip the check of the max level.
4954 #ifndef NDEBUG
4955 for (size_t k = 0; k < num_db; k++) {
4956 auto db = db_list[k];
4957 db->GetColumnFamilyMetaData(&meta);
4958 // verify the number of sorted runs
4959 if (compaction_style == kCompactionStyleLevel) {
4960 assert(num_levels - 1 == sorted_runs[k].size());
4961 } else if (compaction_style == kCompactionStyleUniversal) {
4962 assert(meta.levels[0].files.size() + num_levels - 1 ==
4963 sorted_runs[k].size());
4964 } else if (compaction_style == kCompactionStyleFIFO) {
4965 // TODO(gzh): FIFO compaction
4966 db->GetColumnFamilyMetaData(&meta);
4967 auto total_size = meta.levels[0].size;
4968 assert(total_size <=
4969 db->GetOptions().compaction_options_fifo.max_table_files_size);
4970 break;
4971 }
4972
4973 // verify smallest/largest seqno and key range of each sorted run
4974 auto max_level = num_levels - 1;
4975 int level;
4976 for (size_t i = 0; i < sorted_runs[k].size(); i++) {
4977 level = static_cast<int>(max_level - i);
4978 SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
4979 SequenceNumber sorted_run_largest_seqno = 0;
4980 std::string sorted_run_smallest_key, sorted_run_largest_key;
4981 bool first_key = true;
4982 for (auto fileMeta : sorted_runs[k][i]) {
4983 sorted_run_smallest_seqno =
4984 std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
4985 sorted_run_largest_seqno =
4986 std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
4987 if (first_key ||
4988 db->DefaultColumnFamily()->GetComparator()->Compare(
4989 fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
4990 sorted_run_smallest_key = fileMeta.smallestkey;
4991 }
4992 if (first_key ||
4993 db->DefaultColumnFamily()->GetComparator()->Compare(
4994 fileMeta.largestkey, sorted_run_largest_key) > 0) {
4995 sorted_run_largest_key = fileMeta.largestkey;
4996 }
4997 first_key = false;
4998 }
4999 if (compaction_style == kCompactionStyleLevel ||
5000 (compaction_style == kCompactionStyleUniversal && level > 0)) {
5001 SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
5002 SequenceNumber level_largest_seqno = 0;
5003 for (auto fileMeta : meta.levels[level].files) {
5004 level_smallest_seqno =
5005 std::min(level_smallest_seqno, fileMeta.smallest_seqno);
5006 level_largest_seqno =
5007 std::max(level_largest_seqno, fileMeta.largest_seqno);
5008 }
5009 assert(sorted_run_smallest_key ==
5010 meta.levels[level].files.front().smallestkey);
5011 assert(sorted_run_largest_key ==
5012 meta.levels[level].files.back().largestkey);
5013 if (level != static_cast<int>(max_level)) {
5014 // compaction at max_level would change sequence number
5015 assert(sorted_run_smallest_seqno == level_smallest_seqno);
5016 assert(sorted_run_largest_seqno == level_largest_seqno);
5017 }
5018 } else if (compaction_style == kCompactionStyleUniversal) {
5019 // level <= 0 means sorted runs on level 0
5020 auto level0_file =
5021 meta.levels[0].files[sorted_runs[k].size() - 1 - i];
5022 assert(sorted_run_smallest_key == level0_file.smallestkey);
5023 assert(sorted_run_largest_key == level0_file.largestkey);
5024 if (level != static_cast<int>(max_level)) {
5025 assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
5026 assert(sorted_run_largest_seqno == level0_file.largest_seqno);
5027 }
5028 }
5029 }
5030 }
5031 #endif
5032 // print the size of each sorted_run
5033 for (size_t k = 0; k < num_db; k++) {
5034 auto db = db_list[k];
5035 fprintf(stdout,
5036 "---------------------- DB %" ROCKSDB_PRIszt " LSM ---------------------\n", k);
5037 db->GetColumnFamilyMetaData(&meta);
5038 for (auto& levelMeta : meta.levels) {
5039 if (levelMeta.files.empty()) {
5040 continue;
5041 }
5042 if (levelMeta.level == 0) {
5043 for (auto& fileMeta : levelMeta.files) {
5044 fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n",
5045 levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
5046 }
5047 } else {
5048 fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
5049 levelMeta.level, levelMeta.files.front().name.c_str(),
5050 levelMeta.files.back().name.c_str(), levelMeta.size);
5051 }
5052 }
5053 }
5054 for (size_t i = 0; i < num_db; i++) {
5055 db_list[i]->SetOptions(
5056 {{"disable_auto_compactions",
5057 std::to_string(options_list[i].disable_auto_compactions)},
5058 {"level0_slowdown_writes_trigger",
5059 std::to_string(options_list[i].level0_slowdown_writes_trigger)},
5060 {"level0_stop_writes_trigger",
5061 std::to_string(options_list[i].level0_stop_writes_trigger)}});
5062 }
5063 return Status::OK();
5064 #else
5065 (void)thread;
5066 (void)compaction_style;
5067 (void)write_mode;
5068 fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
5069 return Status::NotSupported(
5070 "Rocksdb Lite doesn't support filldeterministic");
5071 #endif // ROCKSDB_LITE
5072 }
5073
ReadSequential(ThreadState * thread)5074 void ReadSequential(ThreadState* thread) {
5075 if (db_.db != nullptr) {
5076 ReadSequential(thread, db_.db);
5077 } else {
5078 for (const auto& db_with_cfh : multi_dbs_) {
5079 ReadSequential(thread, db_with_cfh.db);
5080 }
5081 }
5082 }
5083
ReadSequential(ThreadState * thread,DB * db)5084 void ReadSequential(ThreadState* thread, DB* db) {
5085 ReadOptions options(FLAGS_verify_checksum, true);
5086 options.tailing = FLAGS_use_tailing_iterator;
5087 std::unique_ptr<char[]> ts_guard;
5088 Slice ts;
5089 if (user_timestamp_size_ > 0) {
5090 ts_guard.reset(new char[user_timestamp_size_]);
5091 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5092 options.timestamp = &ts;
5093 }
5094
5095 Iterator* iter = db->NewIterator(options);
5096 int64_t i = 0;
5097 int64_t bytes = 0;
5098 for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
5099 bytes += iter->key().size() + iter->value().size();
5100 thread->stats.FinishedOps(nullptr, db, 1, kRead);
5101 ++i;
5102
5103 if (thread->shared->read_rate_limiter.get() != nullptr &&
5104 i % 1024 == 1023) {
5105 thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5106 nullptr /* stats */,
5107 RateLimiter::OpType::kRead);
5108 }
5109 }
5110
5111 delete iter;
5112 thread->stats.AddBytes(bytes);
5113 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5114 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5115 get_perf_context()->ToString());
5116 }
5117 }
5118
ReadToRowCache(ThreadState * thread)5119 void ReadToRowCache(ThreadState* thread) {
5120 int64_t read = 0;
5121 int64_t found = 0;
5122 int64_t bytes = 0;
5123 int64_t key_rand = 0;
5124 ReadOptions options(FLAGS_verify_checksum, true);
5125 std::unique_ptr<const char[]> key_guard;
5126 Slice key = AllocateKey(&key_guard);
5127 PinnableSlice pinnable_val;
5128
5129 while (key_rand < FLAGS_num) {
5130 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
5131 // We use same key_rand as seed for key and column family so that we can
5132 // deterministically find the cfh corresponding to a particular key, as it
5133 // is done in DoWrite method.
5134 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5135 key_rand++;
5136 read++;
5137 Status s;
5138 if (FLAGS_num_column_families > 1) {
5139 s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
5140 &pinnable_val);
5141 } else {
5142 pinnable_val.Reset();
5143 s = db_with_cfh->db->Get(options,
5144 db_with_cfh->db->DefaultColumnFamily(), key,
5145 &pinnable_val);
5146 }
5147
5148 if (s.ok()) {
5149 found++;
5150 bytes += key.size() + pinnable_val.size();
5151 } else if (!s.IsNotFound()) {
5152 fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5153 abort();
5154 }
5155
5156 if (thread->shared->read_rate_limiter.get() != nullptr &&
5157 read % 256 == 255) {
5158 thread->shared->read_rate_limiter->Request(
5159 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5160 }
5161
5162 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5163 }
5164
5165 char msg[100];
5166 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
5167 read);
5168
5169 thread->stats.AddBytes(bytes);
5170 thread->stats.AddMessage(msg);
5171
5172 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5173 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5174 get_perf_context()->ToString());
5175 }
5176 }
5177
ReadReverse(ThreadState * thread)5178 void ReadReverse(ThreadState* thread) {
5179 if (db_.db != nullptr) {
5180 ReadReverse(thread, db_.db);
5181 } else {
5182 for (const auto& db_with_cfh : multi_dbs_) {
5183 ReadReverse(thread, db_with_cfh.db);
5184 }
5185 }
5186 }
5187
ReadReverse(ThreadState * thread,DB * db)5188 void ReadReverse(ThreadState* thread, DB* db) {
5189 Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
5190 int64_t i = 0;
5191 int64_t bytes = 0;
5192 for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
5193 bytes += iter->key().size() + iter->value().size();
5194 thread->stats.FinishedOps(nullptr, db, 1, kRead);
5195 ++i;
5196 if (thread->shared->read_rate_limiter.get() != nullptr &&
5197 i % 1024 == 1023) {
5198 thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
5199 nullptr /* stats */,
5200 RateLimiter::OpType::kRead);
5201 }
5202 }
5203 delete iter;
5204 thread->stats.AddBytes(bytes);
5205 }
5206
ReadRandomFast(ThreadState * thread)5207 void ReadRandomFast(ThreadState* thread) {
5208 int64_t read = 0;
5209 int64_t found = 0;
5210 int64_t nonexist = 0;
5211 ReadOptions options(FLAGS_verify_checksum, true);
5212 std::unique_ptr<const char[]> key_guard;
5213 Slice key = AllocateKey(&key_guard);
5214 std::string value;
5215 Slice ts;
5216 std::unique_ptr<char[]> ts_guard;
5217 if (user_timestamp_size_ > 0) {
5218 ts_guard.reset(new char[user_timestamp_size_]);
5219 }
5220 DB* db = SelectDBWithCfh(thread)->db;
5221
5222 int64_t pot = 1;
5223 while (pot < FLAGS_num) {
5224 pot <<= 1;
5225 }
5226
5227 Duration duration(FLAGS_duration, reads_);
5228 do {
5229 for (int i = 0; i < 100; ++i) {
5230 int64_t key_rand = thread->rand.Next() & (pot - 1);
5231 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5232 ++read;
5233 std::string ts_ret;
5234 std::string* ts_ptr = nullptr;
5235 if (user_timestamp_size_ > 0) {
5236 ts = mock_app_clock_->GetTimestampForRead(thread->rand,
5237 ts_guard.get());
5238 options.timestamp = &ts;
5239 ts_ptr = &ts_ret;
5240 }
5241 auto status = db->Get(options, key, &value, ts_ptr);
5242 if (status.ok()) {
5243 ++found;
5244 } else if (!status.IsNotFound()) {
5245 fprintf(stderr, "Get returned an error: %s\n",
5246 status.ToString().c_str());
5247 abort();
5248 }
5249 if (key_rand >= FLAGS_num) {
5250 ++nonexist;
5251 }
5252 }
5253 if (thread->shared->read_rate_limiter.get() != nullptr) {
5254 thread->shared->read_rate_limiter->Request(
5255 100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5256 }
5257
5258 thread->stats.FinishedOps(nullptr, db, 100, kRead);
5259 } while (!duration.Done(100));
5260
5261 char msg[100];
5262 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, "
5263 "issued %" PRIu64 " non-exist keys)\n",
5264 found, read, nonexist);
5265
5266 thread->stats.AddMessage(msg);
5267
5268 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5269 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5270 get_perf_context()->ToString());
5271 }
5272 }
5273
GetRandomKey(Random64 * rand)5274 int64_t GetRandomKey(Random64* rand) {
5275 uint64_t rand_int = rand->Next();
5276 int64_t key_rand;
5277 if (read_random_exp_range_ == 0) {
5278 key_rand = rand_int % FLAGS_num;
5279 } else {
5280 const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
5281 long double order = -static_cast<long double>(rand_int % kBigInt) /
5282 static_cast<long double>(kBigInt) *
5283 read_random_exp_range_;
5284 long double exp_ran = std::exp(order);
5285 uint64_t rand_num =
5286 static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
5287 // Map to a different number to avoid locality.
5288 const uint64_t kBigPrime = 0x5bd1e995;
5289 // Overflow is like %(2^64). Will have little impact of results.
5290 key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
5291 }
5292 return key_rand;
5293 }
5294
ReadRandom(ThreadState * thread)5295 void ReadRandom(ThreadState* thread) {
5296 int64_t read = 0;
5297 int64_t found = 0;
5298 int64_t bytes = 0;
5299 int num_keys = 0;
5300 int64_t key_rand = 0;
5301 ReadOptions options(FLAGS_verify_checksum, true);
5302 std::unique_ptr<const char[]> key_guard;
5303 Slice key = AllocateKey(&key_guard);
5304 PinnableSlice pinnable_val;
5305 std::unique_ptr<char[]> ts_guard;
5306 Slice ts;
5307 if (user_timestamp_size_ > 0) {
5308 ts_guard.reset(new char[user_timestamp_size_]);
5309 }
5310
5311 Duration duration(FLAGS_duration, reads_);
5312 while (!duration.Done(1)) {
5313 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
5314 // We use same key_rand as seed for key and column family so that we can
5315 // deterministically find the cfh corresponding to a particular key, as it
5316 // is done in DoWrite method.
5317 if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
5318 if (++num_keys == entries_per_batch_) {
5319 num_keys = 0;
5320 key_rand = GetRandomKey(&thread->rand);
5321 if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
5322 FLAGS_num) {
5323 key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
5324 }
5325 } else {
5326 key_rand += FLAGS_multiread_stride;
5327 }
5328 } else {
5329 key_rand = GetRandomKey(&thread->rand);
5330 }
5331 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5332 read++;
5333 std::string ts_ret;
5334 std::string* ts_ptr = nullptr;
5335 if (user_timestamp_size_ > 0) {
5336 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5337 options.timestamp = &ts;
5338 ts_ptr = &ts_ret;
5339 }
5340 Status s;
5341 pinnable_val.Reset();
5342 if (FLAGS_num_column_families > 1) {
5343 s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
5344 &pinnable_val, ts_ptr);
5345 } else {
5346 s = db_with_cfh->db->Get(options,
5347 db_with_cfh->db->DefaultColumnFamily(), key,
5348 &pinnable_val, ts_ptr);
5349 }
5350 if (s.ok()) {
5351 found++;
5352 bytes += key.size() + pinnable_val.size() + user_timestamp_size_;
5353 } else if (!s.IsNotFound()) {
5354 fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5355 abort();
5356 }
5357
5358 if (thread->shared->read_rate_limiter.get() != nullptr &&
5359 read % 256 == 255) {
5360 thread->shared->read_rate_limiter->Request(
5361 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
5362 }
5363
5364 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5365 }
5366
5367 char msg[100];
5368 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
5369 found, read);
5370
5371 thread->stats.AddBytes(bytes);
5372 thread->stats.AddMessage(msg);
5373
5374 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5375 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5376 get_perf_context()->ToString());
5377 }
5378 }
5379
5380 // Calls MultiGet over a list of keys from a random distribution.
5381 // Returns the total number of keys found.
MultiReadRandom(ThreadState * thread)5382 void MultiReadRandom(ThreadState* thread) {
5383 int64_t read = 0;
5384 int64_t num_multireads = 0;
5385 int64_t found = 0;
5386 ReadOptions options(FLAGS_verify_checksum, true);
5387 std::vector<Slice> keys;
5388 std::vector<std::unique_ptr<const char[]> > key_guards;
5389 std::vector<std::string> values(entries_per_batch_);
5390 PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
5391 std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
5392 std::vector<Status> stat_list(entries_per_batch_);
5393 while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
5394 key_guards.push_back(std::unique_ptr<const char[]>());
5395 keys.push_back(AllocateKey(&key_guards.back()));
5396 }
5397
5398 std::unique_ptr<char[]> ts_guard;
5399 if (user_timestamp_size_ > 0) {
5400 ts_guard.reset(new char[user_timestamp_size_]);
5401 }
5402
5403 Duration duration(FLAGS_duration, reads_);
5404 while (!duration.Done(entries_per_batch_)) {
5405 DB* db = SelectDB(thread);
5406 if (FLAGS_multiread_stride) {
5407 int64_t key = GetRandomKey(&thread->rand);
5408 if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
5409 static_cast<int64_t>(FLAGS_num)) {
5410 key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
5411 }
5412 for (int64_t i = 0; i < entries_per_batch_; ++i) {
5413 GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
5414 key += FLAGS_multiread_stride;
5415 }
5416 } else {
5417 for (int64_t i = 0; i < entries_per_batch_; ++i) {
5418 GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
5419 }
5420 }
5421 Slice ts;
5422 if (user_timestamp_size_ > 0) {
5423 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5424 options.timestamp = &ts;
5425 }
5426 if (!FLAGS_multiread_batched) {
5427 std::vector<Status> statuses = db->MultiGet(options, keys, &values);
5428 assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
5429
5430 read += entries_per_batch_;
5431 num_multireads++;
5432 for (int64_t i = 0; i < entries_per_batch_; ++i) {
5433 if (statuses[i].ok()) {
5434 ++found;
5435 } else if (!statuses[i].IsNotFound()) {
5436 fprintf(stderr, "MultiGet returned an error: %s\n",
5437 statuses[i].ToString().c_str());
5438 abort();
5439 }
5440 }
5441 } else {
5442 db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
5443 keys.data(), pin_values, stat_list.data());
5444
5445 read += entries_per_batch_;
5446 num_multireads++;
5447 for (int64_t i = 0; i < entries_per_batch_; ++i) {
5448 if (stat_list[i].ok()) {
5449 ++found;
5450 } else if (!stat_list[i].IsNotFound()) {
5451 fprintf(stderr, "MultiGet returned an error: %s\n",
5452 stat_list[i].ToString().c_str());
5453 abort();
5454 }
5455 stat_list[i] = Status::OK();
5456 pin_values[i].Reset();
5457 }
5458 }
5459 if (thread->shared->read_rate_limiter.get() != nullptr &&
5460 num_multireads % 256 == 255) {
5461 thread->shared->read_rate_limiter->Request(
5462 256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
5463 RateLimiter::OpType::kRead);
5464 }
5465 thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
5466 }
5467
5468 char msg[100];
5469 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
5470 found, read);
5471 thread->stats.AddMessage(msg);
5472 }
5473
5474 // Calls ApproximateSize over random key ranges.
ApproximateSizeRandom(ThreadState * thread)5475 void ApproximateSizeRandom(ThreadState* thread) {
5476 int64_t size_sum = 0;
5477 int64_t num_sizes = 0;
5478 const size_t batch_size = entries_per_batch_;
5479 std::vector<Range> ranges;
5480 std::vector<Slice> lkeys;
5481 std::vector<std::unique_ptr<const char[]>> lkey_guards;
5482 std::vector<Slice> rkeys;
5483 std::vector<std::unique_ptr<const char[]>> rkey_guards;
5484 std::vector<uint64_t> sizes;
5485 while (ranges.size() < batch_size) {
5486 // Ugly without C++17 return from emplace_back
5487 lkey_guards.emplace_back();
5488 rkey_guards.emplace_back();
5489 lkeys.emplace_back(AllocateKey(&lkey_guards.back()));
5490 rkeys.emplace_back(AllocateKey(&rkey_guards.back()));
5491 ranges.emplace_back(lkeys.back(), rkeys.back());
5492 sizes.push_back(0);
5493 }
5494 Duration duration(FLAGS_duration, reads_);
5495 while (!duration.Done(1)) {
5496 DB* db = SelectDB(thread);
5497 for (size_t i = 0; i < batch_size; ++i) {
5498 int64_t lkey = GetRandomKey(&thread->rand);
5499 int64_t rkey = GetRandomKey(&thread->rand);
5500 if (lkey > rkey) {
5501 std::swap(lkey, rkey);
5502 }
5503 GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]);
5504 GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]);
5505 }
5506 db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_),
5507 &sizes[0]);
5508 num_sizes += entries_per_batch_;
5509 for (int64_t size : sizes) {
5510 size_sum += size;
5511 }
5512 thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers);
5513 }
5514
5515 char msg[100];
5516 snprintf(msg, sizeof(msg), "(Avg approx size=%g)",
5517 static_cast<double>(size_sum) / static_cast<double>(num_sizes));
5518 thread->stats.AddMessage(msg);
5519 }
5520
5521 // The inverse function of Pareto distribution
ParetoCdfInversion(double u,double theta,double k,double sigma)5522 int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
5523 double ret;
5524 if (k == 0.0) {
5525 ret = theta - sigma * std::log(u);
5526 } else {
5527 ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
5528 }
5529 return static_cast<int64_t>(ceil(ret));
5530 }
5531 // The inverse function of power distribution (y=ax^b)
PowerCdfInversion(double u,double a,double b)5532 int64_t PowerCdfInversion(double u, double a, double b) {
5533 double ret;
5534 ret = std::pow((u / a), (1 / b));
5535 return static_cast<int64_t>(ceil(ret));
5536 }
5537
5538 // Add the noice to the QPS
AddNoise(double origin,double noise_ratio)5539 double AddNoise(double origin, double noise_ratio) {
5540 if (noise_ratio < 0.0 || noise_ratio > 1.0) {
5541 return origin;
5542 }
5543 int band_int = static_cast<int>(FLAGS_sine_a);
5544 double delta = (rand() % band_int - band_int / 2) * noise_ratio;
5545 if (origin + delta < 0) {
5546 return origin;
5547 } else {
5548 return (origin + delta);
5549 }
5550 }
5551
5552 // Decide the ratio of different query types
5553 // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
5554 class QueryDecider {
5555 public:
5556 std::vector<int> type_;
5557 std::vector<double> ratio_;
5558 int range_;
5559
QueryDecider()5560 QueryDecider() {}
~QueryDecider()5561 ~QueryDecider() {}
5562
Initiate(std::vector<double> ratio_input)5563 Status Initiate(std::vector<double> ratio_input) {
5564 int range_max = 1000;
5565 double sum = 0.0;
5566 for (auto& ratio : ratio_input) {
5567 sum += ratio;
5568 }
5569 range_ = 0;
5570 for (auto& ratio : ratio_input) {
5571 range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
5572 type_.push_back(range_);
5573 ratio_.push_back(ratio / sum);
5574 }
5575 return Status::OK();
5576 }
5577
GetType(int64_t rand_num)5578 int GetType(int64_t rand_num) {
5579 if (rand_num < 0) {
5580 rand_num = rand_num * (-1);
5581 }
5582 assert(range_ != 0);
5583 int pos = static_cast<int>(rand_num % range_);
5584 for (int i = 0; i < static_cast<int>(type_.size()); i++) {
5585 if (pos < type_[i]) {
5586 return i;
5587 }
5588 }
5589 return 0;
5590 }
5591 };
5592
5593 // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
5594 // to transfer a random value to one keyrange based on the hotness.
5595 struct KeyrangeUnit {
5596 int64_t keyrange_start;
5597 int64_t keyrange_access;
5598 int64_t keyrange_keys;
5599 };
5600
5601 // From our observations, the prefix hotness (key-range hotness) follows
5602 // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
5603 // However, we cannot directly use the inverse function to decide a
5604 // key-range from a random distribution. To achieve it, we create a list of
5605 // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
5606 // decided based on the hotness of the key-range. When a random value is
5607 // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
5608 // and one KeyrangeUnit is selected. The probability of a KeyrangeUnit being
5609 // selected is the same as the hotness of this KeyrangeUnit. After that, the
5610 // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
5611 // can based on the power distribution (y=ax^b) to generate the offset of
5612 // the key in the selected key-range. In this way, we generate the keyID
5613 // based on the hotness of the prefix and also the key hotness distribution.
5614 class GenerateTwoTermExpKeys {
5615 public:
5616 // Avoid uninitialized warning-as-error in some compilers
5617 int64_t keyrange_rand_max_ = 0;
5618 int64_t keyrange_size_ = 0;
5619 int64_t keyrange_num_ = 0;
5620 std::vector<KeyrangeUnit> keyrange_set_;
5621
5622 // Initiate the KeyrangeUnit vector and calculate the size of each
5623 // KeyrangeUnit.
InitiateExpDistribution(int64_t total_keys,double prefix_a,double prefix_b,double prefix_c,double prefix_d)5624 Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
5625 double prefix_b, double prefix_c,
5626 double prefix_d) {
5627 int64_t amplify = 0;
5628 int64_t keyrange_start = 0;
5629 if (FLAGS_keyrange_num <= 0) {
5630 keyrange_num_ = 1;
5631 } else {
5632 keyrange_num_ = FLAGS_keyrange_num;
5633 }
5634 keyrange_size_ = total_keys / keyrange_num_;
5635
5636 // Calculate the key-range shares size based on the input parameters
5637 for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
5638 // Step 1. Calculate the probability that this key range will be
5639 // accessed in a query. It is based on the two-term expoential
5640 // distribution
5641 double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
5642 prefix_c * std::exp(prefix_d * pfx);
5643 if (keyrange_p < std::pow(10.0, -16.0)) {
5644 keyrange_p = 0.0;
5645 }
5646 // Step 2. Calculate the amplify
5647 // In order to allocate a query to a key-range based on the random
5648 // number generated for this query, we need to extend the probability
5649 // of each key range from [0,1] to [0, amplify]. Amplify is calculated
5650 // by 1/(smallest key-range probability). In this way, we ensure that
5651 // all key-ranges are assigned with an Integer that >=0
5652 if (amplify == 0 && keyrange_p > 0) {
5653 amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
5654 }
5655
5656 // Step 3. For each key-range, we calculate its position in the
5657 // [0, amplify] range, including the start, the size (keyrange_access)
5658 KeyrangeUnit p_unit;
5659 p_unit.keyrange_start = keyrange_start;
5660 if (0.0 >= keyrange_p) {
5661 p_unit.keyrange_access = 0;
5662 } else {
5663 p_unit.keyrange_access =
5664 static_cast<int64_t>(std::floor(amplify * keyrange_p));
5665 }
5666 p_unit.keyrange_keys = keyrange_size_;
5667 keyrange_set_.push_back(p_unit);
5668 keyrange_start += p_unit.keyrange_access;
5669 }
5670 keyrange_rand_max_ = keyrange_start;
5671
5672 // Step 4. Shuffle the key-ranges randomly
5673 // Since the access probability is calculated from small to large,
5674 // If we do not re-allocate them, hot key-ranges are always at the end
5675 // and cold key-ranges are at the begin of the key space. Therefore, the
5676 // key-ranges are shuffled and the rand seed is only decide by the
5677 // key-range hotness distribution. With the same distribution parameters
5678 // the shuffle results are the same.
5679 Random64 rand_loca(keyrange_rand_max_);
5680 for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
5681 int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
5682 assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
5683 pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
5684 std::swap(keyrange_set_[i], keyrange_set_[pos]);
5685 }
5686
5687 // Step 5. Recalculate the prefix start postion after shuffling
5688 int64_t offset = 0;
5689 for (auto& p_unit : keyrange_set_) {
5690 p_unit.keyrange_start = offset;
5691 offset += p_unit.keyrange_access;
5692 }
5693
5694 return Status::OK();
5695 }
5696
5697 // Generate the Key ID according to the input ini_rand and key distribution
DistGetKeyID(int64_t ini_rand,double key_dist_a,double key_dist_b)5698 int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
5699 double key_dist_b) {
5700 int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
5701
5702 // Calculate and select one key-range that contains the new key
5703 int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
5704 while (start + 1 < end) {
5705 int64_t mid = start + (end - start) / 2;
5706 assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
5707 if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
5708 end = mid;
5709 } else {
5710 start = mid;
5711 }
5712 }
5713 int64_t keyrange_id = start;
5714
5715 // Select one key in the key-range and compose the keyID
5716 int64_t key_offset = 0, key_seed;
5717 if (key_dist_a == 0.0 || key_dist_b == 0.0) {
5718 key_offset = ini_rand % keyrange_size_;
5719 } else {
5720 double u =
5721 static_cast<double>(ini_rand % keyrange_size_) / keyrange_size_;
5722 key_seed = static_cast<int64_t>(
5723 ceil(std::pow((u / key_dist_a), (1 / key_dist_b))));
5724 Random64 rand_key(key_seed);
5725 key_offset = rand_key.Next() % keyrange_size_;
5726 }
5727 return keyrange_size_ * keyrange_id + key_offset;
5728 }
5729 };
5730
5731 // The social graph workload mixed with Get, Put, Iterator queries.
5732 // The value size and iterator length follow Pareto distribution.
5733 // The overall key access follow power distribution. If user models the
5734 // workload based on different key-ranges (or different prefixes), user
5735 // can use two-term-exponential distribution to fit the workload. User
5736 // needs to decide the ratio between Get, Put, Iterator queries before
5737 // starting the benchmark.
MixGraph(ThreadState * thread)5738 void MixGraph(ThreadState* thread) {
5739 int64_t read = 0; // including single gets and Next of iterators
5740 int64_t gets = 0;
5741 int64_t puts = 0;
5742 int64_t found = 0;
5743 int64_t seek = 0;
5744 int64_t seek_found = 0;
5745 int64_t bytes = 0;
5746 const int64_t default_value_max = 1 * 1024 * 1024;
5747 int64_t value_max = default_value_max;
5748 int64_t scan_len_max = FLAGS_mix_max_scan_len;
5749 double write_rate = 1000000.0;
5750 double read_rate = 1000000.0;
5751 bool use_prefix_modeling = false;
5752 bool use_random_modeling = false;
5753 GenerateTwoTermExpKeys gen_exp;
5754 std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
5755 FLAGS_mix_seek_ratio};
5756 char value_buffer[default_value_max];
5757 QueryDecider query;
5758 RandomGenerator gen;
5759 Status s;
5760 if (value_max > FLAGS_mix_max_value_size) {
5761 value_max = FLAGS_mix_max_value_size;
5762 }
5763
5764 ReadOptions options(FLAGS_verify_checksum, true);
5765 std::unique_ptr<const char[]> key_guard;
5766 Slice key = AllocateKey(&key_guard);
5767 PinnableSlice pinnable_val;
5768 query.Initiate(ratio);
5769
5770 // the limit of qps initiation
5771 if (FLAGS_sine_a != 0 || FLAGS_sine_d != 0) {
5772 thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
5773 static_cast<int64_t>(read_rate), 100000 /* refill_period_us */, 10 /* fairness */,
5774 RateLimiter::Mode::kReadsOnly));
5775 thread->shared->write_rate_limiter.reset(
5776 NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
5777 }
5778
5779 // Decide if user wants to use prefix based key generation
5780 if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
5781 FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
5782 use_prefix_modeling = true;
5783 gen_exp.InitiateExpDistribution(
5784 FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
5785 FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
5786 }
5787 if (FLAGS_key_dist_a == 0 || FLAGS_key_dist_b == 0) {
5788 use_random_modeling = true;
5789 }
5790
5791 Duration duration(FLAGS_duration, reads_);
5792 while (!duration.Done(1)) {
5793 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
5794 int64_t ini_rand, rand_v, key_rand, key_seed;
5795 ini_rand = GetRandomKey(&thread->rand);
5796 rand_v = ini_rand % FLAGS_num;
5797 double u = static_cast<double>(rand_v) / FLAGS_num;
5798
5799 // Generate the keyID based on the key hotness and prefix hotness
5800 if (use_random_modeling) {
5801 key_rand = ini_rand;
5802 } else if (use_prefix_modeling) {
5803 key_rand =
5804 gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
5805 } else {
5806 key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
5807 Random64 rand(key_seed);
5808 key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
5809 }
5810 GenerateKeyFromInt(key_rand, FLAGS_num, &key);
5811 int query_type = query.GetType(rand_v);
5812
5813 // change the qps
5814 uint64_t now = FLAGS_env->NowMicros();
5815 uint64_t usecs_since_last;
5816 if (now > thread->stats.GetSineInterval()) {
5817 usecs_since_last = now - thread->stats.GetSineInterval();
5818 } else {
5819 usecs_since_last = 0;
5820 }
5821
5822 if (usecs_since_last >
5823 (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
5824 double usecs_since_start =
5825 static_cast<double>(now - thread->stats.GetStart());
5826 thread->stats.ResetSineInterval();
5827 double mix_rate_with_noise = AddNoise(
5828 SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
5829 read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
5830 write_rate =
5831 mix_rate_with_noise * query.ratio_[1] * FLAGS_mix_ave_kv_size;
5832
5833 thread->shared->write_rate_limiter.reset(
5834 NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
5835 thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
5836 static_cast<int64_t>(read_rate),
5837 FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000}, 10,
5838 RateLimiter::Mode::kReadsOnly));
5839 }
5840 // Start the query
5841 if (query_type == 0) {
5842 // the Get query
5843 gets++;
5844 read++;
5845 if (FLAGS_num_column_families > 1) {
5846 s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
5847 &pinnable_val);
5848 } else {
5849 pinnable_val.Reset();
5850 s = db_with_cfh->db->Get(options,
5851 db_with_cfh->db->DefaultColumnFamily(), key,
5852 &pinnable_val);
5853 }
5854
5855 if (s.ok()) {
5856 found++;
5857 bytes += key.size() + pinnable_val.size();
5858 } else if (!s.IsNotFound()) {
5859 fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
5860 abort();
5861 }
5862
5863 if (thread->shared->read_rate_limiter.get() != nullptr &&
5864 read % 256 == 255) {
5865 thread->shared->read_rate_limiter->Request(
5866 256, Env::IO_HIGH, nullptr /* stats */,
5867 RateLimiter::OpType::kRead);
5868 }
5869 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
5870 } else if (query_type == 1) {
5871 // the Put query
5872 puts++;
5873 int64_t val_size = ParetoCdfInversion(
5874 u, FLAGS_value_theta, FLAGS_value_k, FLAGS_value_sigma);
5875 if (val_size < 0) {
5876 val_size = 10;
5877 } else if (val_size > value_max) {
5878 val_size = val_size % value_max;
5879 }
5880 s = db_with_cfh->db->Put(
5881 write_options_, key,
5882 gen.Generate(static_cast<unsigned int>(val_size)));
5883 if (!s.ok()) {
5884 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
5885 ErrorExit();
5886 }
5887
5888 if (thread->shared->write_rate_limiter) {
5889 thread->shared->write_rate_limiter->Request(
5890 key.size() + val_size, Env::IO_HIGH, nullptr /*stats*/,
5891 RateLimiter::OpType::kWrite);
5892 }
5893 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
5894 } else if (query_type == 2) {
5895 // Seek query
5896 if (db_with_cfh->db != nullptr) {
5897 Iterator* single_iter = nullptr;
5898 single_iter = db_with_cfh->db->NewIterator(options);
5899 if (single_iter != nullptr) {
5900 single_iter->Seek(key);
5901 seek++;
5902 read++;
5903 if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
5904 seek_found++;
5905 }
5906 int64_t scan_length =
5907 ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
5908 FLAGS_iter_sigma) %
5909 scan_len_max;
5910 for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
5911 Slice value = single_iter->value();
5912 memcpy(value_buffer, value.data(),
5913 std::min(value.size(), sizeof(value_buffer)));
5914 bytes += single_iter->key().size() + single_iter->value().size();
5915 single_iter->Next();
5916 assert(single_iter->status().ok());
5917 }
5918 }
5919 delete single_iter;
5920 }
5921 thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
5922 }
5923 }
5924 char msg[256];
5925 snprintf(msg, sizeof(msg),
5926 "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64 " of %" PRIu64
5927 " in %" PRIu64 " found)\n",
5928 gets, puts, seek, found, read);
5929
5930 thread->stats.AddBytes(bytes);
5931 thread->stats.AddMessage(msg);
5932
5933 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
5934 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
5935 get_perf_context()->ToString());
5936 }
5937 }
5938
IteratorCreation(ThreadState * thread)5939 void IteratorCreation(ThreadState* thread) {
5940 Duration duration(FLAGS_duration, reads_);
5941 ReadOptions options(FLAGS_verify_checksum, true);
5942 std::unique_ptr<char[]> ts_guard;
5943 if (user_timestamp_size_ > 0) {
5944 ts_guard.reset(new char[user_timestamp_size_]);
5945 }
5946 while (!duration.Done(1)) {
5947 DB* db = SelectDB(thread);
5948 Slice ts;
5949 if (user_timestamp_size_ > 0) {
5950 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5951 options.timestamp = &ts;
5952 }
5953 Iterator* iter = db->NewIterator(options);
5954 delete iter;
5955 thread->stats.FinishedOps(nullptr, db, 1, kOthers);
5956 }
5957 }
5958
IteratorCreationWhileWriting(ThreadState * thread)5959 void IteratorCreationWhileWriting(ThreadState* thread) {
5960 if (thread->tid > 0) {
5961 IteratorCreation(thread);
5962 } else {
5963 BGWriter(thread, kWrite);
5964 }
5965 }
5966
SeekRandom(ThreadState * thread)5967 void SeekRandom(ThreadState* thread) {
5968 int64_t read = 0;
5969 int64_t found = 0;
5970 int64_t bytes = 0;
5971 ReadOptions options(FLAGS_verify_checksum, true);
5972 options.total_order_seek = FLAGS_total_order_seek;
5973 options.prefix_same_as_start = FLAGS_prefix_same_as_start;
5974 options.tailing = FLAGS_use_tailing_iterator;
5975 options.readahead_size = FLAGS_readahead_size;
5976 std::unique_ptr<char[]> ts_guard;
5977 Slice ts;
5978 if (user_timestamp_size_ > 0) {
5979 ts_guard.reset(new char[user_timestamp_size_]);
5980 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
5981 options.timestamp = &ts;
5982 }
5983
5984 Iterator* single_iter = nullptr;
5985 std::vector<Iterator*> multi_iters;
5986 if (db_.db != nullptr) {
5987 single_iter = db_.db->NewIterator(options);
5988 } else {
5989 for (const auto& db_with_cfh : multi_dbs_) {
5990 multi_iters.push_back(db_with_cfh.db->NewIterator(options));
5991 }
5992 }
5993
5994 std::unique_ptr<const char[]> key_guard;
5995 Slice key = AllocateKey(&key_guard);
5996
5997 std::unique_ptr<const char[]> upper_bound_key_guard;
5998 Slice upper_bound = AllocateKey(&upper_bound_key_guard);
5999 std::unique_ptr<const char[]> lower_bound_key_guard;
6000 Slice lower_bound = AllocateKey(&lower_bound_key_guard);
6001
6002 Duration duration(FLAGS_duration, reads_);
6003 char value_buffer[256];
6004 while (!duration.Done(1)) {
6005 int64_t seek_pos = thread->rand.Next() % FLAGS_num;
6006 GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
6007 &key);
6008 if (FLAGS_max_scan_distance != 0) {
6009 if (FLAGS_reverse_iterator) {
6010 GenerateKeyFromInt(
6011 static_cast<uint64_t>(std::max(
6012 static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
6013 FLAGS_num, &lower_bound);
6014 options.iterate_lower_bound = &lower_bound;
6015 } else {
6016 auto min_num =
6017 std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
6018 GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
6019 &upper_bound);
6020 options.iterate_upper_bound = &upper_bound;
6021 }
6022 }
6023
6024 if (!FLAGS_use_tailing_iterator) {
6025 if (db_.db != nullptr) {
6026 delete single_iter;
6027 single_iter = db_.db->NewIterator(options);
6028 } else {
6029 for (auto iter : multi_iters) {
6030 delete iter;
6031 }
6032 multi_iters.clear();
6033 for (const auto& db_with_cfh : multi_dbs_) {
6034 multi_iters.push_back(db_with_cfh.db->NewIterator(options));
6035 }
6036 }
6037 }
6038 // Pick a Iterator to use
6039 Iterator* iter_to_use = single_iter;
6040 if (single_iter == nullptr) {
6041 iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()];
6042 }
6043
6044 iter_to_use->Seek(key);
6045 read++;
6046 if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
6047 found++;
6048 }
6049
6050 for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
6051 // Copy out iterator's value to make sure we read them.
6052 Slice value = iter_to_use->value();
6053 memcpy(value_buffer, value.data(),
6054 std::min(value.size(), sizeof(value_buffer)));
6055 bytes += iter_to_use->key().size() + iter_to_use->value().size();
6056
6057 if (!FLAGS_reverse_iterator) {
6058 iter_to_use->Next();
6059 } else {
6060 iter_to_use->Prev();
6061 }
6062 assert(iter_to_use->status().ok());
6063 }
6064
6065 if (thread->shared->read_rate_limiter.get() != nullptr &&
6066 read % 256 == 255) {
6067 thread->shared->read_rate_limiter->Request(
6068 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
6069 }
6070
6071 thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
6072 }
6073 delete single_iter;
6074 for (auto iter : multi_iters) {
6075 delete iter;
6076 }
6077
6078 char msg[100];
6079 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
6080 found, read);
6081 thread->stats.AddBytes(bytes);
6082 thread->stats.AddMessage(msg);
6083 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
6084 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
6085 get_perf_context()->ToString());
6086 }
6087 }
6088
SeekRandomWhileWriting(ThreadState * thread)6089 void SeekRandomWhileWriting(ThreadState* thread) {
6090 if (thread->tid > 0) {
6091 SeekRandom(thread);
6092 } else {
6093 BGWriter(thread, kWrite);
6094 }
6095 }
6096
SeekRandomWhileMerging(ThreadState * thread)6097 void SeekRandomWhileMerging(ThreadState* thread) {
6098 if (thread->tid > 0) {
6099 SeekRandom(thread);
6100 } else {
6101 BGWriter(thread, kMerge);
6102 }
6103 }
6104
DoDelete(ThreadState * thread,bool seq)6105 void DoDelete(ThreadState* thread, bool seq) {
6106 WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
6107 user_timestamp_size_);
6108 Duration duration(seq ? 0 : FLAGS_duration, deletes_);
6109 int64_t i = 0;
6110 std::unique_ptr<const char[]> key_guard;
6111 Slice key = AllocateKey(&key_guard);
6112 std::unique_ptr<char[]> ts_guard;
6113 Slice ts;
6114 if (user_timestamp_size_ > 0) {
6115 ts_guard.reset(new char[user_timestamp_size_]);
6116 }
6117
6118 while (!duration.Done(entries_per_batch_)) {
6119 DB* db = SelectDB(thread);
6120 batch.Clear();
6121 for (int64_t j = 0; j < entries_per_batch_; ++j) {
6122 const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
6123 GenerateKeyFromInt(k, FLAGS_num, &key);
6124 batch.Delete(key);
6125 }
6126 Status s;
6127 if (user_timestamp_size_ > 0) {
6128 ts = mock_app_clock_->Allocate(ts_guard.get());
6129 s = batch.AssignTimestamp(ts);
6130 if (!s.ok()) {
6131 fprintf(stderr, "assign timestamp: %s\n", s.ToString().c_str());
6132 ErrorExit();
6133 }
6134 }
6135 s = db->Write(write_options_, &batch);
6136 thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
6137 if (!s.ok()) {
6138 fprintf(stderr, "del error: %s\n", s.ToString().c_str());
6139 exit(1);
6140 }
6141 i += entries_per_batch_;
6142 }
6143 }
6144
DeleteSeq(ThreadState * thread)6145 void DeleteSeq(ThreadState* thread) {
6146 DoDelete(thread, true);
6147 }
6148
DeleteRandom(ThreadState * thread)6149 void DeleteRandom(ThreadState* thread) {
6150 DoDelete(thread, false);
6151 }
6152
ReadWhileWriting(ThreadState * thread)6153 void ReadWhileWriting(ThreadState* thread) {
6154 if (thread->tid > 0) {
6155 ReadRandom(thread);
6156 } else {
6157 BGWriter(thread, kWrite);
6158 }
6159 }
6160
ReadWhileMerging(ThreadState * thread)6161 void ReadWhileMerging(ThreadState* thread) {
6162 if (thread->tid > 0) {
6163 ReadRandom(thread);
6164 } else {
6165 BGWriter(thread, kMerge);
6166 }
6167 }
6168
BGWriter(ThreadState * thread,enum OperationType write_merge)6169 void BGWriter(ThreadState* thread, enum OperationType write_merge) {
6170 // Special thread that keeps writing until other threads are done.
6171 RandomGenerator gen;
6172 int64_t bytes = 0;
6173
6174 std::unique_ptr<RateLimiter> write_rate_limiter;
6175 if (FLAGS_benchmark_write_rate_limit > 0) {
6176 write_rate_limiter.reset(
6177 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
6178 }
6179
6180 // Don't merge stats from this thread with the readers.
6181 thread->stats.SetExcludeFromMerge();
6182
6183 std::unique_ptr<const char[]> key_guard;
6184 Slice key = AllocateKey(&key_guard);
6185 std::unique_ptr<char[]> ts_guard;
6186 if (user_timestamp_size_ > 0) {
6187 ts_guard.reset(new char[user_timestamp_size_]);
6188 }
6189 uint32_t written = 0;
6190 bool hint_printed = false;
6191
6192 while (true) {
6193 DB* db = SelectDB(thread);
6194 {
6195 MutexLock l(&thread->shared->mu);
6196 if (FLAGS_finish_after_writes && written == writes_) {
6197 fprintf(stderr, "Exiting the writer after %u writes...\n", written);
6198 break;
6199 }
6200 if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
6201 // Other threads have finished
6202 if (FLAGS_finish_after_writes) {
6203 // Wait for the writes to be finished
6204 if (!hint_printed) {
6205 fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
6206 static_cast<int>(writes_) - written);
6207 hint_printed = true;
6208 }
6209 } else {
6210 // Finish the write immediately
6211 break;
6212 }
6213 }
6214 }
6215
6216 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6217 Status s;
6218
6219 Slice val = gen.Generate();
6220 Slice ts;
6221 if (user_timestamp_size_ > 0) {
6222 ts = mock_app_clock_->Allocate(ts_guard.get());
6223 write_options_.timestamp = &ts;
6224 }
6225 if (write_merge == kWrite) {
6226 s = db->Put(write_options_, key, val);
6227 } else {
6228 s = db->Merge(write_options_, key, val);
6229 }
6230 // Restore write_options_
6231 if (user_timestamp_size_ > 0) {
6232 write_options_.timestamp = nullptr;
6233 }
6234 written++;
6235
6236 if (!s.ok()) {
6237 fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
6238 exit(1);
6239 }
6240 bytes += key.size() + val.size() + user_timestamp_size_;
6241 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
6242
6243 if (FLAGS_benchmark_write_rate_limit > 0) {
6244 write_rate_limiter->Request(
6245 key.size() + val.size(), Env::IO_HIGH,
6246 nullptr /* stats */, RateLimiter::OpType::kWrite);
6247 }
6248 }
6249 thread->stats.AddBytes(bytes);
6250 }
6251
ReadWhileScanning(ThreadState * thread)6252 void ReadWhileScanning(ThreadState* thread) {
6253 if (thread->tid > 0) {
6254 ReadRandom(thread);
6255 } else {
6256 BGScan(thread);
6257 }
6258 }
6259
BGScan(ThreadState * thread)6260 void BGScan(ThreadState* thread) {
6261 if (FLAGS_num_multi_db > 0) {
6262 fprintf(stderr, "Not supporting multiple DBs.\n");
6263 abort();
6264 }
6265 assert(db_.db != nullptr);
6266 ReadOptions read_options;
6267 std::unique_ptr<char[]> ts_guard;
6268 Slice ts;
6269 if (user_timestamp_size_ > 0) {
6270 ts_guard.reset(new char[user_timestamp_size_]);
6271 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6272 read_options.timestamp = &ts;
6273 }
6274 Iterator* iter = db_.db->NewIterator(read_options);
6275
6276 fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
6277 Duration duration(FLAGS_duration, reads_);
6278 uint64_t num_seek_to_first = 0;
6279 uint64_t num_next = 0;
6280 while (!duration.Done(1)) {
6281 if (!iter->Valid()) {
6282 iter->SeekToFirst();
6283 num_seek_to_first++;
6284 } else if (!iter->status().ok()) {
6285 fprintf(stderr, "Iterator error: %s\n",
6286 iter->status().ToString().c_str());
6287 abort();
6288 } else {
6289 iter->Next();
6290 num_next++;
6291 }
6292
6293 thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
6294 }
6295 delete iter;
6296 }
6297
6298 // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
6299 // in DB atomically i.e in a single batch. Also refer GetMany.
PutMany(DB * db,const WriteOptions & writeoptions,const Slice & key,const Slice & value)6300 Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
6301 const Slice& value) {
6302 std::string suffixes[3] = {"2", "1", "0"};
6303 std::string keys[3];
6304
6305 WriteBatch batch(/*reserved_bytes=*/0, /*max_bytes=*/0,
6306 user_timestamp_size_);
6307 Status s;
6308 for (int i = 0; i < 3; i++) {
6309 keys[i] = key.ToString() + suffixes[i];
6310 batch.Put(keys[i], value);
6311 }
6312
6313 std::unique_ptr<char[]> ts_guard;
6314 if (user_timestamp_size_ > 0) {
6315 ts_guard.reset(new char[user_timestamp_size_]);
6316 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
6317 s = batch.AssignTimestamp(ts);
6318 if (!s.ok()) {
6319 fprintf(stderr, "assign timestamp to batch: %s\n",
6320 s.ToString().c_str());
6321 ErrorExit();
6322 }
6323 }
6324
6325 s = db->Write(writeoptions, &batch);
6326 return s;
6327 }
6328
6329
6330 // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
6331 // in DB atomically i.e in a single batch. Also refer GetMany.
DeleteMany(DB * db,const WriteOptions & writeoptions,const Slice & key)6332 Status DeleteMany(DB* db, const WriteOptions& writeoptions,
6333 const Slice& key) {
6334 std::string suffixes[3] = {"1", "2", "0"};
6335 std::string keys[3];
6336
6337 WriteBatch batch(0, 0, user_timestamp_size_);
6338 Status s;
6339 for (int i = 0; i < 3; i++) {
6340 keys[i] = key.ToString() + suffixes[i];
6341 batch.Delete(keys[i]);
6342 }
6343
6344 std::unique_ptr<char[]> ts_guard;
6345 if (user_timestamp_size_ > 0) {
6346 ts_guard.reset(new char[user_timestamp_size_]);
6347 Slice ts = mock_app_clock_->Allocate(ts_guard.get());
6348 s = batch.AssignTimestamp(ts);
6349 if (!s.ok()) {
6350 fprintf(stderr, "assign timestamp to batch: %s\n",
6351 s.ToString().c_str());
6352 ErrorExit();
6353 }
6354 }
6355
6356 s = db->Write(writeoptions, &batch);
6357 return s;
6358 }
6359
6360 // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
6361 // in the same snapshot, and verifies that all the values are identical.
6362 // ASSUMES that PutMany was used to put (K, V) into the DB.
GetMany(DB * db,const ReadOptions & readoptions,const Slice & key,std::string * value)6363 Status GetMany(DB* db, const ReadOptions& readoptions, const Slice& key,
6364 std::string* value) {
6365 std::string suffixes[3] = {"0", "1", "2"};
6366 std::string keys[3];
6367 Slice key_slices[3];
6368 std::string values[3];
6369 ReadOptions readoptionscopy = readoptions;
6370
6371 std::unique_ptr<char[]> ts_guard;
6372 Slice ts;
6373 if (user_timestamp_size_ > 0) {
6374 ts_guard.reset(new char[user_timestamp_size_]);
6375 ts = mock_app_clock_->Allocate(ts_guard.get());
6376 readoptionscopy.timestamp = &ts;
6377 }
6378
6379 readoptionscopy.snapshot = db->GetSnapshot();
6380 Status s;
6381 for (int i = 0; i < 3; i++) {
6382 keys[i] = key.ToString() + suffixes[i];
6383 key_slices[i] = keys[i];
6384 s = db->Get(readoptionscopy, key_slices[i], value);
6385 if (!s.ok() && !s.IsNotFound()) {
6386 fprintf(stderr, "get error: %s\n", s.ToString().c_str());
6387 values[i] = "";
6388 // we continue after error rather than exiting so that we can
6389 // find more errors if any
6390 } else if (s.IsNotFound()) {
6391 values[i] = "";
6392 } else {
6393 values[i] = *value;
6394 }
6395 }
6396 db->ReleaseSnapshot(readoptionscopy.snapshot);
6397
6398 if ((values[0] != values[1]) || (values[1] != values[2])) {
6399 fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
6400 key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
6401 values[2].c_str());
6402 // we continue after error rather than exiting so that we can
6403 // find more errors if any
6404 }
6405
6406 return s;
6407 }
6408
6409 // Differs from readrandomwriterandom in the following ways:
6410 // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
6411 // (b) Does deletes as well (per FLAGS_deletepercent)
6412 // (c) In order to achieve high % of 'found' during lookups, and to do
6413 // multiple writes (including puts and deletes) it uses upto
6414 // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
6415 // (d) Does not have a MultiGet option.
RandomWithVerify(ThreadState * thread)6416 void RandomWithVerify(ThreadState* thread) {
6417 ReadOptions options(FLAGS_verify_checksum, true);
6418 RandomGenerator gen;
6419 std::string value;
6420 int64_t found = 0;
6421 int get_weight = 0;
6422 int put_weight = 0;
6423 int delete_weight = 0;
6424 int64_t gets_done = 0;
6425 int64_t puts_done = 0;
6426 int64_t deletes_done = 0;
6427
6428 std::unique_ptr<const char[]> key_guard;
6429 Slice key = AllocateKey(&key_guard);
6430
6431 // the number of iterations is the larger of read_ or write_
6432 for (int64_t i = 0; i < readwrites_; i++) {
6433 DB* db = SelectDB(thread);
6434 if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
6435 // one batch completed, reinitialize for next batch
6436 get_weight = FLAGS_readwritepercent;
6437 delete_weight = FLAGS_deletepercent;
6438 put_weight = 100 - get_weight - delete_weight;
6439 }
6440 GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
6441 FLAGS_numdistinct, &key);
6442 if (get_weight > 0) {
6443 // do all the gets first
6444 Status s = GetMany(db, options, key, &value);
6445 if (!s.ok() && !s.IsNotFound()) {
6446 fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
6447 // we continue after error rather than exiting so that we can
6448 // find more errors if any
6449 } else if (!s.IsNotFound()) {
6450 found++;
6451 }
6452 get_weight--;
6453 gets_done++;
6454 thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
6455 } else if (put_weight > 0) {
6456 // then do all the corresponding number of puts
6457 // for all the gets we have done earlier
6458 Status s = PutMany(db, write_options_, key, gen.Generate());
6459 if (!s.ok()) {
6460 fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
6461 exit(1);
6462 }
6463 put_weight--;
6464 puts_done++;
6465 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
6466 } else if (delete_weight > 0) {
6467 Status s = DeleteMany(db, write_options_, key);
6468 if (!s.ok()) {
6469 fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
6470 exit(1);
6471 }
6472 delete_weight--;
6473 deletes_done++;
6474 thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
6475 }
6476 }
6477 char msg[128];
6478 snprintf(msg, sizeof(msg),
6479 "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \
6480 PRIu64 " found:%" PRIu64 ")",
6481 gets_done, puts_done, deletes_done, readwrites_, found);
6482 thread->stats.AddMessage(msg);
6483 }
6484
6485 // This is different from ReadWhileWriting because it does not use
6486 // an extra thread.
ReadRandomWriteRandom(ThreadState * thread)6487 void ReadRandomWriteRandom(ThreadState* thread) {
6488 ReadOptions options(FLAGS_verify_checksum, true);
6489 RandomGenerator gen;
6490 std::string value;
6491 int64_t found = 0;
6492 int get_weight = 0;
6493 int put_weight = 0;
6494 int64_t reads_done = 0;
6495 int64_t writes_done = 0;
6496 Duration duration(FLAGS_duration, readwrites_);
6497
6498 std::unique_ptr<const char[]> key_guard;
6499 Slice key = AllocateKey(&key_guard);
6500
6501 std::unique_ptr<char[]> ts_guard;
6502 if (user_timestamp_size_ > 0) {
6503 ts_guard.reset(new char[user_timestamp_size_]);
6504 }
6505
6506 // the number of iterations is the larger of read_ or write_
6507 while (!duration.Done(1)) {
6508 DB* db = SelectDB(thread);
6509 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6510 if (get_weight == 0 && put_weight == 0) {
6511 // one batch completed, reinitialize for next batch
6512 get_weight = FLAGS_readwritepercent;
6513 put_weight = 100 - get_weight;
6514 }
6515 if (get_weight > 0) {
6516 // do all the gets first
6517 Slice ts;
6518 if (user_timestamp_size_ > 0) {
6519 ts = mock_app_clock_->GetTimestampForRead(thread->rand,
6520 ts_guard.get());
6521 options.timestamp = &ts;
6522 }
6523 Status s = db->Get(options, key, &value);
6524 if (!s.ok() && !s.IsNotFound()) {
6525 fprintf(stderr, "get error: %s\n", s.ToString().c_str());
6526 // we continue after error rather than exiting so that we can
6527 // find more errors if any
6528 } else if (!s.IsNotFound()) {
6529 found++;
6530 }
6531 get_weight--;
6532 reads_done++;
6533 thread->stats.FinishedOps(nullptr, db, 1, kRead);
6534 } else if (put_weight > 0) {
6535 // then do all the corresponding number of puts
6536 // for all the gets we have done earlier
6537 Slice ts;
6538 if (user_timestamp_size_ > 0) {
6539 ts = mock_app_clock_->Allocate(ts_guard.get());
6540 write_options_.timestamp = &ts;
6541 }
6542 Status s = db->Put(write_options_, key, gen.Generate());
6543 if (!s.ok()) {
6544 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6545 ErrorExit();
6546 }
6547 put_weight--;
6548 writes_done++;
6549 thread->stats.FinishedOps(nullptr, db, 1, kWrite);
6550 }
6551 }
6552 char msg[100];
6553 snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \
6554 " total:%" PRIu64 " found:%" PRIu64 ")",
6555 reads_done, writes_done, readwrites_, found);
6556 thread->stats.AddMessage(msg);
6557 }
6558
6559 //
6560 // Read-modify-write for random keys
UpdateRandom(ThreadState * thread)6561 void UpdateRandom(ThreadState* thread) {
6562 ReadOptions options(FLAGS_verify_checksum, true);
6563 RandomGenerator gen;
6564 std::string value;
6565 int64_t found = 0;
6566 int64_t bytes = 0;
6567 Duration duration(FLAGS_duration, readwrites_);
6568
6569 std::unique_ptr<const char[]> key_guard;
6570 Slice key = AllocateKey(&key_guard);
6571 std::unique_ptr<char[]> ts_guard;
6572 if (user_timestamp_size_ > 0) {
6573 ts_guard.reset(new char[user_timestamp_size_]);
6574 }
6575 // the number of iterations is the larger of read_ or write_
6576 while (!duration.Done(1)) {
6577 DB* db = SelectDB(thread);
6578 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6579 Slice ts;
6580 if (user_timestamp_size_ > 0) {
6581 // Read with newest timestamp because we are doing rmw.
6582 ts = mock_app_clock_->Allocate(ts_guard.get());
6583 options.timestamp = &ts;
6584 }
6585
6586 auto status = db->Get(options, key, &value);
6587 if (status.ok()) {
6588 ++found;
6589 bytes += key.size() + value.size() + user_timestamp_size_;
6590 } else if (!status.IsNotFound()) {
6591 fprintf(stderr, "Get returned an error: %s\n",
6592 status.ToString().c_str());
6593 abort();
6594 }
6595
6596 if (thread->shared->write_rate_limiter) {
6597 thread->shared->write_rate_limiter->Request(
6598 key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/,
6599 RateLimiter::OpType::kWrite);
6600 }
6601
6602 Slice val = gen.Generate();
6603 if (user_timestamp_size_ > 0) {
6604 ts = mock_app_clock_->Allocate(ts_guard.get());
6605 write_options_.timestamp = &ts;
6606 }
6607 Status s = db->Put(write_options_, key, val);
6608 if (!s.ok()) {
6609 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6610 exit(1);
6611 }
6612 bytes += key.size() + val.size() + user_timestamp_size_;
6613 thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
6614 }
6615 char msg[100];
6616 snprintf(msg, sizeof(msg),
6617 "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
6618 thread->stats.AddBytes(bytes);
6619 thread->stats.AddMessage(msg);
6620 }
6621
6622 // Read-XOR-write for random keys. Xors the existing value with a randomly
6623 // generated value, and stores the result. Assuming A in the array of bytes
6624 // representing the existing value, we generate an array B of the same size,
6625 // then compute C = A^B as C[i]=A[i]^B[i], and store C
XORUpdateRandom(ThreadState * thread)6626 void XORUpdateRandom(ThreadState* thread) {
6627 ReadOptions options(FLAGS_verify_checksum, true);
6628 RandomGenerator gen;
6629 std::string existing_value;
6630 int64_t found = 0;
6631 Duration duration(FLAGS_duration, readwrites_);
6632
6633 BytesXOROperator xor_operator;
6634
6635 std::unique_ptr<const char[]> key_guard;
6636 Slice key = AllocateKey(&key_guard);
6637 std::unique_ptr<char[]> ts_guard;
6638 if (user_timestamp_size_ > 0) {
6639 ts_guard.reset(new char[user_timestamp_size_]);
6640 }
6641 // the number of iterations is the larger of read_ or write_
6642 while (!duration.Done(1)) {
6643 DB* db = SelectDB(thread);
6644 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6645 Slice ts;
6646 if (user_timestamp_size_ > 0) {
6647 ts = mock_app_clock_->Allocate(ts_guard.get());
6648 options.timestamp = &ts;
6649 }
6650
6651 auto status = db->Get(options, key, &existing_value);
6652 if (status.ok()) {
6653 ++found;
6654 } else if (!status.IsNotFound()) {
6655 fprintf(stderr, "Get returned an error: %s\n",
6656 status.ToString().c_str());
6657 exit(1);
6658 }
6659
6660 Slice value = gen.Generate(static_cast<unsigned int>(existing_value.size()));
6661 std::string new_value;
6662
6663 if (status.ok()) {
6664 Slice existing_value_slice = Slice(existing_value);
6665 xor_operator.XOR(&existing_value_slice, value, &new_value);
6666 } else {
6667 xor_operator.XOR(nullptr, value, &new_value);
6668 }
6669
6670 if (user_timestamp_size_ > 0) {
6671 ts = mock_app_clock_->Allocate(ts_guard.get());
6672 write_options_.timestamp = &ts;
6673 }
6674
6675 Status s = db->Put(write_options_, key, Slice(new_value));
6676 if (!s.ok()) {
6677 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6678 ErrorExit();
6679 }
6680 thread->stats.FinishedOps(nullptr, db, 1);
6681 }
6682 char msg[100];
6683 snprintf(msg, sizeof(msg),
6684 "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
6685 thread->stats.AddMessage(msg);
6686 }
6687
6688 // Read-modify-write for random keys.
6689 // Each operation causes the key grow by value_size (simulating an append).
6690 // Generally used for benchmarking against merges of similar type
AppendRandom(ThreadState * thread)6691 void AppendRandom(ThreadState* thread) {
6692 ReadOptions options(FLAGS_verify_checksum, true);
6693 RandomGenerator gen;
6694 std::string value;
6695 int64_t found = 0;
6696 int64_t bytes = 0;
6697
6698 std::unique_ptr<const char[]> key_guard;
6699 Slice key = AllocateKey(&key_guard);
6700 std::unique_ptr<char[]> ts_guard;
6701 if (user_timestamp_size_ > 0) {
6702 ts_guard.reset(new char[user_timestamp_size_]);
6703 }
6704 // The number of iterations is the larger of read_ or write_
6705 Duration duration(FLAGS_duration, readwrites_);
6706 while (!duration.Done(1)) {
6707 DB* db = SelectDB(thread);
6708 GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
6709 Slice ts;
6710 if (user_timestamp_size_ > 0) {
6711 ts = mock_app_clock_->Allocate(ts_guard.get());
6712 options.timestamp = &ts;
6713 }
6714
6715 auto status = db->Get(options, key, &value);
6716 if (status.ok()) {
6717 ++found;
6718 bytes += key.size() + value.size() + user_timestamp_size_;
6719 } else if (!status.IsNotFound()) {
6720 fprintf(stderr, "Get returned an error: %s\n",
6721 status.ToString().c_str());
6722 abort();
6723 } else {
6724 // If not existing, then just assume an empty string of data
6725 value.clear();
6726 }
6727
6728 // Update the value (by appending data)
6729 Slice operand = gen.Generate();
6730 if (value.size() > 0) {
6731 // Use a delimiter to match the semantics for StringAppendOperator
6732 value.append(1,',');
6733 }
6734 value.append(operand.data(), operand.size());
6735
6736 if (user_timestamp_size_ > 0) {
6737 ts = mock_app_clock_->Allocate(ts_guard.get());
6738 write_options_.timestamp = &ts;
6739 }
6740
6741 // Write back to the database
6742 Status s = db->Put(write_options_, key, value);
6743 if (!s.ok()) {
6744 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
6745 ErrorExit();
6746 }
6747 bytes += key.size() + value.size() + user_timestamp_size_;
6748 thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
6749 }
6750
6751 char msg[100];
6752 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
6753 readwrites_, found);
6754 thread->stats.AddBytes(bytes);
6755 thread->stats.AddMessage(msg);
6756 }
6757
6758 // Read-modify-write for random keys (using MergeOperator)
6759 // The merge operator to use should be defined by FLAGS_merge_operator
6760 // Adjust FLAGS_value_size so that the keys are reasonable for this operator
6761 // Assumes that the merge operator is non-null (i.e.: is well-defined)
6762 //
6763 // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
6764 // to simulate random additions over 64-bit integers using merge.
6765 //
6766 // The number of merges on the same key can be controlled by adjusting
6767 // FLAGS_merge_keys.
MergeRandom(ThreadState * thread)6768 void MergeRandom(ThreadState* thread) {
6769 RandomGenerator gen;
6770 int64_t bytes = 0;
6771 std::unique_ptr<const char[]> key_guard;
6772 Slice key = AllocateKey(&key_guard);
6773 // The number of iterations is the larger of read_ or write_
6774 Duration duration(FLAGS_duration, readwrites_);
6775 while (!duration.Done(1)) {
6776 DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
6777 int64_t key_rand = thread->rand.Next() % merge_keys_;
6778 GenerateKeyFromInt(key_rand, merge_keys_, &key);
6779
6780 Status s;
6781 Slice val = gen.Generate();
6782 if (FLAGS_num_column_families > 1) {
6783 s = db_with_cfh->db->Merge(write_options_,
6784 db_with_cfh->GetCfh(key_rand), key,
6785 val);
6786 } else {
6787 s = db_with_cfh->db->Merge(write_options_,
6788 db_with_cfh->db->DefaultColumnFamily(), key,
6789 val);
6790 }
6791
6792 if (!s.ok()) {
6793 fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
6794 exit(1);
6795 }
6796 bytes += key.size() + val.size();
6797 thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
6798 }
6799
6800 // Print some statistics
6801 char msg[100];
6802 snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
6803 thread->stats.AddBytes(bytes);
6804 thread->stats.AddMessage(msg);
6805 }
6806
6807 // Read and merge random keys. The amount of reads and merges are controlled
6808 // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
6809 // keys (and thus also the number of reads and merges on the same key) can be
6810 // adjusted with FLAGS_merge_keys.
6811 //
6812 // As with MergeRandom, the merge operator to use should be defined by
6813 // FLAGS_merge_operator.
ReadRandomMergeRandom(ThreadState * thread)6814 void ReadRandomMergeRandom(ThreadState* thread) {
6815 ReadOptions options(FLAGS_verify_checksum, true);
6816 RandomGenerator gen;
6817 std::string value;
6818 int64_t num_hits = 0;
6819 int64_t num_gets = 0;
6820 int64_t num_merges = 0;
6821 size_t max_length = 0;
6822
6823 std::unique_ptr<const char[]> key_guard;
6824 Slice key = AllocateKey(&key_guard);
6825 // the number of iterations is the larger of read_ or write_
6826 Duration duration(FLAGS_duration, readwrites_);
6827 while (!duration.Done(1)) {
6828 DB* db = SelectDB(thread);
6829 GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
6830
6831 bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
6832
6833 if (do_merge) {
6834 Status s = db->Merge(write_options_, key, gen.Generate());
6835 if (!s.ok()) {
6836 fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
6837 exit(1);
6838 }
6839 num_merges++;
6840 thread->stats.FinishedOps(nullptr, db, 1, kMerge);
6841 } else {
6842 Status s = db->Get(options, key, &value);
6843 if (value.length() > max_length)
6844 max_length = value.length();
6845
6846 if (!s.ok() && !s.IsNotFound()) {
6847 fprintf(stderr, "get error: %s\n", s.ToString().c_str());
6848 // we continue after error rather than exiting so that we can
6849 // find more errors if any
6850 } else if (!s.IsNotFound()) {
6851 num_hits++;
6852 }
6853 num_gets++;
6854 thread->stats.FinishedOps(nullptr, db, 1, kRead);
6855 }
6856 }
6857
6858 char msg[100];
6859 snprintf(msg, sizeof(msg),
6860 "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
6861 " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
6862 num_gets, num_merges, readwrites_, num_hits, max_length);
6863 thread->stats.AddMessage(msg);
6864 }
6865
WriteSeqSeekSeq(ThreadState * thread)6866 void WriteSeqSeekSeq(ThreadState* thread) {
6867 writes_ = FLAGS_num;
6868 DoWrite(thread, SEQUENTIAL);
6869 // exclude writes from the ops/sec calculation
6870 thread->stats.Start(thread->tid);
6871
6872 DB* db = SelectDB(thread);
6873 ReadOptions read_opts(FLAGS_verify_checksum, true);
6874 std::unique_ptr<char[]> ts_guard;
6875 Slice ts;
6876 if (user_timestamp_size_ > 0) {
6877 ts_guard.reset(new char[user_timestamp_size_]);
6878 ts = mock_app_clock_->GetTimestampForRead(thread->rand, ts_guard.get());
6879 read_opts.timestamp = &ts;
6880 }
6881 std::unique_ptr<Iterator> iter(db->NewIterator(read_opts));
6882
6883 std::unique_ptr<const char[]> key_guard;
6884 Slice key = AllocateKey(&key_guard);
6885 for (int64_t i = 0; i < FLAGS_num; ++i) {
6886 GenerateKeyFromInt(i, FLAGS_num, &key);
6887 iter->Seek(key);
6888 assert(iter->Valid() && iter->key() == key);
6889 thread->stats.FinishedOps(nullptr, db, 1, kSeek);
6890
6891 for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
6892 if (!FLAGS_reverse_iterator) {
6893 iter->Next();
6894 } else {
6895 iter->Prev();
6896 }
6897 GenerateKeyFromInt(++i, FLAGS_num, &key);
6898 assert(iter->Valid() && iter->key() == key);
6899 thread->stats.FinishedOps(nullptr, db, 1, kSeek);
6900 }
6901
6902 iter->Seek(key);
6903 assert(iter->Valid() && iter->key() == key);
6904 thread->stats.FinishedOps(nullptr, db, 1, kSeek);
6905 }
6906 }
6907
binary_search(std::vector<int> & data,int start,int end,int key)6908 bool binary_search(std::vector<int>& data, int start, int end, int key) {
6909 if (data.empty()) return false;
6910 if (start > end) return false;
6911 int mid = start + (end - start) / 2;
6912 if (mid > static_cast<int>(data.size()) - 1) return false;
6913 if (data[mid] == key) {
6914 return true;
6915 } else if (data[mid] > key) {
6916 return binary_search(data, start, mid - 1, key);
6917 } else {
6918 return binary_search(data, mid + 1, end, key);
6919 }
6920 }
6921
6922 // Does a bunch of merge operations for a key(key1) where the merge operand
6923 // is a sorted list. Next performance comparison is done between doing a Get
6924 // for key1 followed by searching for another key(key2) in the large sorted
6925 // list vs calling GetMergeOperands for key1 and then searching for the key2
6926 // in all the sorted sub-lists. Later case is expected to be a lot faster.
GetMergeOperands(ThreadState * thread)6927 void GetMergeOperands(ThreadState* thread) {
6928 DB* db = SelectDB(thread);
6929 const int kTotalValues = 100000;
6930 const int kListSize = 100;
6931 std::string key = "my_key";
6932 std::string value;
6933
6934 for (int i = 1; i < kTotalValues; i++) {
6935 if (i % kListSize == 0) {
6936 // Remove trailing ','
6937 value.pop_back();
6938 db->Merge(WriteOptions(), key, value);
6939 value.clear();
6940 } else {
6941 value.append(std::to_string(i)).append(",");
6942 }
6943 }
6944
6945 SortList s;
6946 std::vector<int> data;
6947 // This value can be experimented with and it will demonstrate the
6948 // perf difference between doing a Get and searching for lookup_key in the
6949 // resultant large sorted list vs doing GetMergeOperands and searching
6950 // for lookup_key within this resultant sorted sub-lists.
6951 int lookup_key = 1;
6952
6953 // Get API call
6954 std::cout << "--- Get API call --- \n";
6955 PinnableSlice p_slice;
6956 uint64_t st = FLAGS_env->NowNanos();
6957 db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
6958 s.MakeVector(data, p_slice);
6959 bool found =
6960 binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
6961 std::cout << "Found key? " << std::to_string(found) << "\n";
6962 uint64_t sp = FLAGS_env->NowNanos();
6963 std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
6964 std::string* dat_ = p_slice.GetSelf();
6965 std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
6966 << "\n";
6967 data.clear();
6968
6969 // GetMergeOperands API call
6970 std::cout << "--- GetMergeOperands API --- \n";
6971 std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
6972 st = FLAGS_env->NowNanos();
6973 int number_of_operands = 0;
6974 GetMergeOperandsOptions get_merge_operands_options;
6975 get_merge_operands_options.expected_max_number_of_operands =
6976 (kTotalValues / 100) + 1;
6977 db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
6978 a_slice.data(), &get_merge_operands_options,
6979 &number_of_operands);
6980 for (PinnableSlice& psl : a_slice) {
6981 s.MakeVector(data, psl);
6982 found =
6983 binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
6984 data.clear();
6985 if (found) break;
6986 }
6987 std::cout << "Found key? " << std::to_string(found) << "\n";
6988 sp = FLAGS_env->NowNanos();
6989 std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
6990 << " seconds \n";
6991 int to_print = 0;
6992 std::cout << "Sample data from GetMergeOperands API call: ";
6993 for (PinnableSlice& psl : a_slice) {
6994 std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
6995 if (to_print++ > 2) break;
6996 }
6997 }
6998
6999 #ifndef ROCKSDB_LITE
7000 // This benchmark stress tests Transactions. For a given --duration (or
7001 // total number of --writes, a Transaction will perform a read-modify-write
7002 // to increment the value of a key in each of N(--transaction-sets) sets of
7003 // keys (where each set has --num keys). If --threads is set, this will be
7004 // done in parallel.
7005 //
7006 // To test transactions, use --transaction_db=true. Not setting this
7007 // parameter
7008 // will run the same benchmark without transactions.
7009 //
7010 // RandomTransactionVerify() will then validate the correctness of the results
7011 // by checking if the sum of all keys in each set is the same.
RandomTransaction(ThreadState * thread)7012 void RandomTransaction(ThreadState* thread) {
7013 ReadOptions options(FLAGS_verify_checksum, true);
7014 Duration duration(FLAGS_duration, readwrites_);
7015 ReadOptions read_options(FLAGS_verify_checksum, true);
7016 uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
7017 uint64_t transactions_done = 0;
7018
7019 if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
7020 fprintf(stderr, "invalid value for transaction_sets\n");
7021 abort();
7022 }
7023
7024 TransactionOptions txn_options;
7025 txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
7026 txn_options.set_snapshot = FLAGS_transaction_set_snapshot;
7027
7028 RandomTransactionInserter inserter(&thread->rand, write_options_,
7029 read_options, FLAGS_num,
7030 num_prefix_ranges);
7031
7032 if (FLAGS_num_multi_db > 1) {
7033 fprintf(stderr,
7034 "Cannot run RandomTransaction benchmark with "
7035 "FLAGS_multi_db > 1.");
7036 abort();
7037 }
7038
7039 while (!duration.Done(1)) {
7040 bool success;
7041
7042 // RandomTransactionInserter will attempt to insert a key for each
7043 // # of FLAGS_transaction_sets
7044 if (FLAGS_optimistic_transaction_db) {
7045 success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
7046 } else if (FLAGS_transaction_db) {
7047 TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
7048 success = inserter.TransactionDBInsert(txn_db, txn_options);
7049 } else {
7050 success = inserter.DBInsert(db_.db);
7051 }
7052
7053 if (!success) {
7054 fprintf(stderr, "Unexpected error: %s\n",
7055 inserter.GetLastStatus().ToString().c_str());
7056 abort();
7057 }
7058
7059 thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
7060 transactions_done++;
7061 }
7062
7063 char msg[100];
7064 if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
7065 snprintf(msg, sizeof(msg),
7066 "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
7067 transactions_done, inserter.GetFailureCount());
7068 } else {
7069 snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
7070 }
7071 thread->stats.AddMessage(msg);
7072
7073 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
7074 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
7075 get_perf_context()->ToString());
7076 }
7077 thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
7078 }
7079
7080 // Verifies consistency of data after RandomTransaction() has been run.
7081 // Since each iteration of RandomTransaction() incremented a key in each set
7082 // by the same value, the sum of the keys in each set should be the same.
RandomTransactionVerify()7083 void RandomTransactionVerify() {
7084 if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
7085 // transactions not used, nothing to verify.
7086 return;
7087 }
7088
7089 Status s =
7090 RandomTransactionInserter::Verify(db_.db,
7091 static_cast<uint16_t>(FLAGS_transaction_sets));
7092
7093 if (s.ok()) {
7094 fprintf(stdout, "RandomTransactionVerify Success.\n");
7095 } else {
7096 fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
7097 }
7098 }
7099 #endif // ROCKSDB_LITE
7100
7101 // Writes and deletes random keys without overwriting keys.
7102 //
7103 // This benchmark is intended to partially replicate the behavior of MyRocks
7104 // secondary indices: All data is stored in keys and updates happen by
7105 // deleting the old version of the key and inserting the new version.
RandomReplaceKeys(ThreadState * thread)7106 void RandomReplaceKeys(ThreadState* thread) {
7107 std::unique_ptr<const char[]> key_guard;
7108 Slice key = AllocateKey(&key_guard);
7109 std::unique_ptr<char[]> ts_guard;
7110 if (user_timestamp_size_ > 0) {
7111 ts_guard.reset(new char[user_timestamp_size_]);
7112 }
7113 std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
7114 size_t max_counter = 50;
7115 RandomGenerator gen;
7116
7117 Status s;
7118 DB* db = SelectDB(thread);
7119 for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
7120 GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
7121 Slice ts;
7122 if (user_timestamp_size_ > 0) {
7123 ts = mock_app_clock_->Allocate(ts_guard.get());
7124 write_options_.timestamp = &ts;
7125 }
7126 s = db->Put(write_options_, key, gen.Generate());
7127 if (!s.ok()) {
7128 fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7129 exit(1);
7130 }
7131 }
7132
7133 db->GetSnapshot();
7134
7135 std::default_random_engine generator;
7136 std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
7137 FLAGS_stddev);
7138 Duration duration(FLAGS_duration, FLAGS_num);
7139 while (!duration.Done(1)) {
7140 int64_t rnd_id = static_cast<int64_t>(distribution(generator));
7141 int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
7142 static_cast<int64_t>(0));
7143 GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7144 &key);
7145 Slice ts;
7146 if (user_timestamp_size_ > 0) {
7147 ts = mock_app_clock_->Allocate(ts_guard.get());
7148 write_options_.timestamp = &ts;
7149 }
7150 s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
7151 : db->Delete(write_options_, key);
7152 if (s.ok()) {
7153 counters[key_id] = (counters[key_id] + 1) % max_counter;
7154 GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
7155 &key);
7156 if (user_timestamp_size_ > 0) {
7157 ts = mock_app_clock_->Allocate(ts_guard.get());
7158 write_options_.timestamp = &ts;
7159 }
7160 s = db->Put(write_options_, key, Slice());
7161 }
7162
7163 if (!s.ok()) {
7164 fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
7165 exit(1);
7166 }
7167
7168 thread->stats.FinishedOps(nullptr, db, 1, kOthers);
7169 }
7170
7171 char msg[200];
7172 snprintf(msg, sizeof(msg),
7173 "use single deletes: %d, "
7174 "standard deviation: %lf\n",
7175 FLAGS_use_single_deletes, FLAGS_stddev);
7176 thread->stats.AddMessage(msg);
7177 }
7178
TimeSeriesReadOrDelete(ThreadState * thread,bool do_deletion)7179 void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
7180 ReadOptions options(FLAGS_verify_checksum, true);
7181 int64_t read = 0;
7182 int64_t found = 0;
7183 int64_t bytes = 0;
7184
7185 Iterator* iter = nullptr;
7186 // Only work on single database
7187 assert(db_.db != nullptr);
7188 iter = db_.db->NewIterator(options);
7189
7190 std::unique_ptr<const char[]> key_guard;
7191 Slice key = AllocateKey(&key_guard);
7192
7193 char value_buffer[256];
7194 while (true) {
7195 {
7196 MutexLock l(&thread->shared->mu);
7197 if (thread->shared->num_done >= 1) {
7198 // Write thread have finished
7199 break;
7200 }
7201 }
7202 if (!FLAGS_use_tailing_iterator) {
7203 delete iter;
7204 iter = db_.db->NewIterator(options);
7205 }
7206 // Pick a Iterator to use
7207
7208 int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
7209 GenerateKeyFromInt(key_id, FLAGS_num, &key);
7210 // Reset last 8 bytes to 0
7211 char* start = const_cast<char*>(key.data());
7212 start += key.size() - 8;
7213 memset(start, 0, 8);
7214 ++read;
7215
7216 bool key_found = false;
7217 // Seek the prefix
7218 for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
7219 iter->Next()) {
7220 key_found = true;
7221 // Copy out iterator's value to make sure we read them.
7222 if (do_deletion) {
7223 bytes += iter->key().size();
7224 if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
7225 thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
7226 db_.db->Delete(write_options_, iter->key());
7227 } else {
7228 break;
7229 }
7230 } else {
7231 bytes += iter->key().size() + iter->value().size();
7232 thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
7233 Slice value = iter->value();
7234 memcpy(value_buffer, value.data(),
7235 std::min(value.size(), sizeof(value_buffer)));
7236
7237 assert(iter->status().ok());
7238 }
7239 }
7240 found += key_found;
7241
7242 if (thread->shared->read_rate_limiter.get() != nullptr) {
7243 thread->shared->read_rate_limiter->Request(
7244 1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
7245 }
7246 }
7247 delete iter;
7248
7249 char msg[100];
7250 snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
7251 read);
7252 thread->stats.AddBytes(bytes);
7253 thread->stats.AddMessage(msg);
7254 if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
7255 thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
7256 get_perf_context()->ToString());
7257 }
7258 }
7259
TimeSeriesWrite(ThreadState * thread)7260 void TimeSeriesWrite(ThreadState* thread) {
7261 // Special thread that keeps writing until other threads are done.
7262 RandomGenerator gen;
7263 int64_t bytes = 0;
7264
7265 // Don't merge stats from this thread with the readers.
7266 thread->stats.SetExcludeFromMerge();
7267
7268 std::unique_ptr<RateLimiter> write_rate_limiter;
7269 if (FLAGS_benchmark_write_rate_limit > 0) {
7270 write_rate_limiter.reset(
7271 NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
7272 }
7273
7274 std::unique_ptr<const char[]> key_guard;
7275 Slice key = AllocateKey(&key_guard);
7276
7277 Duration duration(FLAGS_duration, writes_);
7278 while (!duration.Done(1)) {
7279 DB* db = SelectDB(thread);
7280
7281 uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
7282 // Write key id
7283 GenerateKeyFromInt(key_id, FLAGS_num, &key);
7284 // Write timestamp
7285
7286 char* start = const_cast<char*>(key.data());
7287 char* pos = start + 8;
7288 int bytes_to_fill =
7289 std::min(key_size_ - static_cast<int>(pos - start), 8);
7290 uint64_t timestamp_value = timestamp_emulator_->Get();
7291 if (port::kLittleEndian) {
7292 for (int i = 0; i < bytes_to_fill; ++i) {
7293 pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
7294 }
7295 } else {
7296 memcpy(pos, static_cast<void*>(×tamp_value), bytes_to_fill);
7297 }
7298
7299 timestamp_emulator_->Inc();
7300
7301 Status s;
7302 Slice val = gen.Generate();
7303 s = db->Put(write_options_, key, val);
7304
7305 if (!s.ok()) {
7306 fprintf(stderr, "put error: %s\n", s.ToString().c_str());
7307 ErrorExit();
7308 }
7309 bytes = key.size() + val.size();
7310 thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
7311 thread->stats.AddBytes(bytes);
7312
7313 if (FLAGS_benchmark_write_rate_limit > 0) {
7314 write_rate_limiter->Request(
7315 key.size() + val.size(), Env::IO_HIGH,
7316 nullptr /* stats */, RateLimiter::OpType::kWrite);
7317 }
7318 }
7319 }
7320
TimeSeries(ThreadState * thread)7321 void TimeSeries(ThreadState* thread) {
7322 if (thread->tid > 0) {
7323 bool do_deletion = FLAGS_expire_style == "delete" &&
7324 thread->tid <= FLAGS_num_deletion_threads;
7325 TimeSeriesReadOrDelete(thread, do_deletion);
7326 } else {
7327 TimeSeriesWrite(thread);
7328 thread->stats.Stop();
7329 thread->stats.Report("timeseries write");
7330 }
7331 }
7332
Compact(ThreadState * thread)7333 void Compact(ThreadState* thread) {
7334 DB* db = SelectDB(thread);
7335 CompactRangeOptions cro;
7336 cro.bottommost_level_compaction =
7337 BottommostLevelCompaction::kForceOptimized;
7338 db->CompactRange(cro, nullptr, nullptr);
7339 }
7340
CompactAll()7341 void CompactAll() {
7342 if (db_.db != nullptr) {
7343 db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
7344 }
7345 for (const auto& db_with_cfh : multi_dbs_) {
7346 db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
7347 }
7348 }
7349
7350 #ifndef ROCKSDB_LITE
WaitForCompactionHelper(DBWithColumnFamilies & db)7351 void WaitForCompactionHelper(DBWithColumnFamilies& db) {
7352 // This is an imperfect way of waiting for compaction. The loop and sleep
7353 // is done because a thread that finishes a compaction job should get a
7354 // chance to pickup a new compaction job.
7355
7356 std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending,
7357 DB::Properties::kNumRunningFlushes,
7358 DB::Properties::kCompactionPending,
7359 DB::Properties::kNumRunningCompactions};
7360
7361 fprintf(stdout, "waitforcompaction(%s): started\n",
7362 db.db->GetName().c_str());
7363
7364 while (true) {
7365 bool retry = false;
7366
7367 for (const auto& k : keys) {
7368 uint64_t v;
7369 if (!db.db->GetIntProperty(k, &v)) {
7370 fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
7371 db.db->GetName().c_str(), k.c_str());
7372 exit(1);
7373 } else if (v > 0) {
7374 fprintf(stdout,
7375 "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
7376 db.db->GetName().c_str(), k.c_str());
7377 sleep(10);
7378 retry = true;
7379 break;
7380 }
7381 }
7382
7383 if (!retry) {
7384 fprintf(stdout, "waitforcompaction(%s): finished\n",
7385 db.db->GetName().c_str());
7386 return;
7387 }
7388 }
7389 }
7390
WaitForCompaction()7391 void WaitForCompaction() {
7392 // Give background threads a chance to wake
7393 sleep(5);
7394
7395 // I am skeptical that this check race free. I hope that checking twice
7396 // reduces the chance.
7397 if (db_.db != nullptr) {
7398 WaitForCompactionHelper(db_);
7399 WaitForCompactionHelper(db_);
7400 } else {
7401 for (auto& db_with_cfh : multi_dbs_) {
7402 WaitForCompactionHelper(db_with_cfh);
7403 WaitForCompactionHelper(db_with_cfh);
7404 }
7405 }
7406 }
7407
CompactLevelHelper(DBWithColumnFamilies & db_with_cfh,int from_level)7408 bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) {
7409 std::vector<LiveFileMetaData> files;
7410 db_with_cfh.db->GetLiveFilesMetaData(&files);
7411
7412 assert(from_level == 0 || from_level == 1);
7413
7414 int real_from_level = from_level;
7415 if (real_from_level > 0) {
7416 // With dynamic leveled compaction the first level with data beyond L0
7417 // might not be L1.
7418 real_from_level = std::numeric_limits<int>::max();
7419
7420 for (auto& f : files) {
7421 if (f.level > 0 && f.level < real_from_level) real_from_level = f.level;
7422 }
7423
7424 if (real_from_level == std::numeric_limits<int>::max()) {
7425 fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
7426 return true;
7427 }
7428 }
7429
7430 // The goal is to compact from from_level to the level that follows it,
7431 // and with dynamic leveled compaction the next level might not be
7432 // real_from_level+1
7433 int next_level = std::numeric_limits<int>::max();
7434
7435 std::vector<std::string> files_to_compact;
7436 for (auto& f : files) {
7437 if (f.level == real_from_level)
7438 files_to_compact.push_back(f.name);
7439 else if (f.level > real_from_level && f.level < next_level)
7440 next_level = f.level;
7441 }
7442
7443 if (files_to_compact.empty()) {
7444 fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
7445 return true;
7446 } else if (next_level == std::numeric_limits<int>::max()) {
7447 // There is no data beyond real_from_level. So we are done.
7448 fprintf(stdout, "compact%d found no data beyond L%d\n", from_level,
7449 real_from_level);
7450 return true;
7451 }
7452
7453 fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n",
7454 from_level, static_cast<int>(files_to_compact.size()),
7455 real_from_level, next_level);
7456
7457 ROCKSDB_NAMESPACE::CompactionOptions options;
7458 // Lets RocksDB use the configured compression for this level
7459 options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption;
7460
7461 ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc;
7462 db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc);
7463 options.output_file_size_limit = cfDesc.options.target_file_size_base;
7464
7465 Status status =
7466 db_with_cfh.db->CompactFiles(options, files_to_compact, next_level);
7467 if (!status.ok()) {
7468 // This can fail for valid reasons including the operation was aborted
7469 // or a filename is invalid because background compaction removed it.
7470 // Having read the current cases for which an error is raised I prefer
7471 // not to figure out whether an exception should be thrown here.
7472 fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level,
7473 status.ToString().c_str());
7474 return false;
7475 }
7476 return true;
7477 }
7478
CompactLevel(int from_level)7479 void CompactLevel(int from_level) {
7480 if (db_.db != nullptr) {
7481 while (!CompactLevelHelper(db_, from_level)) WaitForCompaction();
7482 }
7483 for (auto& db_with_cfh : multi_dbs_) {
7484 while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction();
7485 }
7486 }
7487 #endif
7488
Flush()7489 void Flush() {
7490 FlushOptions flush_opt;
7491 flush_opt.wait = true;
7492
7493 if (db_.db != nullptr) {
7494 Status s = db_.db->Flush(flush_opt, db_.cfh);
7495 if (!s.ok()) {
7496 fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
7497 exit(1);
7498 }
7499 } else {
7500 for (const auto& db_with_cfh : multi_dbs_) {
7501 Status s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh);
7502 if (!s.ok()) {
7503 fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
7504 exit(1);
7505 }
7506 }
7507 }
7508 fprintf(stdout, "flush memtable\n");
7509 }
7510
ResetStats()7511 void ResetStats() {
7512 if (db_.db != nullptr) {
7513 db_.db->ResetStats();
7514 }
7515 for (const auto& db_with_cfh : multi_dbs_) {
7516 db_with_cfh.db->ResetStats();
7517 }
7518 }
7519
PrintStatsHistory()7520 void PrintStatsHistory() {
7521 if (db_.db != nullptr) {
7522 PrintStatsHistoryImpl(db_.db, false);
7523 }
7524 for (const auto& db_with_cfh : multi_dbs_) {
7525 PrintStatsHistoryImpl(db_with_cfh.db, true);
7526 }
7527 }
7528
PrintStatsHistoryImpl(DB * db,bool print_header)7529 void PrintStatsHistoryImpl(DB* db, bool print_header) {
7530 if (print_header) {
7531 fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
7532 }
7533
7534 std::unique_ptr<StatsHistoryIterator> shi;
7535 Status s = db->GetStatsHistory(0, port::kMaxUint64, &shi);
7536 if (!s.ok()) {
7537 fprintf(stdout, "%s\n", s.ToString().c_str());
7538 return;
7539 }
7540 assert(shi);
7541 while (shi->Valid()) {
7542 uint64_t stats_time = shi->GetStatsTime();
7543 fprintf(stdout, "------ %s ------\n",
7544 TimeToHumanString(static_cast<int>(stats_time)).c_str());
7545 for (auto& entry : shi->GetStatsMap()) {
7546 fprintf(stdout, " %" PRIu64 " %s %" PRIu64 "\n", stats_time,
7547 entry.first.c_str(), entry.second);
7548 }
7549 shi->Next();
7550 }
7551 }
7552
PrintStats(const char * key)7553 void PrintStats(const char* key) {
7554 if (db_.db != nullptr) {
7555 PrintStats(db_.db, key, false);
7556 }
7557 for (const auto& db_with_cfh : multi_dbs_) {
7558 PrintStats(db_with_cfh.db, key, true);
7559 }
7560 }
7561
PrintStats(DB * db,const char * key,bool print_header=false)7562 void PrintStats(DB* db, const char* key, bool print_header = false) {
7563 if (print_header) {
7564 fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
7565 }
7566 std::string stats;
7567 if (!db->GetProperty(key, &stats)) {
7568 stats = "(failed)";
7569 }
7570 fprintf(stdout, "\n%s\n", stats.c_str());
7571 }
7572
PrintStats(const std::vector<std::string> & keys)7573 void PrintStats(const std::vector<std::string>& keys) {
7574 if (db_.db != nullptr) {
7575 PrintStats(db_.db, keys);
7576 }
7577 for (const auto& db_with_cfh : multi_dbs_) {
7578 PrintStats(db_with_cfh.db, keys, true);
7579 }
7580 }
7581
PrintStats(DB * db,const std::vector<std::string> & keys,bool print_header=false)7582 void PrintStats(DB* db, const std::vector<std::string>& keys,
7583 bool print_header = false) {
7584 if (print_header) {
7585 fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
7586 }
7587
7588 for (const auto& key : keys) {
7589 std::string stats;
7590 if (!db->GetProperty(key, &stats)) {
7591 stats = "(failed)";
7592 }
7593 fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str());
7594 }
7595 }
7596
Replay(ThreadState * thread)7597 void Replay(ThreadState* thread) {
7598 if (db_.db != nullptr) {
7599 Replay(thread, &db_);
7600 }
7601 }
7602
Replay(ThreadState *,DBWithColumnFamilies * db_with_cfh)7603 void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
7604 Status s;
7605 std::unique_ptr<TraceReader> trace_reader;
7606 s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
7607 &trace_reader);
7608 if (!s.ok()) {
7609 fprintf(
7610 stderr,
7611 "Encountered an error creating a TraceReader from the trace file. "
7612 "Error: %s\n",
7613 s.ToString().c_str());
7614 exit(1);
7615 }
7616 Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
7617 std::move(trace_reader));
7618 replayer.SetFastForward(
7619 static_cast<uint32_t>(FLAGS_trace_replay_fast_forward));
7620 s = replayer.MultiThreadReplay(
7621 static_cast<uint32_t>(FLAGS_trace_replay_threads));
7622 if (s.ok()) {
7623 fprintf(stdout, "Replay started from trace_file: %s\n",
7624 FLAGS_trace_file.c_str());
7625 } else {
7626 fprintf(stderr, "Starting replay failed. Error: %s\n",
7627 s.ToString().c_str());
7628 }
7629 }
7630 };
7631
db_bench_tool(int argc,char ** argv)7632 int db_bench_tool(int argc, char** argv) {
7633 ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
7634 static bool initialized = false;
7635 if (!initialized) {
7636 SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
7637 " [OPTIONS]...");
7638 initialized = true;
7639 }
7640 ParseCommandLineFlags(&argc, &argv, true);
7641 FLAGS_compaction_style_e =
7642 (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style;
7643 #ifndef ROCKSDB_LITE
7644 if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
7645 fprintf(stderr,
7646 "Cannot provide both --statistics and --statistics_string.\n");
7647 exit(1);
7648 }
7649 if (!FLAGS_statistics_string.empty()) {
7650 Status s = ObjectRegistry::NewInstance()->NewSharedObject<Statistics>(
7651 FLAGS_statistics_string, &dbstats);
7652 if (dbstats == nullptr) {
7653 fprintf(stderr,
7654 "No Statistics registered matching string: %s status=%s\n",
7655 FLAGS_statistics_string.c_str(), s.ToString().c_str());
7656 exit(1);
7657 }
7658 }
7659 #endif // ROCKSDB_LITE
7660 if (FLAGS_statistics) {
7661 dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
7662 }
7663 if (dbstats) {
7664 dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
7665 }
7666 FLAGS_compaction_pri_e =
7667 (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri;
7668
7669 std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit(
7670 FLAGS_max_bytes_for_level_multiplier_additional, ',');
7671 for (size_t j = 0; j < fanout.size(); j++) {
7672 FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
7673 #ifndef CYGWIN
7674 std::stoi(fanout[j]));
7675 #else
7676 stoi(fanout[j]));
7677 #endif
7678 }
7679
7680 FLAGS_compression_type_e =
7681 StringToCompressionType(FLAGS_compression_type.c_str());
7682
7683 #ifndef ROCKSDB_LITE
7684 // Stacked BlobDB
7685 FLAGS_blob_db_compression_type_e =
7686 StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
7687
7688 int env_opts =
7689 !FLAGS_hdfs.empty() + !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
7690 if (env_opts > 1) {
7691 fprintf(stderr,
7692 "Error: --hdfs, --env_uri and --fs_uri are mutually exclusive\n");
7693 exit(1);
7694 }
7695
7696 if (!FLAGS_env_uri.empty()) {
7697 Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
7698 if (FLAGS_env == nullptr) {
7699 fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
7700 exit(1);
7701 }
7702 } else if (!FLAGS_fs_uri.empty()) {
7703 std::shared_ptr<FileSystem> fs;
7704 Status s = FileSystem::Load(FLAGS_fs_uri, &fs);
7705 if (fs == nullptr) {
7706 fprintf(stderr, "Error: %s\n", s.ToString().c_str());
7707 exit(1);
7708 }
7709 FLAGS_env = GetCompositeEnv(fs);
7710 } else if (FLAGS_simulate_hybrid_fs_file != "") {
7711 FLAGS_env = GetCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
7712 FileSystem::Default(), FLAGS_simulate_hybrid_fs_file));
7713 }
7714 #endif // ROCKSDB_LITE
7715 if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
7716 fprintf(stderr,
7717 "`-use_existing_db` must be true for `-use_existing_keys` to be "
7718 "settable\n");
7719 exit(1);
7720 }
7721
7722 if (!FLAGS_hdfs.empty()) {
7723 FLAGS_env = new ROCKSDB_NAMESPACE::HdfsEnv(FLAGS_hdfs);
7724 }
7725
7726 if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
7727 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE;
7728 else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
7729 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL;
7730 else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
7731 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
7732 else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
7733 FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED;
7734 else {
7735 fprintf(stdout, "Unknown compaction fadvice:%s\n",
7736 FLAGS_compaction_fadvice.c_str());
7737 }
7738
7739 FLAGS_value_size_distribution_type_e =
7740 StringToDistributionType(FLAGS_value_size_distribution_type.c_str());
7741
7742 FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
7743
7744 // Note options sanitization may increase thread pool sizes according to
7745 // max_background_flushes/max_background_compactions/max_background_jobs
7746 FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
7747 ROCKSDB_NAMESPACE::Env::Priority::HIGH);
7748 FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
7749 ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
7750 FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
7751 ROCKSDB_NAMESPACE::Env::Priority::LOW);
7752
7753 // Choose a location for the test database if none given with --db=<path>
7754 if (FLAGS_db.empty()) {
7755 std::string default_db_path;
7756 FLAGS_env->GetTestDirectory(&default_db_path);
7757 default_db_path += "/dbbench";
7758 FLAGS_db = default_db_path;
7759 }
7760
7761 if (FLAGS_stats_interval_seconds > 0) {
7762 // When both are set then FLAGS_stats_interval determines the frequency
7763 // at which the timer is checked for FLAGS_stats_interval_seconds
7764 FLAGS_stats_interval = 1000;
7765 }
7766
7767 if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
7768 fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
7769 exit(1);
7770 }
7771
7772 if ((FLAGS_enable_blob_files || FLAGS_enable_blob_garbage_collection) &&
7773 !FLAGS_merge_operator.empty()) {
7774 fprintf(stderr,
7775 "Integrated BlobDB is currently incompatible with Merge.\n");
7776 exit(1);
7777 }
7778
7779 ROCKSDB_NAMESPACE::Benchmark benchmark;
7780 benchmark.Run();
7781
7782 #ifndef ROCKSDB_LITE
7783 if (FLAGS_print_malloc_stats) {
7784 std::string stats_string;
7785 ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string);
7786 fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
7787 }
7788 #endif // ROCKSDB_LITE
7789
7790 return 0;
7791 }
7792 } // namespace ROCKSDB_NAMESPACE
7793 #endif
7794