1 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #include "rocksdb/system_clock.h"
7 #include "test_util/testharness.h"
8 #include "util/bloom_impl.h"
9 #include "util/coding.h"
10 #include "util/hash.h"
11 #include "util/ribbon_config.h"
12 #include "util/ribbon_impl.h"
13 #include "util/stop_watch.h"
14 #include "util/string_util.h"
15 
16 #ifndef GFLAGS
17 uint32_t FLAGS_thoroughness = 5;
18 uint32_t FLAGS_max_add = 0;
19 uint32_t FLAGS_min_check = 4000;
20 uint32_t FLAGS_max_check = 100000;
21 bool FLAGS_verbose = false;
22 
23 bool FLAGS_find_occ = false;
24 bool FLAGS_find_slot_occ = false;
25 double FLAGS_find_next_factor = 1.618;
26 uint32_t FLAGS_find_iters = 10000;
27 uint32_t FLAGS_find_min_slots = 128;
28 uint32_t FLAGS_find_max_slots = 1000000;
29 
30 bool FLAGS_optimize_homog = false;
31 uint32_t FLAGS_optimize_homog_slots = 30000000;
32 uint32_t FLAGS_optimize_homog_check = 200000;
33 double FLAGS_optimize_homog_granularity = 0.002;
34 #else
35 #include "util/gflags_compat.h"
36 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
37 // Using 500 is a good test when you have time to be thorough.
38 // Default is for general RocksDB regression test runs.
39 DEFINE_uint32(thoroughness, 5, "iterations per configuration");
40 DEFINE_uint32(max_add, 0,
41               "Add up to this number of entries to a single filter in "
42               "CompactnessAndBacktrackAndFpRate; 0 == reasonable default");
43 DEFINE_uint32(min_check, 4000,
44               "Minimum number of novel entries for testing FP rate");
45 DEFINE_uint32(max_check, 10000,
46               "Maximum number of novel entries for testing FP rate");
47 DEFINE_bool(verbose, false, "Print extra details");
48 
49 // Options for FindOccupancy, which is more of a tool than a test.
50 DEFINE_bool(find_occ, false, "whether to run the FindOccupancy tool");
51 DEFINE_bool(find_slot_occ, false,
52             "whether to show individual slot occupancies with "
53             "FindOccupancy tool");
54 DEFINE_double(find_next_factor, 1.618,
55               "factor to next num_slots for FindOccupancy");
56 DEFINE_uint32(find_iters, 10000, "number of samples for FindOccupancy");
57 DEFINE_uint32(find_min_slots, 128, "number of slots for FindOccupancy");
58 DEFINE_uint32(find_max_slots, 1000000, "number of slots for FindOccupancy");
59 
60 // Options for OptimizeHomogAtScale, which is more of a tool than a test.
61 DEFINE_bool(optimize_homog, false,
62             "whether to run the OptimizeHomogAtScale tool");
63 DEFINE_uint32(optimize_homog_slots, 30000000,
64               "number of slots for OptimizeHomogAtScale");
65 DEFINE_uint32(optimize_homog_check, 200000,
66               "number of queries for checking FP rate in OptimizeHomogAtScale");
67 DEFINE_double(
68     optimize_homog_granularity, 0.002,
69     "overhead change between FP rate checking in OptimizeHomogAtScale");
70 
71 #endif  // GFLAGS
72 
73 template <typename TypesAndSettings>
74 class RibbonTypeParamTest : public ::testing::Test {};
75 
76 class RibbonTest : public ::testing::Test {};
77 
78 namespace {
79 
80 // Different ways of generating keys for testing
81 
82 // Generate semi-sequential keys
83 struct StandardKeyGen {
StandardKeyGen__anon85a6d2830111::StandardKeyGen84   StandardKeyGen(const std::string& prefix, uint64_t id)
85       : id_(id), str_(prefix) {
86     ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0);
87   }
88 
89   // Prefix (only one required)
operator ++__anon85a6d2830111::StandardKeyGen90   StandardKeyGen& operator++() {
91     ++id_;
92     return *this;
93   }
94 
operator +=__anon85a6d2830111::StandardKeyGen95   StandardKeyGen& operator+=(uint64_t i) {
96     id_ += i;
97     return *this;
98   }
99 
operator *__anon85a6d2830111::StandardKeyGen100   const std::string& operator*() {
101     // Use multiplication to mix things up a little in the key
102     ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
103                                      id_ * uint64_t{0x1500000001});
104     return str_;
105   }
106 
operator ==__anon85a6d2830111::StandardKeyGen107   bool operator==(const StandardKeyGen& other) {
108     // Same prefix is assumed
109     return id_ == other.id_;
110   }
operator !=__anon85a6d2830111::StandardKeyGen111   bool operator!=(const StandardKeyGen& other) {
112     // Same prefix is assumed
113     return id_ != other.id_;
114   }
115 
116   uint64_t id_;
117   std::string str_;
118 };
119 
120 // Generate small sequential keys, that can misbehave with sequential seeds
121 // as in https://github.com/Cyan4973/xxHash/issues/469.
122 // These keys are only heuristically unique, but that's OK with 64 bits,
123 // for testing purposes.
124 struct SmallKeyGen {
SmallKeyGen__anon85a6d2830111::SmallKeyGen125   SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) {
126     // Hash the prefix for a heuristically unique offset
127     id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix);
128     ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
129   }
130 
131   // Prefix (only one required)
operator ++__anon85a6d2830111::SmallKeyGen132   SmallKeyGen& operator++() {
133     ++id_;
134     return *this;
135   }
136 
operator +=__anon85a6d2830111::SmallKeyGen137   SmallKeyGen& operator+=(uint64_t i) {
138     id_ += i;
139     return *this;
140   }
141 
operator *__anon85a6d2830111::SmallKeyGen142   const std::string& operator*() {
143     ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_);
144     return str_;
145   }
146 
operator ==__anon85a6d2830111::SmallKeyGen147   bool operator==(const SmallKeyGen& other) { return id_ == other.id_; }
operator !=__anon85a6d2830111::SmallKeyGen148   bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; }
149 
150   uint64_t id_;
151   std::string str_;
152 };
153 
154 template <typename KeyGen>
155 struct Hash32KeyGenWrapper : public KeyGen {
Hash32KeyGenWrapper__anon85a6d2830111::Hash32KeyGenWrapper156   Hash32KeyGenWrapper(const std::string& prefix, uint64_t id)
157       : KeyGen(prefix, id) {}
operator *__anon85a6d2830111::Hash32KeyGenWrapper158   uint32_t operator*() {
159     auto& key = *static_cast<KeyGen&>(*this);
160     // unseeded
161     return ROCKSDB_NAMESPACE::GetSliceHash(key);
162   }
163 };
164 
165 template <typename KeyGen>
166 struct Hash64KeyGenWrapper : public KeyGen {
Hash64KeyGenWrapper__anon85a6d2830111::Hash64KeyGenWrapper167   Hash64KeyGenWrapper(const std::string& prefix, uint64_t id)
168       : KeyGen(prefix, id) {}
operator *__anon85a6d2830111::Hash64KeyGenWrapper169   uint64_t operator*() {
170     auto& key = *static_cast<KeyGen&>(*this);
171     // unseeded
172     return ROCKSDB_NAMESPACE::GetSliceHash64(key);
173   }
174 };
175 
176 using ROCKSDB_NAMESPACE::ribbon::ConstructionFailureChance;
177 
178 const std::vector<ConstructionFailureChance> kFailureOnly50Pct = {
179     ROCKSDB_NAMESPACE::ribbon::kOneIn2};
180 
181 const std::vector<ConstructionFailureChance> kFailureOnlyRare = {
182     ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
183 
184 const std::vector<ConstructionFailureChance> kFailureAll = {
185     ROCKSDB_NAMESPACE::ribbon::kOneIn2, ROCKSDB_NAMESPACE::ribbon::kOneIn20,
186     ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
187 
188 }  // namespace
189 
190 using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate;
191 using ROCKSDB_NAMESPACE::ribbon::StandardHasher;
192 using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter;
193 
194 struct DefaultTypesAndSettings {
195   using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
196   using ResultRow = uint8_t;
197   using Index = uint32_t;
198   using Hash = uint64_t;
199   using Seed = uint32_t;
200   using Key = ROCKSDB_NAMESPACE::Slice;
201   static constexpr bool kIsFilter = true;
202   static constexpr bool kHomogeneous = false;
203   static constexpr bool kFirstCoeffAlwaysOne = true;
204   static constexpr bool kUseSmash = false;
205   static constexpr bool kAllowZeroStarts = false;
HashFnDefaultTypesAndSettings206   static Hash HashFn(const Key& key, uint64_t raw_seed) {
207     // This version 0.7.2 preview of XXH3 (a.k.a. XXPH3) function does
208     // not pass SmallKeyGen tests below without some seed premixing from
209     // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
210     return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);
211   }
212   // For testing
213   using KeyGen = StandardKeyGen;
FailureChanceToTestDefaultTypesAndSettings214   static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
215     return kFailureAll;
216   }
217 };
218 
219 using TypesAndSettings_Coeff128 = DefaultTypesAndSettings;
220 struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings {
221   static constexpr bool kUseSmash = true;
222 };
223 struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings {
224   using CoeffRow = uint64_t;
225 };
226 struct TypesAndSettings_Coeff64Smash : public TypesAndSettings_Coeff64 {
227   static constexpr bool kUseSmash = true;
228 };
229 struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash {
230   static constexpr bool kFirstCoeffAlwaysOne = false;
231 };
232 
233 // Homogeneous Ribbon configurations
234 struct TypesAndSettings_Coeff128_Homog : public DefaultTypesAndSettings {
235   static constexpr bool kHomogeneous = true;
236   // Since our best construction success setting still has 1/1000 failure
237   // rate, the best FP rate we test is 1/256
238   using ResultRow = uint8_t;
239   // Homogeneous only makes sense with sufficient slots for equivalent of
240   // almost sure construction success
FailureChanceToTestTypesAndSettings_Coeff128_Homog241   static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
242     return kFailureOnlyRare;
243   }
244 };
245 struct TypesAndSettings_Coeff128Smash_Homog
246     : public TypesAndSettings_Coeff128_Homog {
247   // Smash (extra time to save space) + Homog (extra space to save time)
248   // doesn't make much sense in practice, but we minimally test it
249   static constexpr bool kUseSmash = true;
250 };
251 struct TypesAndSettings_Coeff64_Homog : public TypesAndSettings_Coeff128_Homog {
252   using CoeffRow = uint64_t;
253 };
254 struct TypesAndSettings_Coeff64Smash_Homog
255     : public TypesAndSettings_Coeff64_Homog {
256   // Smash (extra time to save space) + Homog (extra space to save time)
257   // doesn't make much sense in practice, but we minimally test it
258   static constexpr bool kUseSmash = true;
259 };
260 
261 // Less exhaustive mix of coverage, but still covering the most stressful case
262 // (only 50% construction success)
263 struct AbridgedTypesAndSettings : public DefaultTypesAndSettings {
FailureChanceToTestAbridgedTypesAndSettings264   static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
265     return kFailureOnly50Pct;
266   }
267 };
268 struct TypesAndSettings_Result16 : public AbridgedTypesAndSettings {
269   using ResultRow = uint16_t;
270 };
271 struct TypesAndSettings_Result32 : public AbridgedTypesAndSettings {
272   using ResultRow = uint32_t;
273 };
274 struct TypesAndSettings_IndexSizeT : public AbridgedTypesAndSettings {
275   using Index = size_t;
276 };
277 struct TypesAndSettings_Hash32 : public AbridgedTypesAndSettings {
278   using Hash = uint32_t;
HashFnTypesAndSettings_Hash32279   static Hash HashFn(const Key& key, Hash raw_seed) {
280     // This MurmurHash1 function does not pass tests below without the
281     // seed premixing from StandardHasher. In fact, it needs more than
282     // just a multiplication mixer on the ordinal seed.
283     return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed);
284   }
285 };
286 struct TypesAndSettings_Hash32_Result16 : public AbridgedTypesAndSettings {
287   using ResultRow = uint16_t;
288 };
289 struct TypesAndSettings_KeyString : public AbridgedTypesAndSettings {
290   using Key = std::string;
291 };
292 struct TypesAndSettings_Seed8 : public AbridgedTypesAndSettings {
293   // This is not a generally recommended configuration. With the configured
294   // hash function, it would fail with SmallKeyGen due to insufficient
295   // independence among the seeds.
296   using Seed = uint8_t;
297 };
298 struct TypesAndSettings_NoAlwaysOne : public AbridgedTypesAndSettings {
299   static constexpr bool kFirstCoeffAlwaysOne = false;
300 };
301 struct TypesAndSettings_AllowZeroStarts : public AbridgedTypesAndSettings {
302   static constexpr bool kAllowZeroStarts = true;
303 };
304 struct TypesAndSettings_Seed64 : public AbridgedTypesAndSettings {
305   using Seed = uint64_t;
306 };
307 struct TypesAndSettings_Rehasher
308     : public StandardRehasherAdapter<AbridgedTypesAndSettings> {
309   using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
310 };
311 struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher {
312   using ResultRow = uint16_t;
313 };
314 struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher {
315   using ResultRow = uint32_t;
316 };
317 struct TypesAndSettings_Rehasher_Seed64
318     : public StandardRehasherAdapter<TypesAndSettings_Seed64> {
319   using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
320   // Note: 64-bit seed with Rehasher gives slightly better average reseeds
321 };
322 struct TypesAndSettings_Rehasher32
323     : public StandardRehasherAdapter<TypesAndSettings_Hash32> {
324   using KeyGen = Hash32KeyGenWrapper<StandardKeyGen>;
325 };
326 struct TypesAndSettings_Rehasher32_Coeff64
327     : public TypesAndSettings_Rehasher32 {
328   using CoeffRow = uint64_t;
329 };
330 struct TypesAndSettings_SmallKeyGen : public AbridgedTypesAndSettings {
331   // SmallKeyGen stresses the independence of different hash seeds
332   using KeyGen = SmallKeyGen;
333 };
334 struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 {
335   // SmallKeyGen stresses the independence of different hash seeds
336   using KeyGen = SmallKeyGen;
337 };
338 struct TypesAndSettings_Coeff32 : public DefaultTypesAndSettings {
339   using CoeffRow = uint32_t;
340 };
341 struct TypesAndSettings_Coeff32Smash : public TypesAndSettings_Coeff32 {
342   static constexpr bool kUseSmash = true;
343 };
344 struct TypesAndSettings_Coeff16 : public DefaultTypesAndSettings {
345   using CoeffRow = uint16_t;
346 };
347 struct TypesAndSettings_Coeff16Smash : public TypesAndSettings_Coeff16 {
348   static constexpr bool kUseSmash = true;
349 };
350 
351 using TestTypesAndSettings = ::testing::Types<
352     TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash,
353     TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash,
354     TypesAndSettings_Coeff64Smash0, TypesAndSettings_Coeff128_Homog,
355     TypesAndSettings_Coeff128Smash_Homog, TypesAndSettings_Coeff64_Homog,
356     TypesAndSettings_Coeff64Smash_Homog, TypesAndSettings_Result16,
357     TypesAndSettings_Result32, TypesAndSettings_IndexSizeT,
358     TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16,
359     TypesAndSettings_KeyString, TypesAndSettings_Seed8,
360     TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts,
361     TypesAndSettings_Seed64, TypesAndSettings_Rehasher,
362     TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32,
363     TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32,
364     TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen,
365     TypesAndSettings_Hash32_SmallKeyGen, TypesAndSettings_Coeff32,
366     TypesAndSettings_Coeff32Smash, TypesAndSettings_Coeff16,
367     TypesAndSettings_Coeff16Smash>;
368 TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings);
369 
370 namespace {
371 
372 // For testing Poisson-distributed (or similar) statistics, get value for
373 // `stddevs_allowed` standard deviations above expected mean
374 // `expected_count`.
375 // (Poisson approximates Binomial only if probability of a trial being
376 // in the count is low.)
PoissonUpperBound(double expected_count,double stddevs_allowed)377 uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) {
378   return static_cast<uint64_t>(
379       expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0);
380 }
381 
PoissonLowerBound(double expected_count,double stddevs_allowed)382 uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) {
383   return static_cast<uint64_t>(std::max(
384       0.0, expected_count - stddevs_allowed * std::sqrt(expected_count)));
385 }
386 
FrequentPoissonUpperBound(double expected_count)387 uint64_t FrequentPoissonUpperBound(double expected_count) {
388   // Allow up to 5.0 standard deviations for frequently checked statistics
389   return PoissonUpperBound(expected_count, 5.0);
390 }
391 
FrequentPoissonLowerBound(double expected_count)392 uint64_t FrequentPoissonLowerBound(double expected_count) {
393   return PoissonLowerBound(expected_count, 5.0);
394 }
395 
InfrequentPoissonUpperBound(double expected_count)396 uint64_t InfrequentPoissonUpperBound(double expected_count) {
397   // Allow up to 3 standard deviations for infrequently checked statistics
398   return PoissonUpperBound(expected_count, 3.0);
399 }
400 
InfrequentPoissonLowerBound(double expected_count)401 uint64_t InfrequentPoissonLowerBound(double expected_count) {
402   return PoissonLowerBound(expected_count, 3.0);
403 }
404 
405 }  // namespace
406 
TYPED_TEST(RibbonTypeParamTest,CompactnessAndBacktrackAndFpRate)407 TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
408   IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
409   IMPORT_RIBBON_IMPL_TYPES(TypeParam);
410   using KeyGen = typename TypeParam::KeyGen;
411   using ConfigHelper =
412       ROCKSDB_NAMESPACE::ribbon::BandingConfigHelper<TypeParam>;
413 
414   if (sizeof(CoeffRow) < 8) {
415     ROCKSDB_GTEST_BYPASS("Not fully supported");
416     return;
417   }
418 
419   const auto log2_thoroughness =
420       static_cast<uint32_t>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
421 
422   // We are going to choose num_to_add using an exponential distribution,
423   // so that we have good representation of small-to-medium filters.
424   // Here we just pick some reasonable, practical upper bound based on
425   // kCoeffBits or option.
426   const double log_max_add = std::log(
427       FLAGS_max_add > 0 ? FLAGS_max_add
428                         : static_cast<uint32_t>(kCoeffBits * kCoeffBits) *
429                               std::max(FLAGS_thoroughness, uint32_t{32}));
430 
431   // This needs to be enough below the minimum number of slots to get a
432   // reasonable number of samples with the minimum number of slots.
433   const double log_min_add = std::log(0.66 * SimpleSoln::RoundUpNumSlots(1));
434 
435   ASSERT_GT(log_max_add, log_min_add);
436 
437   const double diff_log_add = log_max_add - log_min_add;
438 
439   for (ConstructionFailureChance cs : TypeParam::FailureChanceToTest()) {
440     double expected_reseeds;
441     switch (cs) {
442       default:
443         assert(false);
444         FALLTHROUGH_INTENDED;
445       case ROCKSDB_NAMESPACE::ribbon::kOneIn2:
446         fprintf(stderr, "== Failure: 50 percent\n");
447         expected_reseeds = 1.0;
448         break;
449       case ROCKSDB_NAMESPACE::ribbon::kOneIn20:
450         fprintf(stderr, "== Failure: 95 percent\n");
451         expected_reseeds = 0.053;
452         break;
453       case ROCKSDB_NAMESPACE::ribbon::kOneIn1000:
454         fprintf(stderr, "== Failure: 1/1000\n");
455         expected_reseeds = 0.001;
456         break;
457     }
458 
459     uint64_t total_reseeds = 0;
460     uint64_t total_singles = 0;
461     uint64_t total_single_failures = 0;
462     uint64_t total_batch = 0;
463     uint64_t total_batch_successes = 0;
464     uint64_t total_fp_count = 0;
465     uint64_t total_added = 0;
466     uint64_t total_expand_trials = 0;
467     uint64_t total_expand_failures = 0;
468     double total_expand_overhead = 0.0;
469 
470     uint64_t soln_query_nanos = 0;
471     uint64_t soln_query_count = 0;
472     uint64_t bloom_query_nanos = 0;
473     uint64_t isoln_query_nanos = 0;
474     uint64_t isoln_query_count = 0;
475 
476     // Take different samples if you change thoroughness
477     ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness);
478 
479     for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) {
480       // We are going to choose num_to_add using an exponential distribution
481       // as noted above, but instead of randomly choosing them, we generate
482       // samples linearly using the golden ratio, which ensures a nice spread
483       // even for a small number of samples, and starting with the minimum
484       // number of slots to ensure it is tested.
485       double log_add =
486           std::fmod(0.6180339887498948482 * diff_log_add * i, diff_log_add) +
487           log_min_add;
488       uint32_t num_to_add = static_cast<uint32_t>(std::exp(log_add));
489 
490       // Most of the time, test the Interleaved solution storage, but when
491       // we do we have to make num_slots a multiple of kCoeffBits. So
492       // sometimes we want to test without that limitation.
493       bool test_interleaved = (i % 7) != 6;
494 
495       // Compute num_slots, and re-adjust num_to_add to get as close as possible
496       // to next num_slots, to stress that num_slots in terms of construction
497       // success. Ensure at least one iteration:
498       Index num_slots = Index{0} - 1;
499       --num_to_add;
500       for (;;) {
501         Index next_num_slots = SimpleSoln::RoundUpNumSlots(
502             ConfigHelper::GetNumSlots(num_to_add + 1, cs));
503         if (test_interleaved) {
504           next_num_slots = InterleavedSoln::RoundUpNumSlots(next_num_slots);
505           // assert idempotent
506           EXPECT_EQ(next_num_slots,
507                     InterleavedSoln::RoundUpNumSlots(next_num_slots));
508         }
509         // assert idempotent with InterleavedSoln::RoundUpNumSlots
510         EXPECT_EQ(next_num_slots, SimpleSoln::RoundUpNumSlots(next_num_slots));
511 
512         if (next_num_slots > num_slots) {
513           break;
514         }
515         num_slots = next_num_slots;
516         ++num_to_add;
517       }
518       assert(num_slots < Index{0} - 1);
519 
520       total_added += num_to_add;
521 
522       std::string prefix;
523       ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next());
524 
525       // Batch that must be added
526       std::string added_str = prefix + "added";
527       KeyGen keys_begin(added_str, 0);
528       KeyGen keys_end(added_str, num_to_add);
529 
530       // A couple more that will probably be added
531       KeyGen one_more(prefix + "more", 1);
532       KeyGen two_more(prefix + "more", 2);
533 
534       // Batch that may or may not be added
535       uint32_t batch_size =
536           static_cast<uint32_t>(2.0 * std::sqrt(num_slots - num_to_add));
537       if (batch_size < 10U) {
538         batch_size = 0;
539       }
540       std::string batch_str = prefix + "batch";
541       KeyGen batch_begin(batch_str, 0);
542       KeyGen batch_end(batch_str, batch_size);
543 
544       // Batch never (successfully) added, but used for querying FP rate
545       std::string not_str = prefix + "not";
546       KeyGen other_keys_begin(not_str, 0);
547       KeyGen other_keys_end(not_str, FLAGS_max_check);
548 
549       double overhead_ratio = 1.0 * num_slots / num_to_add;
550       if (FLAGS_verbose) {
551         fprintf(stderr, "Adding(%s) %u / %u   Overhead: %g   Batch size: %u\n",
552                 test_interleaved ? "i" : "s", (unsigned)num_to_add,
553                 (unsigned)num_slots, overhead_ratio, (unsigned)batch_size);
554       }
555 
556       // Vary bytes for InterleavedSoln to use number of solution columns
557       // from 0 to max allowed by ResultRow type (and used by SimpleSoln).
558       // Specifically include 0 and max, and otherwise skew toward max.
559       uint32_t max_ibytes =
560           static_cast<uint32_t>(sizeof(ResultRow) * num_slots);
561       size_t ibytes;
562       if (i == 0) {
563         ibytes = 0;
564       } else if (i == 1) {
565         ibytes = max_ibytes;
566       } else {
567         // Skewed
568         ibytes =
569             std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes));
570       }
571       std::unique_ptr<char[]> idata(new char[ibytes]);
572       InterleavedSoln isoln(idata.get(), ibytes);
573 
574       SimpleSoln soln;
575       Hasher hasher;
576       bool first_single;
577       bool second_single;
578       bool batch_success;
579       {
580         Banding banding;
581         // Traditional solve for a fixed set.
582         ASSERT_TRUE(
583             banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end));
584 
585         Index occupied_count = banding.GetOccupiedCount();
586         Index more_added = 0;
587 
588         if (TypeParam::kHomogeneous || overhead_ratio < 1.01 ||
589             batch_size == 0) {
590           // Homogeneous not compatible with backtracking because add
591           // doesn't fail. Small overhead ratio too packed to expect more
592           first_single = false;
593           second_single = false;
594           batch_success = false;
595         } else {
596           // Now to test backtracking, starting with guaranteed fail. By using
597           // the keys that will be used to test FP rate, we are then doing an
598           // extra check that after backtracking there are no remnants (e.g. in
599           // result side of banding) of these entries.
600           KeyGen other_keys_too_big_end = other_keys_begin;
601           other_keys_too_big_end += num_to_add;
602           banding.EnsureBacktrackSize(std::max(num_to_add, batch_size));
603           EXPECT_FALSE(banding.AddRangeOrRollBack(other_keys_begin,
604                                                   other_keys_too_big_end));
605           EXPECT_EQ(occupied_count, banding.GetOccupiedCount());
606 
607           // Check that we still have a good chance of adding a couple more
608           // individually
609           first_single = banding.Add(*one_more);
610           second_single = banding.Add(*two_more);
611           more_added += (first_single ? 1 : 0) + (second_single ? 1 : 0);
612           total_singles += 2U;
613           total_single_failures += 2U - more_added;
614 
615           // Or as a batch
616           batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end);
617           ++total_batch;
618           if (batch_success) {
619             more_added += batch_size;
620             ++total_batch_successes;
621           }
622           EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
623         }
624 
625         // Also verify that redundant adds are OK (no effect)
626         ASSERT_TRUE(
627             banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8)));
628         EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
629 
630         // Now back-substitution
631         soln.BackSubstFrom(banding);
632         if (test_interleaved) {
633           isoln.BackSubstFrom(banding);
634         }
635 
636         Seed reseeds = banding.GetOrdinalSeed();
637         total_reseeds += reseeds;
638 
639         EXPECT_LE(reseeds, 8 + log2_thoroughness);
640         if (reseeds > log2_thoroughness + 1) {
641           fprintf(
642               stderr, "%s high reseeds at %u, %u/%u: %u\n",
643               reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat",
644               static_cast<unsigned>(i), static_cast<unsigned>(num_to_add),
645               static_cast<unsigned>(num_slots), static_cast<unsigned>(reseeds));
646         }
647 
648         if (reseeds > 0) {
649           // "Expand" test: given a failed construction, how likely is it to
650           // pass with same seed and more slots. At each step, we increase
651           // enough to ensure there is at least one shift within each coeff
652           // block.
653           ++total_expand_trials;
654           Index expand_count = 0;
655           Index ex_slots = num_slots;
656           banding.SetOrdinalSeed(0);
657           for (;; ++expand_count) {
658             ASSERT_LE(expand_count, log2_thoroughness);
659             ex_slots += ex_slots / kCoeffBits;
660             if (test_interleaved) {
661               ex_slots = InterleavedSoln::RoundUpNumSlots(ex_slots);
662             }
663             banding.Reset(ex_slots);
664             bool success = banding.AddRange(keys_begin, keys_end);
665             if (success) {
666               break;
667             }
668           }
669           total_expand_failures += expand_count;
670           total_expand_overhead += 1.0 * (ex_slots - num_slots) / num_slots;
671         }
672 
673         hasher.SetOrdinalSeed(reseeds);
674       }
675       // soln and hasher now independent of Banding object
676 
677       // Verify keys added
678       KeyGen cur = keys_begin;
679       while (cur != keys_end) {
680         ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
681         ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
682         ++cur;
683       }
684       // We (maybe) snuck these in!
685       if (first_single) {
686         ASSERT_TRUE(soln.FilterQuery(*one_more, hasher));
687         ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher));
688       }
689       if (second_single) {
690         ASSERT_TRUE(soln.FilterQuery(*two_more, hasher));
691         ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher));
692       }
693       if (batch_success) {
694         cur = batch_begin;
695         while (cur != batch_end) {
696           ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
697           ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
698           ++cur;
699         }
700       }
701 
702       // Check FP rate (depends only on number of result bits == solution
703       // columns)
704       Index fp_count = 0;
705       cur = other_keys_begin;
706       {
707         ROCKSDB_NAMESPACE::StopWatchNano timer(
708             ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
709         while (cur != other_keys_end) {
710           bool fp = soln.FilterQuery(*cur, hasher);
711           fp_count += fp ? 1 : 0;
712           ++cur;
713         }
714         soln_query_nanos += timer.ElapsedNanos();
715         soln_query_count += FLAGS_max_check;
716       }
717       {
718         double expected_fp_count = soln.ExpectedFpRate() * FLAGS_max_check;
719         // For expected FP rate, also include false positives due to collisions
720         // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
721         double correction =
722             FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
723 
724         // NOTE: rare violations expected with kHomogeneous
725         EXPECT_LE(fp_count,
726                   FrequentPoissonUpperBound(expected_fp_count + correction));
727         EXPECT_GE(fp_count,
728                   FrequentPoissonLowerBound(expected_fp_count + correction));
729       }
730       total_fp_count += fp_count;
731 
732       // And also check FP rate for isoln
733       if (test_interleaved) {
734         Index ifp_count = 0;
735         cur = other_keys_begin;
736         ROCKSDB_NAMESPACE::StopWatchNano timer(
737             ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
738         while (cur != other_keys_end) {
739           ifp_count += isoln.FilterQuery(*cur, hasher) ? 1 : 0;
740           ++cur;
741         }
742         isoln_query_nanos += timer.ElapsedNanos();
743         isoln_query_count += FLAGS_max_check;
744         {
745           double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
746           // For expected FP rate, also include false positives due to
747           // collisions in Hash value. (Negligible for 64-bit, can matter for
748           // 32-bit.)
749           double correction =
750               FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
751 
752           // NOTE: rare violations expected with kHomogeneous
753           EXPECT_LE(ifp_count,
754                     FrequentPoissonUpperBound(expected_fp_count + correction));
755 
756           // FIXME: why sometimes can we slightly "beat the odds"?
757           // (0.95 factor should not be needed)
758           EXPECT_GE(ifp_count, FrequentPoissonLowerBound(
759                                    0.95 * expected_fp_count + correction));
760         }
761         // Since the bits used in isoln are a subset of the bits used in soln,
762         // it cannot have fewer FPs
763         EXPECT_GE(ifp_count, fp_count);
764       }
765 
766       // And compare to Bloom time, for fun
767       if (ibytes >= /* minimum Bloom impl bytes*/ 64) {
768         Index bfp_count = 0;
769         cur = other_keys_begin;
770         ROCKSDB_NAMESPACE::StopWatchNano timer(
771             ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
772         while (cur != other_keys_end) {
773           uint64_t h = hasher.GetHash(*cur);
774           uint32_t h1 = ROCKSDB_NAMESPACE::Lower32of64(h);
775           uint32_t h2 = sizeof(Hash) >= 8 ? ROCKSDB_NAMESPACE::Upper32of64(h)
776                                           : h1 * 0x9e3779b9;
777           bfp_count +=
778               ROCKSDB_NAMESPACE::FastLocalBloomImpl::HashMayMatch(
779                   h1, h2, static_cast<uint32_t>(ibytes), 6, idata.get())
780                   ? 1
781                   : 0;
782           ++cur;
783         }
784         bloom_query_nanos += timer.ElapsedNanos();
785         // ensure bfp_count is used
786         ASSERT_LT(bfp_count, FLAGS_max_check);
787       }
788     }
789 
790     // "outside" == key not in original set so either negative or false positive
791     fprintf(stderr,
792             "Simple      outside query, hot, incl hashing, ns/key: %g\n",
793             1.0 * soln_query_nanos / soln_query_count);
794     fprintf(stderr,
795             "Interleaved outside query, hot, incl hashing, ns/key: %g\n",
796             1.0 * isoln_query_nanos / isoln_query_count);
797     fprintf(stderr,
798             "Bloom       outside query, hot, incl hashing, ns/key: %g\n",
799             1.0 * bloom_query_nanos / soln_query_count);
800 
801     if (TypeParam::kHomogeneous) {
802       EXPECT_EQ(total_reseeds, 0U);
803     } else {
804       double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness;
805       fprintf(stderr, "Average re-seeds: %g\n", average_reseeds);
806       // Values above were chosen to target around 50% chance of encoding
807       // success rate (average of 1.0 re-seeds) or slightly better. But 1.15 is
808       // also close enough.
809       EXPECT_LE(total_reseeds,
810                 InfrequentPoissonUpperBound(1.15 * expected_reseeds *
811                                             FLAGS_thoroughness));
812       // Would use 0.85 here instead of 0.75, but
813       // TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of
814       // sequential keys with a small, cheap hash function. We accept that
815       // there are surely inputs that are somewhat bad for this setup, but
816       // these somewhat good inputs are probably more likely.
817       EXPECT_GE(total_reseeds,
818                 InfrequentPoissonLowerBound(0.75 * expected_reseeds *
819                                             FLAGS_thoroughness));
820     }
821 
822     if (total_expand_trials > 0) {
823       double average_expand_failures =
824           1.0 * total_expand_failures / total_expand_trials;
825       fprintf(stderr, "Average expand failures, and overhead: %g, %g\n",
826               average_expand_failures,
827               total_expand_overhead / total_expand_trials);
828       // Seems to be a generous allowance
829       EXPECT_LE(total_expand_failures,
830                 InfrequentPoissonUpperBound(1.0 * total_expand_trials));
831     } else {
832       fprintf(stderr, "Average expand failures: N/A\n");
833     }
834 
835     if (total_singles > 0) {
836       double single_failure_rate = 1.0 * total_single_failures / total_singles;
837       fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate);
838       // A rough bound (one sided) based on nothing in particular
839       double expected_single_failures =
840           1.0 * total_singles /
841           (sizeof(CoeffRow) == 16 ? 128 : TypeParam::kUseSmash ? 64 : 32);
842       EXPECT_LE(total_single_failures,
843                 InfrequentPoissonUpperBound(expected_single_failures));
844     }
845 
846     if (total_batch > 0) {
847       // Counting successes here for Poisson to approximate the Binomial
848       // distribution.
849       // A rough bound (one sided) based on nothing in particular.
850       double expected_batch_successes = 1.0 * total_batch / 2;
851       uint64_t lower_bound =
852           InfrequentPoissonLowerBound(expected_batch_successes);
853       fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n",
854               1.0 * total_batch_successes / total_batch,
855               1.0 * lower_bound / total_batch);
856       EXPECT_GE(total_batch_successes, lower_bound);
857     }
858 
859     {
860       uint64_t total_checked = uint64_t{FLAGS_max_check} * FLAGS_thoroughness;
861       double expected_total_fp_count =
862           total_checked * std::pow(0.5, 8U * sizeof(ResultRow));
863       // For expected FP rate, also include false positives due to collisions
864       // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
865       double average_added = 1.0 * total_added / FLAGS_thoroughness;
866       expected_total_fp_count +=
867           total_checked * ExpectedCollisionFpRate(Hasher(), average_added);
868 
869       uint64_t upper_bound =
870           InfrequentPoissonUpperBound(expected_total_fp_count);
871       uint64_t lower_bound =
872           InfrequentPoissonLowerBound(expected_total_fp_count);
873       fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n",
874               1.0 * total_fp_count / total_checked,
875               expected_total_fp_count / total_checked,
876               1.0 * upper_bound / total_checked,
877               1.0 * lower_bound / total_checked);
878       EXPECT_LE(total_fp_count, upper_bound);
879       EXPECT_GE(total_fp_count, lower_bound);
880     }
881   }
882 }
883 
TYPED_TEST(RibbonTypeParamTest,Extremes)884 TYPED_TEST(RibbonTypeParamTest, Extremes) {
885   IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
886   IMPORT_RIBBON_IMPL_TYPES(TypeParam);
887   using KeyGen = typename TypeParam::KeyGen;
888 
889   size_t bytes = 128 * 1024;
890   std::unique_ptr<char[]> buf(new char[bytes]);
891   InterleavedSoln isoln(buf.get(), bytes);
892   SimpleSoln soln;
893   Hasher hasher;
894   Banding banding;
895 
896   // ########################################
897   // Add zero keys to minimal number of slots
898   KeyGen begin_and_end("foo", 123);
899   ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
900       /*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0,
901       /* seed mask*/ 0));
902 
903   soln.BackSubstFrom(banding);
904   isoln.BackSubstFrom(banding);
905 
906   // Because there's plenty of memory, we expect the interleaved solution to
907   // use maximum supported columns (same as simple solution)
908   ASSERT_EQ(isoln.GetUpperNumColumns(), 8U * sizeof(ResultRow));
909   ASSERT_EQ(isoln.GetUpperStartBlock(), 0U);
910 
911   // Somewhat oddly, we expect same FP rate as if we had essentially filled
912   // up the slots.
913   KeyGen other_keys_begin("not", 0);
914   KeyGen other_keys_end("not", FLAGS_max_check);
915 
916   Index fp_count = 0;
917   KeyGen cur = other_keys_begin;
918   while (cur != other_keys_end) {
919     bool isoln_query_result = isoln.FilterQuery(*cur, hasher);
920     bool soln_query_result = soln.FilterQuery(*cur, hasher);
921     // Solutions are equivalent
922     ASSERT_EQ(isoln_query_result, soln_query_result);
923     if (!TypeParam::kHomogeneous) {
924       // And in fact we only expect an FP when ResultRow is 0
925       // (except Homogeneous)
926       ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash(
927                                        hasher.GetHash(*cur)) == ResultRow{0});
928     }
929     fp_count += soln_query_result ? 1 : 0;
930     ++cur;
931   }
932   {
933     ASSERT_EQ(isoln.ExpectedFpRate(), soln.ExpectedFpRate());
934     double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
935     EXPECT_LE(fp_count, InfrequentPoissonUpperBound(expected_fp_count));
936     if (TypeParam::kHomogeneous) {
937       // Pseudorandom garbage in Homogeneous filter can "beat the odds" if
938       // nothing added
939     } else {
940       EXPECT_GE(fp_count, InfrequentPoissonLowerBound(expected_fp_count));
941     }
942   }
943 
944   // ######################################################
945   // Use zero bytes for interleaved solution (key(s) added)
946 
947   // Add one key
948   KeyGen key_begin("added", 0);
949   KeyGen key_end("added", 1);
950   ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
951       /*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0,
952       /* seed mask*/ 0));
953 
954   InterleavedSoln isoln2(nullptr, /*bytes*/ 0);
955 
956   isoln2.BackSubstFrom(banding);
957 
958   ASSERT_EQ(isoln2.GetUpperNumColumns(), 0U);
959   ASSERT_EQ(isoln2.GetUpperStartBlock(), 0U);
960 
961   // All queries return true
962   ASSERT_TRUE(isoln2.FilterQuery(*other_keys_begin, hasher));
963   ASSERT_EQ(isoln2.ExpectedFpRate(), 1.0);
964 }
965 
TEST(RibbonTest,AllowZeroStarts)966 TEST(RibbonTest, AllowZeroStarts) {
967   IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts);
968   IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts);
969   using KeyGen = StandardKeyGen;
970 
971   InterleavedSoln isoln(nullptr, /*bytes*/ 0);
972   SimpleSoln soln;
973   Hasher hasher;
974   Banding banding;
975 
976   KeyGen begin("foo", 0);
977   KeyGen end("foo", 1);
978   // Can't add 1 entry
979   ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end));
980 
981   KeyGen begin_and_end("foo", 123);
982   // Can add 0 entries
983   ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end,
984                                               begin_and_end));
985 
986   Seed reseeds = banding.GetOrdinalSeed();
987   ASSERT_EQ(reseeds, 0U);
988   hasher.SetOrdinalSeed(reseeds);
989 
990   // Can construct 0-slot solutions
991   isoln.BackSubstFrom(banding);
992   soln.BackSubstFrom(banding);
993 
994   // Should always return false
995   ASSERT_FALSE(isoln.FilterQuery(*begin, hasher));
996   ASSERT_FALSE(soln.FilterQuery(*begin, hasher));
997 
998   // And report that in FP rate
999   ASSERT_EQ(isoln.ExpectedFpRate(), 0.0);
1000   ASSERT_EQ(soln.ExpectedFpRate(), 0.0);
1001 }
1002 
TEST(RibbonTest,RawAndOrdinalSeeds)1003 TEST(RibbonTest, RawAndOrdinalSeeds) {
1004   StandardHasher<TypesAndSettings_Seed64> hasher64;
1005   StandardHasher<DefaultTypesAndSettings> hasher64_32;
1006   StandardHasher<TypesAndSettings_Hash32> hasher32;
1007   StandardHasher<TypesAndSettings_Seed8> hasher8;
1008 
1009   for (uint32_t limit : {0xffU, 0xffffU}) {
1010     std::vector<bool> seen(limit + 1);
1011     for (uint32_t i = 0; i < limit; ++i) {
1012       hasher64.SetOrdinalSeed(i);
1013       auto raw64 = hasher64.GetRawSeed();
1014       hasher32.SetOrdinalSeed(i);
1015       auto raw32 = hasher32.GetRawSeed();
1016       hasher8.SetOrdinalSeed(static_cast<uint8_t>(i));
1017       auto raw8 = hasher8.GetRawSeed();
1018       {
1019         hasher64_32.SetOrdinalSeed(i);
1020         auto raw64_32 = hasher64_32.GetRawSeed();
1021         ASSERT_EQ(raw64_32, raw32);  // Same size seed
1022       }
1023       if (i == 0) {
1024         // Documented that ordinal seed 0 == raw seed 0
1025         ASSERT_EQ(raw64, 0U);
1026         ASSERT_EQ(raw32, 0U);
1027         ASSERT_EQ(raw8, 0U);
1028       } else {
1029         // Extremely likely that upper bits are set
1030         ASSERT_GT(raw64, raw32);
1031         ASSERT_GT(raw32, raw8);
1032       }
1033       // Hashers agree on lower bits
1034       ASSERT_EQ(static_cast<uint32_t>(raw64), raw32);
1035       ASSERT_EQ(static_cast<uint8_t>(raw32), raw8);
1036 
1037       // The translation is one-to-one for this size prefix
1038       uint32_t v = static_cast<uint32_t>(raw32 & limit);
1039       ASSERT_EQ(raw64 & limit, v);
1040       ASSERT_FALSE(seen[v]);
1041       seen[v] = true;
1042     }
1043   }
1044 }
1045 
1046 namespace {
1047 
1048 struct PhsfInputGen {
PhsfInputGen__anon85a6d2830311::PhsfInputGen1049   PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) {
1050     val_.first = prefix;
1051     ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0);
1052   }
1053 
1054   // Prefix (only one required)
operator ++__anon85a6d2830311::PhsfInputGen1055   PhsfInputGen& operator++() {
1056     ++id_;
1057     return *this;
1058   }
1059 
operator *__anon85a6d2830311::PhsfInputGen1060   const std::pair<std::string, uint8_t>& operator*() {
1061     // Use multiplication to mix things up a little in the key
1062     ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8],
1063                                      id_ * uint64_t{0x1500000001});
1064     // Occasionally repeat values etc.
1065     val_.second = static_cast<uint8_t>(id_ * 7 / 8);
1066     return val_;
1067   }
1068 
operator ->__anon85a6d2830311::PhsfInputGen1069   const std::pair<std::string, uint8_t>* operator->() { return &**this; }
1070 
operator ==__anon85a6d2830311::PhsfInputGen1071   bool operator==(const PhsfInputGen& other) {
1072     // Same prefix is assumed
1073     return id_ == other.id_;
1074   }
operator !=__anon85a6d2830311::PhsfInputGen1075   bool operator!=(const PhsfInputGen& other) {
1076     // Same prefix is assumed
1077     return id_ != other.id_;
1078   }
1079 
1080   uint64_t id_;
1081   std::pair<std::string, uint8_t> val_;
1082 };
1083 
1084 struct PhsfTypesAndSettings : public DefaultTypesAndSettings {
1085   static constexpr bool kIsFilter = false;
1086 };
1087 }  // namespace
1088 
TEST(RibbonTest,PhsfBasic)1089 TEST(RibbonTest, PhsfBasic) {
1090   IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings);
1091   IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings);
1092 
1093   Index num_slots = 12800;
1094   Index num_to_add = static_cast<Index>(num_slots / 1.02);
1095 
1096   PhsfInputGen begin("in", 0);
1097   PhsfInputGen end("in", num_to_add);
1098 
1099   std::unique_ptr<char[]> idata(new char[/*bytes*/ num_slots]);
1100   InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots);
1101   SimpleSoln soln;
1102   Hasher hasher;
1103 
1104   {
1105     Banding banding;
1106     ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end));
1107 
1108     soln.BackSubstFrom(banding);
1109     isoln.BackSubstFrom(banding);
1110 
1111     hasher.SetOrdinalSeed(banding.GetOrdinalSeed());
1112   }
1113 
1114   for (PhsfInputGen cur = begin; cur != end; ++cur) {
1115     ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher));
1116     ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher));
1117   }
1118 }
1119 
1120 // Not a real test, but a tool used to build APIs in ribbon_config.h
TYPED_TEST(RibbonTypeParamTest,FindOccupancy)1121 TYPED_TEST(RibbonTypeParamTest, FindOccupancy) {
1122   IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
1123   IMPORT_RIBBON_IMPL_TYPES(TypeParam);
1124   using KeyGen = typename TypeParam::KeyGen;
1125 
1126   if (!FLAGS_find_occ) {
1127     ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
1128     return;
1129   }
1130 
1131   KeyGen cur(ROCKSDB_NAMESPACE::ToString(
1132                  testing::UnitTest::GetInstance()->random_seed()),
1133              0);
1134 
1135   Banding banding;
1136   Index num_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_min_slots);
1137   Index max_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_max_slots);
1138   while (num_slots <= max_slots) {
1139     std::map<int32_t, uint32_t> rem_histogram;
1140     std::map<Index, uint32_t> slot_histogram;
1141     if (FLAGS_find_slot_occ) {
1142       for (Index i = 0; i < kCoeffBits; ++i) {
1143         slot_histogram[i] = 0;
1144         slot_histogram[num_slots - 1 - i] = 0;
1145         slot_histogram[num_slots / 2 - kCoeffBits / 2 + i] = 0;
1146       }
1147     }
1148     uint64_t total_added = 0;
1149     for (uint32_t i = 0; i < FLAGS_find_iters; ++i) {
1150       banding.Reset(num_slots);
1151       uint32_t j = 0;
1152       KeyGen end = cur;
1153       end += num_slots + num_slots / 10;
1154       for (; cur != end; ++cur) {
1155         if (banding.Add(*cur)) {
1156           ++j;
1157         } else {
1158           break;
1159         }
1160       }
1161       total_added += j;
1162       for (auto& slot : slot_histogram) {
1163         slot.second += banding.IsOccupied(slot.first);
1164       }
1165 
1166       int32_t bucket =
1167           static_cast<int32_t>(num_slots) - static_cast<int32_t>(j);
1168       rem_histogram[bucket]++;
1169       if (FLAGS_verbose) {
1170         fprintf(stderr, "num_slots: %u i: %u / %u avg_overhead: %g\r",
1171                 static_cast<unsigned>(num_slots), static_cast<unsigned>(i),
1172                 static_cast<unsigned>(FLAGS_find_iters),
1173                 1.0 * (i + 1) * num_slots / total_added);
1174       }
1175     }
1176     if (FLAGS_verbose) {
1177       fprintf(stderr, "\n");
1178     }
1179 
1180     uint32_t cumulative = 0;
1181 
1182     double p50_rem = 0;
1183     double p95_rem = 0;
1184     double p99_9_rem = 0;
1185 
1186     for (auto& h : rem_histogram) {
1187       double before = 1.0 * cumulative / FLAGS_find_iters;
1188       double not_after = 1.0 * (cumulative + h.second) / FLAGS_find_iters;
1189       if (FLAGS_verbose) {
1190         fprintf(stderr, "overhead: %g before: %g not_after: %g\n",
1191                 1.0 * num_slots / (num_slots - h.first), before, not_after);
1192       }
1193       cumulative += h.second;
1194       if (before < 0.5 && 0.5 <= not_after) {
1195         // fake it with linear interpolation
1196         double portion = (0.5 - before) / (not_after - before);
1197         p50_rem = h.first + portion;
1198       } else if (before < 0.95 && 0.95 <= not_after) {
1199         // fake it with linear interpolation
1200         double portion = (0.95 - before) / (not_after - before);
1201         p95_rem = h.first + portion;
1202       } else if (before < 0.999 && 0.999 <= not_after) {
1203         // fake it with linear interpolation
1204         double portion = (0.999 - before) / (not_after - before);
1205         p99_9_rem = h.first + portion;
1206       }
1207     }
1208     for (auto& slot : slot_histogram) {
1209       fprintf(stderr, "slot[%u] occupied: %g\n", (unsigned)slot.first,
1210               1.0 * slot.second / FLAGS_find_iters);
1211     }
1212 
1213     double mean_rem =
1214         (1.0 * FLAGS_find_iters * num_slots - total_added) / FLAGS_find_iters;
1215     fprintf(
1216         stderr,
1217         "num_slots: %u iters: %u mean_ovr: %g p50_ovr: %g p95_ovr: %g "
1218         "p99.9_ovr: %g mean_rem: %g p50_rem: %g p95_rem: %g p99.9_rem: %g\n",
1219         static_cast<unsigned>(num_slots),
1220         static_cast<unsigned>(FLAGS_find_iters),
1221         1.0 * num_slots / (num_slots - mean_rem),
1222         1.0 * num_slots / (num_slots - p50_rem),
1223         1.0 * num_slots / (num_slots - p95_rem),
1224         1.0 * num_slots / (num_slots - p99_9_rem), mean_rem, p50_rem, p95_rem,
1225         p99_9_rem);
1226 
1227     num_slots = std::max(
1228         num_slots + 1, static_cast<Index>(num_slots * FLAGS_find_next_factor));
1229     num_slots = InterleavedSoln::RoundUpNumSlots(num_slots);
1230   }
1231 }
1232 
1233 // Not a real test, but a tool to understand Homogeneous Ribbon
1234 // behavior (TODO: configuration APIs & tests)
TYPED_TEST(RibbonTypeParamTest,OptimizeHomogAtScale)1235 TYPED_TEST(RibbonTypeParamTest, OptimizeHomogAtScale) {
1236   IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
1237   IMPORT_RIBBON_IMPL_TYPES(TypeParam);
1238   using KeyGen = typename TypeParam::KeyGen;
1239 
1240   if (!FLAGS_optimize_homog) {
1241     ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
1242     return;
1243   }
1244 
1245   if (!TypeParam::kHomogeneous) {
1246     ROCKSDB_GTEST_BYPASS("Only for Homogeneous Ribbon");
1247     return;
1248   }
1249 
1250   KeyGen cur(ROCKSDB_NAMESPACE::ToString(
1251                  testing::UnitTest::GetInstance()->random_seed()),
1252              0);
1253 
1254   Banding banding;
1255   Index num_slots = SimpleSoln::RoundUpNumSlots(FLAGS_optimize_homog_slots);
1256   banding.Reset(num_slots);
1257 
1258   // This and "band_ovr" is the "allocated overhead", or slots over added.
1259   // It does not take into account FP rates.
1260   double target_overhead = 1.20;
1261   uint32_t num_added = 0;
1262 
1263   do {
1264     do {
1265       (void)banding.Add(*cur);
1266       ++cur;
1267       ++num_added;
1268     } while (1.0 * num_slots / num_added > target_overhead);
1269 
1270     SimpleSoln soln;
1271     soln.BackSubstFrom(banding);
1272 
1273     std::array<uint32_t, 8U * sizeof(ResultRow)> fp_counts_by_cols;
1274     fp_counts_by_cols.fill(0U);
1275     for (uint32_t i = 0; i < FLAGS_optimize_homog_check; ++i) {
1276       ResultRow r = soln.PhsfQuery(*cur, banding);
1277       ++cur;
1278       for (size_t j = 0; j < fp_counts_by_cols.size(); ++j) {
1279         if ((r & 1) == 1) {
1280           break;
1281         }
1282         fp_counts_by_cols[j]++;
1283         r /= 2;
1284       }
1285     }
1286     fprintf(stderr, "band_ovr: %g ", 1.0 * num_slots / num_added);
1287     for (unsigned j = 0; j < fp_counts_by_cols.size(); ++j) {
1288       double inv_fp_rate =
1289           1.0 * FLAGS_optimize_homog_check / fp_counts_by_cols[j];
1290       double equiv_cols = std::log(inv_fp_rate) * 1.4426950409;
1291       // Overhead vs. information-theoretic minimum based on observed
1292       // FP rate (subject to sampling error, especially for low FP rates)
1293       double actual_overhead =
1294           1.0 * (j + 1) * num_slots / (equiv_cols * num_added);
1295       fprintf(stderr, "ovr_%u: %g ", j + 1, actual_overhead);
1296     }
1297     fprintf(stderr, "\n");
1298     target_overhead -= FLAGS_optimize_homog_granularity;
1299   } while (target_overhead > 1.0);
1300 }
1301 
main(int argc,char ** argv)1302 int main(int argc, char** argv) {
1303   ::testing::InitGoogleTest(&argc, argv);
1304 #ifdef GFLAGS
1305   ParseCommandLineFlags(&argc, &argv, true);
1306 #endif  // GFLAGS
1307   return RUN_ALL_TESTS();
1308 }
1309