1 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #pragma once
7 
8 #include <array>
9 #include <memory>
10 
11 #include "rocksdb/rocksdb_namespace.h"
12 #include "util/math128.h"
13 
14 namespace ROCKSDB_NAMESPACE {
15 
16 namespace ribbon {
17 
18 // RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
19 //
20 // ribbon_alg.h: generic versions of core algorithms.
21 //
22 // Ribbon is a Perfect Hash Static Function construction useful as a compact
23 // static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear
24 // system construction that approximates a Band Matrix with hashing,
25 // (b) an incremental, on-the-fly Gaussian Elimination algorithm that is
26 // remarkably efficient and adaptable at constructing an upper-triangular
27 // band matrix from a set of band-approximating inputs from (a), and
28 // (c) a storage layout that is fast and adaptable as a filter.
29 //
30 // Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices
31 // with One Short Random Block per Row, with Applications" by Stefan
32 // Walzer and Martin Dietzfelbinger ("DW paper")
33 // (b) developed by Peter C. Dillinger, though not the first on-the-fly
34 // GE algorithm. See "On the fly Gaussian Elimination for LT codes" by
35 // Bioglio, Grangetto, Gaeta, and Sereno.
36 // (c) see "interleaved" solution storage below.
37 //
38 // See ribbon_impl.h for high-level behavioral summary. This file focuses
39 // on the core design details.
40 //
41 // ######################################################################
42 // ################# PHSF -> static filter reduction ####################
43 //
44 // A Perfect Hash Static Function is a data structure representing a
45 // map from anything hashable (a "key") to values of some fixed size.
46 // Crucially, it is allowed to return garbage values for anything not in
47 // the original set of map keys, and it is a "static" structure: entries
48 // cannot be added or deleted after construction. PHSFs representing n
49 // mappings to b-bit values (assume uniformly distributed) require at least
50 // n * b bits to represent, or at least b bits per entry. We typically
51 // describe the compactness of a PHSF by typical bits per entry as some
52 // function of b. For example, the MWHC construction (k=3 "peeling")
53 // requires about 1.0222*b and a variant called Xor+ requires about
54 // 1.08*b + 0.5 bits per entry.
55 //
56 // With more hashing, a PHSF can over-approximate a set as a Bloom filter
57 // does, with no FN queries and predictable false positive (FP) query
58 // rate. Instead of the user providing a value to map each input key to,
59 // a hash function provides the value. Keys in the original set will
60 // return a positive membership query because the underlying PHSF returns
61 // the same value as hashing the key. When a key is not in the original set,
62 // the PHSF returns a "garbage" value, which is only equal to the key's
63 // hash with (false positive) probability 1 in 2^b.
64 //
65 // For a matching false positive rate, standard Bloom filters require
66 // 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h)
67 // require a bit more, around 1.5*b bits per entry. Thus, a Bloom
68 // alternative could save up to or nearly 1/3rd of memory and storage
69 // that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter
70 // is dynamic.)
71 //
72 // Recommended reading:
73 // "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters"
74 // by Graf and Lemire
75 // First three sections of "Fast Scalable Construction of (Minimal
76 // Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna
77 //
78 // ######################################################################
79 // ################## PHSF vs. hash table vs. Bloom #####################
80 //
81 // You can think of traditional hash tables and related filter variants
82 // such as Cuckoo filters as utilizing an "OR" construction: a hash
83 // function associates a key with some slots and the data is returned if
84 // the data is found in any one of those slots. The collision resolution
85 // is visible in the final data structure and requires extra information.
86 // For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and
87 // Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data
88 // structure associates each input key with data in one slot, the
89 // structure implicitly constructs a (near-)minimal (near-)perfect hash
90 // (MPH) of the keys, which requires at least 1.44 bits per key to
91 // represent. This is why approaches with visible collision resolution
92 // have a fixed + 1.5 or more in storage overhead per entry, often in
93 // addition to an overhead multiplier on b.
94 //
95 // By contrast Bloom filters utilize an "AND" construction: a query only
96 // returns true if all bit positions associated with a key are set to 1.
97 // There is no collision resolution, so Bloom filters do not suffer a
98 // fixed bits per entry overhead like the above structures.
99 //
100 // PHSFs typically use a bitwise XOR construction: the data you want is
101 // not in a single slot, but in a linear combination of several slots.
102 // For static data, this gives the best of "AND" and "OR" constructions:
103 // avoids the +1.44 or more fixed overhead by not approximating a MPH and
104 // can do much better than Bloom's 1.44 factor on b with collision
105 // resolution, which here is done ahead of time and invisible at query
106 // time.
107 //
108 // ######################################################################
109 // ######################## PHSF construction ###########################
110 //
111 // For a typical PHSF, construction is solving a linear system of
112 // equations, typically in GF(2), which is to say that values are boolean
113 // and XOR serves both as addition and subtraction. We can use matrices to
114 // represent the problem:
115 //
116 //    C    *    S    =    R
117 // (n x m)   (m x b)   (n x b)
118 // where C = coefficients, S = solution, R = results
119 // and solving for S given C and R.
120 //
121 // Note that C and R each have n rows, one for each input entry for the
122 // PHSF. A row in C is given by a hash function on the PHSF input key,
123 // and the corresponding row in R is the b-bit value to associate with
124 // that input key. (In a filter, rows of R are given by another hash
125 // function on the input key.)
126 //
127 // On solving, the matrix S (solution) is the final PHSF data, as it
128 // maps any row from the original C to its corresponding desired result
129 // in R. We just have to hash our query inputs and compute a linear
130 // combination of rows in S.
131 //
132 // In theory, we could chose m = n and let a hash function associate
133 // each input key with random rows in C. A solution exists with high
134 // probability, and uses essentially minimum space, b bits per entry
135 // (because we set m = n) but this has terrible scaling, something
136 // like O(n^2) space and O(n^3) time during construction (Gaussian
137 // elimination) and O(n) query time. But computational efficiency is
138 // key, and the core of this is avoiding scanning all of S to answer
139 // each query.
140 //
141 // The traditional approach (MWHC, aka Xor filter) starts with setting
142 // only some small fixed number of columns (typically k=3) to 1 for each
143 // row of C, with remaining entries implicitly 0. This is implemented as
144 // three hash functions over [0,m), and S can be implemented as a vector
145 // vector of b-bit values. Now, a query only involves looking up k rows
146 // (values) in S and computing their bitwise XOR. Additionally, this
147 // construction can use a linear time algorithm called "peeling" for
148 // finding a solution in many cases of one existing, but peeling
149 // generally requires a larger space overhead factor in the solution
150 // (m/n) than is required with Gaussian elimination.
151 //
152 // Recommended reading:
153 // "Peeling Close to the Orientability Threshold – Spatial Coupling in
154 // Hashing-Based Data Structures" by Stefan Walzer
155 //
156 // ######################################################################
157 // ##################### Ribbon PHSF construction #######################
158 //
159 // Ribbon constructs coefficient rows essentially the same as in the
160 // Walzer/Dietzfelbinger paper cited above: for some chosen fixed width
161 // r (kCoeffBits in code), each key is hashed to a starting column in
162 // [0, m - r] (GetStart() in code) and an r-bit sequence of boolean
163 // coefficients (GetCoeffRow() in code). If you sort the rows by start,
164 // the C matrix would look something like this:
165 //
166 // [####00000000000000000000]
167 // [####00000000000000000000]
168 // [000####00000000000000000]
169 // [0000####0000000000000000]
170 // [0000000####0000000000000]
171 // [000000000####00000000000]
172 // [000000000####00000000000]
173 // [0000000000000####0000000]
174 // [0000000000000000####0000]
175 // [00000000000000000####000]
176 // [00000000000000000000####]
177 //
178 // where each # could be a 0 or 1, chosen uniformly by a hash function.
179 // (Except we typically set the start column value to 1.) This scheme
180 // uses hashing to approximate a band matrix, and it has a solution iff
181 // it reduces to an upper-triangular boolean r-band matrix, like this:
182 //
183 // [1###00000000000000000000]
184 // [01##00000000000000000000]
185 // [000000000000000000000000]
186 // [0001###00000000000000000]
187 // [000000000000000000000000]
188 // [000001##0000000000000000]
189 // [000000000000000000000000]
190 // [00000001###0000000000000]
191 // [000000001###000000000000]
192 // [0000000001##000000000000]
193 // ...
194 // [00000000000000000000001#]
195 // [000000000000000000000001]
196 //
197 // where we have expanded to an m x m matrix by filling with rows of
198 // all zeros as needed. As in Gaussian elimination, this form is ready for
199 // generating a solution through back-substitution.
200 //
201 // The awesome thing about the Ribbon construction (from the DW paper) is
202 // how row reductions keep each row representable as a start column and
203 // r coefficients, because row reductions are only needed when two rows
204 // have the same number of leading zero columns. Thus, the combination
205 // of those rows, the bitwise XOR of the r-bit coefficient rows, cancels
206 // out the leading 1s, so starts (at least) one column later and only
207 // needs (at most) r - 1 coefficients.
208 //
209 // ######################################################################
210 // ###################### Ribbon PHSF scalability #######################
211 //
212 // Although more practical detail is in ribbon_impl.h, it's worth
213 // understanding some of the overall benefits and limitations of the
214 // Ribbon PHSFs.
215 //
216 // High-end scalability is a primary issue for Ribbon PHSFs, because in
217 // a single Ribbon linear system with fixed r and fixed m/n ratio, the
218 // solution probability approaches zero as n approaches infinity.
219 // For a given n, solution probability improves with larger r and larger
220 // m/n.
221 //
222 // By contrast, peeling-based PHSFs have somewhat worse storage ratio
223 // or solution probability for small n (less than ~1000). This is
224 // especially true with spatial-coupling, where benefits are only
225 // notable for n on the order of 100k or 1m or more.
226 //
227 // To make best use of current hardware, r=128 seems to be closest to
228 // a "generally good" choice for Ribbon, at least in RocksDB where SST
229 // Bloom filters typically hold around 10-100k keys, and almost always
230 // less than 10m keys. r=128 ribbon has a high chance of encoding success
231 // (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05)
232 // for roughly 10k - 10m keys in a single linear system. r=64 only scales
233 // up to about 10k keys with the same storage overhead. Construction and
234 // access times for r=128 are similar to r=64. r=128 tracks nearly
235 // twice as much data during construction, but in most cases we expect
236 // the scalability benefits of r=128 vs. r=64 to make it preferred.
237 //
238 // A natural approach to scaling Ribbon beyond ~10m keys is splitting
239 // (or "sharding") the inputs into multiple linear systems with their
240 // own hash seeds. This can also help to control peak memory consumption.
241 // TODO: much more to come
242 //
243 // ######################################################################
244 // #################### Ribbon on-the-fly banding #######################
245 //
246 // "Banding" is what we call the process of reducing the inputs to an
247 // upper-triangular r-band matrix ready for finishing a solution with
248 // back-substitution. Although the DW paper presents an algorithm for
249 // this ("SGauss"), the awesome properties of their construction enable
250 // an even simpler, faster, and more backtrackable algorithm. In simplest
251 // terms, the SGauss algorithm requires sorting the inputs by start
252 // columns, but it's possible to make Gaussian elimination resemble hash
253 // table insertion!
254 //
255 // The enhanced algorithm is based on these observations:
256 // - When processing a coefficient row with first 1 in column j,
257 //   - If it's the first at column j to be processed, it can be part of
258 //     the banding at row j. (And that decision never overwritten, with
259 //     no loss of generality!)
260 //   - Else, it can be combined with existing row j and re-processed,
261 //     which will look for a later "empty" row or reach "no solution".
262 //
263 // We call our banding algorithm "incremental" and "on-the-fly" because
264 // (like hash table insertion) we are "finished" after each input
265 // processed, with respect to all inputs processed so far. Although the
266 // band matrix is an intermediate step to the solution structure, we have
267 // eliminated intermediate steps and unnecessary data tracking for
268 // banding.
269 //
270 // Building on "incremental" and "on-the-fly", the banding algorithm is
271 // easily backtrackable because no (non-empty) rows are overwritten in
272 // the banding. Thus, if we want to "try" adding an additional set of
273 // inputs to the banding, we only have to record which rows were written
274 // in order to efficiently backtrack to our state before considering
275 // the additional set. (TODO: how this can mitigate scalability and
276 // reach sub-1% overheads)
277 //
278 // Like in a linear-probed hash table, as the occupancy approaches and
279 // surpasses 90-95%, collision resolution dominates the construction
280 // time. (Ribbon doesn't usually pay at query time; see solution
281 // storage below.) This means that we can speed up construction time
282 // by using a higher m/n ratio, up to negative returns around 1.2.
283 // At m/n ~= 1.2, which still saves memory substantially vs. Bloom
284 // filter's 1.5, construction speed (including back-substitution) is not
285 // far from sorting speed, but still a few times slower than cache-local
286 // Bloom construction speed.
287 //
288 // Back-substitution from an upper-triangular boolean band matrix is
289 // especially fast and easy. All the memory accesses are sequential or at
290 // least local, no random. If the number of result bits (b) is a
291 // compile-time constant, the back-substitution state can even be tracked
292 // in CPU registers. Regardless of the solution representation, we prefer
293 // column-major representation for tracking back-substitution state, as
294 // r (the band width) will typically be much larger than b (result bits
295 // or columns), so better to handle r-bit values b times (per solution
296 // row) than b-bit values r times.
297 //
298 // ######################################################################
299 // ##################### Ribbon solution storage ########################
300 //
301 // Row-major layout is typical for boolean (bit) matrices, including for
302 // MWHC (Xor) filters where a query combines k b-bit values, and k is
303 // typically smaller than b. Even for k=4 and b=2, at least k=4 random
304 // look-ups are required regardless of layout.
305 //
306 // Ribbon PHSFs are quite different, however, because
307 // (a) all of the solution rows relevant to a query are within a single
308 // range of r rows, and
309 // (b) the number of solution rows involved (r/2 on average, or r if
310 // avoiding conditional accesses) is typically much greater than
311 // b, the number of solution columns.
312 //
313 // Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead
314 // by processing (up to) r entries of b bits each, where b is typically
315 // less than 10 for filter applications.
316 //
317 // Column-major layout has poor locality because of accessing up to b
318 // memory locations in different pages (and obviously cache lines). Note
319 // that negative filter queries do not typically need to access all
320 // solution columns, as they can return when a mismatch is found in any
321 // result/solution column. This optimization doesn't always pay off on
322 // recent hardware, where the penalty for unpredictable conditional
323 // branching can exceed the penalty for unnecessary work, but the
324 // optimization is essentially unavailable with row-major layout.
325 //
326 // The best compromise seems to be interleaving column-major on the small
327 // scale with row-major on the large scale. For example, let a solution
328 // "block" be r rows column-major encoded as b r-bit values in sequence.
329 // Each query accesses (up to) 2 adjacent blocks, which will typically
330 // span 1-3 cache lines in adjacent memory. We get very close to the same
331 // locality as row-major, but with much faster reconstruction of each
332 // result column, at least for filter applications where b is relatively
333 // small and negative queries can return early.
334 //
335 // ######################################################################
336 // ###################### Fractional result bits ########################
337 //
338 // Bloom filters have great flexibility that alternatives mostly do not
339 // have. One of those flexibilities is in utilizing any ratio of data
340 // structure bits per key. With a typical memory allocator like jemalloc,
341 // this flexibility can save roughly 10% of the filters' footprint in
342 // DRAM by rounding up and down filter sizes to minimize memory internal
343 // fragmentation (see optimize_filters_for_memory RocksDB option).
344 //
345 // At first glance, PHSFs only offer a whole number of bits per "slot"
346 // (m rather than number of keys n), but coefficient locality in the
347 // Ribbon construction makes fractional bits/key quite possible and
348 // attractive for filter applications. This works by a prefix of the
349 // structure using b-1 solution columns and the rest using b solution
350 // columns. See InterleavedSolutionStorage below for more detail.
351 //
352 // Because false positive rates are non-linear in bits/key, this approach
353 // is not quite optimal in terms of information theory. In common cases,
354 // we see additional space overhead up to about 1.5% vs. theoretical
355 // optimal to achieve the same FP rate. We consider this a quite acceptable
356 // overhead for very efficiently utilizing space that might otherwise be
357 // wasted.
358 //
359 // This property of Ribbon even makes it "elastic." A Ribbon filter and
360 // its small metadata for answering queries can be adapted into another
361 // Ribbon filter filling any smaller multiple of r bits (plus small
362 // metadata), with a correspondingly higher FP rate. None of the data
363 // thrown away during construction needs to be recalled for this reduction.
364 // Similarly a single Ribbon construction can be separated (by solution
365 // column) into two or more structures (or "layers" or "levels") with
366 // independent filtering ability (no FP correlation, just as solution or
367 // result columns in a single structure) despite being constructed as part
368 // of a single linear system. (TODO: implement)
369 // See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards
370 // Efficient Read for LSM-tree-based KV Stores."
371 //
372 
373 // ######################################################################
374 // ################### CODE: Ribbon core algorithms #####################
375 // ######################################################################
376 //
377 // These algorithms are templatized for genericity but near-maximum
378 // performance in a given application. The template parameters
379 // adhere to informal class/struct type concepts outlined below. (This
380 // code is written for C++11 so does not use formal C++ concepts.)
381 
382 // Rough architecture for these algorithms:
383 //
384 // +-----------+     +---+     +-----------------+
385 // | AddInputs | --> | H | --> | BandingStorage  |
386 // +-----------+     | a |     +-----------------+
387 //                   | s |             |
388 //                   | h |      Back substitution
389 //                   | e |             V
390 // +-----------+     | r |     +-----------------+
391 // | Query Key | --> |   | >+< | SolutionStorage |
392 // +-----------+     +---+  |  +-----------------+
393 //                          V
394 //                     Query result
395 
396 // Common to other concepts
397 // concept RibbonTypes {
398 //   // An unsigned integer type for an r-bit subsequence of coefficients.
399 //   // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would
400 //   // generally only hurt scalability to leave bits of CoeffRow unused.
401 //   typename CoeffRow;
402 //   // An unsigned integer type big enough to hold a result row (b bits,
403 //   // or number of solution/result columns).
404 //   // In many applications, especially filters, the number of result
405 //   // columns is decided at run time, so ResultRow simply needs to be
406 //   // big enough for the largest number of columns allowed.
407 //   typename ResultRow;
408 //   // An unsigned integer type sufficient for representing the number of
409 //   // rows in the solution structure, and at least the arithmetic
410 //   // promotion size (usually 32 bits). uint32_t recommended because a
411 //   // single Ribbon construction doesn't really scale to billions of
412 //   // entries.
413 //   typename Index;
414 // };
415 
416 // ######################################################################
417 // ######################## Hashers and Banding #########################
418 
419 // Hasher concepts abstract out hashing details.
420 
421 // concept PhsfQueryHasher extends RibbonTypes {
422 //   // Type for a lookup key, which is hashable.
423 //   typename Key;
424 //
425 //   // Type for hashed summary of a Key. uint64_t is recommended.
426 //   typename Hash;
427 //
428 //   // Compute a hash value summarizing a Key
429 //   Hash GetHash(const Key &) const;
430 //
431 //   // Given a hash value and a number of columns that can start an
432 //   // r-sequence of coefficients (== m - r + 1), return the start
433 //   // column to associate with that hash value. (Starts can be chosen
434 //   // uniformly or "smash" extra entries into the beginning and end for
435 //   // better utilization at those extremes of the structure. Details in
436 //   // ribbon.impl.h)
437 //   Index GetStart(Hash, Index num_starts) const;
438 //
439 //   // Given a hash value, return the r-bit sequence of coefficients to
440 //   // associate with it. It's generally OK if
441 //   //   sizeof(CoeffRow) > sizeof(Hash)
442 //   // as long as the hash itself is not too prone to collisions for the
443 //   // applications and the CoeffRow is generated uniformly from
444 //   // available hash data, but relatively independent of the start.
445 //   //
446 //   // Must be non-zero, because that's required for a solution to exist
447 //   // when mapping to non-zero result row. (Note: BandingAdd could be
448 //   // modified to allow 0 coeff row if that only occurs with 0 result
449 //   // row, which really only makes sense for filter implementation,
450 //   // where both values are hash-derived. Or BandingAdd could reject 0
451 //   // coeff row, forcing next seed, but that has potential problems with
452 //   // generality/scalability.)
453 //   CoeffRow GetCoeffRow(Hash) const;
454 // };
455 
456 // concept FilterQueryHasher extends PhsfQueryHasher {
457 //   // For building or querying a filter, this returns the expected
458 //   // result row associated with a hashed input. For general PHSF,
459 //   // this must return 0.
460 //   //
461 //   // Although not strictly required, there's a slightly better chance of
462 //   // solver success if result row is masked down here to only the bits
463 //   // actually needed.
464 //   ResultRow GetResultRowFromHash(Hash) const;
465 // }
466 
467 // concept BandingHasher extends FilterQueryHasher {
468 //   // For a filter, this will generally be the same as Key.
469 //   // For a general PHSF, it must either
470 //   // (a) include a key and a result it maps to (e.g. in a std::pair), or
471 //   // (b) GetResultRowFromInput looks up the result somewhere rather than
472 //   // extracting it.
473 //   typename AddInput;
474 //
475 //   // Instead of requiring a way to extract a Key from an
476 //   // AddInput, we require getting the hash of the Key part
477 //   // of an AddInput, which is trivial if AddInput == Key.
478 //   Hash GetHash(const AddInput &) const;
479 //
480 //   // For building a non-filter PHSF, this extracts or looks up the result
481 //   // row to associate with an input. For filter PHSF, this must return 0.
482 //   ResultRow GetResultRowFromInput(const AddInput &) const;
483 //
484 //   // Whether the solver can assume the lowest bit of GetCoeffRow is
485 //   // always 1. When true, it should improve solver efficiency slightly.
486 //   static bool kFirstCoeffAlwaysOne;
487 // }
488 
489 // Abstract storage for the the result of "banding" the inputs (Gaussian
490 // elimination to an upper-triangular boolean band matrix). Because the
491 // banding is an incremental / on-the-fly algorithm, this also represents
492 // all the intermediate state between input entries.
493 //
494 // concept BandingStorage extends RibbonTypes {
495 //   // Tells the banding algorithm to prefetch memory associated with
496 //   // the next input before processing the current input. Generally
497 //   // recommended iff the BandingStorage doesn't easily fit in CPU
498 //   // cache.
499 //   bool UsePrefetch() const;
500 //
501 //   // Prefetches (e.g. __builtin_prefetch) memory associated with a
502 //   // slot index i.
503 //   void Prefetch(Index i) const;
504 //
505 //   // Load or store CoeffRow and ResultRow for slot index i.
506 //   // (Gaussian row operations involve both sides of the equation.)
507 //   // Bool `for_back_subst` indicates that customizing values for
508 //   // unconstrained solution rows (cr == 0) is allowed.
509 //   void LoadRow(Index i, CoeffRow *cr, ResultRow *rr, bool for_back_subst)
510 //        const;
511 //   void StoreRow(Index i, CoeffRow cr, ResultRow rr);
512 //
513 //   // Returns the number of columns that can start an r-sequence of
514 //   // coefficients, which is the number of slots minus r (kCoeffBits)
515 //   // plus one. (m - r + 1)
516 //   Index GetNumStarts() const;
517 // };
518 
519 // Optional storage for backtracking data in banding a set of input
520 // entries. It exposes an array structure which will generally be
521 // used as a stack. It must be able to accommodate as many entries
522 // as are passed in as inputs to `BandingAddRange`.
523 //
524 // concept BacktrackStorage extends RibbonTypes {
525 //   // If false, backtracking support will be disabled in the algorithm.
526 //   // This should preferably be an inline compile-time constant function.
527 //   bool UseBacktrack() const;
528 //
529 //   // Records `to_save` as the `i`th backtrack entry
530 //   void BacktrackPut(Index i, Index to_save);
531 //
532 //   // Recalls the `i`th backtrack entry
533 //   Index BacktrackGet(Index i) const;
534 // }
535 
536 // Adds a single entry to BandingStorage (and optionally, BacktrackStorage),
537 // returning true if successful or false if solution is impossible with
538 // current hasher (and presumably its seed) and number of "slots" (solution
539 // or banding rows). (A solution is impossible when there is a linear
540 // dependence among the inputs that doesn't "cancel out".)
541 //
542 // Pre- and post-condition: the BandingStorage represents a band matrix
543 // ready for back substitution (row echelon form except for zero rows),
544 // augmented with result values such that back substitution would give a
545 // solution satisfying all the cr@start -> rr entries added.
546 template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
547           typename BacktrackStorage>
BandingAdd(BandingStorage * bs,typename BandingStorage::Index start,typename BandingStorage::ResultRow rr,typename BandingStorage::CoeffRow cr,BacktrackStorage * bts,typename BandingStorage::Index * backtrack_pos)548 bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
549                 typename BandingStorage::ResultRow rr,
550                 typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
551                 typename BandingStorage::Index *backtrack_pos) {
552   using CoeffRow = typename BandingStorage::CoeffRow;
553   using ResultRow = typename BandingStorage::ResultRow;
554   using Index = typename BandingStorage::Index;
555 
556   Index i = start;
557 
558   if (!kFirstCoeffAlwaysOne) {
559     // Requires/asserts that cr != 0
560     int tz = CountTrailingZeroBits(cr);
561     i += static_cast<Index>(tz);
562     cr >>= tz;
563   }
564 
565   for (;;) {
566     assert((cr & 1) == 1);
567     CoeffRow cr_at_i;
568     ResultRow rr_at_i;
569     bs->LoadRow(i, &cr_at_i, &rr_at_i, /* for_back_subst */ false);
570     if (cr_at_i == 0) {
571       bs->StoreRow(i, cr, rr);
572       bts->BacktrackPut(*backtrack_pos, i);
573       ++*backtrack_pos;
574       return true;
575     }
576     assert((cr_at_i & 1) == 1);
577     // Gaussian row reduction
578     cr ^= cr_at_i;
579     rr ^= rr_at_i;
580     if (cr == 0) {
581       // Inconsistency or (less likely) redundancy
582       break;
583     }
584     // Find relative offset of next non-zero coefficient.
585     int tz = CountTrailingZeroBits(cr);
586     i += static_cast<Index>(tz);
587     cr >>= tz;
588   }
589 
590   // Failed, unless result row == 0 because e.g. a duplicate input or a
591   // stock hash collision, with same result row. (For filter, stock hash
592   // collision implies same result row.) Or we could have a full equation
593   // equal to sum of other equations, which is very possible with
594   // small range of values for result row.
595   return rr == 0;
596 }
597 
598 // Adds a range of entries to BandingStorage returning true if successful
599 // or false if solution is impossible with current hasher (and presumably
600 // its seed) and number of "slots" (solution or banding rows). (A solution
601 // is impossible when there is a linear dependence among the inputs that
602 // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
603 //
604 // If UseBacktrack in the BacktrackStorage, this function call rolls back
605 // to prior state on failure. If !UseBacktrack, some subset of the entries
606 // will have been added to the BandingStorage, so best considered to be in
607 // an indeterminate state.
608 //
609 template <typename BandingStorage, typename BacktrackStorage,
610           typename BandingHasher, typename InputIterator>
BandingAddRange(BandingStorage * bs,BacktrackStorage * bts,const BandingHasher & bh,InputIterator begin,InputIterator end)611 bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
612                      const BandingHasher &bh, InputIterator begin,
613                      InputIterator end) {
614   using CoeffRow = typename BandingStorage::CoeffRow;
615   using Index = typename BandingStorage::Index;
616   using ResultRow = typename BandingStorage::ResultRow;
617   using Hash = typename BandingHasher::Hash;
618 
619   static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned");
620   static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned");
621   static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned");
622 
623   constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne;
624 
625   if (begin == end) {
626     // trivial
627     return true;
628   }
629 
630   const Index num_starts = bs->GetNumStarts();
631 
632   InputIterator cur = begin;
633   Index backtrack_pos = 0;
634   if (!bs->UsePrefetch()) {
635     // Simple version, no prefetch
636     for (;;) {
637       Hash h = bh.GetHash(*cur);
638       Index start = bh.GetStart(h, num_starts);
639       ResultRow rr =
640           bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h);
641       CoeffRow cr = bh.GetCoeffRow(h);
642 
643       if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
644         break;
645       }
646       if ((++cur) == end) {
647         return true;
648       }
649     }
650   } else {
651     // Pipelined w/prefetch
652     // Prime the pipeline
653     Hash h = bh.GetHash(*cur);
654     Index start = bh.GetStart(h, num_starts);
655     ResultRow rr = bh.GetResultRowFromInput(*cur);
656     bs->Prefetch(start);
657 
658     // Pipeline
659     for (;;) {
660       rr |= bh.GetResultRowFromHash(h);
661       CoeffRow cr = bh.GetCoeffRow(h);
662       if ((++cur) == end) {
663         if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
664           break;
665         }
666         return true;
667       }
668       Hash next_h = bh.GetHash(*cur);
669       Index next_start = bh.GetStart(next_h, num_starts);
670       ResultRow next_rr = bh.GetResultRowFromInput(*cur);
671       bs->Prefetch(next_start);
672       if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
673         break;
674       }
675       h = next_h;
676       start = next_start;
677       rr = next_rr;
678     }
679   }
680   // failed; backtrack (if implemented)
681   if (bts->UseBacktrack()) {
682     while (backtrack_pos > 0) {
683       --backtrack_pos;
684       Index i = bts->BacktrackGet(backtrack_pos);
685       // Clearing the ResultRow is not strictly required, but is required
686       // for good FP rate on inputs that might have been backtracked out.
687       // (We don't want anything we've backtracked on to leak into final
688       // result, as that might not be "harmless".)
689       bs->StoreRow(i, 0, 0);
690     }
691   }
692   return false;
693 }
694 
695 // Adds a range of entries to BandingStorage returning true if successful
696 // or false if solution is impossible with current hasher (and presumably
697 // its seed) and number of "slots" (solution or banding rows). (A solution
698 // is impossible when there is a linear dependence among the inputs that
699 // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
700 //
701 // On failure, some subset of the entries will have been added to the
702 // BandingStorage, so best considered to be in an indeterminate state.
703 //
704 template <typename BandingStorage, typename BandingHasher,
705           typename InputIterator>
BandingAddRange(BandingStorage * bs,const BandingHasher & bh,InputIterator begin,InputIterator end)706 bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
707                      InputIterator begin, InputIterator end) {
708   using Index = typename BandingStorage::Index;
709   struct NoopBacktrackStorage {
710     bool UseBacktrack() { return false; }
711     void BacktrackPut(Index, Index) {}
712     Index BacktrackGet(Index) {
713       assert(false);
714       return 0;
715     }
716   } nbts;
717   return BandingAddRange(bs, &nbts, bh, begin, end);
718 }
719 
720 // ######################################################################
721 // ######################### Solution Storage ###########################
722 
723 // Back-substitution and query algorithms unfortunately depend on some
724 // details of data layout in the final data structure ("solution"). Thus,
725 // there is no common SolutionStorage covering all the reasonable
726 // possibilities.
727 
728 // ###################### SimpleSolutionStorage #########################
729 
730 // SimpleSolutionStorage is for a row-major storage, typically with no
731 // unused bits in each ResultRow. This is mostly for demonstration
732 // purposes as the simplest solution storage scheme. It is relatively slow
733 // for filter queries.
734 
735 // concept SimpleSolutionStorage extends RibbonTypes {
736 //   // This is called at the beginning of back-substitution for the
737 //   // solution storage to do any remaining configuration before data
738 //   // is stored to it. If configuration is previously finalized, this
739 //   // could be a simple assertion or even no-op. Ribbon algorithms
740 //   // only call this from back-substitution, and only once per call,
741 //   // before other functions here.
742 //   void PrepareForNumStarts(Index num_starts) const;
743 //   // Must return num_starts passed to PrepareForNumStarts, or the most
744 //   // recent call to PrepareForNumStarts if this storage object can be
745 //   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
746 //   // there must be a run of kCoeffBits slots starting from each start.
747 //   Index GetNumStarts() const;
748 //   // Load the solution row (type ResultRow) for a slot
749 //   ResultRow Load(Index slot_num) const;
750 //   // Store the solution row (type ResultRow) for a slot
751 //   void Store(Index slot_num, ResultRow data);
752 // };
753 
754 // Back-substitution for generating a solution from BandingStorage to
755 // SimpleSolutionStorage.
756 template <typename SimpleSolutionStorage, typename BandingStorage>
SimpleBackSubst(SimpleSolutionStorage * sss,const BandingStorage & bs)757 void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
758   using CoeffRow = typename BandingStorage::CoeffRow;
759   using Index = typename BandingStorage::Index;
760   using ResultRow = typename BandingStorage::ResultRow;
761 
762   static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index),
763                 "must be same");
764   static_assert(
765       sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow),
766       "must be same");
767   static_assert(
768       sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow),
769       "must be same");
770 
771   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
772   constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U);
773 
774   // A column-major buffer of the solution matrix, containing enough
775   // recently-computed solution data to compute the next solution row
776   // (based also on banding data).
777   std::array<CoeffRow, kResultBits> state;
778   state.fill(0);
779 
780   const Index num_starts = bs.GetNumStarts();
781   sss->PrepareForNumStarts(num_starts);
782   const Index num_slots = num_starts + kCoeffBits - 1;
783 
784   for (Index i = num_slots; i > 0;) {
785     --i;
786     CoeffRow cr;
787     ResultRow rr;
788     bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
789     // solution row
790     ResultRow sr = 0;
791     for (Index j = 0; j < kResultBits; ++j) {
792       // Compute next solution bit at row i, column j (see derivation below)
793       CoeffRow tmp = state[j] << 1;
794       bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0;
795       tmp |= bit ? CoeffRow{1} : CoeffRow{0};
796 
797       // Now tmp is solution at column j from row i for next kCoeffBits
798       // more rows. Thus, for valid solution, the dot product of the
799       // solution column with the coefficient row has to equal the result
800       // at that column,
801       //   BitParity(tmp & cr) == ((rr >> j) & 1)
802 
803       // Update state.
804       state[j] = tmp;
805       // add to solution row
806       sr |= (bit ? ResultRow{1} : ResultRow{0}) << j;
807     }
808     sss->Store(i, sr);
809   }
810 }
811 
812 // Common functionality for querying a key (already hashed) in
813 // SimpleSolutionStorage.
814 template <typename SimpleSolutionStorage>
SimpleQueryHelper(typename SimpleSolutionStorage::Index start_slot,typename SimpleSolutionStorage::CoeffRow cr,const SimpleSolutionStorage & sss)815 typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
816     typename SimpleSolutionStorage::Index start_slot,
817     typename SimpleSolutionStorage::CoeffRow cr,
818     const SimpleSolutionStorage &sss) {
819   using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
820   using ResultRow = typename SimpleSolutionStorage::ResultRow;
821 
822   constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U);
823 
824   ResultRow result = 0;
825   for (unsigned i = 0; i < kCoeffBits; ++i) {
826     // Bit masking whole value is generally faster here than 'if'
827     result ^= sss.Load(start_slot + i) &
828               (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1}));
829   }
830   return result;
831 }
832 
833 // General PHSF query a key from SimpleSolutionStorage.
834 template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
SimplePhsfQuery(const typename PhsfQueryHasher::Key & key,const PhsfQueryHasher & hasher,const SimpleSolutionStorage & sss)835 typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
836     const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
837     const SimpleSolutionStorage &sss) {
838   const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
839 
840   static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
841                     sizeof(typename PhsfQueryHasher::Index),
842                 "must be same");
843   static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
844                     sizeof(typename PhsfQueryHasher::CoeffRow),
845                 "must be same");
846 
847   return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
848                            hasher.GetCoeffRow(hash), sss);
849 }
850 
851 // Filter query a key from SimpleSolutionStorage.
852 template <typename SimpleSolutionStorage, typename FilterQueryHasher>
SimpleFilterQuery(const typename FilterQueryHasher::Key & key,const FilterQueryHasher & hasher,const SimpleSolutionStorage & sss)853 bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
854                        const FilterQueryHasher &hasher,
855                        const SimpleSolutionStorage &sss) {
856   const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
857   const typename SimpleSolutionStorage::ResultRow expected =
858       hasher.GetResultRowFromHash(hash);
859 
860   static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
861                     sizeof(typename FilterQueryHasher::Index),
862                 "must be same");
863   static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
864                     sizeof(typename FilterQueryHasher::CoeffRow),
865                 "must be same");
866   static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) ==
867                     sizeof(typename FilterQueryHasher::ResultRow),
868                 "must be same");
869 
870   return expected ==
871          SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
872                            hasher.GetCoeffRow(hash), sss);
873 }
874 
875 // #################### InterleavedSolutionStorage ######################
876 
877 // InterleavedSolutionStorage is row-major at a high level, for good
878 // locality, and column-major at a low level, for CPU efficiency
879 // especially in filter queries or relatively small number of result bits
880 // (== solution columns). The storage is a sequence of "blocks" where a
881 // block has one CoeffRow-sized segment for each solution column. Each
882 // query spans at most two blocks; the starting solution row is typically
883 // in the row-logical middle of a block and spans to the middle of the
884 // next block. (See diagram below.)
885 //
886 // InterleavedSolutionStorage supports choosing b (number of result or
887 // solution columns) at run time, and even supports mixing b and b-1 solution
888 // columns in a single linear system solution, for filters that can
889 // effectively utilize any size space (multiple of CoeffRow) for minimizing
890 // FP rate for any number of added keys. To simplify query implementation
891 // (with lower-index columns first), the b-bit portion comes after the b-1
892 // portion of the structure.
893 //
894 // Diagram (=== marks logical block boundary; b=4; ### is data used by a
895 // query crossing the b-1 to b boundary, each Segment has type CoeffRow):
896 //  ...
897 // +======================+
898 // | S e g m e n t  col=0 |
899 // +----------------------+
900 // | S e g m e n t  col=1 |
901 // +----------------------+
902 // | S e g m e n t  col=2 |
903 // +======================+
904 // | S e g m e n #########|
905 // +----------------------+
906 // | S e g m e n #########|
907 // +----------------------+
908 // | S e g m e n #########|
909 // +======================+ Result/solution columns: above = 3, below = 4
910 // |#############t  col=0 |
911 // +----------------------+
912 // |#############t  col=1 |
913 // +----------------------+
914 // |#############t  col=2 |
915 // +----------------------+
916 // | S e g m e n t  col=3 |
917 // +======================+
918 // | S e g m e n t  col=0 |
919 // +----------------------+
920 // | S e g m e n t  col=1 |
921 // +----------------------+
922 // | S e g m e n t  col=2 |
923 // +----------------------+
924 // | S e g m e n t  col=3 |
925 // +======================+
926 //  ...
927 //
928 // InterleavedSolutionStorage will be adapted by the algorithms from
929 // simple array-like segment storage. That array-like storage is templatized
930 // in part so that an implementation may choose to handle byte ordering
931 // at access time.
932 //
933 // concept InterleavedSolutionStorage extends RibbonTypes {
934 //   // This is called at the beginning of back-substitution for the
935 //   // solution storage to do any remaining configuration before data
936 //   // is stored to it. If configuration is previously finalized, this
937 //   // could be a simple assertion or even no-op. Ribbon algorithms
938 //   // only call this from back-substitution, and only once per call,
939 //   // before other functions here.
940 //   void PrepareForNumStarts(Index num_starts) const;
941 //   // Must return num_starts passed to PrepareForNumStarts, or the most
942 //   // recent call to PrepareForNumStarts if this storage object can be
943 //   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
944 //   // there must be a run of kCoeffBits slots starting from each start.
945 //   Index GetNumStarts() const;
946 //   // The larger number of solution columns used (called "b" above).
947 //   Index GetUpperNumColumns() const;
948 //   // If returns > 0, then block numbers below that use
949 //   // GetUpperNumColumns() - 1 columns per solution row, and the rest
950 //   // use GetUpperNumColumns(). A block represents kCoeffBits "slots",
951 //   // where all but the last kCoeffBits - 1 slots are also starts. And
952 //   // a block contains a segment for each solution column.
953 //   // An implementation may only support uniform columns per solution
954 //   // row and return constant 0 here.
955 //   Index GetUpperStartBlock() const;
956 //
957 //   // ### "Array of segments" portion of API ###
958 //   // The number of values of type CoeffRow used in this solution
959 //   // representation. (This value can be inferred from the previous
960 //   // three functions, but is expected at least for sanity / assertion
961 //   // checking.)
962 //   Index GetNumSegments() const;
963 //   // Load an entry from the logical array of segments
964 //   CoeffRow LoadSegment(Index segment_num) const;
965 //   // Store an entry to the logical array of segments
966 //   void StoreSegment(Index segment_num, CoeffRow data);
967 // };
968 
969 // A helper for InterleavedBackSubst.
970 template <typename BandingStorage>
BackSubstBlock(typename BandingStorage::CoeffRow * state,typename BandingStorage::Index num_columns,const BandingStorage & bs,typename BandingStorage::Index start_slot)971 inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
972                            typename BandingStorage::Index num_columns,
973                            const BandingStorage &bs,
974                            typename BandingStorage::Index start_slot) {
975   using CoeffRow = typename BandingStorage::CoeffRow;
976   using Index = typename BandingStorage::Index;
977   using ResultRow = typename BandingStorage::ResultRow;
978 
979   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
980 
981   for (Index i = start_slot + kCoeffBits; i > start_slot;) {
982     --i;
983     CoeffRow cr;
984     ResultRow rr;
985     bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
986     for (Index j = 0; j < num_columns; ++j) {
987       // Compute next solution bit at row i, column j (see derivation below)
988       CoeffRow tmp = state[j] << 1;
989       int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1);
990       tmp |= static_cast<CoeffRow>(bit);
991 
992       // Now tmp is solution at column j from row i for next kCoeffBits
993       // more rows. Thus, for valid solution, the dot product of the
994       // solution column with the coefficient row has to equal the result
995       // at that column,
996       //   BitParity(tmp & cr) == ((rr >> j) & 1)
997 
998       // Update state.
999       state[j] = tmp;
1000     }
1001   }
1002 }
1003 
1004 // Back-substitution for generating a solution from BandingStorage to
1005 // InterleavedSolutionStorage.
1006 template <typename InterleavedSolutionStorage, typename BandingStorage>
InterleavedBackSubst(InterleavedSolutionStorage * iss,const BandingStorage & bs)1007 void InterleavedBackSubst(InterleavedSolutionStorage *iss,
1008                           const BandingStorage &bs) {
1009   using CoeffRow = typename BandingStorage::CoeffRow;
1010   using Index = typename BandingStorage::Index;
1011 
1012   static_assert(
1013       sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index),
1014       "must be same");
1015   static_assert(
1016       sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow),
1017       "must be same");
1018 
1019   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1020 
1021   const Index num_starts = bs.GetNumStarts();
1022   // Although it might be nice to have a filter that returns "always false"
1023   // when no key is added, we aren't specifically supporting that here
1024   // because it would require another condition branch in the query.
1025   assert(num_starts > 0);
1026   iss->PrepareForNumStarts(num_starts);
1027 
1028   const Index num_slots = num_starts + kCoeffBits - 1;
1029   assert(num_slots % kCoeffBits == 0);
1030   const Index num_blocks = num_slots / kCoeffBits;
1031   const Index num_segments = iss->GetNumSegments();
1032 
1033   // For now upper, then lower
1034   Index num_columns = iss->GetUpperNumColumns();
1035   const Index upper_start_block = iss->GetUpperStartBlock();
1036 
1037   if (num_columns == 0) {
1038     // Nothing to do, presumably because there's not enough space for even
1039     // a single segment.
1040     assert(num_segments == 0);
1041     // When num_columns == 0, a Ribbon filter query will always return true,
1042     // or a PHSF query always 0.
1043     return;
1044   }
1045 
1046   // We should be utilizing all available segments
1047   assert(num_segments == (upper_start_block * (num_columns - 1)) +
1048                              ((num_blocks - upper_start_block) * num_columns));
1049 
1050   // TODO: consider fixed-column specializations with stack-allocated state
1051 
1052   // A column-major buffer of the solution matrix, containing enough
1053   // recently-computed solution data to compute the next solution row
1054   // (based also on banding data).
1055   std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()};
1056 
1057   Index block = num_blocks;
1058   Index segment_num = num_segments;
1059   while (block > upper_start_block) {
1060     --block;
1061     BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
1062     segment_num -= num_columns;
1063     for (Index i = 0; i < num_columns; ++i) {
1064       iss->StoreSegment(segment_num + i, state[i]);
1065     }
1066   }
1067   // Now (if applicable), region using lower number of columns
1068   // (This should be optimized away if GetUpperStartBlock() returns
1069   // constant 0.)
1070   --num_columns;
1071   while (block > 0) {
1072     --block;
1073     BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
1074     segment_num -= num_columns;
1075     for (Index i = 0; i < num_columns; ++i) {
1076       iss->StoreSegment(segment_num + i, state[i]);
1077     }
1078   }
1079   // Verify everything processed
1080   assert(block == 0);
1081   assert(segment_num == 0);
1082 }
1083 
1084 // Prefetch memory for a key in InterleavedSolutionStorage.
1085 template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
InterleavedPrepareQuery(const typename PhsfQueryHasher::Key & key,const PhsfQueryHasher & hasher,const InterleavedSolutionStorage & iss,typename PhsfQueryHasher::Hash * saved_hash,typename InterleavedSolutionStorage::Index * saved_segment_num,typename InterleavedSolutionStorage::Index * saved_num_columns,typename InterleavedSolutionStorage::Index * saved_start_bit)1086 inline void InterleavedPrepareQuery(
1087     const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
1088     const InterleavedSolutionStorage &iss,
1089     typename PhsfQueryHasher::Hash *saved_hash,
1090     typename InterleavedSolutionStorage::Index *saved_segment_num,
1091     typename InterleavedSolutionStorage::Index *saved_num_columns,
1092     typename InterleavedSolutionStorage::Index *saved_start_bit) {
1093   using Hash = typename PhsfQueryHasher::Hash;
1094   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1095   using Index = typename InterleavedSolutionStorage::Index;
1096 
1097   static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
1098                 "must be same");
1099 
1100   const Hash hash = hasher.GetHash(key);
1101   const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
1102 
1103   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1104 
1105   const Index upper_start_block = iss.GetUpperStartBlock();
1106   Index num_columns = iss.GetUpperNumColumns();
1107   Index start_block_num = start_slot / kCoeffBits;
1108   Index segment_num = start_block_num * num_columns -
1109                       std::min(start_block_num, upper_start_block);
1110   // Change to lower num columns if applicable.
1111   // (This should not compile to a conditional branch.)
1112   num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
1113 
1114   Index start_bit = start_slot % kCoeffBits;
1115 
1116   Index segment_count = num_columns + (start_bit == 0 ? 0 : num_columns);
1117 
1118   iss.PrefetchSegmentRange(segment_num, segment_num + segment_count);
1119 
1120   *saved_hash = hash;
1121   *saved_segment_num = segment_num;
1122   *saved_num_columns = num_columns;
1123   *saved_start_bit = start_bit;
1124 }
1125 
1126 // General PHSF query from InterleavedSolutionStorage, using data for
1127 // the query key from InterleavedPrepareQuery
1128 template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
InterleavedPhsfQuery(typename PhsfQueryHasher::Hash hash,typename InterleavedSolutionStorage::Index segment_num,typename InterleavedSolutionStorage::Index num_columns,typename InterleavedSolutionStorage::Index start_bit,const PhsfQueryHasher & hasher,const InterleavedSolutionStorage & iss)1129 inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
1130     typename PhsfQueryHasher::Hash hash,
1131     typename InterleavedSolutionStorage::Index segment_num,
1132     typename InterleavedSolutionStorage::Index num_columns,
1133     typename InterleavedSolutionStorage::Index start_bit,
1134     const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
1135   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1136   using Index = typename InterleavedSolutionStorage::Index;
1137   using ResultRow = typename InterleavedSolutionStorage::ResultRow;
1138 
1139   static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
1140                 "must be same");
1141   static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow),
1142                 "must be same");
1143 
1144   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1145 
1146   const CoeffRow cr = hasher.GetCoeffRow(hash);
1147 
1148   ResultRow sr = 0;
1149   const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
1150   for (Index i = 0; i < num_columns; ++i) {
1151     sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_left) << i;
1152   }
1153 
1154   if (start_bit > 0) {
1155     segment_num += num_columns;
1156     const CoeffRow cr_right =
1157         cr >> static_cast<unsigned>(kCoeffBits - start_bit);
1158     for (Index i = 0; i < num_columns; ++i) {
1159       sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_right) << i;
1160     }
1161   }
1162 
1163   return sr;
1164 }
1165 
1166 // Filter query a key from InterleavedFilterQuery.
1167 template <typename InterleavedSolutionStorage, typename FilterQueryHasher>
InterleavedFilterQuery(typename FilterQueryHasher::Hash hash,typename InterleavedSolutionStorage::Index segment_num,typename InterleavedSolutionStorage::Index num_columns,typename InterleavedSolutionStorage::Index start_bit,const FilterQueryHasher & hasher,const InterleavedSolutionStorage & iss)1168 inline bool InterleavedFilterQuery(
1169     typename FilterQueryHasher::Hash hash,
1170     typename InterleavedSolutionStorage::Index segment_num,
1171     typename InterleavedSolutionStorage::Index num_columns,
1172     typename InterleavedSolutionStorage::Index start_bit,
1173     const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
1174   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1175   using Index = typename InterleavedSolutionStorage::Index;
1176   using ResultRow = typename InterleavedSolutionStorage::ResultRow;
1177 
1178   static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index),
1179                 "must be same");
1180   static_assert(
1181       sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow),
1182       "must be same");
1183   static_assert(
1184       sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow),
1185       "must be same");
1186 
1187   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1188 
1189   const CoeffRow cr = hasher.GetCoeffRow(hash);
1190   const ResultRow expected = hasher.GetResultRowFromHash(hash);
1191 
1192   // TODO: consider optimizations such as
1193   // * get rid of start_bit == 0 condition with careful fetching & shifting
1194   if (start_bit == 0) {
1195     for (Index i = 0; i < num_columns; ++i) {
1196       if (BitParity(iss.LoadSegment(segment_num + i) & cr) !=
1197           (static_cast<int>(expected >> i) & 1)) {
1198         return false;
1199       }
1200     }
1201   } else {
1202     const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
1203     const CoeffRow cr_right =
1204         cr >> static_cast<unsigned>(kCoeffBits - start_bit);
1205 
1206     for (Index i = 0; i < num_columns; ++i) {
1207       CoeffRow soln_data =
1208           (iss.LoadSegment(segment_num + i) & cr_left) ^
1209           (iss.LoadSegment(segment_num + num_columns + i) & cr_right);
1210       if (BitParity(soln_data) != (static_cast<int>(expected >> i) & 1)) {
1211         return false;
1212       }
1213     }
1214   }
1215   // otherwise, all match
1216   return true;
1217 }
1218 
1219 // TODO: refactor Interleaved*Query so that queries can be "prepared" by
1220 // prefetching memory, to hide memory latency for multiple queries in a
1221 // single thread.
1222 
1223 }  // namespace ribbon
1224 
1225 }  // namespace ROCKSDB_NAMESPACE
1226