1 #ifndef BMRS__H__INCLUDED__
2 #define BMRS__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 For more information please visit: http://bitmagic.io
19 */
20
21 /*! \file bmrs.h
22 \brief Rank-Select index structures
23 */
24
25 namespace bm
26 {
27
28 /**
29 @brief Rank-Select acceleration index
30
31 Index uses two-level acceleration structure:
32 bcount - running total popcount for all (possible) blocks
33 (missing blocks give duplicate counts as POPCNT(N-1) + 0).
34 subcount - sub-count inside blocks
35
36 @ingroup bvector
37 @internal
38 */
39 template<typename BVAlloc>
40 class rs_index
41 {
42 public:
43 typedef BVAlloc bv_allocator_type;
44
45 #ifdef BM64ADDR
46 typedef bm::id64_t size_type;
47 typedef bm::id64_t block_idx_type;
48 typedef bm::id64_t sblock_count_type;
49 #else
50 typedef bm::id_t size_type;
51 typedef bm::id_t block_idx_type;
52 typedef bm::id_t sblock_count_type;
53 #endif
54
55 typedef bm::pair<bm::gap_word_t, bm::gap_word_t> sb_pair_type;
56
57 public:
rs_index()58 rs_index() : total_blocks_(0), max_sblock_(0) {}
59 rs_index(const rs_index& rsi);
60
61 /// init arrays to zeros
62 void init() BMNOEXCEPT;
63
64 /// copy rs index
65 void copy_from(const rs_index& rsi);
66
67 // -----------------------------------------------------------------
68
69 /// set empty (no bits set super-block)
70 void set_null_super_block(unsigned i);
71
72 /// set FULL (all bits set super-block)
73 void set_full_super_block(unsigned i);
74
75 /// set super block count
76 void set_super_block(unsigned i, size_type bcount);
77
78 /// return bit-count in super-block
79 unsigned get_super_block_count(unsigned i) const;
80
81 /// return running bit-count in super-block
82 size_type get_super_block_rcount(unsigned i) const;
83
84 /// return number of super-blocks
85 size_type super_block_size() const;
86
87 /// Find super-block with specified rank
88 unsigned find_super_block(size_type rank) const;
89
90
91 /// set size of true super-blocks (not NULL and not FFFFF)
92 void resize_effective_super_blocks(size_type sb_eff);
93
94 /// Add block list belonging to one super block
95 void register_super_block(unsigned i,
96 const unsigned* bcount,
97 const unsigned* sub_count);
98
99 /// find block position and sub-range for the specified rank
100 bool find(size_type* rank,
101 block_idx_type* nb, bm::gap_word_t* sub_range) const;
102
103 /// return sub-clock counts
104 unsigned sub_count(block_idx_type nb) const;
105
106
107 // -----------------------------------------------------------------
108
109 /// reserve the capacity for block count
110 void resize(block_idx_type new_size);
111
112 /// set total blocks
set_total(size_type t)113 void set_total(size_type t) { total_blocks_ = t; }
114
115 /// get total blocks
get_total()116 size_type get_total() const { return total_blocks_; }
117
118 /// return bit-count for specified block
119 unsigned count(block_idx_type nb) const;
120
121 /// return total bit-count for the index
122 size_type count() const;
123
124 /// return running bit-count for specified block
125 size_type rcount(block_idx_type nb) const;
126
127 /// determine the sub-range within a bit-block
128 static
129 unsigned find_sub_range(unsigned block_bit_pos);
130
131 /// determine block sub-range for rank search
132 bm::gap_word_t select_sub_range(block_idx_type nb, size_type& rank) const;
133
134 /// find block position for the specified rank
135 block_idx_type find(size_type rank) const;
136
137 private:
138 typedef bm::heap_vector<sblock_count_type, bv_allocator_type, false>
139 sblock_count_vector_type;
140 typedef bm::heap_vector<unsigned, bv_allocator_type, false>
141 sblock_row_vector_type;
142 typedef bm::dynamic_heap_matrix<unsigned, bv_allocator_type> blocks_matrix_type;
143
144 private:
145 unsigned sblock_rows_;
146 sblock_count_vector_type sblock_count_; ///< super-block accumulated counts
147 sblock_row_vector_type sblock_row_idx_; ///< super-block row numbers
148 blocks_matrix_type block_matr_; ///< blocks within super-blocks (matrix)
149 blocks_matrix_type sub_block_matr_; ///< sub-block counts
150 size_type total_blocks_; ///< total bit-blocks in the index
151 unsigned max_sblock_; ///< max. superblock index
152 };
153
154 //---------------------------------------------------------------------
155 //
156 //---------------------------------------------------------------------
157
158 template<typename BVAlloc>
rs_index(const rs_index<BVAlloc> & rsi)159 rs_index<BVAlloc>::rs_index(const rs_index<BVAlloc>& rsi)
160 {
161 copy_from(rsi);
162 }
163
164 //---------------------------------------------------------------------
165
166
167 template<typename BVAlloc>
init()168 void rs_index<BVAlloc>::init() BMNOEXCEPT
169 {
170 sblock_count_.resize(0);
171 sblock_row_idx_.resize(0);
172 block_matr_.resize(0, 0);
173 sub_block_matr_.resize(0, 0);
174
175 total_blocks_ = sblock_rows_ = max_sblock_ = 0;
176 }
177
178 //---------------------------------------------------------------------
179
180 template<typename BVAlloc>
resize(block_idx_type new_size)181 void rs_index<BVAlloc>::resize(block_idx_type new_size)
182 {
183 sblock_rows_ = 0;
184
185 block_idx_type sblock_count = (new_size >> bm::set_array_shift) + 3;
186 BM_ASSERT(sblock_count == (new_size / 256) + 3);
187 sblock_count_.resize(sblock_count);
188 sblock_row_idx_.resize(sblock_count);
189 }
190
191 //---------------------------------------------------------------------
192
193 template<typename BVAlloc>
copy_from(const rs_index & rsi)194 void rs_index<BVAlloc>::copy_from(const rs_index& rsi)
195 {
196 sblock_rows_ = rsi.sblock_rows_;
197 sblock_count_ = rsi.sblock_count_;
198 sblock_row_idx_ = rsi.sblock_row_idx_;
199 block_matr_ = rsi.block_matr_;
200 sub_block_matr_ = rsi.sub_block_matr_;
201
202 total_blocks_ = rsi.total_blocks_;
203 max_sblock_ = rsi.max_sblock_;
204 }
205
206 //---------------------------------------------------------------------
207
208 template<typename BVAlloc>
count(block_idx_type nb)209 unsigned rs_index<BVAlloc>::count(block_idx_type nb) const
210 {
211 if (nb >= total_blocks_)
212 return 0;
213 unsigned i = unsigned(nb >> bm::set_array_shift);
214 size_type sb_count = get_super_block_count(i);
215
216 if (!sb_count)
217 return 0;
218 if (sb_count == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
219 return bm::gap_max_bits;
220
221 unsigned j = unsigned(nb & bm::set_array_mask);
222
223 unsigned row_idx = sblock_row_idx_[i+1];
224 const unsigned* row = block_matr_.row(row_idx);
225 unsigned bc;
226
227 bc = (!j) ? row[j] : row[j] - row[j-1];
228 return bc;
229 }
230
231 //---------------------------------------------------------------------
232
233 template<typename BVAlloc>
234 typename rs_index<BVAlloc>::size_type
count()235 rs_index<BVAlloc>::count() const
236 {
237 if (!total_blocks_)
238 return 0;
239 return sblock_count_[max_sblock_ + 1];
240 }
241
242 //---------------------------------------------------------------------
243
244 template<typename BVAlloc>
245 typename rs_index<BVAlloc>::size_type
rcount(block_idx_type nb)246 rs_index<BVAlloc>::rcount(block_idx_type nb) const
247 {
248 unsigned i = unsigned(nb >> bm::set_array_shift);
249 size_type sb_rcount = i ? get_super_block_rcount(i-1) : i;
250
251 unsigned sb_count = get_super_block_count(i);
252 if (!sb_count)
253 return sb_rcount;
254
255 unsigned j = unsigned(nb & bm::set_array_mask);
256 if (sb_count == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
257 {
258 sb_count = (j+1) * bm::gap_max_bits;
259 sb_rcount += sb_count;
260 return sb_rcount;
261 }
262
263 unsigned row_idx = sblock_row_idx_[i+1];
264 const unsigned* row = block_matr_.row(row_idx);
265 unsigned bc = row[j];
266 sb_rcount += bc;
267 return sb_rcount;
268
269 }
270
271 //---------------------------------------------------------------------
272
273 template<typename BVAlloc>
find_sub_range(unsigned block_bit_pos)274 unsigned rs_index<BVAlloc>::find_sub_range(unsigned block_bit_pos)
275 {
276 return (block_bit_pos <= rs3_border0) ? 0 :
277 (block_bit_pos > rs3_border1) ? 2 : 1;
278 }
279
280 //---------------------------------------------------------------------
281
282 template<typename BVAlloc>
select_sub_range(block_idx_type nb,size_type & rank)283 bm::gap_word_t rs_index<BVAlloc>::select_sub_range(block_idx_type nb,
284 size_type& rank) const
285 {
286 unsigned sub_cnt = sub_count(nb);
287 unsigned first = sub_cnt & 0xFFFF;
288 unsigned second = sub_cnt >> 16;
289
290 if (rank > first)
291 {
292 rank -= first;
293 if (rank > second)
294 {
295 rank -= second;
296 return rs3_border1 + 1;
297 }
298 else
299 return rs3_border0 + 1;
300 }
301 return 0;
302 }
303
304 //---------------------------------------------------------------------
305
306 template<typename BVAlloc>
307 typename rs_index<BVAlloc>::block_idx_type
find(size_type rank)308 rs_index<BVAlloc>::find(size_type rank) const
309 {
310 BM_ASSERT(rank);
311
312 unsigned i = find_super_block(rank);
313 BM_ASSERT(i < super_block_size());
314
315 size_type prev_rc = sblock_count_[i];
316 size_type curr_rc = sblock_count_[i+1];
317 size_type bc = curr_rc - prev_rc;
318
319 BM_ASSERT(bc);
320
321 unsigned j;
322 rank -= prev_rc;
323 if (bc == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
324 {
325 for (j = 0; j < bm::set_sub_array_size; ++j)
326 {
327 if (rank <= bm::gap_max_bits)
328 break;
329 rank -= bm::gap_max_bits;
330 } // for j
331 }
332 else
333 {
334 unsigned row_idx = sblock_row_idx_[i+1];
335 const unsigned* row = block_matr_.row(row_idx);
336 BM_ASSERT(rank <= (bm::set_sub_array_size * bm::gap_max_bits));
337 j = bm::lower_bound_u32(row, unsigned(rank), 0, bm::set_sub_array_size-1);
338 }
339 BM_ASSERT(j < bm::set_sub_array_size);
340
341 block_idx_type nb = (i * bm::set_sub_array_size) + j;
342 return nb;
343 }
344
345 //---------------------------------------------------------------------
346 template<typename BVAlloc>
sub_count(block_idx_type nb)347 unsigned rs_index<BVAlloc>::sub_count(block_idx_type nb) const
348 {
349 if (nb >= total_blocks_)
350 return 0;
351
352 unsigned i = unsigned(nb >> bm::set_array_shift);
353 size_type sb_count = get_super_block_count(i);
354
355 if (!sb_count)
356 return 0;
357 if (sb_count == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
358 {
359 unsigned first = rs3_border0 + 1;
360 unsigned second = rs3_border1 - first + 1;
361 return (second << 16) | first;
362 }
363
364 unsigned j = unsigned(nb & bm::set_array_mask);
365
366 unsigned row_idx = sblock_row_idx_[i+1];
367 const unsigned* sub_row = sub_block_matr_.row(row_idx);
368 return sub_row[j];
369 }
370
371
372 //---------------------------------------------------------------------
373
374 template<typename BVAlloc>
find(size_type * rank,block_idx_type * nb,bm::gap_word_t * sub_range)375 bool rs_index<BVAlloc>::find(size_type* rank,
376 block_idx_type* nb,
377 bm::gap_word_t* sub_range) const
378 {
379 BM_ASSERT(rank);
380 BM_ASSERT(nb);
381 BM_ASSERT(sub_range);
382
383 unsigned i = find_super_block(*rank);
384 if (i > max_sblock_)
385 return false;
386
387 size_type prev_rc = sblock_count_[i];
388 size_type curr_rc = sblock_count_[i+1];
389 size_type bc = curr_rc - prev_rc;
390
391 BM_ASSERT(bc);
392
393 unsigned j;
394 *rank -= prev_rc;
395 if (bc == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
396 {
397 // TODO: optimize
398 size_type r = *rank;
399 for (j = 0; j < bm::set_sub_array_size; ++j)
400 {
401 if (r <= bm::gap_max_bits)
402 break;
403 r -= bm::gap_max_bits;
404 } // for j
405 *rank = r;
406
407 unsigned first = rs3_border0 + 1;
408 unsigned second = rs3_border1 - first + 1;
409 if (*rank > first)
410 {
411 *rank -= first;
412 if (*rank > second)
413 {
414 *rank -= second;
415 *sub_range = bm::rs3_border1 + 1;
416 }
417 else
418 {
419 *sub_range = bm::rs3_border0 + 1;
420 }
421 }
422 else
423 {
424 *sub_range = 0;
425 }
426 }
427 else
428 {
429 unsigned row_idx = sblock_row_idx_[i+1];
430 const unsigned* row = block_matr_.row(row_idx);
431
432 BM_ASSERT(*rank <= (bm::set_sub_array_size * bm::gap_max_bits));
433 j = bm::lower_bound_u32(row, unsigned(*rank), 0, bm::set_sub_array_size-1);
434 BM_ASSERT(j < bm::set_sub_array_size);
435 if (j)
436 *rank -= row[j-1];
437
438 const unsigned* sub_row = sub_block_matr_.row(row_idx);
439 unsigned first = sub_row[j] & 0xFFFF;
440 unsigned second = sub_row[j] >> 16;
441 if (*rank > first)
442 {
443 *rank -= first;
444 if (*rank > second)
445 {
446 *rank -= second;
447 *sub_range = bm::rs3_border1 + 1;
448 }
449 else
450 {
451 *sub_range = bm::rs3_border0 + 1;
452 }
453 }
454 else
455 {
456 *sub_range = 0;
457 }
458 }
459 BM_ASSERT(j < bm::set_sub_array_size);
460 *nb = (i * bm::set_sub_array_size) + j;
461
462 return true;
463
464 }
465
466 //---------------------------------------------------------------------
467
468 template<typename BVAlloc>
set_null_super_block(unsigned i)469 void rs_index<BVAlloc>::set_null_super_block(unsigned i)
470 {
471 if (i > max_sblock_)
472 max_sblock_ = i;
473 sblock_count_[i+1] = sblock_count_[i];
474 sblock_row_idx_[i+1] = 0U;
475 }
476
477 //---------------------------------------------------------------------
478
479 template<typename BVAlloc>
set_full_super_block(unsigned i)480 void rs_index<BVAlloc>::set_full_super_block(unsigned i)
481 {
482 set_super_block(i, bm::set_sub_array_size * bm::gap_max_bits);
483 }
484
485 //---------------------------------------------------------------------
486
487 template<typename BVAlloc>
set_super_block(unsigned i,size_type bcount)488 void rs_index<BVAlloc>::set_super_block(unsigned i, size_type bcount)
489 {
490 if (i > max_sblock_)
491 max_sblock_ = i;
492
493 sblock_count_[i+1] = sblock_count_[i] + bcount;
494 if (bcount == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
495 {
496 sblock_row_idx_[i+1] = 0;
497 }
498 else
499 {
500 sblock_row_idx_[i+1] = sblock_rows_;
501 ++sblock_rows_;
502 }
503 }
504
505 //---------------------------------------------------------------------
506
507 template<typename BVAlloc>
get_super_block_count(unsigned i)508 unsigned rs_index<BVAlloc>::get_super_block_count(unsigned i) const
509 {
510 if (i > max_sblock_)
511 return 0;
512 size_type prev = sblock_count_[i];
513 size_type curr = sblock_count_[i + 1];
514 size_type cnt = curr - prev;
515 BM_ASSERT(cnt <= (bm::set_sub_array_size * bm::gap_max_bits));
516 return unsigned(cnt);
517 }
518
519 //---------------------------------------------------------------------
520
521 template<typename BVAlloc>
522 typename rs_index<BVAlloc>::size_type
get_super_block_rcount(unsigned i)523 rs_index<BVAlloc>::get_super_block_rcount(unsigned i) const
524 {
525 if (i > max_sblock_)
526 i = max_sblock_;
527 return sblock_count_[i+1];
528 }
529
530 //---------------------------------------------------------------------
531
532 template<typename BVAlloc>
find_super_block(size_type rank)533 unsigned rs_index<BVAlloc>::find_super_block(size_type rank) const
534 {
535 const sblock_count_type* bcount_arr = sblock_count_.begin();
536 unsigned i;
537
538 #ifdef BM64ADDR
539 i = bm::lower_bound_u64(bcount_arr, rank, 1, max_sblock_+1);
540 #else
541 i = bm::lower_bound_u32(bcount_arr, rank, 1, max_sblock_+1);
542 #endif
543 return i-1;
544 }
545
546 //---------------------------------------------------------------------
547
548 template<typename BVAlloc>
549 typename rs_index<BVAlloc>::size_type
super_block_size()550 rs_index<BVAlloc>::super_block_size() const
551 {
552
553 size_type total_sblocks = (size_type)sblock_count_.size();
554 if (total_sblocks)
555 return max_sblock_ + 1;
556 return 0;
557 }
558
559 //---------------------------------------------------------------------
560
561 template<typename BVAlloc>
resize_effective_super_blocks(size_type sb_eff)562 void rs_index<BVAlloc>::resize_effective_super_blocks(size_type sb_eff)
563 {
564 block_matr_.resize(sb_eff+1, bm::set_sub_array_size);
565 sub_block_matr_.resize(sb_eff+1, bm::set_sub_array_size);
566 }
567
568 //---------------------------------------------------------------------
569
570 template<typename BVAlloc>
register_super_block(unsigned i,const unsigned * bcount,const unsigned * sub_count_in)571 void rs_index<BVAlloc>::register_super_block(unsigned i,
572 const unsigned* bcount,
573 const unsigned* sub_count_in)
574 {
575 BM_ASSERT(bcount);
576 BM_ASSERT(sub_count_in);
577
578 if (i > max_sblock_)
579 max_sblock_ = i;
580
581
582 sblock_row_idx_[i+1] = sblock_rows_;
583 unsigned* row = block_matr_.row(sblock_rows_);
584 unsigned* sub_row = sub_block_matr_.row(sblock_rows_);
585 ++sblock_rows_;
586
587 unsigned bc = 0;
588 for (unsigned j = 0; j < bm::set_sub_array_size; ++j)
589 {
590 bc += bcount[j];
591 row[j] = bc;
592 sub_row[j] = sub_count_in[j];
593 BM_ASSERT(bcount[j] >= (sub_count_in[j] >> 16) + (sub_count_in[j] & bm::set_block_mask));
594 } // for j
595 sblock_count_[i+1] = sblock_count_[i] + bc;
596 }
597
598 //---------------------------------------------------------------------
599
600 }
601 #endif
602