1 #ifndef BMRS__H__INCLUDED__
2 #define BMRS__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10     http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit:  http://bitmagic.io
19 */
20 
21 /*! \file bmrs.h
22     \brief Rank-Select index structures
23 */
24 
25 namespace bm
26 {
27 
28 /**
29     @brief Rank-Select acceleration index
30 
31     Index uses two-level acceleration structure:
32     bcount - running total popcount for all (possible) blocks
33     (missing blocks give duplicate counts as POPCNT(N-1) + 0).
34     subcount - sub-count inside blocks
35 
36     @ingroup bvector
37     @internal
38 */
39 template<typename BVAlloc>
40 class rs_index
41 {
42 public:
43     typedef BVAlloc        bv_allocator_type;
44 
45 #ifdef BM64ADDR
46     typedef bm::id64_t                                   size_type;
47     typedef bm::id64_t                                   block_idx_type;
48     typedef bm::id64_t                                   sblock_count_type;
49 #else
50     typedef bm::id_t                                     size_type;
51     typedef bm::id_t                                     block_idx_type;
52     typedef bm::id_t                                     sblock_count_type;
53 #endif
54 
55     typedef bm::pair<bm::gap_word_t, bm::gap_word_t> sb_pair_type;
56 
57 public:
rs_index()58     rs_index() : total_blocks_(0), max_sblock_(0) {}
59     rs_index(const rs_index& rsi);
60 
61     /// init arrays to zeros
62     void init() BMNOEXCEPT;
63 
64     /// copy rs index
65     void copy_from(const rs_index& rsi);
66 
67     // -----------------------------------------------------------------
68 
69     /// set empty (no bits set super-block)
70     void set_null_super_block(unsigned i);
71 
72     /// set FULL (all bits set super-block)
73     void set_full_super_block(unsigned i);
74 
75     /// set super block count
76     void set_super_block(unsigned i, size_type bcount);
77 
78     /// return bit-count in super-block
79     unsigned get_super_block_count(unsigned i) const;
80 
81     /// return running bit-count in super-block
82     size_type get_super_block_rcount(unsigned i) const;
83 
84     /// return number of super-blocks
85     size_type super_block_size() const;
86 
87     /// Find super-block with specified rank
88     unsigned find_super_block(size_type rank) const;
89 
90 
91     /// set size of true super-blocks (not NULL and not FFFFF)
92     void resize_effective_super_blocks(size_type sb_eff);
93 
94     /// Add block list belonging to one super block
95     void register_super_block(unsigned i,
96                               const unsigned* bcount,
97                               const unsigned* sub_count);
98 
99     /// find block position and sub-range for the specified rank
100     bool find(size_type* rank,
101               block_idx_type* nb, bm::gap_word_t* sub_range) const;
102 
103     /// return sub-clock counts
104     unsigned sub_count(block_idx_type nb) const;
105 
106 
107     // -----------------------------------------------------------------
108 
109     /// reserve the capacity for block count
110     void resize(block_idx_type new_size);
111 
112     /// set total blocks
set_total(size_type t)113     void set_total(size_type t) { total_blocks_ = t; }
114 
115     /// get total blocks
get_total()116     size_type get_total() const { return total_blocks_; }
117 
118     /// return bit-count for specified block
119     unsigned count(block_idx_type nb) const;
120 
121     /// return total bit-count for the index
122     size_type count() const;
123 
124     /// return running bit-count for specified block
125     size_type rcount(block_idx_type nb) const;
126 
127     /// determine the sub-range within a bit-block
128     static
129     unsigned find_sub_range(unsigned block_bit_pos);
130 
131     /// determine block sub-range for rank search
132     bm::gap_word_t select_sub_range(block_idx_type nb, size_type& rank) const;
133 
134     /// find block position for the specified rank
135     block_idx_type find(size_type rank) const;
136 
137 private:
138     typedef bm::heap_vector<sblock_count_type, bv_allocator_type, false>
139                                                     sblock_count_vector_type;
140     typedef bm::heap_vector<unsigned, bv_allocator_type, false>
141                                                     sblock_row_vector_type;
142     typedef bm::dynamic_heap_matrix<unsigned, bv_allocator_type>  blocks_matrix_type;
143 
144 private:
145     unsigned                  sblock_rows_;
146     sblock_count_vector_type  sblock_count_;   ///< super-block accumulated counts
147     sblock_row_vector_type    sblock_row_idx_; ///< super-block row numbers
148     blocks_matrix_type        block_matr_;     ///< blocks within super-blocks (matrix)
149     blocks_matrix_type        sub_block_matr_; ///< sub-block counts
150     size_type                 total_blocks_;   ///< total bit-blocks in the index
151     unsigned                  max_sblock_;     ///< max. superblock index
152 };
153 
154 //---------------------------------------------------------------------
155 //
156 //---------------------------------------------------------------------
157 
158 template<typename BVAlloc>
rs_index(const rs_index<BVAlloc> & rsi)159 rs_index<BVAlloc>::rs_index(const rs_index<BVAlloc>& rsi)
160 {
161     copy_from(rsi);
162 }
163 
164 //---------------------------------------------------------------------
165 
166 
167 template<typename BVAlloc>
init()168 void rs_index<BVAlloc>::init() BMNOEXCEPT
169 {
170     sblock_count_.resize(0);
171     sblock_row_idx_.resize(0);
172     block_matr_.resize(0, 0);
173     sub_block_matr_.resize(0, 0);
174 
175     total_blocks_ = sblock_rows_ = max_sblock_ = 0;
176 }
177 
178 //---------------------------------------------------------------------
179 
180 template<typename BVAlloc>
resize(block_idx_type new_size)181 void rs_index<BVAlloc>::resize(block_idx_type new_size)
182 {
183     sblock_rows_ = 0;
184 
185     block_idx_type sblock_count = (new_size >> bm::set_array_shift) + 3;
186     BM_ASSERT(sblock_count == (new_size / 256) + 3);
187     sblock_count_.resize(sblock_count);
188     sblock_row_idx_.resize(sblock_count);
189 }
190 
191 //---------------------------------------------------------------------
192 
193 template<typename BVAlloc>
copy_from(const rs_index & rsi)194 void rs_index<BVAlloc>::copy_from(const rs_index& rsi)
195 {
196     sblock_rows_ = rsi.sblock_rows_;
197     sblock_count_ = rsi.sblock_count_;
198     sblock_row_idx_ = rsi.sblock_row_idx_;
199     block_matr_ = rsi.block_matr_;
200     sub_block_matr_ = rsi.sub_block_matr_;
201 
202     total_blocks_ = rsi.total_blocks_;
203     max_sblock_ = rsi.max_sblock_;
204 }
205 
206 //---------------------------------------------------------------------
207 
208 template<typename BVAlloc>
count(block_idx_type nb)209 unsigned rs_index<BVAlloc>::count(block_idx_type nb) const
210 {
211     if (nb >= total_blocks_)
212         return 0;
213     unsigned i = unsigned(nb >> bm::set_array_shift);
214     size_type sb_count = get_super_block_count(i);
215 
216     if (!sb_count)
217         return 0;
218     if (sb_count == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
219         return bm::gap_max_bits;
220 
221     unsigned j = unsigned(nb &  bm::set_array_mask);
222 
223     unsigned row_idx = sblock_row_idx_[i+1];
224     const unsigned* row = block_matr_.row(row_idx);
225     unsigned bc;
226 
227     bc = (!j) ? row[j] : row[j] - row[j-1];
228     return bc;
229 }
230 
231 //---------------------------------------------------------------------
232 
233 template<typename BVAlloc>
234 typename rs_index<BVAlloc>::size_type
count()235 rs_index<BVAlloc>::count() const
236 {
237     if (!total_blocks_)
238         return 0;
239     return sblock_count_[max_sblock_ + 1];
240 }
241 
242 //---------------------------------------------------------------------
243 
244 template<typename BVAlloc>
245 typename rs_index<BVAlloc>::size_type
rcount(block_idx_type nb)246 rs_index<BVAlloc>::rcount(block_idx_type nb) const
247 {
248     unsigned i = unsigned(nb >> bm::set_array_shift);
249     size_type sb_rcount = i ? get_super_block_rcount(i-1) : i;
250 
251     unsigned sb_count = get_super_block_count(i);
252     if (!sb_count)
253         return sb_rcount;
254 
255     unsigned j = unsigned(nb &  bm::set_array_mask);
256     if (sb_count == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
257     {
258         sb_count = (j+1) * bm::gap_max_bits;
259         sb_rcount += sb_count;
260         return sb_rcount;
261     }
262 
263     unsigned row_idx = sblock_row_idx_[i+1];
264     const unsigned* row = block_matr_.row(row_idx);
265     unsigned bc = row[j];
266     sb_rcount += bc;
267     return sb_rcount;
268 
269 }
270 
271 //---------------------------------------------------------------------
272 
273 template<typename BVAlloc>
find_sub_range(unsigned block_bit_pos)274 unsigned rs_index<BVAlloc>::find_sub_range(unsigned block_bit_pos)
275 {
276     return (block_bit_pos <= rs3_border0) ? 0 :
277             (block_bit_pos > rs3_border1) ? 2 : 1;
278 }
279 
280 //---------------------------------------------------------------------
281 
282 template<typename BVAlloc>
select_sub_range(block_idx_type nb,size_type & rank)283 bm::gap_word_t rs_index<BVAlloc>::select_sub_range(block_idx_type nb,
284                                                    size_type& rank) const
285 {
286     unsigned sub_cnt = sub_count(nb);
287     unsigned first = sub_cnt & 0xFFFF;
288     unsigned second = sub_cnt >> 16;
289 
290     if (rank > first)
291     {
292         rank -= first;
293         if (rank > second)
294         {
295             rank -= second;
296             return rs3_border1 + 1;
297         }
298         else
299             return rs3_border0 + 1;
300     }
301     return 0;
302 }
303 
304 //---------------------------------------------------------------------
305 
306 template<typename BVAlloc>
307 typename rs_index<BVAlloc>::block_idx_type
find(size_type rank)308 rs_index<BVAlloc>::find(size_type rank) const
309 {
310     BM_ASSERT(rank);
311 
312     unsigned i = find_super_block(rank);
313     BM_ASSERT(i < super_block_size());
314 
315     size_type prev_rc = sblock_count_[i];
316     size_type curr_rc = sblock_count_[i+1];
317     size_type bc = curr_rc - prev_rc;
318 
319     BM_ASSERT(bc);
320 
321     unsigned j;
322     rank -= prev_rc;
323     if (bc == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
324     {
325         for (j = 0; j < bm::set_sub_array_size; ++j)
326         {
327             if (rank <= bm::gap_max_bits)
328                 break;
329             rank -= bm::gap_max_bits;
330         } // for j
331     }
332     else
333     {
334         unsigned row_idx = sblock_row_idx_[i+1];
335         const unsigned* row = block_matr_.row(row_idx);
336         BM_ASSERT(rank <= (bm::set_sub_array_size * bm::gap_max_bits));
337         j = bm::lower_bound_u32(row, unsigned(rank), 0, bm::set_sub_array_size-1);
338     }
339     BM_ASSERT(j < bm::set_sub_array_size);
340 
341     block_idx_type nb = (i * bm::set_sub_array_size) + j;
342     return nb;
343 }
344 
345 //---------------------------------------------------------------------
346 template<typename BVAlloc>
sub_count(block_idx_type nb)347 unsigned rs_index<BVAlloc>::sub_count(block_idx_type nb) const
348 {
349     if (nb >= total_blocks_)
350         return 0;
351 
352     unsigned i = unsigned(nb >> bm::set_array_shift);
353     size_type sb_count = get_super_block_count(i);
354 
355     if (!sb_count)
356         return 0;
357     if (sb_count == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
358     {
359         unsigned first = rs3_border0 + 1;
360         unsigned second = rs3_border1 - first + 1;
361         return (second << 16) | first;
362     }
363 
364     unsigned j = unsigned(nb &  bm::set_array_mask);
365 
366     unsigned row_idx = sblock_row_idx_[i+1];
367     const unsigned* sub_row = sub_block_matr_.row(row_idx);
368     return sub_row[j];
369 }
370 
371 
372 //---------------------------------------------------------------------
373 
374 template<typename BVAlloc>
find(size_type * rank,block_idx_type * nb,bm::gap_word_t * sub_range)375 bool rs_index<BVAlloc>::find(size_type* rank,
376                              block_idx_type* nb,
377                              bm::gap_word_t* sub_range) const
378 {
379     BM_ASSERT(rank);
380     BM_ASSERT(nb);
381     BM_ASSERT(sub_range);
382 
383     unsigned i = find_super_block(*rank);
384     if (i > max_sblock_)
385         return false;
386 
387     size_type prev_rc = sblock_count_[i];
388     size_type curr_rc = sblock_count_[i+1];
389     size_type bc = curr_rc - prev_rc;
390 
391     BM_ASSERT(bc);
392 
393     unsigned j;
394     *rank -= prev_rc;
395     if (bc == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
396     {
397         // TODO: optimize
398         size_type r = *rank;
399         for (j = 0; j < bm::set_sub_array_size; ++j)
400         {
401             if (r <= bm::gap_max_bits)
402                 break;
403             r -= bm::gap_max_bits;
404         } // for j
405         *rank = r;
406 
407         unsigned first = rs3_border0 + 1;
408         unsigned second = rs3_border1 - first + 1;
409         if (*rank > first)
410         {
411             *rank -= first;
412             if (*rank > second)
413             {
414                 *rank -= second;
415                 *sub_range = bm::rs3_border1 + 1;
416             }
417             else
418             {
419                 *sub_range = bm::rs3_border0 + 1;
420             }
421         }
422         else
423         {
424             *sub_range = 0;
425         }
426     }
427     else
428     {
429         unsigned row_idx = sblock_row_idx_[i+1];
430         const unsigned* row = block_matr_.row(row_idx);
431 
432         BM_ASSERT(*rank <= (bm::set_sub_array_size * bm::gap_max_bits));
433         j = bm::lower_bound_u32(row, unsigned(*rank), 0, bm::set_sub_array_size-1);
434         BM_ASSERT(j < bm::set_sub_array_size);
435         if (j)
436             *rank -= row[j-1];
437 
438         const unsigned* sub_row = sub_block_matr_.row(row_idx);
439         unsigned first = sub_row[j] & 0xFFFF;
440         unsigned second = sub_row[j] >> 16;
441         if (*rank > first)
442         {
443             *rank -= first;
444             if (*rank > second)
445             {
446                 *rank -= second;
447                 *sub_range = bm::rs3_border1 + 1;
448             }
449             else
450             {
451                 *sub_range = bm::rs3_border0 + 1;
452             }
453         }
454         else
455         {
456             *sub_range = 0;
457         }
458     }
459     BM_ASSERT(j < bm::set_sub_array_size);
460     *nb = (i * bm::set_sub_array_size) + j;
461 
462     return true;
463 
464 }
465 
466 //---------------------------------------------------------------------
467 
468 template<typename BVAlloc>
set_null_super_block(unsigned i)469 void rs_index<BVAlloc>::set_null_super_block(unsigned i)
470 {
471     if (i > max_sblock_)
472         max_sblock_ = i;
473     sblock_count_[i+1] = sblock_count_[i];
474     sblock_row_idx_[i+1] = 0U;
475 }
476 
477 //---------------------------------------------------------------------
478 
479 template<typename BVAlloc>
set_full_super_block(unsigned i)480 void rs_index<BVAlloc>::set_full_super_block(unsigned i)
481 {
482     set_super_block(i, bm::set_sub_array_size * bm::gap_max_bits);
483 }
484 
485 //---------------------------------------------------------------------
486 
487 template<typename BVAlloc>
set_super_block(unsigned i,size_type bcount)488 void rs_index<BVAlloc>::set_super_block(unsigned i, size_type bcount)
489 {
490     if (i > max_sblock_)
491         max_sblock_ = i;
492 
493     sblock_count_[i+1] = sblock_count_[i] + bcount;
494     if (bcount == (bm::set_sub_array_size * bm::gap_max_bits)) // FULL BLOCK
495     {
496         sblock_row_idx_[i+1] = 0;
497     }
498     else
499     {
500         sblock_row_idx_[i+1] = sblock_rows_;
501         ++sblock_rows_;
502     }
503 }
504 
505 //---------------------------------------------------------------------
506 
507 template<typename BVAlloc>
get_super_block_count(unsigned i)508 unsigned rs_index<BVAlloc>::get_super_block_count(unsigned i) const
509 {
510     if (i > max_sblock_)
511         return 0;
512     size_type prev = sblock_count_[i];
513     size_type curr = sblock_count_[i + 1];
514     size_type cnt = curr - prev;
515     BM_ASSERT(cnt <= (bm::set_sub_array_size * bm::gap_max_bits));
516     return unsigned(cnt);
517 }
518 
519 //---------------------------------------------------------------------
520 
521 template<typename BVAlloc>
522 typename rs_index<BVAlloc>::size_type
get_super_block_rcount(unsigned i)523 rs_index<BVAlloc>::get_super_block_rcount(unsigned i) const
524 {
525     if (i > max_sblock_)
526         i = max_sblock_;
527     return sblock_count_[i+1];
528 }
529 
530 //---------------------------------------------------------------------
531 
532 template<typename BVAlloc>
find_super_block(size_type rank)533 unsigned rs_index<BVAlloc>::find_super_block(size_type rank) const
534 {
535     const sblock_count_type* bcount_arr = sblock_count_.begin();
536     unsigned i;
537 
538     #ifdef BM64ADDR
539         i = bm::lower_bound_u64(bcount_arr, rank, 1, max_sblock_+1);
540     #else
541         i = bm::lower_bound_u32(bcount_arr, rank, 1, max_sblock_+1);
542     #endif
543     return i-1;
544 }
545 
546 //---------------------------------------------------------------------
547 
548 template<typename BVAlloc>
549 typename rs_index<BVAlloc>::size_type
super_block_size()550 rs_index<BVAlloc>::super_block_size() const
551 {
552 
553     size_type total_sblocks = (size_type)sblock_count_.size();
554     if (total_sblocks)
555         return max_sblock_ + 1;
556     return 0;
557 }
558 
559 //---------------------------------------------------------------------
560 
561 template<typename BVAlloc>
resize_effective_super_blocks(size_type sb_eff)562 void rs_index<BVAlloc>::resize_effective_super_blocks(size_type sb_eff)
563 {
564     block_matr_.resize(sb_eff+1,     bm::set_sub_array_size);
565     sub_block_matr_.resize(sb_eff+1, bm::set_sub_array_size);
566 }
567 
568 //---------------------------------------------------------------------
569 
570 template<typename BVAlloc>
register_super_block(unsigned i,const unsigned * bcount,const unsigned * sub_count_in)571 void rs_index<BVAlloc>::register_super_block(unsigned i,
572                                              const unsigned* bcount,
573                                              const unsigned* sub_count_in)
574 {
575     BM_ASSERT(bcount);
576     BM_ASSERT(sub_count_in);
577 
578     if (i > max_sblock_)
579         max_sblock_ = i;
580 
581 
582     sblock_row_idx_[i+1] = sblock_rows_;
583     unsigned* row = block_matr_.row(sblock_rows_);
584     unsigned* sub_row = sub_block_matr_.row(sblock_rows_);
585     ++sblock_rows_;
586 
587     unsigned bc = 0;
588     for (unsigned j = 0; j < bm::set_sub_array_size; ++j)
589     {
590         bc += bcount[j];
591         row[j] = bc;
592         sub_row[j] = sub_count_in[j];
593         BM_ASSERT(bcount[j] >= (sub_count_in[j] >> 16) + (sub_count_in[j] & bm::set_block_mask));
594     } // for j
595     sblock_count_[i+1] = sblock_count_[i] + bc;
596 }
597 
598 //---------------------------------------------------------------------
599 
600 }
601 #endif
602