1 #ifndef BMRANDOM__H__INCLUDED__
2 #define BMRANDOM__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10     http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit:  http://bitmagic.io
19 */
20 
21 /*! \file bmrandom.h
22     \brief Generation of random subset
23 */
24 
25 #ifndef BM__H__INCLUDED__
26 // BitMagic utility headers do not include main "bm.h" declaration
27 // #include "bm.h" or "bm64.h" explicitly
28 # error missing include (bm.h or bm64.h)
29 #endif
30 
31 
32 #include "bmfunc.h"
33 #include "bmdef.h"
34 
35 #include <stdlib.h>
36 #include <random>
37 #include <algorithm>
38 
39 namespace bm
40 {
41 
42 
43 /*!
44     Class implements algorithm for random subset generation.
45 
46     Implemented method tries to be fair, but doesn't guarantee
47     true randomeness or absense of bias.
48 
49     Performace note:
50     Class holds temporary buffers and variables, so it is recommended to
51     re-use instances over multiple calls.
52 
53     \ingroup setalgo
54 */
55 template<class BV>
56 class random_subset
57 {
58 public:
59     typedef BV                      bvector_type;
60     typedef typename BV::size_type  size_type;
61 public:
62     random_subset();
63     ~random_subset();
64 
65     /// Get random subset of input vector
66     ///
67     /// @param bv_out - destination vector
68     /// @param bv_in  - input vector
69     /// @param sample_count  - number of bits to pick
70     ///
71     void sample(BV& bv_out, const BV& bv_in, size_type sample_count);
72 
73 
74 private:
75     typedef
76         typename BV::blocks_manager_type  blocks_manager_type;
77 
78 private:
79     /// simple picking algorithm for small number of items
80     /// in [first,last] range
81     ///
82     void simple_pick(BV& bv_out,
83                       const BV&  bv_in,
84                       size_type sample_count,
85                       size_type first,
86                       size_type last);
87 
88     void get_subset(BV& bv_out,
89                     const BV&  bv_in,
90                     size_type  in_count,
91                     size_type  sample_count);
92 
93     void get_block_subset(bm::word_t*       blk_out,
94                           const bm::word_t* blk_src,
95                           unsigned          count);
96     static
97     unsigned process_word(bm::word_t*       blk_out,
98                           const bm::word_t* blk_src,
99                           unsigned          nword,
100                           unsigned          take_count) BMNOEXCEPT;
101 
102     static
103     void get_random_array(bm::word_t*       blk_out,
104                           bm::gap_word_t*   bit_list,
105                           unsigned          bit_list_size,
106                           unsigned          count);
107     static
108     unsigned compute_take_count(unsigned bc,
109                 size_type in_count, size_type sample_count) BMNOEXCEPT;
110 
111 
112 private:
113     random_subset(const random_subset&);
114     random_subset& operator=(const random_subset&);
115 private:
116     typename bvector_type::rs_index_type   rsi_; ///< RS index (block counts)
117     bvector_type                           bv_nb_; ///< blocks vector
118     bm::gap_word_t                         bit_list_[bm::gap_max_bits];
119     bm::word_t*                            sub_block_;
120 };
121 
122 
123 ///////////////////////////////////////////////////////////////////
124 
125 
126 
127 template<class BV>
random_subset()128 random_subset<BV>::random_subset()
129 {
130     sub_block_ = new bm::word_t[bm::set_block_size];
131 }
132 
133 template<class BV>
~random_subset()134 random_subset<BV>::~random_subset()
135 {
136     delete [] sub_block_;
137 }
138 
139 template<class BV>
sample(BV & bv_out,const BV & bv_in,size_type sample_count)140 void random_subset<BV>::sample(BV&       bv_out,
141                                const BV& bv_in,
142                                size_type sample_count)
143 {
144     if (sample_count == 0)
145     {
146         bv_out.clear(true);
147         return;
148     }
149 
150     rsi_.init();
151     bv_in.build_rs_index(&rsi_, &bv_nb_);
152 
153     size_type in_count = rsi_.count();
154     if (in_count <= sample_count)
155     {
156         bv_out = bv_in;
157         return;
158     }
159 
160     float pick_ratio = float(sample_count) / float(in_count);
161     if (pick_ratio < 0.054f)
162     {
163         size_type first, last;
164         bool b = bv_in.find_range(first, last);
165         if (!b)
166             return;
167 
168         simple_pick(bv_out, bv_in, sample_count, first, last);
169         return;
170     }
171 
172     if (sample_count > in_count/2)
173     {
174         // build the complement vector and subtract it
175         BV tmp_bv;
176         size_type delta_count = in_count - sample_count;
177 
178         get_subset(tmp_bv, bv_in, in_count, delta_count);
179         bv_out = bv_in;
180         bv_out -= tmp_bv;
181         return;
182     }
183     get_subset(bv_out, bv_in, in_count, sample_count);
184 }
185 
186 template<class BV>
simple_pick(BV & bv_out,const BV & bv_in,size_type sample_count,size_type first,size_type last)187 void random_subset<BV>::simple_pick(BV&        bv_out,
188                                     const BV&  bv_in,
189                                     size_type sample_count,
190                                     size_type first,
191                                     size_type last)
192 {
193     bv_out.clear(true);
194 
195     std::random_device rd;
196     #ifdef BM64ADDR
197         std::mt19937_64 mt_rand(rd());
198     #else
199         std::mt19937 mt_rand(rd());
200     #endif
201     std::uniform_int_distribution<size_type> dist(first, last);
202 
203     while (sample_count)
204     {
205         size_type fidx;
206         size_type ridx = dist(mt_rand); // generate random position
207 
208         BM_ASSERT(ridx >= first && ridx <= last);
209 
210         bool b = bv_in.find(ridx, fidx); // find next valid bit after random
211         BM_ASSERT(b);
212         if (b)
213         {
214             // set true if was false
215             bool is_set = bv_out.set_bit_conditional(fidx, true, false);
216             sample_count -= is_set;
217             while (!is_set) // find next valid (and not set) bit
218             {
219                 ++fidx;
220                 // searching always left to right may create a bias...
221                 b = bv_in.find(fidx, fidx);
222                 if (!b)
223                     break;
224                 if (fidx > last)
225                     break;
226                 is_set = bv_out.set_bit_conditional(fidx, true, false);
227                 sample_count -= is_set;
228             } // while
229         }
230     } // while
231 }
232 
233 
234 template<class BV>
get_subset(BV & bv_out,const BV & bv_in,size_type in_count,size_type sample_count)235 void random_subset<BV>::get_subset(BV&        bv_out,
236                                    const BV&  bv_in,
237                                    size_type  in_count,
238                                    size_type  sample_count)
239 {
240     bv_out.clear(true);
241     bv_out.resize(bv_in.size());
242 
243     const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
244     blocks_manager_type& bman_out = bv_out.get_blocks_manager();
245 
246     bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
247 
248     size_type first_nb, last_nb;
249     bool b = bv_nb_.find_range(first_nb, last_nb);
250     BM_ASSERT(b);
251     if (!b)
252         return;
253 
254     std::random_device rd;
255     #ifdef BM64ADDR
256         std::mt19937_64 mt_rand(rd());
257     #else
258         std::mt19937 mt_rand(rd());
259     #endif
260     std::uniform_int_distribution<size_type> dist_nb(first_nb, last_nb);
261 
262     size_type curr_sample_count = sample_count;
263     for (unsigned take_count = 0; curr_sample_count; curr_sample_count -= take_count)
264     {
265         // pick block at random
266         //
267         size_type nb;
268         size_type ridx = dist_nb(mt_rand); // generate random block idx
269         BM_ASSERT(ridx >= first_nb && ridx <= last_nb);
270 
271         b = bv_nb_.find(ridx, nb); // find next valid nb
272         if (!b)
273         {
274             b = bv_nb_.find(first_nb, nb);
275             if (!b)
276             {
277                 b = bv_nb_.find(first_nb, nb);
278                 BM_ASSERT(!bv_nb_.any());
279                 BM_ASSERT(b);
280                 return; // cannot find block
281             }
282         }
283         bv_nb_.clear_bit_no_check(nb); // remove from blocks list
284 
285         // calculate proportinal sample count
286         //
287         unsigned bc = rsi_.count(nb);
288         BM_ASSERT(bc && (bc <= 65536));
289         take_count = compute_take_count(bc, in_count, sample_count);
290         if (take_count > curr_sample_count)
291             take_count = unsigned(curr_sample_count);
292         BM_ASSERT(take_count);
293         if (!take_count)
294             continue;
295         {
296             unsigned i0, j0;
297             bm::get_block_coord(nb, i0, j0);
298             const bm::word_t* blk_src = bman_in.get_block(i0, j0);
299             BM_ASSERT(blk_src);
300 
301             // allocate target block
302             bm::word_t* blk_out = bman_out.get_block_ptr(i0, j0);
303             BM_ASSERT(!blk_out);
304             if (blk_out)
305             {
306                 blk_out = bman_out.deoptimize_block(nb);
307             }
308             else
309             {
310                 blk_out = bman_out.get_allocator().alloc_bit_block();
311                 bman_out.set_block(nb, blk_out);
312             }
313             if (take_count == bc) // whole block take (strange)
314             {
315                 // copy the whole src block
316                 if (BM_IS_GAP(blk_src))
317                     bm::gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
318                 else
319                     bm::bit_block_copy(blk_out, blk_src);
320                 continue;
321             }
322             bm::bit_block_set(blk_out, 0);
323             if (bc < 4096) // use array shuffle
324             {
325                 unsigned arr_len;
326                 // convert source block to bit-block
327                 if (BM_IS_GAP(blk_src))
328                 {
329                     arr_len = bm::gap_convert_to_arr(bit_list_,
330                                                  BMGAP_PTR(blk_src),
331                                                  bm::gap_max_bits);
332                 }
333                 else // bit-block
334                 {
335                     arr_len = bm::bit_block_convert_to_arr(bit_list_, blk_src, 0);
336                 }
337                 BM_ASSERT(arr_len);
338                 get_random_array(blk_out, bit_list_, arr_len, take_count);
339             }
340             else // dense block
341             {
342                 // convert source block to bit-block
343                 if (BM_IS_GAP(blk_src))
344                 {
345                     bm::gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
346                     blk_src = tmp_block;
347                 }
348                 // pick random bits source block to target
349                 get_block_subset(blk_out, blk_src, take_count);
350             }
351         }
352     } // for
353 }
354 
355 template<class BV>
compute_take_count(unsigned bc,size_type in_count,size_type sample_count)356 unsigned random_subset<BV>::compute_take_count(
357                                     unsigned bc,
358                                     size_type in_count,
359                                     size_type sample_count) BMNOEXCEPT
360 {
361     float block_percent = float(bc) / float(in_count);
362     float bits_to_take = float(sample_count) * block_percent;
363     bits_to_take += 0.99f;
364     unsigned to_take = unsigned(bits_to_take);
365     if (to_take > bc)
366         to_take = bc;
367     return to_take;
368 }
369 
370 
371 template<class BV>
get_block_subset(bm::word_t * blk_out,const bm::word_t * blk_src,unsigned take_count)372 void random_subset<BV>::get_block_subset(bm::word_t*       blk_out,
373                                           const bm::word_t* blk_src,
374                                           unsigned          take_count)
375 {
376     for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
377     {
378         // pick random scan start and scan direction
379         //
380         unsigned i = unsigned(rand()) % bm::set_block_size;
381         unsigned new_count;
382 
383         for (; i < bm::set_block_size && take_count; ++i)
384         {
385             if (blk_src[i] && (blk_out[i] != blk_src[i]))
386             {
387                 new_count = process_word(blk_out, blk_src, i, take_count);
388                 take_count -= new_count;
389             }
390         } // for i
391 
392     } // for
393     // if masked scan did not produce enough results..
394     //
395     if (take_count)
396     {
397         // Find all vacant bits: do logical (src SUB out)
398         for (unsigned i = 0; i < bm::set_block_size; ++i)
399         {
400             sub_block_[i] = blk_src[i] & ~blk_out[i];
401         }
402         // now transform vacant bits to array, then pick random elements
403         //
404         unsigned arr_len =
405             bm::bit_block_convert_to_arr(bit_list_, sub_block_, 0);
406                            //                   bm::gap_max_bits,
407                            //                   bm::gap_max_bits,
408                            //                   0);
409         BM_ASSERT(arr_len);
410         get_random_array(blk_out, bit_list_, arr_len, take_count);
411     }
412 }
413 
414 template<class BV>
process_word(bm::word_t * blk_out,const bm::word_t * blk_src,unsigned nword,unsigned take_count)415 unsigned random_subset<BV>::process_word(bm::word_t*       blk_out,
416                                          const bm::word_t* blk_src,
417                                          unsigned          nword,
418                                          unsigned          take_count) BMNOEXCEPT
419 {
420     unsigned new_bits, mask;
421     do
422     {
423         mask = unsigned(rand());
424         mask ^= mask << 16;
425     } while (mask == 0);
426 
427     std::random_device rd;
428     #ifdef BM64ADDR
429         std::mt19937_64 mt_rand(rd());
430     #else
431         std::mt19937 mt_rand(rd());
432     #endif
433     unsigned src_rand = blk_src[nword] & mask;
434     new_bits = src_rand & ~blk_out[nword];
435     if (new_bits)
436     {
437         unsigned new_count = bm::word_bitcount(new_bits);
438 
439         // check if we accidentally picked more bits than needed
440         if (new_count > take_count)
441         {
442             BM_ASSERT(take_count);
443 
444             unsigned char blist[64];
445             unsigned arr_size = bm::bitscan(new_bits, blist);
446             BM_ASSERT(arr_size == new_count);
447             std::shuffle(blist, blist + arr_size, mt_rand);
448             unsigned value = 0;
449             for (unsigned j = 0; j < take_count; ++j)
450             {
451                 value |= (1u << blist[j]);
452             }
453             new_bits = value;
454             new_count = take_count;
455 
456             BM_ASSERT(bm::word_bitcount(new_bits) == take_count);
457             BM_ASSERT((new_bits & ~blk_src[nword]) == 0);
458         }
459 
460         blk_out[nword] |= new_bits;
461         return new_count;
462     }
463     return 0;
464 }
465 
466 
467 template<class BV>
get_random_array(bm::word_t * blk_out,bm::gap_word_t * bit_list,unsigned bit_list_size,unsigned count)468 void random_subset<BV>::get_random_array(bm::word_t*       blk_out,
469                                          bm::gap_word_t*   bit_list,
470                                          unsigned          bit_list_size,
471                                          unsigned          count)
472 {
473     std::random_device rd;
474     #ifdef BM64ADDR
475         std::mt19937_64 mt_rand(rd());
476     #else
477         std::mt19937 mt_rand(rd());
478     #endif
479     std::shuffle(bit_list, bit_list + bit_list_size, mt_rand);
480     for (unsigned i = 0; i < count; ++i)
481     {
482         bm::set_bit(blk_out, bit_list[i]);
483     }
484 }
485 
486 } // namespace
487 
488 
489 #endif
490