1 #ifndef BMRANDOM__H__INCLUDED__
2 #define BMRANDOM__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17
18 For more information please visit: http://bitmagic.io
19 */
20
21 /*! \file bmrandom.h
22 \brief Generation of random subset
23 */
24
25 #ifndef BM__H__INCLUDED__
26 // BitMagic utility headers do not include main "bm.h" declaration
27 // #include "bm.h" or "bm64.h" explicitly
28 # error missing include (bm.h or bm64.h)
29 #endif
30
31
32 #include "bmfunc.h"
33 #include "bmdef.h"
34
35 #include <stdlib.h>
36 #include <random>
37 #include <algorithm>
38
39 namespace bm
40 {
41
42
43 /*!
44 Class implements algorithm for random subset generation.
45
46 Implemented method tries to be fair, but doesn't guarantee
47 true randomeness or absense of bias.
48
49 Performace note:
50 Class holds temporary buffers and variables, so it is recommended to
51 re-use instances over multiple calls.
52
53 \ingroup setalgo
54 */
55 template<class BV>
56 class random_subset
57 {
58 public:
59 typedef BV bvector_type;
60 typedef typename BV::size_type size_type;
61 public:
62 random_subset();
63 ~random_subset();
64
65 /// Get random subset of input vector
66 ///
67 /// @param bv_out - destination vector
68 /// @param bv_in - input vector
69 /// @param sample_count - number of bits to pick
70 ///
71 void sample(BV& bv_out, const BV& bv_in, size_type sample_count);
72
73
74 private:
75 typedef
76 typename BV::blocks_manager_type blocks_manager_type;
77
78 private:
79 /// simple picking algorithm for small number of items
80 /// in [first,last] range
81 ///
82 void simple_pick(BV& bv_out,
83 const BV& bv_in,
84 size_type sample_count,
85 size_type first,
86 size_type last);
87
88 void get_subset(BV& bv_out,
89 const BV& bv_in,
90 size_type in_count,
91 size_type sample_count);
92
93 void get_block_subset(bm::word_t* blk_out,
94 const bm::word_t* blk_src,
95 unsigned count);
96 static
97 unsigned process_word(bm::word_t* blk_out,
98 const bm::word_t* blk_src,
99 unsigned nword,
100 unsigned take_count) BMNOEXCEPT;
101
102 static
103 void get_random_array(bm::word_t* blk_out,
104 bm::gap_word_t* bit_list,
105 unsigned bit_list_size,
106 unsigned count);
107 static
108 unsigned compute_take_count(unsigned bc,
109 size_type in_count, size_type sample_count) BMNOEXCEPT;
110
111
112 private:
113 random_subset(const random_subset&);
114 random_subset& operator=(const random_subset&);
115 private:
116 typename bvector_type::rs_index_type rsi_; ///< RS index (block counts)
117 bvector_type bv_nb_; ///< blocks vector
118 bm::gap_word_t bit_list_[bm::gap_max_bits];
119 bm::word_t* sub_block_;
120 };
121
122
123 ///////////////////////////////////////////////////////////////////
124
125
126
127 template<class BV>
random_subset()128 random_subset<BV>::random_subset()
129 {
130 sub_block_ = new bm::word_t[bm::set_block_size];
131 }
132
133 template<class BV>
~random_subset()134 random_subset<BV>::~random_subset()
135 {
136 delete [] sub_block_;
137 }
138
139 template<class BV>
sample(BV & bv_out,const BV & bv_in,size_type sample_count)140 void random_subset<BV>::sample(BV& bv_out,
141 const BV& bv_in,
142 size_type sample_count)
143 {
144 if (sample_count == 0)
145 {
146 bv_out.clear(true);
147 return;
148 }
149
150 rsi_.init();
151 bv_in.build_rs_index(&rsi_, &bv_nb_);
152
153 size_type in_count = rsi_.count();
154 if (in_count <= sample_count)
155 {
156 bv_out = bv_in;
157 return;
158 }
159
160 float pick_ratio = float(sample_count) / float(in_count);
161 if (pick_ratio < 0.054f)
162 {
163 size_type first, last;
164 bool b = bv_in.find_range(first, last);
165 if (!b)
166 return;
167
168 simple_pick(bv_out, bv_in, sample_count, first, last);
169 return;
170 }
171
172 if (sample_count > in_count/2)
173 {
174 // build the complement vector and subtract it
175 BV tmp_bv;
176 size_type delta_count = in_count - sample_count;
177
178 get_subset(tmp_bv, bv_in, in_count, delta_count);
179 bv_out = bv_in;
180 bv_out -= tmp_bv;
181 return;
182 }
183 get_subset(bv_out, bv_in, in_count, sample_count);
184 }
185
186 template<class BV>
simple_pick(BV & bv_out,const BV & bv_in,size_type sample_count,size_type first,size_type last)187 void random_subset<BV>::simple_pick(BV& bv_out,
188 const BV& bv_in,
189 size_type sample_count,
190 size_type first,
191 size_type last)
192 {
193 bv_out.clear(true);
194
195 std::random_device rd;
196 #ifdef BM64ADDR
197 std::mt19937_64 mt_rand(rd());
198 #else
199 std::mt19937 mt_rand(rd());
200 #endif
201 std::uniform_int_distribution<size_type> dist(first, last);
202
203 while (sample_count)
204 {
205 size_type fidx;
206 size_type ridx = dist(mt_rand); // generate random position
207
208 BM_ASSERT(ridx >= first && ridx <= last);
209
210 bool b = bv_in.find(ridx, fidx); // find next valid bit after random
211 BM_ASSERT(b);
212 if (b)
213 {
214 // set true if was false
215 bool is_set = bv_out.set_bit_conditional(fidx, true, false);
216 sample_count -= is_set;
217 while (!is_set) // find next valid (and not set) bit
218 {
219 ++fidx;
220 // searching always left to right may create a bias...
221 b = bv_in.find(fidx, fidx);
222 if (!b)
223 break;
224 if (fidx > last)
225 break;
226 is_set = bv_out.set_bit_conditional(fidx, true, false);
227 sample_count -= is_set;
228 } // while
229 }
230 } // while
231 }
232
233
234 template<class BV>
get_subset(BV & bv_out,const BV & bv_in,size_type in_count,size_type sample_count)235 void random_subset<BV>::get_subset(BV& bv_out,
236 const BV& bv_in,
237 size_type in_count,
238 size_type sample_count)
239 {
240 bv_out.clear(true);
241 bv_out.resize(bv_in.size());
242
243 const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
244 blocks_manager_type& bman_out = bv_out.get_blocks_manager();
245
246 bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
247
248 size_type first_nb, last_nb;
249 bool b = bv_nb_.find_range(first_nb, last_nb);
250 BM_ASSERT(b);
251 if (!b)
252 return;
253
254 std::random_device rd;
255 #ifdef BM64ADDR
256 std::mt19937_64 mt_rand(rd());
257 #else
258 std::mt19937 mt_rand(rd());
259 #endif
260 std::uniform_int_distribution<size_type> dist_nb(first_nb, last_nb);
261
262 size_type curr_sample_count = sample_count;
263 for (unsigned take_count = 0; curr_sample_count; curr_sample_count -= take_count)
264 {
265 // pick block at random
266 //
267 size_type nb;
268 size_type ridx = dist_nb(mt_rand); // generate random block idx
269 BM_ASSERT(ridx >= first_nb && ridx <= last_nb);
270
271 b = bv_nb_.find(ridx, nb); // find next valid nb
272 if (!b)
273 {
274 b = bv_nb_.find(first_nb, nb);
275 if (!b)
276 {
277 b = bv_nb_.find(first_nb, nb);
278 BM_ASSERT(!bv_nb_.any());
279 BM_ASSERT(b);
280 return; // cannot find block
281 }
282 }
283 bv_nb_.clear_bit_no_check(nb); // remove from blocks list
284
285 // calculate proportinal sample count
286 //
287 unsigned bc = rsi_.count(nb);
288 BM_ASSERT(bc && (bc <= 65536));
289 take_count = compute_take_count(bc, in_count, sample_count);
290 if (take_count > curr_sample_count)
291 take_count = unsigned(curr_sample_count);
292 BM_ASSERT(take_count);
293 if (!take_count)
294 continue;
295 {
296 unsigned i0, j0;
297 bm::get_block_coord(nb, i0, j0);
298 const bm::word_t* blk_src = bman_in.get_block(i0, j0);
299 BM_ASSERT(blk_src);
300
301 // allocate target block
302 bm::word_t* blk_out = bman_out.get_block_ptr(i0, j0);
303 BM_ASSERT(!blk_out);
304 if (blk_out)
305 {
306 blk_out = bman_out.deoptimize_block(nb);
307 }
308 else
309 {
310 blk_out = bman_out.get_allocator().alloc_bit_block();
311 bman_out.set_block(nb, blk_out);
312 }
313 if (take_count == bc) // whole block take (strange)
314 {
315 // copy the whole src block
316 if (BM_IS_GAP(blk_src))
317 bm::gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
318 else
319 bm::bit_block_copy(blk_out, blk_src);
320 continue;
321 }
322 bm::bit_block_set(blk_out, 0);
323 if (bc < 4096) // use array shuffle
324 {
325 unsigned arr_len;
326 // convert source block to bit-block
327 if (BM_IS_GAP(blk_src))
328 {
329 arr_len = bm::gap_convert_to_arr(bit_list_,
330 BMGAP_PTR(blk_src),
331 bm::gap_max_bits);
332 }
333 else // bit-block
334 {
335 arr_len = bm::bit_block_convert_to_arr(bit_list_, blk_src, 0);
336 }
337 BM_ASSERT(arr_len);
338 get_random_array(blk_out, bit_list_, arr_len, take_count);
339 }
340 else // dense block
341 {
342 // convert source block to bit-block
343 if (BM_IS_GAP(blk_src))
344 {
345 bm::gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
346 blk_src = tmp_block;
347 }
348 // pick random bits source block to target
349 get_block_subset(blk_out, blk_src, take_count);
350 }
351 }
352 } // for
353 }
354
355 template<class BV>
compute_take_count(unsigned bc,size_type in_count,size_type sample_count)356 unsigned random_subset<BV>::compute_take_count(
357 unsigned bc,
358 size_type in_count,
359 size_type sample_count) BMNOEXCEPT
360 {
361 float block_percent = float(bc) / float(in_count);
362 float bits_to_take = float(sample_count) * block_percent;
363 bits_to_take += 0.99f;
364 unsigned to_take = unsigned(bits_to_take);
365 if (to_take > bc)
366 to_take = bc;
367 return to_take;
368 }
369
370
371 template<class BV>
get_block_subset(bm::word_t * blk_out,const bm::word_t * blk_src,unsigned take_count)372 void random_subset<BV>::get_block_subset(bm::word_t* blk_out,
373 const bm::word_t* blk_src,
374 unsigned take_count)
375 {
376 for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
377 {
378 // pick random scan start and scan direction
379 //
380 unsigned i = unsigned(rand()) % bm::set_block_size;
381 unsigned new_count;
382
383 for (; i < bm::set_block_size && take_count; ++i)
384 {
385 if (blk_src[i] && (blk_out[i] != blk_src[i]))
386 {
387 new_count = process_word(blk_out, blk_src, i, take_count);
388 take_count -= new_count;
389 }
390 } // for i
391
392 } // for
393 // if masked scan did not produce enough results..
394 //
395 if (take_count)
396 {
397 // Find all vacant bits: do logical (src SUB out)
398 for (unsigned i = 0; i < bm::set_block_size; ++i)
399 {
400 sub_block_[i] = blk_src[i] & ~blk_out[i];
401 }
402 // now transform vacant bits to array, then pick random elements
403 //
404 unsigned arr_len =
405 bm::bit_block_convert_to_arr(bit_list_, sub_block_, 0);
406 // bm::gap_max_bits,
407 // bm::gap_max_bits,
408 // 0);
409 BM_ASSERT(arr_len);
410 get_random_array(blk_out, bit_list_, arr_len, take_count);
411 }
412 }
413
414 template<class BV>
process_word(bm::word_t * blk_out,const bm::word_t * blk_src,unsigned nword,unsigned take_count)415 unsigned random_subset<BV>::process_word(bm::word_t* blk_out,
416 const bm::word_t* blk_src,
417 unsigned nword,
418 unsigned take_count) BMNOEXCEPT
419 {
420 unsigned new_bits, mask;
421 do
422 {
423 mask = unsigned(rand());
424 mask ^= mask << 16;
425 } while (mask == 0);
426
427 std::random_device rd;
428 #ifdef BM64ADDR
429 std::mt19937_64 mt_rand(rd());
430 #else
431 std::mt19937 mt_rand(rd());
432 #endif
433 unsigned src_rand = blk_src[nword] & mask;
434 new_bits = src_rand & ~blk_out[nword];
435 if (new_bits)
436 {
437 unsigned new_count = bm::word_bitcount(new_bits);
438
439 // check if we accidentally picked more bits than needed
440 if (new_count > take_count)
441 {
442 BM_ASSERT(take_count);
443
444 unsigned char blist[64];
445 unsigned arr_size = bm::bitscan(new_bits, blist);
446 BM_ASSERT(arr_size == new_count);
447 std::shuffle(blist, blist + arr_size, mt_rand);
448 unsigned value = 0;
449 for (unsigned j = 0; j < take_count; ++j)
450 {
451 value |= (1u << blist[j]);
452 }
453 new_bits = value;
454 new_count = take_count;
455
456 BM_ASSERT(bm::word_bitcount(new_bits) == take_count);
457 BM_ASSERT((new_bits & ~blk_src[nword]) == 0);
458 }
459
460 blk_out[nword] |= new_bits;
461 return new_count;
462 }
463 return 0;
464 }
465
466
467 template<class BV>
get_random_array(bm::word_t * blk_out,bm::gap_word_t * bit_list,unsigned bit_list_size,unsigned count)468 void random_subset<BV>::get_random_array(bm::word_t* blk_out,
469 bm::gap_word_t* bit_list,
470 unsigned bit_list_size,
471 unsigned count)
472 {
473 std::random_device rd;
474 #ifdef BM64ADDR
475 std::mt19937_64 mt_rand(rd());
476 #else
477 std::mt19937 mt_rand(rd());
478 #endif
479 std::shuffle(bit_list, bit_list + bit_list_size, mt_rand);
480 for (unsigned i = 0; i < count; ++i)
481 {
482 bm::set_bit(blk_out, bit_list[i]);
483 }
484 }
485
486 } // namespace
487
488
489 #endif
490