1 /***********************************************************************
2 * Software License Agreement (BSD License)
3 *
4 * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5 * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6 *
7 * THE BSD LICENSE
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *************************************************************************/
30
31 /***********************************************************************
32 * Author: Vincent Rabaud
33 *************************************************************************/
34
35 #ifndef FLANN_LSH_TABLE_H_
36 #define FLANN_LSH_TABLE_H_
37
38 #include <algorithm>
39 #include <iostream>
40 #include <iomanip>
41 #include <limits.h>
42 #include <random>
43 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
44 #if USE_UNORDERED_MAP
45 #include <unordered_map>
46 #else
47 #include <map>
48 #endif
49 #include <math.h>
50 #include <stddef.h>
51
52 #include "flann/util/dynamic_bitset.h"
53 #include "flann/util/matrix.h"
54
55 namespace flann
56 {
57
58 namespace lsh
59 {
60
61 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
62
63 /** What is stored in an LSH bucket
64 */
65 typedef uint32_t FeatureIndex;
66 /** The id from which we can get a bucket back in an LSH table
67 */
68 typedef unsigned int BucketKey;
69
70 /** A bucket in an LSH table
71 */
72 typedef std::vector<FeatureIndex> Bucket;
73
74 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
75
76 /** POD for stats about an LSH table
77 */
78 struct LshStats
79 {
80 std::vector<unsigned int> bucket_sizes_;
81 size_t n_buckets_;
82 size_t bucket_size_mean_;
83 size_t bucket_size_median_;
84 size_t bucket_size_min_;
85 size_t bucket_size_max_;
86 size_t bucket_size_std_dev;
87 /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
88 */
89 std::vector<std::vector<unsigned int> > size_histogram_;
90 };
91
92 /** Overload the << operator for LshStats
93 * @param out the streams
94 * @param stats the stats to display
95 * @return the streams
96 */
97 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
98 {
99 size_t w = 20;
100 out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
101 << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
102 << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
103 << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
104 << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
105 << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
106 << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
107
108 // Display the histogram
109 out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
110 << std::setiosflags(std::ios::left);
111 for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
112 stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ", ";
113
114 return out;
115 }
116
117
118 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
119
120 /** Lsh hash table. As its key is a sub-feature, and as usually
121 * the size of it is pretty small, we keep it as a continuous memory array.
122 * The value is an index in the corpus of features (we keep it as an unsigned
123 * int for pure memory reasons, it could be a size_t)
124 */
125 template<typename ElementType>
126 class LshTable
127 {
128 public:
129 /** A container of all the feature indices. Optimized for space
130 */
131 #if USE_UNORDERED_MAP
132 typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
133 #else
134 typedef std::map<BucketKey, Bucket> BucketsSpace;
135 #endif
136
137 /** A container of all the feature indices. Optimized for speed
138 */
139 typedef std::vector<Bucket> BucketsSpeed;
140
141 /** Default constructor
142 */
LshTable()143 LshTable()
144 {
145 }
146
147 /** Default constructor
148 * Create the mask and allocate the memory
149 * @param feature_size is the size of the feature (considered as a ElementType[])
150 * @param key_size is the number of bits that are turned on in the feature
151 */
LshTable(unsigned int,unsigned int)152 LshTable(unsigned int /*feature_size*/, unsigned int /*key_size*/)
153 {
154 std::cerr << "LSH is not implemented for that type" << std::endl;
155 throw;
156 }
157
158 /** Add a feature to the table
159 * @param value the value to store for that feature
160 * @param feature the feature itself
161 */
add(unsigned int value,const ElementType * feature)162 void add(unsigned int value, const ElementType* feature)
163 {
164 // Add the value to the corresponding bucket
165 BucketKey key = getKey(feature);
166
167 switch (speed_level_) {
168 case kArray:
169 // That means we get the buckets from an array
170 buckets_speed_[key].push_back(value);
171 break;
172 case kBitsetHash:
173 // That means we can check the bitset for the presence of a key
174 key_bitset_.set(key);
175 buckets_space_[key].push_back(value);
176 break;
177 case kHash:
178 {
179 // That means we have to check for the hash table for the presence of a key
180 buckets_space_[key].push_back(value);
181 break;
182 }
183 }
184 }
185
186 /** Add a set of features to the table
187 * @param dataset the values to store
188 */
add(const std::vector<std::pair<size_t,ElementType * >> & features)189 void add(const std::vector< std::pair<size_t, ElementType*> >& features)
190 {
191 #if USE_UNORDERED_MAP
192 buckets_space_.rehash((buckets_space_.size() + features.size()) * 1.2);
193 #endif
194 // Add the features to the table
195 for (size_t i = 0; i < features.size(); ++i) {
196 add(features[i].first, features[i].second);
197 }
198 // Now that the table is full, optimize it for speed/space
199 optimize();
200 }
201
202 /** Get a bucket given the key
203 * @param key
204 * @return
205 */
getBucketFromKey(BucketKey key)206 inline const Bucket* getBucketFromKey(BucketKey key) const
207 {
208 // Generate other buckets
209 switch (speed_level_) {
210 case kArray:
211 // That means we get the buckets from an array
212 return &buckets_speed_[key];
213 break;
214 case kBitsetHash:
215 // That means we can check the bitset for the presence of a key
216 if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
217 else return 0;
218 break;
219 case kHash:
220 {
221 // That means we have to check for the hash table for the presence of a key
222 BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
223 bucket_it = buckets_space_.find(key);
224 // Stop here if that bucket does not exist
225 if (bucket_it == bucket_end) return 0;
226 else return &bucket_it->second;
227 break;
228 }
229 }
230 return 0;
231 }
232
233 /** Compute the sub-signature of a feature
234 */
getKey(const ElementType *)235 size_t getKey(const ElementType* /*feature*/) const
236 {
237 std::cerr << "LSH is not implemented for that type" << std::endl;
238 return -1;
239 }
240
241 /** Get statistics about the table
242 * @return
243 */
244 LshStats getStats() const;
245
246 private:
247 /** defines the speed fo the implementation
248 * kArray uses a vector for storing data
249 * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
250 * kHash uses a hash map only
251 */
252 enum SpeedLevel
253 {
254 kArray, kBitsetHash, kHash
255 };
256
257 /** Initialize some variables
258 */
initialize(size_t key_size)259 void initialize(size_t key_size)
260 {
261 speed_level_ = kHash;
262 key_size_ = key_size;
263 }
264
265 /** Optimize the table for speed/space
266 */
optimize()267 void optimize()
268 {
269 // If we are already using the fast storage, no need to do anything
270 if (speed_level_ == kArray) return;
271
272 // Use an array if it will be more than half full
273 if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
274 speed_level_ = kArray;
275 // Fill the array version of it
276 buckets_speed_.resize(size_t(1) << key_size_);
277 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
278
279 // Empty the hash table
280 buckets_space_.clear();
281 return;
282 }
283
284 // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
285 // for the vector) or less than 512MB (key_size_ <= 30)
286 if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
287 >= size_t(size_t(1) << key_size_)) || (key_size_ <= 32)) {
288 speed_level_ = kBitsetHash;
289 key_bitset_.resize(size_t(1) << key_size_);
290 key_bitset_.reset();
291 // Try with the BucketsSpace
292 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
293 }
294 else {
295 speed_level_ = kHash;
296 key_bitset_.clear();
297 }
298 }
299
300 template<typename Archive>
serialize(Archive & ar)301 void serialize(Archive& ar)
302 {
303 int val;
304 if (Archive::is_saving::value) {
305 val = (int)speed_level_;
306 }
307 ar & val;
308 if (Archive::is_loading::value) {
309 speed_level_ = (SpeedLevel) val;
310 }
311
312 ar & key_size_;
313 ar & mask_;
314
315 if (speed_level_==kArray) {
316 ar & buckets_speed_;
317 }
318 if (speed_level_==kBitsetHash || speed_level_==kHash) {
319 ar & buckets_space_;
320 }
321 if (speed_level_==kBitsetHash) {
322 ar & key_bitset_;
323 }
324 }
325 friend struct serialization::access;
326
327 /** The vector of all the buckets if they are held for speed
328 */
329 BucketsSpeed buckets_speed_;
330
331 /** The hash table of all the buckets in case we cannot use the speed version
332 */
333 BucketsSpace buckets_space_;
334
335 /** What is used to store the data */
336 SpeedLevel speed_level_;
337
338 /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
339 * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
340 */
341 DynamicBitset key_bitset_;
342
343 /** The size of the sub-signature in bits
344 */
345 unsigned int key_size_;
346
347 // Members only used for the unsigned char specialization
348 /** The mask to apply to a feature to get the hash key
349 * Only used in the unsigned char case
350 */
351 std::vector<size_t> mask_;
352 };
353
354 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
355 // Specialization for unsigned char
356
357 template<>
LshTable(unsigned int feature_size,unsigned int subsignature_size)358 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
359 {
360 initialize(subsignature_size);
361 // Allocate the mask
362 mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0);
363
364 // A bit brutal but fast to code
365 std::vector<size_t> indices(feature_size * CHAR_BIT);
366 for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i;
367 std::random_device rd;
368 std::mt19937 g(rd());
369 std::shuffle(indices.begin(), indices.end(),g);
370
371 // Generate a random set of order of subsignature_size_ bits
372 for (unsigned int i = 0; i < key_size_; ++i) {
373 size_t index = indices[i];
374
375 // Set that bit in the mask
376 size_t divisor = CHAR_BIT * sizeof(size_t);
377 size_t idx = index / divisor; //pick the right size_t index
378 mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
379 }
380
381 // Set to 1 if you want to display the mask for debug
382 #if 0
383 {
384 size_t bcount = 0;
385 BOOST_FOREACH(size_t mask_block, mask_){
386 out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
387 << std::endl;
388 bcount += __builtin_popcountll(mask_block);
389 }
390 out << "bit count : " << std::dec << bcount << std::endl;
391 out << "mask size : " << mask_.size() << std::endl;
392 return out;
393 }
394 #endif
395 }
396
397 /** Return the Subsignature of a feature
398 * @param feature the feature to analyze
399 */
400 template<>
getKey(const unsigned char * feature)401 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
402 {
403 // no need to check if T is dividable by sizeof(size_t) like in the Hamming
404 // distance computation as we have a mask
405 const size_t* feature_block_ptr = reinterpret_cast<const size_t*> (feature);
406
407 // Figure out the subsignature of the feature
408 // Given the feature ABCDEF, and the mask 001011, the output will be
409 // 000CEF
410 size_t subsignature = 0;
411 size_t bit_index = 1;
412
413 for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) {
414 // get the mask and signature blocks
415 size_t feature_block = *feature_block_ptr;
416 size_t mask_block = *pmask_block;
417 while (mask_block) {
418 // Get the lowest set bit in the mask block
419 size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
420 // Add it to the current subsignature if necessary
421 subsignature += (feature_block & lowest_bit) ? bit_index : 0;
422 // Reset the bit in the mask block
423 mask_block ^= lowest_bit;
424 // increment the bit index for the subsignature
425 bit_index <<= 1;
426 }
427 // Check the next feature block
428 ++feature_block_ptr;
429 }
430 return subsignature;
431 }
432
433 template<>
getStats()434 inline LshStats LshTable<unsigned char>::getStats() const
435 {
436 LshStats stats;
437 stats.bucket_size_mean_ = 0;
438 if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
439 stats.n_buckets_ = 0;
440 stats.bucket_size_median_ = 0;
441 stats.bucket_size_min_ = 0;
442 stats.bucket_size_max_ = 0;
443 return stats;
444 }
445
446 if (!buckets_speed_.empty()) {
447 for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
448 stats.bucket_sizes_.push_back(pbucket->size());
449 stats.bucket_size_mean_ += pbucket->size();
450 }
451 stats.bucket_size_mean_ /= buckets_speed_.size();
452 stats.n_buckets_ = buckets_speed_.size();
453 }
454 else {
455 for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
456 stats.bucket_sizes_.push_back(x->second.size());
457 stats.bucket_size_mean_ += x->second.size();
458 }
459 stats.bucket_size_mean_ /= buckets_space_.size();
460 stats.n_buckets_ = buckets_space_.size();
461 }
462
463 std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
464
465 // BOOST_FOREACH(int size, stats.bucket_sizes_)
466 // std::cout << size << " ";
467 // std::cout << std::endl;
468 stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
469 stats.bucket_size_min_ = stats.bucket_sizes_.front();
470 stats.bucket_size_max_ = stats.bucket_sizes_.back();
471
472 // TODO compute mean and std
473 /*float mean, stddev;
474 stats.bucket_size_mean_ = mean;
475 stats.bucket_size_std_dev = stddev;*/
476
477 // Include a histogram of the buckets
478 unsigned int bin_start = 0;
479 unsigned int bin_end = 20;
480 bool is_new_bin = true;
481 for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
482 != end; )
483 if (*iterator < bin_end) {
484 if (is_new_bin) {
485 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
486 stats.size_histogram_.back()[0] = bin_start;
487 stats.size_histogram_.back()[1] = bin_end - 1;
488 is_new_bin = false;
489 }
490 ++stats.size_histogram_.back()[2];
491 ++iterator;
492 }
493 else {
494 bin_start += 20;
495 bin_end += 20;
496 is_new_bin = true;
497 }
498
499 return stats;
500 }
501
502 // End the two namespaces
503 }
504 }
505
506 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
507
508 #endif /* FLANN_LSH_TABLE_H_ */
509