1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30 
31 /***********************************************************************
32  * Author: Vincent Rabaud
33  *************************************************************************/
34 
35 #ifndef FLANN_LSH_TABLE_H_
36 #define FLANN_LSH_TABLE_H_
37 
38 #include <algorithm>
39 #include <iostream>
40 #include <iomanip>
41 #include <limits.h>
42 #include <random>
43 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
44 #if USE_UNORDERED_MAP
45 #include <unordered_map>
46 #else
47 #include <map>
48 #endif
49 #include <math.h>
50 #include <stddef.h>
51 
52 #include "flann/util/dynamic_bitset.h"
53 #include "flann/util/matrix.h"
54 
55 namespace flann
56 {
57 
58 namespace lsh
59 {
60 
61 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
62 
63 /** What is stored in an LSH bucket
64  */
65 typedef uint32_t FeatureIndex;
66 /** The id from which we can get a bucket back in an LSH table
67  */
68 typedef unsigned int BucketKey;
69 
70 /** A bucket in an LSH table
71  */
72 typedef std::vector<FeatureIndex> Bucket;
73 
74 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
75 
76 /** POD for stats about an LSH table
77  */
78 struct LshStats
79 {
80     std::vector<unsigned int> bucket_sizes_;
81     size_t n_buckets_;
82     size_t bucket_size_mean_;
83     size_t bucket_size_median_;
84     size_t bucket_size_min_;
85     size_t bucket_size_max_;
86     size_t bucket_size_std_dev;
87     /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
88      */
89     std::vector<std::vector<unsigned int> > size_histogram_;
90 };
91 
92 /** Overload the << operator for LshStats
93  * @param out the streams
94  * @param stats the stats to display
95  * @return the streams
96  */
97 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
98 {
99     size_t w = 20;
100     out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
101     << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
102     << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
103     << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
104     << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
105     << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
106     << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
107 
108     // Display the histogram
109     out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
110     << std::setiosflags(std::ios::left);
111     for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
112              stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ",  ";
113 
114     return out;
115 }
116 
117 
118 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
119 
120 /** Lsh hash table. As its key is a sub-feature, and as usually
121  * the size of it is pretty small, we keep it as a continuous memory array.
122  * The value is an index in the corpus of features (we keep it as an unsigned
123  * int for pure memory reasons, it could be a size_t)
124  */
125 template<typename ElementType>
126 class LshTable
127 {
128 public:
129     /** A container of all the feature indices. Optimized for space
130      */
131 #if USE_UNORDERED_MAP
132     typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
133 #else
134     typedef std::map<BucketKey, Bucket> BucketsSpace;
135 #endif
136 
137     /** A container of all the feature indices. Optimized for speed
138      */
139     typedef std::vector<Bucket> BucketsSpeed;
140 
141     /** Default constructor
142      */
LshTable()143     LshTable()
144     {
145     }
146 
147     /** Default constructor
148      * Create the mask and allocate the memory
149      * @param feature_size is the size of the feature (considered as a ElementType[])
150      * @param key_size is the number of bits that are turned on in the feature
151      */
LshTable(unsigned int,unsigned int)152     LshTable(unsigned int /*feature_size*/, unsigned int /*key_size*/)
153     {
154         std::cerr << "LSH is not implemented for that type" << std::endl;
155         throw;
156     }
157 
158     /** Add a feature to the table
159      * @param value the value to store for that feature
160      * @param feature the feature itself
161      */
add(unsigned int value,const ElementType * feature)162     void add(unsigned int value, const ElementType* feature)
163     {
164         // Add the value to the corresponding bucket
165         BucketKey key = getKey(feature);
166 
167         switch (speed_level_) {
168         case kArray:
169             // That means we get the buckets from an array
170             buckets_speed_[key].push_back(value);
171             break;
172         case kBitsetHash:
173             // That means we can check the bitset for the presence of a key
174             key_bitset_.set(key);
175             buckets_space_[key].push_back(value);
176             break;
177         case kHash:
178         {
179             // That means we have to check for the hash table for the presence of a key
180             buckets_space_[key].push_back(value);
181             break;
182         }
183         }
184     }
185 
186     /** Add a set of features to the table
187      * @param dataset the values to store
188      */
add(const std::vector<std::pair<size_t,ElementType * >> & features)189     void add(const std::vector< std::pair<size_t, ElementType*> >& features)
190     {
191 #if USE_UNORDERED_MAP
192         buckets_space_.rehash((buckets_space_.size() + features.size()) * 1.2);
193 #endif
194         // Add the features to the table
195         for (size_t i = 0; i < features.size(); ++i) {
196         	add(features[i].first, features[i].second);
197         }
198         // Now that the table is full, optimize it for speed/space
199         optimize();
200     }
201 
202     /** Get a bucket given the key
203      * @param key
204      * @return
205      */
getBucketFromKey(BucketKey key)206     inline const Bucket* getBucketFromKey(BucketKey key) const
207     {
208         // Generate other buckets
209         switch (speed_level_) {
210         case kArray:
211             // That means we get the buckets from an array
212             return &buckets_speed_[key];
213             break;
214         case kBitsetHash:
215             // That means we can check the bitset for the presence of a key
216             if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
217             else return 0;
218             break;
219         case kHash:
220         {
221             // That means we have to check for the hash table for the presence of a key
222             BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
223             bucket_it = buckets_space_.find(key);
224             // Stop here if that bucket does not exist
225             if (bucket_it == bucket_end) return 0;
226             else return &bucket_it->second;
227             break;
228         }
229         }
230         return 0;
231     }
232 
233     /** Compute the sub-signature of a feature
234      */
getKey(const ElementType *)235     size_t getKey(const ElementType* /*feature*/) const
236     {
237         std::cerr << "LSH is not implemented for that type" << std::endl;
238         return -1;
239     }
240 
241     /** Get statistics about the table
242      * @return
243      */
244     LshStats getStats() const;
245 
246 private:
247     /** defines the speed fo the implementation
248      * kArray uses a vector for storing data
249      * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
250      * kHash uses a hash map only
251      */
252     enum SpeedLevel
253     {
254         kArray, kBitsetHash, kHash
255     };
256 
257     /** Initialize some variables
258      */
initialize(size_t key_size)259     void initialize(size_t key_size)
260     {
261         speed_level_ = kHash;
262         key_size_ = key_size;
263     }
264 
265     /** Optimize the table for speed/space
266      */
optimize()267     void optimize()
268     {
269         // If we are already using the fast storage, no need to do anything
270         if (speed_level_ == kArray) return;
271 
272         // Use an array if it will be more than half full
273         if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
274             speed_level_ = kArray;
275             // Fill the array version of it
276             buckets_speed_.resize(size_t(1) << key_size_);
277             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
278 
279             // Empty the hash table
280             buckets_space_.clear();
281             return;
282         }
283 
284         // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
285         // for the vector) or less than 512MB (key_size_ <= 30)
286         if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
287              >= size_t(size_t(1) << key_size_)) || (key_size_ <= 32)) {
288             speed_level_ = kBitsetHash;
289             key_bitset_.resize(size_t(1) << key_size_);
290             key_bitset_.reset();
291             // Try with the BucketsSpace
292             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
293         }
294         else {
295             speed_level_ = kHash;
296             key_bitset_.clear();
297         }
298     }
299 
300     template<typename Archive>
serialize(Archive & ar)301     void serialize(Archive& ar)
302     {
303     	int val;
304     	if (Archive::is_saving::value) {
305     		val = (int)speed_level_;
306     	}
307     	ar & val;
308     	if (Archive::is_loading::value) {
309     		speed_level_ = (SpeedLevel) val;
310     	}
311 
312     	ar & key_size_;
313     	ar & mask_;
314 
315     	if (speed_level_==kArray) {
316     		ar & buckets_speed_;
317     	}
318     	if (speed_level_==kBitsetHash || speed_level_==kHash) {
319     		ar & buckets_space_;
320     	}
321 		if (speed_level_==kBitsetHash) {
322 			ar & key_bitset_;
323 		}
324     }
325     friend struct serialization::access;
326 
327     /** The vector of all the buckets if they are held for speed
328      */
329     BucketsSpeed buckets_speed_;
330 
331     /** The hash table of all the buckets in case we cannot use the speed version
332      */
333     BucketsSpace buckets_space_;
334 
335     /** What is used to store the data */
336     SpeedLevel speed_level_;
337 
338     /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
339      * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
340      */
341     DynamicBitset key_bitset_;
342 
343     /** The size of the sub-signature in bits
344      */
345     unsigned int key_size_;
346 
347     // Members only used for the unsigned char specialization
348     /** The mask to apply to a feature to get the hash key
349      * Only used in the unsigned char case
350      */
351     std::vector<size_t> mask_;
352 };
353 
354 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
355 // Specialization for unsigned char
356 
357 template<>
LshTable(unsigned int feature_size,unsigned int subsignature_size)358 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
359 {
360     initialize(subsignature_size);
361     // Allocate the mask
362     mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0);
363 
364     // A bit brutal but fast to code
365     std::vector<size_t> indices(feature_size * CHAR_BIT);
366     for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i;
367     std::random_device rd;
368     std::mt19937 g(rd());
369     std::shuffle(indices.begin(), indices.end(),g);
370 
371     // Generate a random set of order of subsignature_size_ bits
372     for (unsigned int i = 0; i < key_size_; ++i) {
373         size_t index = indices[i];
374 
375         // Set that bit in the mask
376         size_t divisor = CHAR_BIT * sizeof(size_t);
377         size_t idx = index / divisor; //pick the right size_t index
378         mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
379     }
380 
381     // Set to 1 if you want to display the mask for debug
382 #if 0
383     {
384         size_t bcount = 0;
385         BOOST_FOREACH(size_t mask_block, mask_){
386             out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
387                 << std::endl;
388             bcount += __builtin_popcountll(mask_block);
389         }
390         out << "bit count : " << std::dec << bcount << std::endl;
391         out << "mask size : " << mask_.size() << std::endl;
392         return out;
393     }
394 #endif
395 }
396 
397 /** Return the Subsignature of a feature
398  * @param feature the feature to analyze
399  */
400 template<>
getKey(const unsigned char * feature)401 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
402 {
403     // no need to check if T is dividable by sizeof(size_t) like in the Hamming
404     // distance computation as we have a mask
405     const size_t* feature_block_ptr = reinterpret_cast<const size_t*> (feature);
406 
407     // Figure out the subsignature of the feature
408     // Given the feature ABCDEF, and the mask 001011, the output will be
409     // 000CEF
410     size_t subsignature = 0;
411     size_t bit_index = 1;
412 
413     for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) {
414         // get the mask and signature blocks
415         size_t feature_block = *feature_block_ptr;
416         size_t mask_block = *pmask_block;
417         while (mask_block) {
418             // Get the lowest set bit in the mask block
419             size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
420             // Add it to the current subsignature if necessary
421             subsignature += (feature_block & lowest_bit) ? bit_index : 0;
422             // Reset the bit in the mask block
423             mask_block ^= lowest_bit;
424             // increment the bit index for the subsignature
425             bit_index <<= 1;
426         }
427         // Check the next feature block
428         ++feature_block_ptr;
429     }
430     return subsignature;
431 }
432 
433 template<>
getStats()434 inline LshStats LshTable<unsigned char>::getStats() const
435 {
436     LshStats stats;
437     stats.bucket_size_mean_ = 0;
438     if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
439         stats.n_buckets_ = 0;
440         stats.bucket_size_median_ = 0;
441         stats.bucket_size_min_ = 0;
442         stats.bucket_size_max_ = 0;
443         return stats;
444     }
445 
446     if (!buckets_speed_.empty()) {
447         for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
448             stats.bucket_sizes_.push_back(pbucket->size());
449             stats.bucket_size_mean_ += pbucket->size();
450         }
451         stats.bucket_size_mean_ /= buckets_speed_.size();
452         stats.n_buckets_ = buckets_speed_.size();
453     }
454     else {
455         for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
456             stats.bucket_sizes_.push_back(x->second.size());
457             stats.bucket_size_mean_ += x->second.size();
458         }
459         stats.bucket_size_mean_ /= buckets_space_.size();
460         stats.n_buckets_ = buckets_space_.size();
461     }
462 
463     std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
464 
465     //  BOOST_FOREACH(int size, stats.bucket_sizes_)
466     //          std::cout << size << " ";
467     //  std::cout << std::endl;
468     stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
469     stats.bucket_size_min_ = stats.bucket_sizes_.front();
470     stats.bucket_size_max_ = stats.bucket_sizes_.back();
471 
472     // TODO compute mean and std
473     /*float mean, stddev;
474        stats.bucket_size_mean_ = mean;
475        stats.bucket_size_std_dev = stddev;*/
476 
477     // Include a histogram of the buckets
478     unsigned int bin_start = 0;
479     unsigned int bin_end = 20;
480     bool is_new_bin = true;
481     for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
482          != end; )
483         if (*iterator < bin_end) {
484             if (is_new_bin) {
485                 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
486                 stats.size_histogram_.back()[0] = bin_start;
487                 stats.size_histogram_.back()[1] = bin_end - 1;
488                 is_new_bin = false;
489             }
490             ++stats.size_histogram_.back()[2];
491             ++iterator;
492         }
493         else {
494             bin_start += 20;
495             bin_end += 20;
496             is_new_bin = true;
497         }
498 
499     return stats;
500 }
501 
502 // End the two namespaces
503 }
504 }
505 
506 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
507 
508 #endif /* FLANN_LSH_TABLE_H_ */
509