/*********************************************************************** * Software License Agreement (BSD License) * * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved. * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved. * * THE BSD LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************************/ /*********************************************************************** * Author: Vincent Rabaud *************************************************************************/ #ifndef FLANN_LSH_INDEX_H_ #define FLANN_LSH_INDEX_H_ #include #include #include #include #include #include "FLANN/general.h" #include "FLANN/algorithms/nn_index.h" #include "FLANN/util/matrix.h" #include "FLANN/util/result_set.h" #include "FLANN/util/heap.h" #include "FLANN/util/lsh_table.h" #include "FLANN/util/allocator.h" #include "FLANN/util/random.h" #include "FLANN/util/saving.h" namespace flann { struct LshIndexParams : public IndexParams { LshIndexParams(unsigned int table_number = 12, unsigned int key_size = 20, unsigned int multi_probe_level = 2) { (* this)["algorithm"] = FLANN_INDEX_LSH; // The number of hash tables to use (*this)["table_number"] = table_number; // The length of the key in the hash tables (*this)["key_size"] = key_size; // Number of levels to use in multi-probe (0 for standard LSH) (*this)["multi_probe_level"] = multi_probe_level; } }; /** * Locality-sensitive hashing index * * Contains the tables and other information for indexing a set of points * for nearest-neighbor matching. */ template class LshIndex : public NNIndex { public: typedef typename Distance::ElementType ElementType; typedef typename Distance::ResultType DistanceType; typedef NNIndex BaseClass; /** Constructor * @param params parameters passed to the LSH algorithm * @param d the distance used */ LshIndex(const IndexParams& params = LshIndexParams(), Distance d = Distance()) : BaseClass(params, d) { table_number_ = get_param(index_params_,"table_number",12); key_size_ = get_param(index_params_,"key_size",20); multi_probe_level_ = get_param(index_params_,"multi_probe_level",2); fill_xor_mask(0, key_size_, multi_probe_level_, xor_masks_); } /** Constructor * @param input_data dataset with the input features * @param params parameters passed to the LSH algorithm * @param d the distance used */ LshIndex(const Matrix& input_data, const IndexParams& params = LshIndexParams(), Distance d = Distance()) : BaseClass(params, d) { table_number_ = get_param(index_params_,"table_number",12); key_size_ = get_param(index_params_,"key_size",20); multi_probe_level_ = get_param(index_params_,"multi_probe_level",2); fill_xor_mask(0, key_size_, multi_probe_level_, xor_masks_); setDataset(input_data); } LshIndex(const LshIndex& other) : BaseClass(other), tables_(other.tables_), table_number_(other.table_number_), key_size_(other.key_size_), multi_probe_level_(other.multi_probe_level_), xor_masks_(other.xor_masks_) { } LshIndex& operator=(LshIndex other) { this->swap(other); return *this; } virtual ~LshIndex() { freeIndex(); } BaseClass* clone() const { return new LshIndex(*this); } using BaseClass::buildIndex; void addPoints(const Matrix& points, float rebuild_threshold = 2) { assert(points.cols==veclen_); size_t old_size = size_; extendDataset(points); if (rebuild_threshold>1 && size_at_build_*rebuild_threshold& table = tables_[i]; for (size_t i=old_size;i void serialize(Archive& ar) { ar.setObject(this); ar & *static_cast*>(this); ar & table_number_; ar & key_size_; ar & multi_probe_level_; ar & xor_masks_; ar & tables_; if (Archive::is_loading::value) { index_params_["algorithm"] = getType(); index_params_["table_number"] = table_number_; index_params_["key_size"] = key_size_; index_params_["multi_probe_level"] = multi_probe_level_; } } void saveIndex(FILE* stream) { serialization::SaveArchive sa(stream); sa & *this; } void loadIndex(FILE* stream) { serialization::LoadArchive la(stream); la & *this; } /** * Computes the index memory usage * Returns: memory used by the index */ int usedMemory() const { return size_ * sizeof(int); } /** * \brief Perform k-nearest neighbor search * \param[in] queries The query points for which to find the nearest neighbors * \param[out] indices The indices of the nearest neighbors found * \param[out] dists Distances to the nearest neighbors found * \param[in] knn Number of nearest neighbors to return * \param[in] params Search parameters */ int knnSearch(const Matrix& queries, Matrix& indices, Matrix& dists, size_t knn, const SearchParams& params) const { assert(queries.cols == veclen_); assert(indices.rows >= queries.rows); assert(dists.rows >= queries.rows); assert(indices.cols >= knn); assert(dists.cols >= knn); int count = 0; if (params.use_heap==FLANN_True) { #pragma omp parallel num_threads(params.cores) { KNNUniqueResultSet resultSet(knn); #pragma omp for schedule(dynamic) reduction(+:count) for (int i = 0; i < (int)queries.rows; i++) { resultSet.clear(); findNeighbors(resultSet, queries[i], params); size_t n = std::min(resultSet.size(), knn); resultSet.copy(indices[i], dists[i], n, params.sorted); indices_to_ids(indices[i], indices[i], n); count += n; } } } else { #pragma omp parallel num_threads(params.cores) { KNNResultSet resultSet(knn); #pragma omp for schedule(dynamic) reduction(+:count) for (int i = 0; i < (int)queries.rows; i++) { resultSet.clear(); findNeighbors(resultSet, queries[i], params); size_t n = std::min(resultSet.size(), knn); resultSet.copy(indices[i], dists[i], n, params.sorted); indices_to_ids(indices[i], indices[i], n); count += n; } } } return count; } /** * \brief Perform k-nearest neighbor search * \param[in] queries The query points for which to find the nearest neighbors * \param[out] indices The indices of the nearest neighbors found * \param[out] dists Distances to the nearest neighbors found * \param[in] knn Number of nearest neighbors to return * \param[in] params Search parameters */ int knnSearch(const Matrix& queries, std::vector< std::vector >& indices, std::vector >& dists, size_t knn, const SearchParams& params) const { assert(queries.cols == veclen_); if (indices.size() < queries.rows ) indices.resize(queries.rows); if (dists.size() < queries.rows ) dists.resize(queries.rows); int count = 0; if (params.use_heap==FLANN_True) { #pragma omp parallel num_threads(params.cores) { KNNUniqueResultSet resultSet(knn); #pragma omp for schedule(dynamic) reduction(+:count) for (int i = 0; i < (int)queries.rows; i++) { resultSet.clear(); findNeighbors(resultSet, queries[i], params); size_t n = std::min(resultSet.size(), knn); indices[i].resize(n); dists[i].resize(n); if (n > 0) { resultSet.copy(&indices[i][0], &dists[i][0], n, params.sorted); indices_to_ids(&indices[i][0], &indices[i][0], n); } count += n; } } } else { #pragma omp parallel num_threads(params.cores) { KNNResultSet resultSet(knn); #pragma omp for schedule(dynamic) reduction(+:count) for (int i = 0; i < (int)queries.rows; i++) { resultSet.clear(); findNeighbors(resultSet, queries[i], params); size_t n = std::min(resultSet.size(), knn); indices[i].resize(n); dists[i].resize(n); if (n > 0) { resultSet.copy(&indices[i][0], &dists[i][0], n, params.sorted); indices_to_ids(&indices[i][0], &indices[i][0], n); } count += n; } } } return count; } /** * Find set of nearest neighbors to vec. Their indices are stored inside * the result object. * * Params: * result = the result object in which the indices of the nearest-neighbors are stored * vec = the vector for which to search the nearest neighbors * maxCheck = the maximum number of restarts (in a best-bin-first manner) */ void findNeighbors(ResultSet& result, const ElementType* vec, const SearchParams& /*searchParams*/) const { getNeighbors(vec, result); } protected: /** * Builds the index */ void buildIndexImpl() { tables_.resize(table_number_); std::vector > features; features.reserve(points_.size()); for (size_t i=0;i& table = tables_[i]; table = lsh::LshTable(veclen_, key_size_); // Add the features to the table table.add(features); } } void freeIndex() { /* nothing to do here */ } private: /** Defines the comparator on score and index */ typedef std::pair ScoreIndexPair; struct SortScoreIndexPairOnSecond { bool operator()(const ScoreIndexPair& left, const ScoreIndexPair& right) const { return left.second < right.second; } }; /** Fills the different xor masks to use when getting the neighbors in multi-probe LSH * @param key the key we build neighbors from * @param lowest_index the lowest index of the bit set * @param level the multi-probe level we are at * @param xor_masks all the xor mask */ void fill_xor_mask(lsh::BucketKey key, int lowest_index, unsigned int level, std::vector& xor_masks) { xor_masks.push_back(key); if (level == 0) return; for (int index = lowest_index - 1; index >= 0; --index) { // Create a new key lsh::BucketKey new_key = key | (lsh::BucketKey(1) << index); fill_xor_mask(new_key, index, level - 1, xor_masks); } } /** Performs the approximate nearest-neighbor search. * @param vec the feature to analyze * @param do_radius flag indicating if we check the radius too * @param radius the radius if it is a radius search * @param do_k flag indicating if we limit the number of nn * @param k_nn the number of nearest neighbors * @param checked_average used for debugging */ void getNeighbors(const ElementType* vec, bool do_radius, float radius, bool do_k, unsigned int k_nn, float& checked_average) { static std::vector score_index_heap; if (do_k) { unsigned int worst_score = std::numeric_limits::max(); typename std::vector >::const_iterator table = tables_.begin(); typename std::vector >::const_iterator table_end = tables_.end(); for (; table != table_end; ++table) { size_t key = table->getKey(vec); std::vector::const_iterator xor_mask = xor_masks_.begin(); std::vector::const_iterator xor_mask_end = xor_masks_.end(); for (; xor_mask != xor_mask_end; ++xor_mask) { size_t sub_key = key ^ (*xor_mask); const lsh::Bucket* bucket = table->getBucketFromKey(sub_key); if (bucket == 0) continue; // Go over each descriptor index std::vector::const_iterator training_index = bucket->begin(); std::vector::const_iterator last_training_index = bucket->end(); DistanceType hamming_distance; // Process the rest of the candidates for (; training_index < last_training_index; ++training_index) { if (removed_ && removed_points_.test(*training_index)) continue; hamming_distance = distance_(vec, points_[*training_index].point, veclen_); if (hamming_distance < worst_score) { // Insert the new element score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index)); std::push_heap(score_index_heap.begin(), score_index_heap.end()); if (score_index_heap.size() > (unsigned int)k_nn) { // Remove the highest distance value as we have too many elements std::pop_heap(score_index_heap.begin(), score_index_heap.end()); score_index_heap.pop_back(); // Keep track of the worst score worst_score = score_index_heap.front().first; } } } } } } else { typename std::vector >::const_iterator table = tables_.begin(); typename std::vector >::const_iterator table_end = tables_.end(); for (; table != table_end; ++table) { size_t key = table->getKey(vec); std::vector::const_iterator xor_mask = xor_masks_.begin(); std::vector::const_iterator xor_mask_end = xor_masks_.end(); for (; xor_mask != xor_mask_end; ++xor_mask) { size_t sub_key = key ^ (*xor_mask); const lsh::Bucket* bucket = table->getBucketFromKey(sub_key); if (bucket == 0) continue; // Go over each descriptor index std::vector::const_iterator training_index = bucket->begin(); std::vector::const_iterator last_training_index = bucket->end(); DistanceType hamming_distance; // Process the rest of the candidates for (; training_index < last_training_index; ++training_index) { if (removed_ && removed_points_.test(*training_index)) continue; // Compute the Hamming distance hamming_distance = distance_(vec, points_[*training_index].point, veclen_); if (hamming_distance < radius) score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index)); } } } } } /** Performs the approximate nearest-neighbor search. * This is a slower version than the above as it uses the ResultSet * @param vec the feature to analyze */ void getNeighbors(const ElementType* vec, ResultSet& result) const { typename std::vector >::const_iterator table = tables_.begin(); typename std::vector >::const_iterator table_end = tables_.end(); for (; table != table_end; ++table) { size_t key = table->getKey(vec); std::vector::const_iterator xor_mask = xor_masks_.begin(); std::vector::const_iterator xor_mask_end = xor_masks_.end(); for (; xor_mask != xor_mask_end; ++xor_mask) { size_t sub_key = key ^ (*xor_mask); const lsh::Bucket* bucket = table->getBucketFromKey(sub_key); if (bucket == 0) continue; // Go over each descriptor index std::vector::const_iterator training_index = bucket->begin(); std::vector::const_iterator last_training_index = bucket->end(); DistanceType hamming_distance; // Process the rest of the candidates for (; training_index < last_training_index; ++training_index) { if (removed_ && removed_points_.test(*training_index)) continue; // Compute the Hamming distance hamming_distance = distance_(vec, points_[*training_index], veclen_); result.addPoint(hamming_distance, *training_index); } } } } void swap(LshIndex& other) { BaseClass::swap(other); std::swap(tables_, other.tables_); std::swap(size_at_build_, other.size_at_build_); std::swap(table_number_, other.table_number_); std::swap(key_size_, other.key_size_); std::swap(multi_probe_level_, other.multi_probe_level_); std::swap(xor_masks_, other.xor_masks_); } /** The different hash tables */ std::vector > tables_; /** table number */ unsigned int table_number_; /** key size */ unsigned int key_size_; /** How far should we look for neighbors in multi-probe LSH */ unsigned int multi_probe_level_; /** The XOR masks to apply to a key to get the neighboring buckets */ std::vector xor_masks_; USING_BASECLASS_SYMBOLS }; } #endif //FLANN_LSH_INDEX_H_