1 /*********************************************************************** 2 * Software License Agreement (BSD License) 3 * 4 * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved. 5 * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved. 6 * 7 * THE BSD LICENSE 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 *************************************************************************/ 30 31 #ifndef OPENCV_FLANN_KMEANS_INDEX_H_ 32 #define OPENCV_FLANN_KMEANS_INDEX_H_ 33 34 //! @cond IGNORED 35 36 #include <algorithm> 37 #include <map> 38 #include <limits> 39 #include <cmath> 40 41 #include "general.h" 42 #include "nn_index.h" 43 #include "dist.h" 44 #include "matrix.h" 45 #include "result_set.h" 46 #include "heap.h" 47 #include "allocator.h" 48 #include "random.h" 49 #include "saving.h" 50 #include "logger.h" 51 52 #define BITS_PER_CHAR 8 53 #define BITS_PER_BASE 2 // for DNA/RNA sequences 54 #define BASE_PER_CHAR (BITS_PER_CHAR/BITS_PER_BASE) 55 #define HISTOS_PER_BASE (1<<BITS_PER_BASE) 56 57 58 namespace cvflann 59 { 60 61 struct KMeansIndexParams : public IndexParams 62 { 63 KMeansIndexParams(int branching = 32, int iterations = 11, 64 flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, 65 float cb_index = 0.2, int trees = 1 ) 66 { 67 (*this)["algorithm"] = FLANN_INDEX_KMEANS; 68 // branching factor 69 (*this)["branching"] = branching; 70 // max iterations to perform in one kmeans clustering (kmeans tree) 71 (*this)["iterations"] = iterations; 72 // algorithm used for picking the initial cluster centers for kmeans tree 73 (*this)["centers_init"] = centers_init; 74 // cluster boundary index. Used when searching the kmeans tree 75 (*this)["cb_index"] = cb_index; 76 // number of kmeans trees to search in 77 (*this)["trees"] = trees; 78 } 79 }; 80 81 82 /** 83 * Hierarchical kmeans index 84 * 85 * Contains a tree constructed through a hierarchical kmeans clustering 86 * and other information for indexing a set of points for nearest-neighbour matching. 87 */ 88 template <typename Distance> 89 class KMeansIndex : public NNIndex<Distance> 90 { 91 public: 92 typedef typename Distance::ElementType ElementType; 93 typedef typename Distance::ResultType DistanceType; 94 typedef typename Distance::CentersType CentersType; 95 96 typedef typename Distance::is_kdtree_distance is_kdtree_distance; 97 typedef typename Distance::is_vector_space_distance is_vector_space_distance; 98 99 100 101 typedef void (KMeansIndex::* centersAlgFunction)(int, int*, int, int*, int&); 102 103 /** 104 * The function used for choosing the cluster centers. 105 */ 106 centersAlgFunction chooseCenters; 107 108 109 110 /** 111 * Chooses the initial centers in the k-means clustering in a random manner. 112 * 113 * Params: 114 * k = number of centers 115 * vecs = the dataset of points 116 * indices = indices in the dataset 117 * indices_length = length of indices vector 118 * 119 */ chooseCentersRandom(int k,int * indices,int indices_length,int * centers,int & centers_length)120 void chooseCentersRandom(int k, int* indices, int indices_length, int* centers, int& centers_length) 121 { 122 UniqueRandom r(indices_length); 123 124 int index; 125 for (index=0; index<k; ++index) { 126 bool duplicate = true; 127 int rnd; 128 while (duplicate) { 129 duplicate = false; 130 rnd = r.next(); 131 if (rnd<0) { 132 centers_length = index; 133 return; 134 } 135 136 centers[index] = indices[rnd]; 137 138 for (int j=0; j<index; ++j) { 139 DistanceType sq = distance_(dataset_[centers[index]], dataset_[centers[j]], dataset_.cols); 140 if (sq<1e-16) { 141 duplicate = true; 142 } 143 } 144 } 145 } 146 147 centers_length = index; 148 } 149 150 151 /** 152 * Chooses the initial centers in the k-means using Gonzales' algorithm 153 * so that the centers are spaced apart from each other. 154 * 155 * Params: 156 * k = number of centers 157 * vecs = the dataset of points 158 * indices = indices in the dataset 159 * Returns: 160 */ chooseCentersGonzales(int k,int * indices,int indices_length,int * centers,int & centers_length)161 void chooseCentersGonzales(int k, int* indices, int indices_length, int* centers, int& centers_length) 162 { 163 int n = indices_length; 164 165 int rnd = rand_int(n); 166 CV_DbgAssert(rnd >=0 && rnd < n); 167 168 centers[0] = indices[rnd]; 169 170 int index; 171 for (index=1; index<k; ++index) { 172 173 int best_index = -1; 174 DistanceType best_val = 0; 175 for (int j=0; j<n; ++j) { 176 DistanceType dist = distance_(dataset_[centers[0]],dataset_[indices[j]],dataset_.cols); 177 for (int i=1; i<index; ++i) { 178 DistanceType tmp_dist = distance_(dataset_[centers[i]],dataset_[indices[j]],dataset_.cols); 179 if (tmp_dist<dist) { 180 dist = tmp_dist; 181 } 182 } 183 if (dist>best_val) { 184 best_val = dist; 185 best_index = j; 186 } 187 } 188 if (best_index!=-1) { 189 centers[index] = indices[best_index]; 190 } 191 else { 192 break; 193 } 194 } 195 centers_length = index; 196 } 197 198 199 /** 200 * Chooses the initial centers in the k-means using the algorithm 201 * proposed in the KMeans++ paper: 202 * Arthur, David; Vassilvitskii, Sergei - k-means++: The Advantages of Careful Seeding 203 * 204 * Implementation of this function was converted from the one provided in Arthur's code. 205 * 206 * Params: 207 * k = number of centers 208 * vecs = the dataset of points 209 * indices = indices in the dataset 210 * Returns: 211 */ chooseCentersKMeanspp(int k,int * indices,int indices_length,int * centers,int & centers_length)212 void chooseCentersKMeanspp(int k, int* indices, int indices_length, int* centers, int& centers_length) 213 { 214 int n = indices_length; 215 216 double currentPot = 0; 217 DistanceType* closestDistSq = new DistanceType[n]; 218 219 // Choose one random center and set the closestDistSq values 220 int index = rand_int(n); 221 CV_DbgAssert(index >=0 && index < n); 222 centers[0] = indices[index]; 223 224 for (int i = 0; i < n; i++) { 225 closestDistSq[i] = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols); 226 closestDistSq[i] = ensureSquareDistance<Distance>( closestDistSq[i] ); 227 currentPot += closestDistSq[i]; 228 } 229 230 231 const int numLocalTries = 1; 232 233 // Choose each center 234 int centerCount; 235 for (centerCount = 1; centerCount < k; centerCount++) { 236 237 // Repeat several trials 238 double bestNewPot = -1; 239 int bestNewIndex = -1; 240 for (int localTrial = 0; localTrial < numLocalTries; localTrial++) { 241 242 // Choose our center - have to be slightly careful to return a valid answer even accounting 243 // for possible rounding errors 244 double randVal = rand_double(currentPot); 245 for (index = 0; index < n-1; index++) { 246 if (randVal <= closestDistSq[index]) break; 247 else randVal -= closestDistSq[index]; 248 } 249 250 // Compute the new potential 251 double newPot = 0; 252 for (int i = 0; i < n; i++) { 253 DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols); 254 newPot += std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] ); 255 } 256 257 // Store the best result 258 if ((bestNewPot < 0)||(newPot < bestNewPot)) { 259 bestNewPot = newPot; 260 bestNewIndex = index; 261 } 262 } 263 264 // Add the appropriate center 265 centers[centerCount] = indices[bestNewIndex]; 266 currentPot = bestNewPot; 267 for (int i = 0; i < n; i++) { 268 DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[bestNewIndex]], dataset_.cols); 269 closestDistSq[i] = std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] ); 270 } 271 } 272 273 centers_length = centerCount; 274 275 delete[] closestDistSq; 276 } 277 278 279 280 public: 281 getType()282 flann_algorithm_t getType() const CV_OVERRIDE 283 { 284 return FLANN_INDEX_KMEANS; 285 } 286 287 template<class CentersContainerType> 288 class KMeansDistanceComputer : public cv::ParallelLoopBody 289 { 290 public: KMeansDistanceComputer(Distance _distance,const Matrix<ElementType> & _dataset,const int _branching,const int * _indices,const CentersContainerType & _dcenters,const size_t _veclen,std::vector<int> & _new_centroids,std::vector<DistanceType> & _sq_dists)291 KMeansDistanceComputer(Distance _distance, const Matrix<ElementType>& _dataset, 292 const int _branching, const int* _indices, const CentersContainerType& _dcenters, 293 const size_t _veclen, std::vector<int> &_new_centroids, 294 std::vector<DistanceType> &_sq_dists) 295 : distance(_distance) 296 , dataset(_dataset) 297 , branching(_branching) 298 , indices(_indices) 299 , dcenters(_dcenters) 300 , veclen(_veclen) 301 , new_centroids(_new_centroids) 302 , sq_dists(_sq_dists) 303 { 304 } 305 operator()306 void operator()(const cv::Range& range) const CV_OVERRIDE 307 { 308 const int begin = range.start; 309 const int end = range.end; 310 311 for( int i = begin; i<end; ++i) 312 { 313 DistanceType sq_dist(distance(dataset[indices[i]], dcenters[0], veclen)); 314 int new_centroid(0); 315 for (int j=1; j<branching; ++j) { 316 DistanceType new_sq_dist = distance(dataset[indices[i]], dcenters[j], veclen); 317 if (sq_dist>new_sq_dist) { 318 new_centroid = j; 319 sq_dist = new_sq_dist; 320 } 321 } 322 sq_dists[i] = sq_dist; 323 new_centroids[i] = new_centroid; 324 } 325 } 326 327 private: 328 Distance distance; 329 const Matrix<ElementType>& dataset; 330 const int branching; 331 const int* indices; 332 const CentersContainerType& dcenters; 333 const size_t veclen; 334 std::vector<int> &new_centroids; 335 std::vector<DistanceType> &sq_dists; 336 KMeansDistanceComputer& operator=( const KMeansDistanceComputer & ) { return *this; } 337 }; 338 339 /** 340 * Index constructor 341 * 342 * Params: 343 * inputData = dataset with the input features 344 * params = parameters passed to the hierarchical k-means algorithm 345 */ 346 KMeansIndex(const Matrix<ElementType>& inputData, const IndexParams& params = KMeansIndexParams(), 347 Distance d = Distance()) dataset_(inputData)348 : dataset_(inputData), index_params_(params), root_(NULL), indices_(NULL), distance_(d) 349 { 350 memoryCounter_ = 0; 351 352 size_ = dataset_.rows; 353 veclen_ = dataset_.cols; 354 355 branching_ = get_param(params,"branching",32); 356 trees_ = get_param(params,"trees",1); 357 iterations_ = get_param(params,"iterations",11); 358 if (iterations_<0) { 359 iterations_ = (std::numeric_limits<int>::max)(); 360 } 361 centers_init_ = get_param(params,"centers_init",FLANN_CENTERS_RANDOM); 362 363 if (centers_init_==FLANN_CENTERS_RANDOM) { 364 chooseCenters = &KMeansIndex::chooseCentersRandom; 365 } 366 else if (centers_init_==FLANN_CENTERS_GONZALES) { 367 chooseCenters = &KMeansIndex::chooseCentersGonzales; 368 } 369 else if (centers_init_==FLANN_CENTERS_KMEANSPP) { 370 chooseCenters = &KMeansIndex::chooseCentersKMeanspp; 371 } 372 else { 373 FLANN_THROW(cv::Error::StsBadArg, "Unknown algorithm for choosing initial centers."); 374 } 375 cb_index_ = 0.4f; 376 377 root_ = new KMeansNodePtr[trees_]; 378 indices_ = new int*[trees_]; 379 380 for (int i=0; i<trees_; ++i) { 381 root_[i] = NULL; 382 indices_[i] = NULL; 383 } 384 } 385 386 387 KMeansIndex(const KMeansIndex&); 388 KMeansIndex& operator=(const KMeansIndex&); 389 390 391 /** 392 * Index destructor. 393 * 394 * Release the memory used by the index. 395 */ ~KMeansIndex()396 virtual ~KMeansIndex() 397 { 398 if (root_ != NULL) { 399 free_centers(); 400 delete[] root_; 401 } 402 if (indices_!=NULL) { 403 free_indices(); 404 delete[] indices_; 405 } 406 } 407 408 /** 409 * Returns size of index. 410 */ size()411 size_t size() const CV_OVERRIDE 412 { 413 return size_; 414 } 415 416 /** 417 * Returns the length of an index feature. 418 */ veclen()419 size_t veclen() const CV_OVERRIDE 420 { 421 return veclen_; 422 } 423 424 set_cb_index(float index)425 void set_cb_index( float index) 426 { 427 cb_index_ = index; 428 } 429 430 /** 431 * Computes the inde memory usage 432 * Returns: memory used by the index 433 */ usedMemory()434 int usedMemory() const CV_OVERRIDE 435 { 436 return pool_.usedMemory+pool_.wastedMemory+memoryCounter_; 437 } 438 439 /** 440 * Builds the index 441 */ buildIndex()442 void buildIndex() CV_OVERRIDE 443 { 444 if (branching_<2) { 445 FLANN_THROW(cv::Error::StsError, "Branching factor must be at least 2"); 446 } 447 448 free_indices(); 449 450 for (int i=0; i<trees_; ++i) { 451 indices_[i] = new int[size_]; 452 for (size_t j=0; j<size_; ++j) { 453 indices_[i][j] = int(j); 454 } 455 root_[i] = pool_.allocate<KMeansNode>(); 456 std::memset(root_[i], 0, sizeof(KMeansNode)); 457 458 Distance* dummy = NULL; 459 computeNodeStatistics(root_[i], indices_[i], (unsigned int)size_, dummy); 460 461 computeClustering(root_[i], indices_[i], (int)size_, branching_,0); 462 } 463 } 464 465 saveIndex(FILE * stream)466 void saveIndex(FILE* stream) CV_OVERRIDE 467 { 468 save_value(stream, branching_); 469 save_value(stream, iterations_); 470 save_value(stream, memoryCounter_); 471 save_value(stream, cb_index_); 472 save_value(stream, trees_); 473 for (int i=0; i<trees_; ++i) { 474 save_value(stream, *indices_[i], (int)size_); 475 save_tree(stream, root_[i], i); 476 } 477 } 478 479 loadIndex(FILE * stream)480 void loadIndex(FILE* stream) CV_OVERRIDE 481 { 482 if (indices_!=NULL) { 483 free_indices(); 484 delete[] indices_; 485 } 486 if (root_!=NULL) { 487 free_centers(); 488 } 489 490 load_value(stream, branching_); 491 load_value(stream, iterations_); 492 load_value(stream, memoryCounter_); 493 load_value(stream, cb_index_); 494 load_value(stream, trees_); 495 496 indices_ = new int*[trees_]; 497 for (int i=0; i<trees_; ++i) { 498 indices_[i] = new int[size_]; 499 load_value(stream, *indices_[i], size_); 500 load_tree(stream, root_[i], i); 501 } 502 503 index_params_["algorithm"] = getType(); 504 index_params_["branching"] = branching_; 505 index_params_["trees"] = trees_; 506 index_params_["iterations"] = iterations_; 507 index_params_["centers_init"] = centers_init_; 508 index_params_["cb_index"] = cb_index_; 509 } 510 511 512 /** 513 * Find set of nearest neighbors to vec. Their indices are stored inside 514 * the result object. 515 * 516 * Params: 517 * result = the result object in which the indices of the nearest-neighbors are stored 518 * vec = the vector for which to search the nearest neighbors 519 * searchParams = parameters that influence the search algorithm (checks, cb_index) 520 */ findNeighbors(ResultSet<DistanceType> & result,const ElementType * vec,const SearchParams & searchParams)521 void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE 522 { 523 524 const int maxChecks = get_param(searchParams,"checks",32); 525 526 if (maxChecks==FLANN_CHECKS_UNLIMITED) { 527 findExactNN(root_[0], result, vec); 528 } 529 else { 530 // Priority queue storing intermediate branches in the best-bin-first search 531 Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_); 532 533 int checks = 0; 534 for (int i=0; i<trees_; ++i) { 535 findNN(root_[i], result, vec, checks, maxChecks, heap); 536 if ((checks >= maxChecks) && result.full()) 537 break; 538 } 539 540 BranchSt branch; 541 while (heap->popMin(branch) && (checks<maxChecks || !result.full())) { 542 KMeansNodePtr node = branch.node; 543 findNN(node, result, vec, checks, maxChecks, heap); 544 } 545 delete heap; 546 547 CV_Assert(result.full()); 548 } 549 } 550 551 /** 552 * Clustering function that takes a cut in the hierarchical k-means 553 * tree and return the clusters centers of that clustering. 554 * Params: 555 * numClusters = number of clusters to have in the clustering computed 556 * Returns: number of cluster centers 557 */ getClusterCenters(Matrix<CentersType> & centers)558 int getClusterCenters(Matrix<CentersType>& centers) 559 { 560 int numClusters = centers.rows; 561 if (numClusters<1) { 562 FLANN_THROW(cv::Error::StsBadArg, "Number of clusters must be at least 1"); 563 } 564 565 DistanceType variance; 566 KMeansNodePtr* clusters = new KMeansNodePtr[numClusters]; 567 568 int clusterCount = getMinVarianceClusters(root_[0], clusters, numClusters, variance); 569 570 Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount); 571 572 for (int i=0; i<clusterCount; ++i) { 573 CentersType* center = clusters[i]->pivot; 574 for (size_t j=0; j<veclen_; ++j) { 575 centers[i][j] = center[j]; 576 } 577 } 578 delete[] clusters; 579 580 return clusterCount; 581 } 582 getParameters()583 IndexParams getParameters() const CV_OVERRIDE 584 { 585 return index_params_; 586 } 587 588 589 private: 590 /** 591 * Structure representing a node in the hierarchical k-means tree. 592 */ 593 struct KMeansNode 594 { 595 /** 596 * The cluster center. 597 */ 598 CentersType* pivot; 599 /** 600 * The cluster radius. 601 */ 602 DistanceType radius; 603 /** 604 * The cluster mean radius. 605 */ 606 DistanceType mean_radius; 607 /** 608 * The cluster variance. 609 */ 610 DistanceType variance; 611 /** 612 * The cluster size (number of points in the cluster) 613 */ 614 int size; 615 /** 616 * Child nodes (only for non-terminal nodes) 617 */ 618 KMeansNode** childs; 619 /** 620 * Node points (only for terminal nodes) 621 */ 622 int* indices; 623 /** 624 * Level 625 */ 626 int level; 627 }; 628 typedef KMeansNode* KMeansNodePtr; 629 630 /** 631 * Alias definition for a nicer syntax. 632 */ 633 typedef BranchStruct<KMeansNodePtr, DistanceType> BranchSt; 634 635 636 637 save_tree(FILE * stream,KMeansNodePtr node,int num)638 void save_tree(FILE* stream, KMeansNodePtr node, int num) 639 { 640 save_value(stream, *node); 641 save_value(stream, *(node->pivot), (int)veclen_); 642 if (node->childs==NULL) { 643 int indices_offset = (int)(node->indices - indices_[num]); 644 save_value(stream, indices_offset); 645 } 646 else { 647 for(int i=0; i<branching_; ++i) { 648 save_tree(stream, node->childs[i], num); 649 } 650 } 651 } 652 653 load_tree(FILE * stream,KMeansNodePtr & node,int num)654 void load_tree(FILE* stream, KMeansNodePtr& node, int num) 655 { 656 node = pool_.allocate<KMeansNode>(); 657 load_value(stream, *node); 658 node->pivot = new CentersType[veclen_]; 659 load_value(stream, *(node->pivot), (int)veclen_); 660 if (node->childs==NULL) { 661 int indices_offset; 662 load_value(stream, indices_offset); 663 node->indices = indices_[num] + indices_offset; 664 } 665 else { 666 node->childs = pool_.allocate<KMeansNodePtr>(branching_); 667 for(int i=0; i<branching_; ++i) { 668 load_tree(stream, node->childs[i], num); 669 } 670 } 671 } 672 673 674 /** 675 * Helper function 676 */ free_centers(KMeansNodePtr node)677 void free_centers(KMeansNodePtr node) 678 { 679 delete[] node->pivot; 680 if (node->childs!=NULL) { 681 for (int k=0; k<branching_; ++k) { 682 free_centers(node->childs[k]); 683 } 684 } 685 } 686 free_centers()687 void free_centers() 688 { 689 if (root_ != NULL) { 690 for(int i=0; i<trees_; ++i) { 691 if (root_[i] != NULL) { 692 free_centers(root_[i]); 693 } 694 } 695 } 696 } 697 698 /** 699 * Release the inner elements of indices[] 700 */ free_indices()701 void free_indices() 702 { 703 if (indices_!=NULL) { 704 for(int i=0; i<trees_; ++i) { 705 if (indices_[i]!=NULL) { 706 delete[] indices_[i]; 707 indices_[i] = NULL; 708 } 709 } 710 } 711 } 712 713 /** 714 * Computes the statistics of a node (mean, radius, variance). 715 * 716 * Params: 717 * node = the node to use 718 * indices = array of indices of the points belonging to the node 719 * indices_length = number of indices in the array 720 */ computeNodeStatistics(KMeansNodePtr node,int * indices,unsigned int indices_length)721 void computeNodeStatistics(KMeansNodePtr node, int* indices, unsigned int indices_length) 722 { 723 DistanceType variance = 0; 724 CentersType* mean = new CentersType[veclen_]; 725 memoryCounter_ += int(veclen_*sizeof(CentersType)); 726 727 memset(mean,0,veclen_*sizeof(CentersType)); 728 729 for (unsigned int i=0; i<indices_length; ++i) { 730 ElementType* vec = dataset_[indices[i]]; 731 for (size_t j=0; j<veclen_; ++j) { 732 mean[j] += vec[j]; 733 } 734 variance += distance_(vec, ZeroIterator<ElementType>(), veclen_); 735 } 736 float length = static_cast<float>(indices_length); 737 for (size_t j=0; j<veclen_; ++j) { 738 mean[j] = cvflann::round<CentersType>( mean[j] / static_cast<double>(indices_length) ); 739 } 740 variance /= static_cast<DistanceType>( length ); 741 variance -= distance_(mean, ZeroIterator<ElementType>(), veclen_); 742 743 DistanceType radius = 0; 744 for (unsigned int i=0; i<indices_length; ++i) { 745 DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_); 746 if (tmp>radius) { 747 radius = tmp; 748 } 749 } 750 751 node->variance = variance; 752 node->radius = radius; 753 node->pivot = mean; 754 } 755 756 computeBitfieldNodeStatistics(KMeansNodePtr node,int * indices,unsigned int indices_length)757 void computeBitfieldNodeStatistics(KMeansNodePtr node, int* indices, 758 unsigned int indices_length) 759 { 760 const unsigned int accumulator_veclen = static_cast<unsigned int>( 761 veclen_*sizeof(CentersType)*BITS_PER_CHAR); 762 763 unsigned long long variance = 0ull; 764 CentersType* mean = new CentersType[veclen_]; 765 memoryCounter_ += int(veclen_*sizeof(CentersType)); 766 unsigned int* mean_accumulator = new unsigned int[accumulator_veclen]; 767 768 memset(mean_accumulator, 0, sizeof(unsigned int)*accumulator_veclen); 769 770 for (unsigned int i=0; i<indices_length; ++i) { 771 variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>( 772 distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_))); 773 unsigned char* vec = (unsigned char*)dataset_[indices[i]]; 774 for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) { 775 mean_accumulator[k] += (vec[l]) & 0x01; 776 mean_accumulator[k+1] += (vec[l]>>1) & 0x01; 777 mean_accumulator[k+2] += (vec[l]>>2) & 0x01; 778 mean_accumulator[k+3] += (vec[l]>>3) & 0x01; 779 mean_accumulator[k+4] += (vec[l]>>4) & 0x01; 780 mean_accumulator[k+5] += (vec[l]>>5) & 0x01; 781 mean_accumulator[k+6] += (vec[l]>>6) & 0x01; 782 mean_accumulator[k+7] += (vec[l]>>7) & 0x01; 783 } 784 } 785 double cnt = static_cast<double>(indices_length); 786 unsigned char* char_mean = (unsigned char*)mean; 787 for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) { 788 char_mean[l] = static_cast<unsigned char>( 789 (((int)(0.5 + (double)(mean_accumulator[k]) / cnt))) 790 | (((int)(0.5 + (double)(mean_accumulator[k+1]) / cnt))<<1) 791 | (((int)(0.5 + (double)(mean_accumulator[k+2]) / cnt))<<2) 792 | (((int)(0.5 + (double)(mean_accumulator[k+3]) / cnt))<<3) 793 | (((int)(0.5 + (double)(mean_accumulator[k+4]) / cnt))<<4) 794 | (((int)(0.5 + (double)(mean_accumulator[k+5]) / cnt))<<5) 795 | (((int)(0.5 + (double)(mean_accumulator[k+6]) / cnt))<<6) 796 | (((int)(0.5 + (double)(mean_accumulator[k+7]) / cnt))<<7)); 797 } 798 variance = static_cast<unsigned long long>( 799 0.5 + static_cast<double>(variance) / static_cast<double>(indices_length)); 800 variance -= static_cast<unsigned long long>( 801 ensureSquareDistance<Distance>( 802 distance_(mean, ZeroIterator<ElementType>(), veclen_))); 803 804 DistanceType radius = 0; 805 for (unsigned int i=0; i<indices_length; ++i) { 806 DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_); 807 if (tmp>radius) { 808 radius = tmp; 809 } 810 } 811 812 node->variance = static_cast<DistanceType>(variance); 813 node->radius = radius; 814 node->pivot = mean; 815 816 delete[] mean_accumulator; 817 } 818 819 computeDnaNodeStatistics(KMeansNodePtr node,int * indices,unsigned int indices_length)820 void computeDnaNodeStatistics(KMeansNodePtr node, int* indices, 821 unsigned int indices_length) 822 { 823 const unsigned int histos_veclen = static_cast<unsigned int>( 824 veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR)); 825 826 unsigned long long variance = 0ull; 827 unsigned int* histograms = new unsigned int[histos_veclen]; 828 memset(histograms, 0, sizeof(unsigned int)*histos_veclen); 829 830 for (unsigned int i=0; i<indices_length; ++i) { 831 variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>( 832 distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_))); 833 834 unsigned char* vec = (unsigned char*)dataset_[indices[i]]; 835 for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) { 836 histograms[k + ((vec[l]) & 0x03)]++; 837 histograms[k + 4 + ((vec[l]>>2) & 0x03)]++; 838 histograms[k + 8 + ((vec[l]>>4) & 0x03)]++; 839 histograms[k +12 + ((vec[l]>>6) & 0x03)]++; 840 } 841 } 842 843 CentersType* mean = new CentersType[veclen_]; 844 memoryCounter_ += int(veclen_*sizeof(CentersType)); 845 unsigned char* char_mean = (unsigned char*)mean; 846 unsigned int* h = histograms; 847 for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) { 848 char_mean[l] = (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10 849 : h[k] > h[k+3] ? 0x00 : 0x11 850 : h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10 851 : h[k+1] > h[k+3] ? 0x01 : 0x11) 852 | (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000 853 : h[k+4] > h[k+7] ? 0x00 : 0x1100 854 : h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000 855 : h[k+5] > h[k+7] ? 0x0100 : 0x1100) 856 | (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000 857 : h[k+8] >h[k+11] ? 0x00 : 0x110000 858 : h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000 859 : h[k+9] >h[k+11] ? 0x010000 : 0x110000) 860 | (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000 861 : h[k+12] >h[k+15] ? 0x00 : 0x11000000 862 : h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000 863 : h[k+13] >h[k+15] ? 0x01000000 : 0x11000000); 864 } 865 variance = static_cast<unsigned long long>( 866 0.5 + static_cast<double>(variance) / static_cast<double>(indices_length)); 867 variance -= static_cast<unsigned long long>( 868 ensureSquareDistance<Distance>( 869 distance_(mean, ZeroIterator<ElementType>(), veclen_))); 870 871 DistanceType radius = 0; 872 for (unsigned int i=0; i<indices_length; ++i) { 873 DistanceType tmp = distance_(mean, dataset_[indices[i]], veclen_); 874 if (tmp>radius) { 875 radius = tmp; 876 } 877 } 878 879 node->variance = static_cast<DistanceType>(variance); 880 node->radius = radius; 881 node->pivot = mean; 882 883 delete[] histograms; 884 } 885 886 887 template<typename DistType> computeNodeStatistics(KMeansNodePtr node,int * indices,unsigned int indices_length,const DistType * identifier)888 void computeNodeStatistics(KMeansNodePtr node, int* indices, 889 unsigned int indices_length, 890 const DistType* identifier) 891 { 892 (void)identifier; 893 computeNodeStatistics(node, indices, indices_length); 894 } 895 computeNodeStatistics(KMeansNodePtr node,int * indices,unsigned int indices_length,const cvflann::HammingLUT * identifier)896 void computeNodeStatistics(KMeansNodePtr node, int* indices, 897 unsigned int indices_length, 898 const cvflann::HammingLUT* identifier) 899 { 900 (void)identifier; 901 computeBitfieldNodeStatistics(node, indices, indices_length); 902 } 903 computeNodeStatistics(KMeansNodePtr node,int * indices,unsigned int indices_length,const cvflann::Hamming<unsigned char> * identifier)904 void computeNodeStatistics(KMeansNodePtr node, int* indices, 905 unsigned int indices_length, 906 const cvflann::Hamming<unsigned char>* identifier) 907 { 908 (void)identifier; 909 computeBitfieldNodeStatistics(node, indices, indices_length); 910 } 911 computeNodeStatistics(KMeansNodePtr node,int * indices,unsigned int indices_length,const cvflann::Hamming2<unsigned char> * identifier)912 void computeNodeStatistics(KMeansNodePtr node, int* indices, 913 unsigned int indices_length, 914 const cvflann::Hamming2<unsigned char>* identifier) 915 { 916 (void)identifier; 917 computeBitfieldNodeStatistics(node, indices, indices_length); 918 } 919 computeNodeStatistics(KMeansNodePtr node,int * indices,unsigned int indices_length,const cvflann::DNAmmingLUT * identifier)920 void computeNodeStatistics(KMeansNodePtr node, int* indices, 921 unsigned int indices_length, 922 const cvflann::DNAmmingLUT* identifier) 923 { 924 (void)identifier; 925 computeDnaNodeStatistics(node, indices, indices_length); 926 } 927 computeNodeStatistics(KMeansNodePtr node,int * indices,unsigned int indices_length,const cvflann::DNAmming2<unsigned char> * identifier)928 void computeNodeStatistics(KMeansNodePtr node, int* indices, 929 unsigned int indices_length, 930 const cvflann::DNAmming2<unsigned char>* identifier) 931 { 932 (void)identifier; 933 computeDnaNodeStatistics(node, indices, indices_length); 934 } 935 936 refineClustering(int * indices,int indices_length,int branching,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count)937 void refineClustering(int* indices, int indices_length, int branching, CentersType** centers, 938 std::vector<DistanceType>& radiuses, int* belongs_to, int* count) 939 { 940 cv::AutoBuffer<double> dcenters_buf(branching*veclen_); 941 Matrix<double> dcenters(dcenters_buf.data(), branching, veclen_); 942 943 bool converged = false; 944 int iteration = 0; 945 while (!converged && iteration<iterations_) { 946 converged = true; 947 iteration++; 948 949 // compute the new cluster centers 950 for (int i=0; i<branching; ++i) { 951 memset(dcenters[i],0,sizeof(double)*veclen_); 952 radiuses[i] = 0; 953 } 954 for (int i=0; i<indices_length; ++i) { 955 ElementType* vec = dataset_[indices[i]]; 956 double* center = dcenters[belongs_to[i]]; 957 for (size_t k=0; k<veclen_; ++k) { 958 center[k] += vec[k]; 959 } 960 } 961 for (int i=0; i<branching; ++i) { 962 int cnt = count[i]; 963 for (size_t k=0; k<veclen_; ++k) { 964 dcenters[i][k] /= cnt; 965 } 966 } 967 968 std::vector<int> new_centroids(indices_length); 969 std::vector<DistanceType> sq_dists(indices_length); 970 971 // reassign points to clusters 972 KMeansDistanceComputer<Matrix<double> > invoker( 973 distance_, dataset_, branching, indices, dcenters, veclen_, new_centroids, sq_dists); 974 parallel_for_(cv::Range(0, (int)indices_length), invoker); 975 976 for (int i=0; i < (int)indices_length; ++i) { 977 DistanceType sq_dist(sq_dists[i]); 978 int new_centroid(new_centroids[i]); 979 if (sq_dist > radiuses[new_centroid]) { 980 radiuses[new_centroid] = sq_dist; 981 } 982 if (new_centroid != belongs_to[i]) { 983 count[belongs_to[i]]--; 984 count[new_centroid]++; 985 belongs_to[i] = new_centroid; 986 converged = false; 987 } 988 } 989 990 for (int i=0; i<branching; ++i) { 991 // if one cluster converges to an empty cluster, 992 // move an element into that cluster 993 if (count[i]==0) { 994 int j = (i+1)%branching; 995 while (count[j]<=1) { 996 j = (j+1)%branching; 997 } 998 999 for (int k=0; k<indices_length; ++k) { 1000 if (belongs_to[k]==j) { 1001 // for cluster j, we move the furthest element from the center to the empty cluster i 1002 if ( distance_(dataset_[indices[k]], dcenters[j], veclen_) == radiuses[j] ) { 1003 belongs_to[k] = i; 1004 count[j]--; 1005 count[i]++; 1006 break; 1007 } 1008 } 1009 } 1010 converged = false; 1011 } 1012 } 1013 } 1014 1015 for (int i=0; i<branching; ++i) { 1016 centers[i] = new CentersType[veclen_]; 1017 memoryCounter_ += (int)(veclen_*sizeof(CentersType)); 1018 for (size_t k=0; k<veclen_; ++k) { 1019 centers[i][k] = (CentersType)dcenters[i][k]; 1020 } 1021 } 1022 } 1023 1024 refineBitfieldClustering(int * indices,int indices_length,int branching,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count)1025 void refineBitfieldClustering(int* indices, int indices_length, int branching, CentersType** centers, 1026 std::vector<DistanceType>& radiuses, int* belongs_to, int* count) 1027 { 1028 for (int i=0; i<branching; ++i) { 1029 centers[i] = new CentersType[veclen_]; 1030 memoryCounter_ += (int)(veclen_*sizeof(CentersType)); 1031 } 1032 1033 const unsigned int accumulator_veclen = static_cast<unsigned int>( 1034 veclen_*sizeof(ElementType)*BITS_PER_CHAR); 1035 cv::AutoBuffer<unsigned int> dcenters_buf(branching*accumulator_veclen); 1036 Matrix<unsigned int> dcenters(dcenters_buf.data(), branching, accumulator_veclen); 1037 1038 bool converged = false; 1039 int iteration = 0; 1040 while (!converged && iteration<iterations_) { 1041 converged = true; 1042 iteration++; 1043 1044 // compute the new cluster centers 1045 for (int i=0; i<branching; ++i) { 1046 memset(dcenters[i],0,sizeof(unsigned int)*accumulator_veclen); 1047 radiuses[i] = 0; 1048 } 1049 for (int i=0; i<indices_length; ++i) { 1050 unsigned char* vec = (unsigned char*)dataset_[indices[i]]; 1051 unsigned int* dcenter = dcenters[belongs_to[i]]; 1052 for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) { 1053 dcenter[k] += (vec[l]) & 0x01; 1054 dcenter[k+1] += (vec[l]>>1) & 0x01; 1055 dcenter[k+2] += (vec[l]>>2) & 0x01; 1056 dcenter[k+3] += (vec[l]>>3) & 0x01; 1057 dcenter[k+4] += (vec[l]>>4) & 0x01; 1058 dcenter[k+5] += (vec[l]>>5) & 0x01; 1059 dcenter[k+6] += (vec[l]>>6) & 0x01; 1060 dcenter[k+7] += (vec[l]>>7) & 0x01; 1061 } 1062 } 1063 for (int i=0; i<branching; ++i) { 1064 double cnt = static_cast<double>(count[i]); 1065 unsigned int* dcenter = dcenters[i]; 1066 unsigned char* charCenter = (unsigned char*)centers[i]; 1067 for (size_t k=0, l=0; k<accumulator_veclen; k+=BITS_PER_CHAR, ++l) { 1068 charCenter[l] = static_cast<unsigned char>( 1069 (((int)(0.5 + (double)(dcenter[k]) / cnt))) 1070 | (((int)(0.5 + (double)(dcenter[k+1]) / cnt))<<1) 1071 | (((int)(0.5 + (double)(dcenter[k+2]) / cnt))<<2) 1072 | (((int)(0.5 + (double)(dcenter[k+3]) / cnt))<<3) 1073 | (((int)(0.5 + (double)(dcenter[k+4]) / cnt))<<4) 1074 | (((int)(0.5 + (double)(dcenter[k+5]) / cnt))<<5) 1075 | (((int)(0.5 + (double)(dcenter[k+6]) / cnt))<<6) 1076 | (((int)(0.5 + (double)(dcenter[k+7]) / cnt))<<7)); 1077 } 1078 } 1079 1080 std::vector<int> new_centroids(indices_length); 1081 std::vector<DistanceType> dists(indices_length); 1082 1083 // reassign points to clusters 1084 KMeansDistanceComputer<ElementType**> invoker( 1085 distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists); 1086 parallel_for_(cv::Range(0, (int)indices_length), invoker); 1087 1088 for (int i=0; i < indices_length; ++i) { 1089 DistanceType dist(dists[i]); 1090 int new_centroid(new_centroids[i]); 1091 if (dist > radiuses[new_centroid]) { 1092 radiuses[new_centroid] = dist; 1093 } 1094 if (new_centroid != belongs_to[i]) { 1095 count[belongs_to[i]]--; 1096 count[new_centroid]++; 1097 belongs_to[i] = new_centroid; 1098 converged = false; 1099 } 1100 } 1101 1102 for (int i=0; i<branching; ++i) { 1103 // if one cluster converges to an empty cluster, 1104 // move an element into that cluster 1105 if (count[i]==0) { 1106 int j = (i+1)%branching; 1107 while (count[j]<=1) { 1108 j = (j+1)%branching; 1109 } 1110 1111 for (int k=0; k<indices_length; ++k) { 1112 if (belongs_to[k]==j) { 1113 // for cluster j, we move the furthest element from the center to the empty cluster i 1114 if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) { 1115 belongs_to[k] = i; 1116 count[j]--; 1117 count[i]++; 1118 break; 1119 } 1120 } 1121 } 1122 converged = false; 1123 } 1124 } 1125 } 1126 } 1127 1128 refineDnaClustering(int * indices,int indices_length,int branching,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count)1129 void refineDnaClustering(int* indices, int indices_length, int branching, CentersType** centers, 1130 std::vector<DistanceType>& radiuses, int* belongs_to, int* count) 1131 { 1132 for (int i=0; i<branching; ++i) { 1133 centers[i] = new CentersType[veclen_]; 1134 memoryCounter_ += (int)(veclen_*sizeof(CentersType)); 1135 } 1136 1137 const unsigned int histos_veclen = static_cast<unsigned int>( 1138 veclen_*sizeof(CentersType)*(HISTOS_PER_BASE*BASE_PER_CHAR)); 1139 cv::AutoBuffer<unsigned int> histos_buf(branching*histos_veclen); 1140 Matrix<unsigned int> histos(histos_buf.data(), branching, histos_veclen); 1141 1142 bool converged = false; 1143 int iteration = 0; 1144 while (!converged && iteration<iterations_) { 1145 converged = true; 1146 iteration++; 1147 1148 // compute the new cluster centers 1149 for (int i=0; i<branching; ++i) { 1150 memset(histos[i],0,sizeof(unsigned int)*histos_veclen); 1151 radiuses[i] = 0; 1152 } 1153 for (int i=0; i<indices_length; ++i) { 1154 unsigned char* vec = (unsigned char*)dataset_[indices[i]]; 1155 unsigned int* h = histos[belongs_to[i]]; 1156 for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) { 1157 h[k + ((vec[l]) & 0x03)]++; 1158 h[k + 4 + ((vec[l]>>2) & 0x03)]++; 1159 h[k + 8 + ((vec[l]>>4) & 0x03)]++; 1160 h[k +12 + ((vec[l]>>6) & 0x03)]++; 1161 } 1162 } 1163 for (int i=0; i<branching; ++i) { 1164 unsigned int* h = histos[i]; 1165 unsigned char* charCenter = (unsigned char*)centers[i]; 1166 for (size_t k=0, l=0; k<histos_veclen; k+=HISTOS_PER_BASE*BASE_PER_CHAR, ++l) { 1167 charCenter[l]= (h[k] > h[k+1] ? h[k+2] > h[k+3] ? h[k] > h[k+2] ? 0x00 : 0x10 1168 : h[k] > h[k+3] ? 0x00 : 0x11 1169 : h[k+2] > h[k+3] ? h[k+1] > h[k+2] ? 0x01 : 0x10 1170 : h[k+1] > h[k+3] ? 0x01 : 0x11) 1171 | (h[k+4]>h[k+5] ? h[k+6] > h[k+7] ? h[k+4] > h[k+6] ? 0x00 : 0x1000 1172 : h[k+4] > h[k+7] ? 0x00 : 0x1100 1173 : h[k+6] > h[k+7] ? h[k+5] > h[k+6] ? 0x0100 : 0x1000 1174 : h[k+5] > h[k+7] ? 0x0100 : 0x1100) 1175 | (h[k+8]>h[k+9] ? h[k+10]>h[k+11] ? h[k+8] >h[k+10] ? 0x00 : 0x100000 1176 : h[k+8] >h[k+11] ? 0x00 : 0x110000 1177 : h[k+10]>h[k+11] ? h[k+9] >h[k+10] ? 0x010000 : 0x100000 1178 : h[k+9] >h[k+11] ? 0x010000 : 0x110000) 1179 | (h[k+12]>h[k+13] ? h[k+14]>h[k+15] ? h[k+12] >h[k+14] ? 0x00 : 0x10000000 1180 : h[k+12] >h[k+15] ? 0x00 : 0x11000000 1181 : h[k+14]>h[k+15] ? h[k+13] >h[k+14] ? 0x01000000 : 0x10000000 1182 : h[k+13] >h[k+15] ? 0x01000000 : 0x11000000); 1183 } 1184 } 1185 1186 std::vector<int> new_centroids(indices_length); 1187 std::vector<DistanceType> dists(indices_length); 1188 1189 // reassign points to clusters 1190 KMeansDistanceComputer<ElementType**> invoker( 1191 distance_, dataset_, branching, indices, centers, veclen_, new_centroids, dists); 1192 parallel_for_(cv::Range(0, (int)indices_length), invoker); 1193 1194 for (int i=0; i < indices_length; ++i) { 1195 DistanceType dist(dists[i]); 1196 int new_centroid(new_centroids[i]); 1197 if (dist > radiuses[new_centroid]) { 1198 radiuses[new_centroid] = dist; 1199 } 1200 if (new_centroid != belongs_to[i]) { 1201 count[belongs_to[i]]--; 1202 count[new_centroid]++; 1203 belongs_to[i] = new_centroid; 1204 converged = false; 1205 } 1206 } 1207 1208 for (int i=0; i<branching; ++i) { 1209 // if one cluster converges to an empty cluster, 1210 // move an element into that cluster 1211 if (count[i]==0) { 1212 int j = (i+1)%branching; 1213 while (count[j]<=1) { 1214 j = (j+1)%branching; 1215 } 1216 1217 for (int k=0; k<indices_length; ++k) { 1218 if (belongs_to[k]==j) { 1219 // for cluster j, we move the furthest element from the center to the empty cluster i 1220 if ( distance_(dataset_[indices[k]], centers[j], veclen_) == radiuses[j] ) { 1221 belongs_to[k] = i; 1222 count[j]--; 1223 count[i]++; 1224 break; 1225 } 1226 } 1227 } 1228 converged = false; 1229 } 1230 } 1231 } 1232 } 1233 1234 computeSubClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count)1235 void computeSubClustering(KMeansNodePtr node, int* indices, int indices_length, 1236 int branching, int level, CentersType** centers, 1237 std::vector<DistanceType>& radiuses, int* belongs_to, int* count) 1238 { 1239 // compute kmeans clustering for each of the resulting clusters 1240 node->childs = pool_.allocate<KMeansNodePtr>(branching); 1241 int start = 0; 1242 int end = start; 1243 for (int c=0; c<branching; ++c) { 1244 int s = count[c]; 1245 1246 DistanceType variance = 0; 1247 DistanceType mean_radius =0; 1248 for (int i=0; i<indices_length; ++i) { 1249 if (belongs_to[i]==c) { 1250 DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_); 1251 variance += d; 1252 mean_radius += static_cast<DistanceType>( sqrt(d) ); 1253 std::swap(indices[i],indices[end]); 1254 std::swap(belongs_to[i],belongs_to[end]); 1255 end++; 1256 } 1257 } 1258 variance /= s; 1259 mean_radius /= s; 1260 variance -= distance_(centers[c], ZeroIterator<ElementType>(), veclen_); 1261 1262 node->childs[c] = pool_.allocate<KMeansNode>(); 1263 std::memset(node->childs[c], 0, sizeof(KMeansNode)); 1264 node->childs[c]->radius = radiuses[c]; 1265 node->childs[c]->pivot = centers[c]; 1266 node->childs[c]->variance = variance; 1267 node->childs[c]->mean_radius = mean_radius; 1268 computeClustering(node->childs[c],indices+start, end-start, branching, level+1); 1269 start=end; 1270 } 1271 } 1272 1273 computeAnyBitfieldSubClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count)1274 void computeAnyBitfieldSubClustering(KMeansNodePtr node, int* indices, int indices_length, 1275 int branching, int level, CentersType** centers, 1276 std::vector<DistanceType>& radiuses, int* belongs_to, int* count) 1277 { 1278 // compute kmeans clustering for each of the resulting clusters 1279 node->childs = pool_.allocate<KMeansNodePtr>(branching); 1280 int start = 0; 1281 int end = start; 1282 for (int c=0; c<branching; ++c) { 1283 int s = count[c]; 1284 1285 unsigned long long variance = 0ull; 1286 DistanceType mean_radius =0; 1287 for (int i=0; i<indices_length; ++i) { 1288 if (belongs_to[i]==c) { 1289 DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_); 1290 variance += static_cast<unsigned long long>( ensureSquareDistance<Distance>(d) ); 1291 mean_radius += ensureSimpleDistance<Distance>(d); 1292 std::swap(indices[i],indices[end]); 1293 std::swap(belongs_to[i],belongs_to[end]); 1294 end++; 1295 } 1296 } 1297 mean_radius = static_cast<DistanceType>( 1298 0.5f + static_cast<float>(mean_radius) / static_cast<float>(s)); 1299 variance = static_cast<unsigned long long>( 1300 0.5 + static_cast<double>(variance) / static_cast<double>(s)); 1301 variance -= static_cast<unsigned long long>( 1302 ensureSquareDistance<Distance>( 1303 distance_(centers[c], ZeroIterator<ElementType>(), veclen_))); 1304 1305 node->childs[c] = pool_.allocate<KMeansNode>(); 1306 std::memset(node->childs[c], 0, sizeof(KMeansNode)); 1307 node->childs[c]->radius = radiuses[c]; 1308 node->childs[c]->pivot = centers[c]; 1309 node->childs[c]->variance = static_cast<DistanceType>(variance); 1310 node->childs[c]->mean_radius = mean_radius; 1311 computeClustering(node->childs[c],indices+start, end-start, branching, level+1); 1312 start=end; 1313 } 1314 } 1315 1316 1317 template<typename DistType> refineAndSplitClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count,const DistType * identifier)1318 void refineAndSplitClustering( 1319 KMeansNodePtr node, int* indices, int indices_length, int branching, 1320 int level, CentersType** centers, std::vector<DistanceType>& radiuses, 1321 int* belongs_to, int* count, const DistType* identifier) 1322 { 1323 (void)identifier; 1324 refineClustering(indices, indices_length, branching, centers, radiuses, belongs_to, count); 1325 1326 computeSubClustering(node, indices, indices_length, branching, 1327 level, centers, radiuses, belongs_to, count); 1328 } 1329 1330 1331 /** 1332 * The methods responsible with doing the recursive hierarchical clustering on 1333 * binary vectors. 1334 * As some might have heard that KMeans on binary data doesn't make sense, 1335 * it's worth a little explanation why it actually fairly works. As 1336 * with the Hierarchical Clustering algortihm, we seed several centers for the 1337 * current node by picking some of its points. Then in a first pass each point 1338 * of the node is then related to its closest center. Now let's have a look at 1339 * the 5 central dimensions of the 9 following points: 1340 * 1341 * xxxxxx11100xxxxx (1) 1342 * xxxxxx11010xxxxx (2) 1343 * xxxxxx11001xxxxx (3) 1344 * xxxxxx10110xxxxx (4) 1345 * xxxxxx10101xxxxx (5) 1346 * xxxxxx10011xxxxx (6) 1347 * xxxxxx01110xxxxx (7) 1348 * xxxxxx01101xxxxx (8) 1349 * xxxxxx01011xxxxx (9) 1350 * sum _____ 1351 * of 1: 66555 1352 * 1353 * Even if the barycenter notion doesn't apply, we can set a center 1354 * xxxxxx11111xxxxx that will better fit the five dimensions we are focusing 1355 * on for these points. 1356 * 1357 * Note that convergence isn't ensured anymore. In practice, using Gonzales 1358 * as seeding algorithm should be fine for getting convergence ("iterations" 1359 * value can be set to -1). But with KMeans++ seeding you should definitely 1360 * set a maximum number of iterations (but make it higher than the "iterations" 1361 * default value of 11). 1362 * 1363 * Params: 1364 * node = the node to cluster 1365 * indices = indices of the points belonging to the current node 1366 * indices_length = number of points in the current node 1367 * branching = the branching factor to use in the clustering 1368 * level = 0 for the root node, it increases with the subdivision levels 1369 * centers = clusters centers to compute 1370 * radiuses = radiuses of clusters 1371 * belongs_to = LookUp Table returning, for a given indice id, the center id it belongs to 1372 * count = array storing the number of indices for a given center id 1373 * identifier = dummy pointer on an instance of Distance (use to branch correctly among templates) 1374 */ refineAndSplitClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count,const cvflann::HammingLUT * identifier)1375 void refineAndSplitClustering( 1376 KMeansNodePtr node, int* indices, int indices_length, int branching, 1377 int level, CentersType** centers, std::vector<DistanceType>& radiuses, 1378 int* belongs_to, int* count, const cvflann::HammingLUT* identifier) 1379 { 1380 (void)identifier; 1381 refineBitfieldClustering( 1382 indices, indices_length, branching, centers, radiuses, belongs_to, count); 1383 1384 computeAnyBitfieldSubClustering(node, indices, indices_length, branching, 1385 level, centers, radiuses, belongs_to, count); 1386 } 1387 1388 refineAndSplitClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count,const cvflann::Hamming<unsigned char> * identifier)1389 void refineAndSplitClustering( 1390 KMeansNodePtr node, int* indices, int indices_length, int branching, 1391 int level, CentersType** centers, std::vector<DistanceType>& radiuses, 1392 int* belongs_to, int* count, const cvflann::Hamming<unsigned char>* identifier) 1393 { 1394 (void)identifier; 1395 refineBitfieldClustering( 1396 indices, indices_length, branching, centers, radiuses, belongs_to, count); 1397 1398 computeAnyBitfieldSubClustering(node, indices, indices_length, branching, 1399 level, centers, radiuses, belongs_to, count); 1400 } 1401 1402 refineAndSplitClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count,const cvflann::Hamming2<unsigned char> * identifier)1403 void refineAndSplitClustering( 1404 KMeansNodePtr node, int* indices, int indices_length, int branching, 1405 int level, CentersType** centers, std::vector<DistanceType>& radiuses, 1406 int* belongs_to, int* count, const cvflann::Hamming2<unsigned char>* identifier) 1407 { 1408 (void)identifier; 1409 refineBitfieldClustering( 1410 indices, indices_length, branching, centers, radiuses, belongs_to, count); 1411 1412 computeAnyBitfieldSubClustering(node, indices, indices_length, branching, 1413 level, centers, radiuses, belongs_to, count); 1414 } 1415 1416 refineAndSplitClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count,const cvflann::DNAmmingLUT * identifier)1417 void refineAndSplitClustering( 1418 KMeansNodePtr node, int* indices, int indices_length, int branching, 1419 int level, CentersType** centers, std::vector<DistanceType>& radiuses, 1420 int* belongs_to, int* count, const cvflann::DNAmmingLUT* identifier) 1421 { 1422 (void)identifier; 1423 refineDnaClustering( 1424 indices, indices_length, branching, centers, radiuses, belongs_to, count); 1425 1426 computeAnyBitfieldSubClustering(node, indices, indices_length, branching, 1427 level, centers, radiuses, belongs_to, count); 1428 } 1429 1430 refineAndSplitClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level,CentersType ** centers,std::vector<DistanceType> & radiuses,int * belongs_to,int * count,const cvflann::DNAmming2<unsigned char> * identifier)1431 void refineAndSplitClustering( 1432 KMeansNodePtr node, int* indices, int indices_length, int branching, 1433 int level, CentersType** centers, std::vector<DistanceType>& radiuses, 1434 int* belongs_to, int* count, const cvflann::DNAmming2<unsigned char>* identifier) 1435 { 1436 (void)identifier; 1437 refineDnaClustering( 1438 indices, indices_length, branching, centers, radiuses, belongs_to, count); 1439 1440 computeAnyBitfieldSubClustering(node, indices, indices_length, branching, 1441 level, centers, radiuses, belongs_to, count); 1442 } 1443 1444 1445 /** 1446 * The method responsible with actually doing the recursive hierarchical 1447 * clustering 1448 * 1449 * Params: 1450 * node = the node to cluster 1451 * indices = indices of the points belonging to the current node 1452 * branching = the branching factor to use in the clustering 1453 * 1454 * TODO: for 1-sized clusters don't store a cluster center (it's the same as the single cluster point) 1455 */ computeClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level)1456 void computeClustering(KMeansNodePtr node, int* indices, int indices_length, int branching, int level) 1457 { 1458 node->size = indices_length; 1459 node->level = level; 1460 1461 if (indices_length < branching) { 1462 node->indices = indices; 1463 std::sort(node->indices,node->indices+indices_length); 1464 node->childs = NULL; 1465 return; 1466 } 1467 1468 cv::AutoBuffer<int> centers_idx_buf(branching); 1469 int* centers_idx = centers_idx_buf.data(); 1470 int centers_length; 1471 (this->*chooseCenters)(branching, indices, indices_length, centers_idx, centers_length); 1472 1473 if (centers_length<branching) { 1474 node->indices = indices; 1475 std::sort(node->indices,node->indices+indices_length); 1476 node->childs = NULL; 1477 return; 1478 } 1479 1480 1481 std::vector<DistanceType> radiuses(branching); 1482 cv::AutoBuffer<int> count_buf(branching); 1483 int* count = count_buf.data(); 1484 for (int i=0; i<branching; ++i) { 1485 radiuses[i] = 0; 1486 count[i] = 0; 1487 } 1488 1489 // assign points to clusters 1490 cv::AutoBuffer<int> belongs_to_buf(indices_length); 1491 int* belongs_to = belongs_to_buf.data(); 1492 for (int i=0; i<indices_length; ++i) { 1493 DistanceType sq_dist = distance_(dataset_[indices[i]], dataset_[centers_idx[0]], veclen_); 1494 belongs_to[i] = 0; 1495 for (int j=1; j<branching; ++j) { 1496 DistanceType new_sq_dist = distance_(dataset_[indices[i]], dataset_[centers_idx[j]], veclen_); 1497 if (sq_dist>new_sq_dist) { 1498 belongs_to[i] = j; 1499 sq_dist = new_sq_dist; 1500 } 1501 } 1502 if (sq_dist>radiuses[belongs_to[i]]) { 1503 radiuses[belongs_to[i]] = sq_dist; 1504 } 1505 count[belongs_to[i]]++; 1506 } 1507 1508 CentersType** centers = new CentersType*[branching]; 1509 1510 Distance* dummy = NULL; 1511 refineAndSplitClustering(node, indices, indices_length, branching, level, 1512 centers, radiuses, belongs_to, count, dummy); 1513 1514 delete[] centers; 1515 } 1516 1517 1518 /** 1519 * Performs one descent in the hierarchical k-means tree. The branches not 1520 * visited are stored in a priority queue. 1521 * 1522 * Params: 1523 * node = node to explore 1524 * result = container for the k-nearest neighbors found 1525 * vec = query points 1526 * checks = how many points in the dataset have been checked so far 1527 * maxChecks = maximum dataset points to checks 1528 */ 1529 1530 findNN(KMeansNodePtr node,ResultSet<DistanceType> & result,const ElementType * vec,int & checks,int maxChecks,Heap<BranchSt> * heap)1531 void findNN(KMeansNodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks, 1532 Heap<BranchSt>* heap) 1533 { 1534 // Ignore those clusters that are too far away 1535 { 1536 DistanceType bsq = distance_(vec, node->pivot, veclen_); 1537 DistanceType rsq = node->radius; 1538 DistanceType wsq = result.worstDist(); 1539 1540 if (isSquareDistance<Distance>()) 1541 { 1542 DistanceType val = bsq-rsq-wsq; 1543 if ((val>0) && (val*val > 4*rsq*wsq)) 1544 return; 1545 } 1546 else 1547 { 1548 if (bsq-rsq > wsq) 1549 return; 1550 } 1551 } 1552 1553 if (node->childs==NULL) { 1554 if ((checks>=maxChecks) && result.full()) { 1555 return; 1556 } 1557 checks += node->size; 1558 for (int i=0; i<node->size; ++i) { 1559 int index = node->indices[i]; 1560 DistanceType dist = distance_(dataset_[index], vec, veclen_); 1561 result.addPoint(dist, index); 1562 } 1563 } 1564 else { 1565 DistanceType* domain_distances = new DistanceType[branching_]; 1566 int closest_center = exploreNodeBranches(node, vec, domain_distances, heap); 1567 delete[] domain_distances; 1568 findNN(node->childs[closest_center],result,vec, checks, maxChecks, heap); 1569 } 1570 } 1571 1572 /** 1573 * Helper function that computes the nearest childs of a node to a given query point. 1574 * Params: 1575 * node = the node 1576 * q = the query point 1577 * distances = array with the distances to each child node. 1578 * Returns: 1579 */ exploreNodeBranches(KMeansNodePtr node,const ElementType * q,DistanceType * domain_distances,Heap<BranchSt> * heap)1580 int exploreNodeBranches(KMeansNodePtr node, const ElementType* q, DistanceType* domain_distances, Heap<BranchSt>* heap) 1581 { 1582 1583 int best_index = 0; 1584 domain_distances[best_index] = distance_(q, node->childs[best_index]->pivot, veclen_); 1585 for (int i=1; i<branching_; ++i) { 1586 domain_distances[i] = distance_(q, node->childs[i]->pivot, veclen_); 1587 if (domain_distances[i]<domain_distances[best_index]) { 1588 best_index = i; 1589 } 1590 } 1591 1592 // float* best_center = node->childs[best_index]->pivot; 1593 for (int i=0; i<branching_; ++i) { 1594 if (i != best_index) { 1595 domain_distances[i] -= cvflann::round<DistanceType>( 1596 cb_index_*node->childs[i]->variance ); 1597 1598 // float dist_to_border = getDistanceToBorder(node.childs[i].pivot,best_center,q); 1599 // if (domain_distances[i]<dist_to_border) { 1600 // domain_distances[i] = dist_to_border; 1601 // } 1602 heap->insert(BranchSt(node->childs[i],domain_distances[i])); 1603 } 1604 } 1605 1606 return best_index; 1607 } 1608 1609 1610 /** 1611 * Function the performs exact nearest neighbor search by traversing the entire tree. 1612 */ findExactNN(KMeansNodePtr node,ResultSet<DistanceType> & result,const ElementType * vec)1613 void findExactNN(KMeansNodePtr node, ResultSet<DistanceType>& result, const ElementType* vec) 1614 { 1615 // Ignore those clusters that are too far away 1616 { 1617 DistanceType bsq = distance_(vec, node->pivot, veclen_); 1618 DistanceType rsq = node->radius; 1619 DistanceType wsq = result.worstDist(); 1620 1621 if (isSquareDistance<Distance>()) 1622 { 1623 DistanceType val = bsq-rsq-wsq; 1624 if ((val>0) && (val*val > 4*rsq*wsq)) 1625 return; 1626 } 1627 else 1628 { 1629 if (bsq-rsq > wsq) 1630 return; 1631 } 1632 } 1633 1634 1635 if (node->childs==NULL) { 1636 for (int i=0; i<node->size; ++i) { 1637 int index = node->indices[i]; 1638 DistanceType dist = distance_(dataset_[index], vec, veclen_); 1639 result.addPoint(dist, index); 1640 } 1641 } 1642 else { 1643 int* sort_indices = new int[branching_]; 1644 1645 getCenterOrdering(node, vec, sort_indices); 1646 1647 for (int i=0; i<branching_; ++i) { 1648 findExactNN(node->childs[sort_indices[i]],result,vec); 1649 } 1650 1651 delete[] sort_indices; 1652 } 1653 } 1654 1655 1656 /** 1657 * Helper function. 1658 * 1659 * I computes the order in which to traverse the child nodes of a particular node. 1660 */ getCenterOrdering(KMeansNodePtr node,const ElementType * q,int * sort_indices)1661 void getCenterOrdering(KMeansNodePtr node, const ElementType* q, int* sort_indices) 1662 { 1663 DistanceType* domain_distances = new DistanceType[branching_]; 1664 for (int i=0; i<branching_; ++i) { 1665 DistanceType dist = distance_(q, node->childs[i]->pivot, veclen_); 1666 1667 int j=0; 1668 while (domain_distances[j]<dist && j<i) 1669 j++; 1670 for (int k=i; k>j; --k) { 1671 domain_distances[k] = domain_distances[k-1]; 1672 sort_indices[k] = sort_indices[k-1]; 1673 } 1674 domain_distances[j] = dist; 1675 sort_indices[j] = i; 1676 } 1677 delete[] domain_distances; 1678 } 1679 1680 /** 1681 * Method that computes the squared distance from the query point q 1682 * from inside region with center c to the border between this 1683 * region and the region with center p 1684 */ getDistanceToBorder(DistanceType * p,DistanceType * c,DistanceType * q)1685 DistanceType getDistanceToBorder(DistanceType* p, DistanceType* c, DistanceType* q) 1686 { 1687 DistanceType sum = 0; 1688 DistanceType sum2 = 0; 1689 1690 for (int i=0; i<veclen_; ++i) { 1691 DistanceType t = c[i]-p[i]; 1692 sum += t*(q[i]-(c[i]+p[i])/2); 1693 sum2 += t*t; 1694 } 1695 1696 return sum*sum/sum2; 1697 } 1698 1699 1700 /** 1701 * Helper function the descends in the hierarchical k-means tree by splitting those clusters that minimize 1702 * the overall variance of the clustering. 1703 * Params: 1704 * root = root node 1705 * clusters = array with clusters centers (return value) 1706 * varianceValue = variance of the clustering (return value) 1707 * Returns: 1708 */ getMinVarianceClusters(KMeansNodePtr root,KMeansNodePtr * clusters,int clusters_length,DistanceType & varianceValue)1709 int getMinVarianceClusters(KMeansNodePtr root, KMeansNodePtr* clusters, int clusters_length, DistanceType& varianceValue) 1710 { 1711 int clusterCount = 1; 1712 clusters[0] = root; 1713 1714 DistanceType meanVariance = root->variance*root->size; 1715 1716 while (clusterCount<clusters_length) { 1717 DistanceType minVariance = (std::numeric_limits<DistanceType>::max)(); 1718 int splitIndex = -1; 1719 1720 for (int i=0; i<clusterCount; ++i) { 1721 if (clusters[i]->childs != NULL) { 1722 1723 DistanceType variance = meanVariance - clusters[i]->variance*clusters[i]->size; 1724 1725 for (int j=0; j<branching_; ++j) { 1726 variance += clusters[i]->childs[j]->variance*clusters[i]->childs[j]->size; 1727 } 1728 if (variance<minVariance) { 1729 minVariance = variance; 1730 splitIndex = i; 1731 } 1732 } 1733 } 1734 1735 if (splitIndex==-1) break; 1736 if ( (branching_+clusterCount-1) > clusters_length) break; 1737 1738 meanVariance = minVariance; 1739 1740 // split node 1741 KMeansNodePtr toSplit = clusters[splitIndex]; 1742 clusters[splitIndex] = toSplit->childs[0]; 1743 for (int i=1; i<branching_; ++i) { 1744 clusters[clusterCount++] = toSplit->childs[i]; 1745 } 1746 } 1747 1748 varianceValue = meanVariance/root->size; 1749 return clusterCount; 1750 } 1751 1752 private: 1753 /** The branching factor used in the hierarchical k-means clustering */ 1754 int branching_; 1755 1756 /** Number of kmeans trees (default is one) */ 1757 int trees_; 1758 1759 /** Maximum number of iterations to use when performing k-means clustering */ 1760 int iterations_; 1761 1762 /** Algorithm for choosing the cluster centers */ 1763 flann_centers_init_t centers_init_; 1764 1765 /** 1766 * Cluster border index. This is used in the tree search phase when determining 1767 * the closest cluster to explore next. A zero value takes into account only 1768 * the cluster centres, a value greater then zero also take into account the size 1769 * of the cluster. 1770 */ 1771 float cb_index_; 1772 1773 /** 1774 * The dataset used by this index 1775 */ 1776 const Matrix<ElementType> dataset_; 1777 1778 /** Index parameters */ 1779 IndexParams index_params_; 1780 1781 /** 1782 * Number of features in the dataset. 1783 */ 1784 size_t size_; 1785 1786 /** 1787 * Length of each feature. 1788 */ 1789 size_t veclen_; 1790 1791 /** 1792 * The root node in the tree. 1793 */ 1794 KMeansNodePtr* root_; 1795 1796 /** 1797 * Array of indices to vectors in the dataset. 1798 */ 1799 int** indices_; 1800 1801 /** 1802 * The distance 1803 */ 1804 Distance distance_; 1805 1806 /** 1807 * Pooled memory allocator. 1808 */ 1809 PooledAllocator pool_; 1810 1811 /** 1812 * Memory occupied by the index. 1813 */ 1814 int memoryCounter_; 1815 }; 1816 1817 } 1818 1819 //! @endcond 1820 1821 #endif //OPENCV_FLANN_KMEANS_INDEX_H_ 1822