1 // This is brl/bbas/bsta/bsta_k_medoid.h
2 #ifndef bsta_k_medoid_h_
3 #define bsta_k_medoid_h_
4 //:
5 // \file
6 // \brief Form k clusters using distance to representative objects (medoids)
7 // \author Joseph Mundy
8 // \date June 11, 2005
9 //
10 // A clustering algorithm based on the distance within the cluster to
11 // a representative element and the total distance between representatives.
12 // The input is a n x n matrix of pairwise distances
13 // The output is a set of k representatives (medoids) that minimize the
14 // sum of the average distance from each medoid to other elements
15 // closest to the medoid and the average distance between medoids.
16 //
17 // For k = 1 the medoid would be the element that minimizes the average
18 // distance to all other elements. The elements are indexed from 0 to n-1.
19 //
20 // Fairly computationally expensive:  The space requirement is n x n
21 // The time is k x (n - k) x (n - k) x number of swaps to minimize
22 // total distance.  There might be on the order of k swaps (or worse).
23 
24 #include <vector>
25 #include <iostream>
26 #include <cassert>
27 #ifdef _MSC_VER
28 #  include <vcl_msvc_warnings.h>
29 #endif
30 #include <vbl/vbl_array_2d.h>
31 
32 class bsta_k_medoid
33 {
34  public:
35   bsta_k_medoid(unsigned n_elements, bool verbose = false);
36   ~bsta_k_medoid()= default;
37 
38   //: insert a distance into the array, the entry j, i is automatically added
insert_distance(const unsigned i,const unsigned j,double d)39   inline void insert_distance(const unsigned i, const unsigned j, double d)
40   {assert((i<n_elements_)&&(j<n_elements_));
41   distance_array_[i][j] = d; distance_array_[j][i] = d;}
42 
43   //: The distance between two elements
distance(const unsigned i,const unsigned j)44   inline double distance(const unsigned i, const unsigned j) const
45     {assert((i<n_elements_)&&(j<n_elements_)); return distance_array_[i][j];}
46 
47   //: cluster the elements using k medoids
48   void do_clustering(const unsigned k);
49 
50   //:get number of medoids
k()51   inline unsigned k() const
52     {return medoids_.size();}
53 
54   //: get a medoid
medoid(const unsigned i)55   unsigned medoid(const unsigned i) const
56     {assert(i<medoids_.size()); return medoids_[i];}
57 
58   //: is an element a medoid?
59   bool is_medoid(const unsigned i) const;
60 
61   //: number of elements in cluster k
size(const unsigned k)62   inline unsigned size(const unsigned k) const
63     {assert(k<this->k());return clusters_[k].size();}
64 
65   //: the elements in cluster k
elements(const unsigned k)66   inline std::vector<unsigned> elements(const unsigned k)
67     {assert(k<this->k());return clusters_[k];}
68 
69   //: is an element in cluster k ?
70   bool in_cluster(const unsigned i, const unsigned k) const;
71 
72   //:the distance between an element and its medoid
73   double medoid_distance(const unsigned i) const;
74 
75   //: the total distance between elements and the medoid in cluster k
76   double total_distance(const unsigned k) const;
77 
78   //: print distance array (for debugging)
79   inline void print_distance_array(std::ostream & str = std::cout)
80     {str << '\n' << distance_array_ << '\n';}
81 
82  protected:
83 
84   //: avg distance change for element i resulting from swapping medoids, j->k.
85   double dc(const unsigned i, const unsigned j, const unsigned k);
86 
87   //: avg inter-medoid distance change resulting from swapping medoids, j->k.
88   double dcm(const unsigned j, const unsigned k);
89 
90   //: replace medoid k with medoid k in the set of medoids
91   bool replace_medoid(const unsigned j, const unsigned k);
92 
93   //: determine if a swap of j with k leads to a reduction in distance
94   bool test_medoid_swap(unsigned& mj, unsigned& mk);
95 
96   //: clear the cluster sets
97   void clear_clusters();
98 
99   //: assign non-medoids to their nearest medoid, forming clusters
100   void form_clusters();
101 
102  private:
103   //: print useful debug messages
104   bool verbose_;
105 
106   //: the size of the distance array
107   unsigned n_elements_;
108 
109   //: the k medoids
110   std::vector<unsigned> medoids_;
111 
112   //: The set of elements closest to a given medoid
113   std::vector<std::vector<unsigned> > clusters_;
114 
115   //: The array of pair-wise distances between elements
116   vbl_array_2d<double> distance_array_;
117 };
118 
119 #endif // bsta_k_medoid_h_
120