1 // Copyright (c) 2014  GeometryFactory Sarl (France).
2 // All rights reserved.
3 //
4 // This file is part of CGAL (www.cgal.org).
5 //
6 // $URL: https://github.com/CGAL/cgal/blob/v5.3/Surface_mesh_segmentation/include/CGAL/internal/Surface_mesh_segmentation/K_means_clustering.h $
7 // $Id: K_means_clustering.h ab14acf 2021-03-23T13:14:41+01:00 Simon Giraudot
8 // SPDX-License-Identifier: GPL-3.0-or-later OR LicenseRef-Commercial
9 //
10 // Author(s)     : Ilker O. Yaz
11 
12 
13 #ifndef CGAL_SURFACE_MESH_SEGMENTATION_K_MEANS_CLUSTERING_H
14 #define CGAL_SURFACE_MESH_SEGMENTATION_K_MEANS_CLUSTERING_H
15 
16 #include <CGAL/license/Surface_mesh_segmentation.h>
17 
18 
19 #include <vector>
20 #include <set>
21 #include <cmath>
22 #include <limits>
23 #include <algorithm>
24 
25 #include <CGAL/assertions.h>
26 #include <CGAL/Random.h>
27 
28 #define CGAL_DEFAULT_MAXIMUM_ITERATION 10u
29 #define CGAL_DEFAULT_NUMBER_OF_RUN 15u
30 #define CGAL_DEFAULT_SEED 1340818006
31 
32 namespace CGAL
33 {
34 /// @cond CGAL_DOCUMENT_INTERNAL
35 namespace internal
36 {
37 
38 /**
39  * Class providing initialization functionality: Forgy initialization, k++ initialization
40  *
41  * TODO: rand() might be changed with a generator for larger 'range'
42  */
43 class Selector
44 {
45 public:
46   /**
47    * Selects random samples from `points` and put them into `centers`. Does not select one sample more than once as center.
48    * T2 should be constructable by T1
49    *
50    * Implementation note: it is a variant of Floyd generator, and has uniform distribution
51    * where k = number of centers = complexity is O(k log k), and mem overhead is O(k)
52    *
53    * I also left previous implementation below, it might be useful where number of centers close to number of points
54    */
55   template<class T1, class T2>
forgy_initialization(std::size_t number_of_centers,const std::vector<T1> & points,std::vector<T2> & centers,CGAL::Random & random)56   void forgy_initialization(std::size_t number_of_centers,
57                             const std::vector<T1>& points, std::vector<T2>& centers, CGAL::Random& random) {
58     centers.reserve(number_of_centers);
59     std::set<std::size_t> selected;
60 
61     for(std::size_t i = 0; i < number_of_centers; ++i) {
62       std::size_t random_range = points.size() - number_of_centers +
63                                  i; // activate one more element in each iteration for as selectable
64       std::size_t random_index = random(random_range + 1); // [0, random_range];
65 
66       std::pair<std::set<std::size_t>::iterator, bool> random_index_unique =
67         selected.insert(random_index);
68       if(!random_index_unique.second) {
69         random_index_unique = selected.insert(
70                                 random_range); // random_range can not be inside random_index_unique,
71         CGAL_assertion(
72           random_index_unique.second);          // since we just increment it by one
73       }
74 
75       centers.push_back(points[ *(random_index_unique.first) ]);
76     }
77   }
78 
79   // To future reference, I also left prev implementation which is a variant of Fisher–Yates shuffle, however to keep `points` intact I use another vector to
80   // store and swap indices.
81   // where n = number of points; complexity = O(n), memory overhead = O(n)
82   /*
83   template<class T1, class T2>
84   void forgy_initialization(std::size_t number_of_centers, const std::vector<T1>& points, std::vector<T2>& centers)
85   {
86       std::vector<std::size_t> indices(points.size()); // it is required to not to swap points
87       for(std::size_t i = 0; i < points.size(); ++i) { indices[i] = i; }
88 
89       centers.reserve(number_of_centers);
90       for(std::size_t i = 0; i < number_of_centers; ++i)
91       {
92           std::size_t random_index = rand() % (points.size() - i); // select a random index between 0 and not selected sample count
93           centers.push_back(points[ indices[random_index] ]);
94 
95           std::swap(indices[random_index], indices[points.size() -i -1]); // not to select again, swap at the current-end
96       }
97   }
98   */
99 
100   /**
101    * Selects samples from `points` using K-means++ algorithm and put them into `centers`.
102    * Does not select one sample more than once as center.
103    * Probability of a point to become a center is proportional to its squared distance to the closest center.
104    *
105    * T2 should be constructable by T1, and both T1 and T2 should have conversion operator to double.
106    */
107   template<class T1, class T2>
plus_plus_initialization(std::size_t number_of_centers,const std::vector<T1> & points,std::vector<T2> & centers,CGAL::Random & random)108   void plus_plus_initialization(std::size_t number_of_centers,
109                                 const std::vector<T1>& points, std::vector<T2>& centers, CGAL::Random& random) {
110     centers.reserve(number_of_centers);
111 
112     std::vector<double> distance_square(points.size(),
113                                         (std::numeric_limits<double>::max)());
114     std::vector<double> distance_square_cumulative(points.size());
115 
116     // distance_square stores squared distance to the closest center for each point.
117     // say, "distance_square" ->            [ 0.1, 0.2, 0.3, 0.0, 0.2 ... ]
118     // then distance_square_cumulative ->   [ 0.1, 0.3, 0.6, 0.6, 0.8 ... ]
119     std::size_t initial_index = random(points.size()); // [0, points size)
120     centers.push_back(points[initial_index]);
121 
122     for(std::size_t i = 1; i < number_of_centers; ++i) {
123       double cumulative_distance_square = 0.0;
124       // distance_square holds closest distance that points have, so just test new coming center (i.e. centers.back())
125       for(std::size_t j = 0; j < points.size(); ++j) {
126         double new_distance = CGAL::square(centers.back() - points[j]);
127         if(new_distance < distance_square[j]) {
128           distance_square[j] = new_distance;
129         }
130         cumulative_distance_square += distance_square[j];
131         distance_square_cumulative[j] = cumulative_distance_square;
132       }
133 
134       // check whether there is a way to select a new center
135       double total_probability = distance_square_cumulative.back();
136       CGAL_assertion( total_probability != 0.0 &&
137                       "Squared distances of all elements to their closest centers are 0.0, there is no way to select a new center.");
138 
139       double random_ds = random.get_double(0.0,
140                                            total_probability); // [0.0, total_probability)
141 
142       // this can not select end(), since random_ds < total_probability (i.e. distance_square_cumulative.back())
143       // this can not select an already selected item since either (by considering that upper bounds returns greater)
144       //  - aready selected item is at 0, and its value is 0.0
145       //  - or its value is equal to value of previous element
146       std::size_t selection_index = std::upper_bound(
147                                       distance_square_cumulative.begin(), distance_square_cumulative.end(), random_ds)
148                                     - distance_square_cumulative.begin();
149 
150       centers.push_back(points[selection_index]);
151     }
152   }
153 };
154 
155 class K_means_center;
156 
157 /**
158   * @brief Represents points in k-means algorithm.
159   * @see K_means_center, K_means_clustering
160   */
161 class K_means_point
162 {
163 public:
164   double data;      /**< Location of the point */
165   std::size_t    center_id; /**< Closest center to the point */
166   K_means_point(double data,
167                 std::size_t center_id = (std::numeric_limits<std::size_t>::max)())
data(data)168     : data(data), center_id(center_id) {
169   }
170 
171   operator double() const {
172     return data;
173   }
174 
175   bool calculate_new_center(std::vector<K_means_center>& centers);
176 };
177 
178 /**
179   * @brief Represents centers in k-means algorithm.
180   * @see K_means_point, K_means_clustering
181   */
182 class K_means_center
183 {
184 public:
185   double mean; /**< Mean of the center */
186 private:
187   double new_mean;
188   std::size_t    new_number_of_points;
189   bool   empty;
190 
191 public:
192 
K_means_center(double mean)193   K_means_center(double mean): mean(mean), new_mean(0.0), new_number_of_points(0),
194     empty(true) {
195   }
196 
K_means_center(const K_means_point & point)197   K_means_center(const K_means_point& point) : mean(point.data), new_mean(0.0),
198     new_number_of_points(0), empty(true) {
199   }
200 
201   operator double() const {
202     return mean;
203   }
204 
205   /**
206     * Called by a point for adding itself to mean of the center (i.e. registering).
207     * @param data location of the point
208     */
add_point(double data)209   void add_point(double data) {
210     ++new_number_of_points;
211     new_mean += data;
212   }
213   /**
214     * Called after every point is registered to its closest center
215     * @see add_point()
216     */
calculate_mean()217   void calculate_mean() {
218     if(new_number_of_points ==
219         0) { // just return and this will leave `mean` as it is,
220       empty = true;                 // `new_mean` is 0.0 since `new_number_of_points` is 0
221       return;
222     }
223 
224     empty = false;
225     mean = new_mean / new_number_of_points;
226     new_number_of_points = 0;
227     new_mean = 0.0;
228   }
229 
is_empty()230   bool is_empty() {
231     return empty;
232   }
233 
234   /** A comparator for sorting centers in ascending order. */
235   bool operator < (const K_means_center& center) const {
236     return mean < center.mean;
237   }
238 };
239 
240 /**
241 * Finds closest center and adds itself to the closest center's mean.
242 * @param centers available centers
243 * @return true if #center_id is changed (i.e. new center != previous center)
244 */
calculate_new_center(std::vector<K_means_center> & centers)245 inline bool K_means_point::calculate_new_center(std::vector<K_means_center>&
246     centers)
247 {
248   std::size_t new_center_id = 0;
249   double min_distance = std::abs(centers[0].mean - data);
250   for(std::size_t i = 1; i < centers.size(); ++i) {
251     double new_distance = std::abs(centers[i].mean - data);
252     if(new_distance < min_distance) {
253       new_center_id = i;
254       min_distance = new_distance;
255     }
256   }
257   bool is_center_changed = (new_center_id != center_id);
258   center_id = new_center_id;
259 
260   centers[center_id].add_point(data);
261   return is_center_changed;
262 }
263 
264 
265 /**
266  * @brief K-means clustering algorithm.
267  * @see K_means_point, K_means_center
268  */
269 class K_means_clustering
270 {
271 public:
272   /** Types of algorithms for random center selection. */
273   enum Initialization_types {
274     RANDOM_INITIALIZATION, /**< place initial centers randomly */
275     PLUS_INITIALIZATION    /**< place initial centers using k-means++ algorithm */
276   };
277 
278 private:
279   std::vector<K_means_center> centers;
280   std::vector<K_means_point>  points;
281   std::size_t  maximum_iteration;
282 
283   Initialization_types init_type;
284 
285   CGAL::Random random;
286 public:
287   /**
288    * @pre @a number_of_centers should be positive
289    * @pre size of @a data should be no smaller than number_of_centers
290    *
291    * Constructs structures and runs the algorithm.
292    * K-means algorithm is repeated number_of_run times, and the result which has minimum within cluster error is kept.
293    * @param number_of_centers
294    * @param data
295    * @param init_type initialization type for random center selection
296    * @param number_of_run number of times to repeat k-means algorithm
297    * @param maximum_iteration maximum allowed iteration in a single k-means algorithm call
298    *
299    *
300    */
301   K_means_clustering(std::size_t number_of_centers,
302                      const std::vector<double>& data,
303                      Initialization_types init_type = PLUS_INITIALIZATION,
304                      std::size_t number_of_run = CGAL_DEFAULT_NUMBER_OF_RUN,
305                      std::size_t maximum_iteration = CGAL_DEFAULT_MAXIMUM_ITERATION)
306     :
307     points(data.begin(), data.end()),
308     maximum_iteration(maximum_iteration),
309     init_type(init_type),
310     random(CGAL_DEFAULT_SEED) {
311     CGAL_precondition(data.size() >= number_of_centers
312                       && "Number of centers can not be more than number of data.");
313 
314     calculate_clustering_with_multiple_run(number_of_centers, number_of_run);
315     sort(centers.begin(), centers.end());
316   }
317 
318   /**
319    * Fills data_center by the id of the closest center for each point.
320    * @param[out] data_centers
321    */
fill_with_center_ids(std::vector<std::size_t> & data_centers)322   void fill_with_center_ids(std::vector<std::size_t>& data_centers) {
323     data_centers.reserve(points.size());
324     for(std::vector<K_means_point>::iterator point_it = points.begin();
325         point_it != points.end(); ++point_it) {
326       point_it->calculate_new_center(centers); // find closest center
327       data_centers.push_back(point_it->center_id);
328     }
329   }
330 
331   template<class T>
fill_with_centers(std::vector<T> & center_means)332   void fill_with_centers(std::vector<T>& center_means) {
333     for(std::vector<K_means_center>::iterator center_it = centers.begin();
334         center_it != centers.end(); ++center_it) {
335       center_means.push_back(center_it->mean);
336     }
337   }
338 
339 private:
340   /**
341    * Initializes centers by choosing random points from data. Does not select one points more than once as center.
342    * @param number_of_centers
343    */
initiate_centers_randomly(std::size_t number_of_centers)344   void initiate_centers_randomly(std::size_t number_of_centers) {
345     centers.clear();
346     Selector().forgy_initialization(number_of_centers, points, centers, random);
347   }
348 
349   /**
350    * Initializes centers by using K-means++ algorithm.
351    * Probability of a point to become a center is proportional to its squared distance to the closest center.
352    * @param number_of_centers
353    */
initiate_centers_plus_plus(std::size_t number_of_centers)354   void initiate_centers_plus_plus(std::size_t number_of_centers) {
355     centers.clear();
356     Selector().plus_plus_initialization(number_of_centers, points, centers, random);
357   }
358 
359   /**
360    * One iteration of k-means algorithm.
361    * @return true if any closest center to a point is changed
362    */
iterate()363   bool iterate() {
364     bool any_center_changed = false;
365     // For each point, calculate its new center
366     for(std::vector<K_means_point>::iterator point_it = points.begin();
367         point_it != points.end(); ++point_it) {
368       bool center_changed = point_it->calculate_new_center(centers);
369       any_center_changed |= center_changed;
370     }
371     // For each center, calculate its new mean
372     for(std::vector<K_means_center>::iterator center_it = centers.begin();
373         center_it != centers.end(); ++center_it) {
374       center_it->calculate_mean();
375     }
376     return any_center_changed;
377   }
378 
379   /**
380    * Main entry point for k-means algorithm.
381    * Iterates until convergence occurs (i.e. no point changes its center) or maximum iteration limit is reached.
382    */
calculate_clustering()383   void calculate_clustering() {
384     std::size_t iteration_count = 0;
385     bool any_center_changed = true;
386     while(any_center_changed && iteration_count++ < maximum_iteration) {
387       any_center_changed = iterate();
388     }
389   }
390 
391   /**
392    * Calls calculate_clustering() @a number_of_run times,
393    * and keeps the result which has minimum within cluster error.
394    * @param number_of_centers
395    * @param number_of_run
396    * @see calculate_clustering(), within_cluster_sum_of_squares()
397    */
calculate_clustering_with_multiple_run(std::size_t number_of_centers,std::size_t number_of_run)398   void calculate_clustering_with_multiple_run(std::size_t number_of_centers,
399       std::size_t number_of_run) {
400     std::vector<K_means_center> min_centers;
401     double error = (std::numeric_limits<double>::max)();
402     while(number_of_run-- > 0) {
403       init_type == RANDOM_INITIALIZATION ? initiate_centers_randomly(
404         number_of_centers)
405       : initiate_centers_plus_plus(number_of_centers);
406       calculate_clustering();
407       double new_error = within_cluster_sum_of_squares();
408       if(error > new_error) {
409         error = new_error;
410         min_centers = centers;
411       }
412     }
413     // Note that current center-ids in points are not valid.
414     // But they are recalculated when asked (in fill_with_center_ids())
415     centers = min_centers;
416   }
417 
418   /**
419    * Sum of squared distances between each point and the closest center to it.
420    */
within_cluster_sum_of_squares()421   double within_cluster_sum_of_squares() const {
422     double sum = 0.0;
423     for(std::vector<K_means_point>::const_iterator point_it = points.begin();
424         point_it != points.end(); ++point_it) {
425       sum += CGAL::square(centers[point_it->center_id].mean - point_it->data);
426     }
427     return sum;
428   }
429 };
430 }//namespace internal
431 /// @endcond
432 }//namespace CGAL
433 #undef CGAL_DEFAULT_SEED
434 #undef CGAL_DEFAULT_MAXIMUM_ITERATION
435 #undef CGAL_DEFAULT_NUMBER_OF_RUN
436 
437 #endif //CGAL_SURFACE_MESH_SEGMENTATION_K_MEANS_CLUSTERING_H
438