1 // Copyright (c) 2014 GeometryFactory Sarl (France).
2 // All rights reserved.
3 //
4 // This file is part of CGAL (www.cgal.org).
5 //
6 // $URL: https://github.com/CGAL/cgal/blob/v5.3/Surface_mesh_segmentation/include/CGAL/internal/Surface_mesh_segmentation/K_means_clustering.h $
7 // $Id: K_means_clustering.h ab14acf 2021-03-23T13:14:41+01:00 Simon Giraudot
8 // SPDX-License-Identifier: GPL-3.0-or-later OR LicenseRef-Commercial
9 //
10 // Author(s) : Ilker O. Yaz
11
12
13 #ifndef CGAL_SURFACE_MESH_SEGMENTATION_K_MEANS_CLUSTERING_H
14 #define CGAL_SURFACE_MESH_SEGMENTATION_K_MEANS_CLUSTERING_H
15
16 #include <CGAL/license/Surface_mesh_segmentation.h>
17
18
19 #include <vector>
20 #include <set>
21 #include <cmath>
22 #include <limits>
23 #include <algorithm>
24
25 #include <CGAL/assertions.h>
26 #include <CGAL/Random.h>
27
28 #define CGAL_DEFAULT_MAXIMUM_ITERATION 10u
29 #define CGAL_DEFAULT_NUMBER_OF_RUN 15u
30 #define CGAL_DEFAULT_SEED 1340818006
31
32 namespace CGAL
33 {
34 /// @cond CGAL_DOCUMENT_INTERNAL
35 namespace internal
36 {
37
38 /**
39 * Class providing initialization functionality: Forgy initialization, k++ initialization
40 *
41 * TODO: rand() might be changed with a generator for larger 'range'
42 */
43 class Selector
44 {
45 public:
46 /**
47 * Selects random samples from `points` and put them into `centers`. Does not select one sample more than once as center.
48 * T2 should be constructable by T1
49 *
50 * Implementation note: it is a variant of Floyd generator, and has uniform distribution
51 * where k = number of centers = complexity is O(k log k), and mem overhead is O(k)
52 *
53 * I also left previous implementation below, it might be useful where number of centers close to number of points
54 */
55 template<class T1, class T2>
forgy_initialization(std::size_t number_of_centers,const std::vector<T1> & points,std::vector<T2> & centers,CGAL::Random & random)56 void forgy_initialization(std::size_t number_of_centers,
57 const std::vector<T1>& points, std::vector<T2>& centers, CGAL::Random& random) {
58 centers.reserve(number_of_centers);
59 std::set<std::size_t> selected;
60
61 for(std::size_t i = 0; i < number_of_centers; ++i) {
62 std::size_t random_range = points.size() - number_of_centers +
63 i; // activate one more element in each iteration for as selectable
64 std::size_t random_index = random(random_range + 1); // [0, random_range];
65
66 std::pair<std::set<std::size_t>::iterator, bool> random_index_unique =
67 selected.insert(random_index);
68 if(!random_index_unique.second) {
69 random_index_unique = selected.insert(
70 random_range); // random_range can not be inside random_index_unique,
71 CGAL_assertion(
72 random_index_unique.second); // since we just increment it by one
73 }
74
75 centers.push_back(points[ *(random_index_unique.first) ]);
76 }
77 }
78
79 // To future reference, I also left prev implementation which is a variant of Fisher–Yates shuffle, however to keep `points` intact I use another vector to
80 // store and swap indices.
81 // where n = number of points; complexity = O(n), memory overhead = O(n)
82 /*
83 template<class T1, class T2>
84 void forgy_initialization(std::size_t number_of_centers, const std::vector<T1>& points, std::vector<T2>& centers)
85 {
86 std::vector<std::size_t> indices(points.size()); // it is required to not to swap points
87 for(std::size_t i = 0; i < points.size(); ++i) { indices[i] = i; }
88
89 centers.reserve(number_of_centers);
90 for(std::size_t i = 0; i < number_of_centers; ++i)
91 {
92 std::size_t random_index = rand() % (points.size() - i); // select a random index between 0 and not selected sample count
93 centers.push_back(points[ indices[random_index] ]);
94
95 std::swap(indices[random_index], indices[points.size() -i -1]); // not to select again, swap at the current-end
96 }
97 }
98 */
99
100 /**
101 * Selects samples from `points` using K-means++ algorithm and put them into `centers`.
102 * Does not select one sample more than once as center.
103 * Probability of a point to become a center is proportional to its squared distance to the closest center.
104 *
105 * T2 should be constructable by T1, and both T1 and T2 should have conversion operator to double.
106 */
107 template<class T1, class T2>
plus_plus_initialization(std::size_t number_of_centers,const std::vector<T1> & points,std::vector<T2> & centers,CGAL::Random & random)108 void plus_plus_initialization(std::size_t number_of_centers,
109 const std::vector<T1>& points, std::vector<T2>& centers, CGAL::Random& random) {
110 centers.reserve(number_of_centers);
111
112 std::vector<double> distance_square(points.size(),
113 (std::numeric_limits<double>::max)());
114 std::vector<double> distance_square_cumulative(points.size());
115
116 // distance_square stores squared distance to the closest center for each point.
117 // say, "distance_square" -> [ 0.1, 0.2, 0.3, 0.0, 0.2 ... ]
118 // then distance_square_cumulative -> [ 0.1, 0.3, 0.6, 0.6, 0.8 ... ]
119 std::size_t initial_index = random(points.size()); // [0, points size)
120 centers.push_back(points[initial_index]);
121
122 for(std::size_t i = 1; i < number_of_centers; ++i) {
123 double cumulative_distance_square = 0.0;
124 // distance_square holds closest distance that points have, so just test new coming center (i.e. centers.back())
125 for(std::size_t j = 0; j < points.size(); ++j) {
126 double new_distance = CGAL::square(centers.back() - points[j]);
127 if(new_distance < distance_square[j]) {
128 distance_square[j] = new_distance;
129 }
130 cumulative_distance_square += distance_square[j];
131 distance_square_cumulative[j] = cumulative_distance_square;
132 }
133
134 // check whether there is a way to select a new center
135 double total_probability = distance_square_cumulative.back();
136 CGAL_assertion( total_probability != 0.0 &&
137 "Squared distances of all elements to their closest centers are 0.0, there is no way to select a new center.");
138
139 double random_ds = random.get_double(0.0,
140 total_probability); // [0.0, total_probability)
141
142 // this can not select end(), since random_ds < total_probability (i.e. distance_square_cumulative.back())
143 // this can not select an already selected item since either (by considering that upper bounds returns greater)
144 // - aready selected item is at 0, and its value is 0.0
145 // - or its value is equal to value of previous element
146 std::size_t selection_index = std::upper_bound(
147 distance_square_cumulative.begin(), distance_square_cumulative.end(), random_ds)
148 - distance_square_cumulative.begin();
149
150 centers.push_back(points[selection_index]);
151 }
152 }
153 };
154
155 class K_means_center;
156
157 /**
158 * @brief Represents points in k-means algorithm.
159 * @see K_means_center, K_means_clustering
160 */
161 class K_means_point
162 {
163 public:
164 double data; /**< Location of the point */
165 std::size_t center_id; /**< Closest center to the point */
166 K_means_point(double data,
167 std::size_t center_id = (std::numeric_limits<std::size_t>::max)())
data(data)168 : data(data), center_id(center_id) {
169 }
170
171 operator double() const {
172 return data;
173 }
174
175 bool calculate_new_center(std::vector<K_means_center>& centers);
176 };
177
178 /**
179 * @brief Represents centers in k-means algorithm.
180 * @see K_means_point, K_means_clustering
181 */
182 class K_means_center
183 {
184 public:
185 double mean; /**< Mean of the center */
186 private:
187 double new_mean;
188 std::size_t new_number_of_points;
189 bool empty;
190
191 public:
192
K_means_center(double mean)193 K_means_center(double mean): mean(mean), new_mean(0.0), new_number_of_points(0),
194 empty(true) {
195 }
196
K_means_center(const K_means_point & point)197 K_means_center(const K_means_point& point) : mean(point.data), new_mean(0.0),
198 new_number_of_points(0), empty(true) {
199 }
200
201 operator double() const {
202 return mean;
203 }
204
205 /**
206 * Called by a point for adding itself to mean of the center (i.e. registering).
207 * @param data location of the point
208 */
add_point(double data)209 void add_point(double data) {
210 ++new_number_of_points;
211 new_mean += data;
212 }
213 /**
214 * Called after every point is registered to its closest center
215 * @see add_point()
216 */
calculate_mean()217 void calculate_mean() {
218 if(new_number_of_points ==
219 0) { // just return and this will leave `mean` as it is,
220 empty = true; // `new_mean` is 0.0 since `new_number_of_points` is 0
221 return;
222 }
223
224 empty = false;
225 mean = new_mean / new_number_of_points;
226 new_number_of_points = 0;
227 new_mean = 0.0;
228 }
229
is_empty()230 bool is_empty() {
231 return empty;
232 }
233
234 /** A comparator for sorting centers in ascending order. */
235 bool operator < (const K_means_center& center) const {
236 return mean < center.mean;
237 }
238 };
239
240 /**
241 * Finds closest center and adds itself to the closest center's mean.
242 * @param centers available centers
243 * @return true if #center_id is changed (i.e. new center != previous center)
244 */
calculate_new_center(std::vector<K_means_center> & centers)245 inline bool K_means_point::calculate_new_center(std::vector<K_means_center>&
246 centers)
247 {
248 std::size_t new_center_id = 0;
249 double min_distance = std::abs(centers[0].mean - data);
250 for(std::size_t i = 1; i < centers.size(); ++i) {
251 double new_distance = std::abs(centers[i].mean - data);
252 if(new_distance < min_distance) {
253 new_center_id = i;
254 min_distance = new_distance;
255 }
256 }
257 bool is_center_changed = (new_center_id != center_id);
258 center_id = new_center_id;
259
260 centers[center_id].add_point(data);
261 return is_center_changed;
262 }
263
264
265 /**
266 * @brief K-means clustering algorithm.
267 * @see K_means_point, K_means_center
268 */
269 class K_means_clustering
270 {
271 public:
272 /** Types of algorithms for random center selection. */
273 enum Initialization_types {
274 RANDOM_INITIALIZATION, /**< place initial centers randomly */
275 PLUS_INITIALIZATION /**< place initial centers using k-means++ algorithm */
276 };
277
278 private:
279 std::vector<K_means_center> centers;
280 std::vector<K_means_point> points;
281 std::size_t maximum_iteration;
282
283 Initialization_types init_type;
284
285 CGAL::Random random;
286 public:
287 /**
288 * @pre @a number_of_centers should be positive
289 * @pre size of @a data should be no smaller than number_of_centers
290 *
291 * Constructs structures and runs the algorithm.
292 * K-means algorithm is repeated number_of_run times, and the result which has minimum within cluster error is kept.
293 * @param number_of_centers
294 * @param data
295 * @param init_type initialization type for random center selection
296 * @param number_of_run number of times to repeat k-means algorithm
297 * @param maximum_iteration maximum allowed iteration in a single k-means algorithm call
298 *
299 *
300 */
301 K_means_clustering(std::size_t number_of_centers,
302 const std::vector<double>& data,
303 Initialization_types init_type = PLUS_INITIALIZATION,
304 std::size_t number_of_run = CGAL_DEFAULT_NUMBER_OF_RUN,
305 std::size_t maximum_iteration = CGAL_DEFAULT_MAXIMUM_ITERATION)
306 :
307 points(data.begin(), data.end()),
308 maximum_iteration(maximum_iteration),
309 init_type(init_type),
310 random(CGAL_DEFAULT_SEED) {
311 CGAL_precondition(data.size() >= number_of_centers
312 && "Number of centers can not be more than number of data.");
313
314 calculate_clustering_with_multiple_run(number_of_centers, number_of_run);
315 sort(centers.begin(), centers.end());
316 }
317
318 /**
319 * Fills data_center by the id of the closest center for each point.
320 * @param[out] data_centers
321 */
fill_with_center_ids(std::vector<std::size_t> & data_centers)322 void fill_with_center_ids(std::vector<std::size_t>& data_centers) {
323 data_centers.reserve(points.size());
324 for(std::vector<K_means_point>::iterator point_it = points.begin();
325 point_it != points.end(); ++point_it) {
326 point_it->calculate_new_center(centers); // find closest center
327 data_centers.push_back(point_it->center_id);
328 }
329 }
330
331 template<class T>
fill_with_centers(std::vector<T> & center_means)332 void fill_with_centers(std::vector<T>& center_means) {
333 for(std::vector<K_means_center>::iterator center_it = centers.begin();
334 center_it != centers.end(); ++center_it) {
335 center_means.push_back(center_it->mean);
336 }
337 }
338
339 private:
340 /**
341 * Initializes centers by choosing random points from data. Does not select one points more than once as center.
342 * @param number_of_centers
343 */
initiate_centers_randomly(std::size_t number_of_centers)344 void initiate_centers_randomly(std::size_t number_of_centers) {
345 centers.clear();
346 Selector().forgy_initialization(number_of_centers, points, centers, random);
347 }
348
349 /**
350 * Initializes centers by using K-means++ algorithm.
351 * Probability of a point to become a center is proportional to its squared distance to the closest center.
352 * @param number_of_centers
353 */
initiate_centers_plus_plus(std::size_t number_of_centers)354 void initiate_centers_plus_plus(std::size_t number_of_centers) {
355 centers.clear();
356 Selector().plus_plus_initialization(number_of_centers, points, centers, random);
357 }
358
359 /**
360 * One iteration of k-means algorithm.
361 * @return true if any closest center to a point is changed
362 */
iterate()363 bool iterate() {
364 bool any_center_changed = false;
365 // For each point, calculate its new center
366 for(std::vector<K_means_point>::iterator point_it = points.begin();
367 point_it != points.end(); ++point_it) {
368 bool center_changed = point_it->calculate_new_center(centers);
369 any_center_changed |= center_changed;
370 }
371 // For each center, calculate its new mean
372 for(std::vector<K_means_center>::iterator center_it = centers.begin();
373 center_it != centers.end(); ++center_it) {
374 center_it->calculate_mean();
375 }
376 return any_center_changed;
377 }
378
379 /**
380 * Main entry point for k-means algorithm.
381 * Iterates until convergence occurs (i.e. no point changes its center) or maximum iteration limit is reached.
382 */
calculate_clustering()383 void calculate_clustering() {
384 std::size_t iteration_count = 0;
385 bool any_center_changed = true;
386 while(any_center_changed && iteration_count++ < maximum_iteration) {
387 any_center_changed = iterate();
388 }
389 }
390
391 /**
392 * Calls calculate_clustering() @a number_of_run times,
393 * and keeps the result which has minimum within cluster error.
394 * @param number_of_centers
395 * @param number_of_run
396 * @see calculate_clustering(), within_cluster_sum_of_squares()
397 */
calculate_clustering_with_multiple_run(std::size_t number_of_centers,std::size_t number_of_run)398 void calculate_clustering_with_multiple_run(std::size_t number_of_centers,
399 std::size_t number_of_run) {
400 std::vector<K_means_center> min_centers;
401 double error = (std::numeric_limits<double>::max)();
402 while(number_of_run-- > 0) {
403 init_type == RANDOM_INITIALIZATION ? initiate_centers_randomly(
404 number_of_centers)
405 : initiate_centers_plus_plus(number_of_centers);
406 calculate_clustering();
407 double new_error = within_cluster_sum_of_squares();
408 if(error > new_error) {
409 error = new_error;
410 min_centers = centers;
411 }
412 }
413 // Note that current center-ids in points are not valid.
414 // But they are recalculated when asked (in fill_with_center_ids())
415 centers = min_centers;
416 }
417
418 /**
419 * Sum of squared distances between each point and the closest center to it.
420 */
within_cluster_sum_of_squares()421 double within_cluster_sum_of_squares() const {
422 double sum = 0.0;
423 for(std::vector<K_means_point>::const_iterator point_it = points.begin();
424 point_it != points.end(); ++point_it) {
425 sum += CGAL::square(centers[point_it->center_id].mean - point_it->data);
426 }
427 return sum;
428 }
429 };
430 }//namespace internal
431 /// @endcond
432 }//namespace CGAL
433 #undef CGAL_DEFAULT_SEED
434 #undef CGAL_DEFAULT_MAXIMUM_ITERATION
435 #undef CGAL_DEFAULT_NUMBER_OF_RUN
436
437 #endif //CGAL_SURFACE_MESH_SEGMENTATION_K_MEANS_CLUSTERING_H
438