1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2018, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
23 //
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
27 //
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 #include "precomp.hpp"
45 #include <opencv2/core/utils/configuration.private.hpp>
46 #include <opencv2/core/hal/hal.hpp>
47
48 ////////////////////////////////////////// kmeans ////////////////////////////////////////////
49
50 namespace cv
51 {
52
53 static int CV_KMEANS_PARALLEL_GRANULARITY = (int)utils::getConfigurationParameterSizeT("OPENCV_KMEANS_PARALLEL_GRANULARITY", 1000);
54
generateRandomCenter(int dims,const Vec2f * box,float * center,RNG & rng)55 static void generateRandomCenter(int dims, const Vec2f* box, float* center, RNG& rng)
56 {
57 float margin = 1.f/dims;
58 for (int j = 0; j < dims; j++)
59 center[j] = ((float)rng*(1.f+margin*2.f)-margin)*(box[j][1] - box[j][0]) + box[j][0];
60 }
61
62 class KMeansPPDistanceComputer : public ParallelLoopBody
63 {
64 public:
KMeansPPDistanceComputer(float * tdist2_,const Mat & data_,const float * dist_,int ci_)65 KMeansPPDistanceComputer(float *tdist2_, const Mat& data_, const float *dist_, int ci_) :
66 tdist2(tdist2_), data(data_), dist(dist_), ci(ci_)
67 { }
68
operator ()(const cv::Range & range) const69 void operator()( const cv::Range& range ) const CV_OVERRIDE
70 {
71 CV_TRACE_FUNCTION();
72 const int begin = range.start;
73 const int end = range.end;
74 const int dims = data.cols;
75
76 for (int i = begin; i<end; i++)
77 {
78 tdist2[i] = std::min(hal::normL2Sqr_(data.ptr<float>(i), data.ptr<float>(ci), dims), dist[i]);
79 }
80 }
81
82 private:
83 KMeansPPDistanceComputer& operator=(const KMeansPPDistanceComputer&); // = delete
84
85 float *tdist2;
86 const Mat& data;
87 const float *dist;
88 const int ci;
89 };
90
91 /*
92 k-means center initialization using the following algorithm:
93 Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
94 */
generateCentersPP(const Mat & data,Mat & _out_centers,int K,RNG & rng,int trials)95 static void generateCentersPP(const Mat& data, Mat& _out_centers,
96 int K, RNG& rng, int trials)
97 {
98 CV_TRACE_FUNCTION();
99 const int dims = data.cols, N = data.rows;
100 cv::AutoBuffer<int, 64> _centers(K);
101 int* centers = &_centers[0];
102 cv::AutoBuffer<float, 0> _dist(N*3);
103 float* dist = &_dist[0], *tdist = dist + N, *tdist2 = tdist + N;
104 double sum0 = 0;
105
106 centers[0] = (unsigned)rng % N;
107
108 for (int i = 0; i < N; i++)
109 {
110 dist[i] = hal::normL2Sqr_(data.ptr<float>(i), data.ptr<float>(centers[0]), dims);
111 sum0 += dist[i];
112 }
113
114 for (int k = 1; k < K; k++)
115 {
116 double bestSum = DBL_MAX;
117 int bestCenter = -1;
118
119 for (int j = 0; j < trials; j++)
120 {
121 double p = (double)rng*sum0;
122 int ci = 0;
123 for (; ci < N - 1; ci++)
124 {
125 p -= dist[ci];
126 if (p <= 0)
127 break;
128 }
129
130 parallel_for_(Range(0, N),
131 KMeansPPDistanceComputer(tdist2, data, dist, ci),
132 (double)divUp((size_t)(dims * N), CV_KMEANS_PARALLEL_GRANULARITY));
133 double s = 0;
134 for (int i = 0; i < N; i++)
135 {
136 s += tdist2[i];
137 }
138
139 if (s < bestSum)
140 {
141 bestSum = s;
142 bestCenter = ci;
143 std::swap(tdist, tdist2);
144 }
145 }
146 if (bestCenter < 0)
147 CV_Error(Error::StsNoConv, "kmeans: can't update cluster center (check input for huge or NaN values)");
148 centers[k] = bestCenter;
149 sum0 = bestSum;
150 std::swap(dist, tdist);
151 }
152
153 for (int k = 0; k < K; k++)
154 {
155 const float* src = data.ptr<float>(centers[k]);
156 float* dst = _out_centers.ptr<float>(k);
157 for (int j = 0; j < dims; j++)
158 dst[j] = src[j];
159 }
160 }
161
162 template<bool onlyDistance>
163 class KMeansDistanceComputer : public ParallelLoopBody
164 {
165 public:
KMeansDistanceComputer(double * distances_,int * labels_,const Mat & data_,const Mat & centers_)166 KMeansDistanceComputer( double *distances_,
167 int *labels_,
168 const Mat& data_,
169 const Mat& centers_)
170 : distances(distances_),
171 labels(labels_),
172 data(data_),
173 centers(centers_)
174 {
175 }
176
operator ()(const Range & range) const177 void operator()(const Range& range) const CV_OVERRIDE
178 {
179 CV_TRACE_FUNCTION();
180 const int begin = range.start;
181 const int end = range.end;
182 const int K = centers.rows;
183 const int dims = centers.cols;
184
185 for (int i = begin; i < end; ++i)
186 {
187 const float *sample = data.ptr<float>(i);
188 if (onlyDistance)
189 {
190 const float* center = centers.ptr<float>(labels[i]);
191 distances[i] = hal::normL2Sqr_(sample, center, dims);
192 continue;
193 }
194 else
195 {
196 int k_best = 0;
197 double min_dist = DBL_MAX;
198
199 for (int k = 0; k < K; k++)
200 {
201 const float* center = centers.ptr<float>(k);
202 const double dist = hal::normL2Sqr_(sample, center, dims);
203
204 if (min_dist > dist)
205 {
206 min_dist = dist;
207 k_best = k;
208 }
209 }
210
211 distances[i] = min_dist;
212 labels[i] = k_best;
213 }
214 }
215 }
216
217 private:
218 KMeansDistanceComputer& operator=(const KMeansDistanceComputer&); // = delete
219
220 double *distances;
221 int *labels;
222 const Mat& data;
223 const Mat& centers;
224 };
225
226 }
227
kmeans(InputArray _data,int K,InputOutputArray _bestLabels,TermCriteria criteria,int attempts,int flags,OutputArray _centers)228 double cv::kmeans( InputArray _data, int K,
229 InputOutputArray _bestLabels,
230 TermCriteria criteria, int attempts,
231 int flags, OutputArray _centers )
232 {
233 CV_INSTRUMENT_REGION();
234 const int SPP_TRIALS = 3;
235 Mat data0 = _data.getMat();
236 const bool isrow = data0.rows == 1;
237 const int N = isrow ? data0.cols : data0.rows;
238 const int dims = (isrow ? 1 : data0.cols)*data0.channels();
239 const int type = data0.depth();
240
241 attempts = std::max(attempts, 1);
242 CV_Assert( data0.dims <= 2 && type == CV_32F && K > 0 );
243 CV_CheckGE(N, K, "Number of clusters should be more than number of elements");
244
245 Mat data(N, dims, CV_32F, data0.ptr(), isrow ? dims * sizeof(float) : static_cast<size_t>(data0.step));
246
247 _bestLabels.create(N, 1, CV_32S, -1, true);
248
249 Mat _labels, best_labels = _bestLabels.getMat();
250 if (flags & CV_KMEANS_USE_INITIAL_LABELS)
251 {
252 CV_Assert( (best_labels.cols == 1 || best_labels.rows == 1) &&
253 best_labels.cols*best_labels.rows == N &&
254 best_labels.type() == CV_32S &&
255 best_labels.isContinuous());
256 best_labels.reshape(1, N).copyTo(_labels);
257 for (int i = 0; i < N; i++)
258 {
259 CV_Assert((unsigned)_labels.at<int>(i) < (unsigned)K);
260 }
261 }
262 else
263 {
264 if (!((best_labels.cols == 1 || best_labels.rows == 1) &&
265 best_labels.cols*best_labels.rows == N &&
266 best_labels.type() == CV_32S &&
267 best_labels.isContinuous()))
268 {
269 _bestLabels.create(N, 1, CV_32S);
270 best_labels = _bestLabels.getMat();
271 }
272 _labels.create(best_labels.size(), best_labels.type());
273 }
274 int* labels = _labels.ptr<int>();
275
276 Mat centers(K, dims, type), old_centers(K, dims, type), temp(1, dims, type);
277 cv::AutoBuffer<int, 64> counters(K);
278 cv::AutoBuffer<double, 64> dists(N);
279 RNG& rng = theRNG();
280
281 if (criteria.type & TermCriteria::EPS)
282 criteria.epsilon = std::max(criteria.epsilon, 0.);
283 else
284 criteria.epsilon = FLT_EPSILON;
285 criteria.epsilon *= criteria.epsilon;
286
287 if (criteria.type & TermCriteria::COUNT)
288 criteria.maxCount = std::min(std::max(criteria.maxCount, 2), 100);
289 else
290 criteria.maxCount = 100;
291
292 if (K == 1)
293 {
294 attempts = 1;
295 criteria.maxCount = 2;
296 }
297
298 cv::AutoBuffer<Vec2f, 64> box(dims);
299 if (!(flags & KMEANS_PP_CENTERS))
300 {
301 {
302 const float* sample = data.ptr<float>(0);
303 for (int j = 0; j < dims; j++)
304 box[j] = Vec2f(sample[j], sample[j]);
305 }
306 for (int i = 1; i < N; i++)
307 {
308 const float* sample = data.ptr<float>(i);
309 for (int j = 0; j < dims; j++)
310 {
311 float v = sample[j];
312 box[j][0] = std::min(box[j][0], v);
313 box[j][1] = std::max(box[j][1], v);
314 }
315 }
316 }
317
318 double best_compactness = DBL_MAX;
319 for (int a = 0; a < attempts; a++)
320 {
321 double compactness = 0;
322
323 for (int iter = 0; ;)
324 {
325 double max_center_shift = iter == 0 ? DBL_MAX : 0.0;
326
327 swap(centers, old_centers);
328
329 if (iter == 0 && (a > 0 || !(flags & KMEANS_USE_INITIAL_LABELS)))
330 {
331 if (flags & KMEANS_PP_CENTERS)
332 generateCentersPP(data, centers, K, rng, SPP_TRIALS);
333 else
334 {
335 for (int k = 0; k < K; k++)
336 generateRandomCenter(dims, box.data(), centers.ptr<float>(k), rng);
337 }
338 }
339 else
340 {
341 // compute centers
342 centers = Scalar(0);
343 for (int k = 0; k < K; k++)
344 counters[k] = 0;
345
346 for (int i = 0; i < N; i++)
347 {
348 const float* sample = data.ptr<float>(i);
349 int k = labels[i];
350 float* center = centers.ptr<float>(k);
351 for (int j = 0; j < dims; j++)
352 center[j] += sample[j];
353 counters[k]++;
354 }
355
356 for (int k = 0; k < K; k++)
357 {
358 if (counters[k] != 0)
359 continue;
360
361 // if some cluster appeared to be empty then:
362 // 1. find the biggest cluster
363 // 2. find the farthest from the center point in the biggest cluster
364 // 3. exclude the farthest point from the biggest cluster and form a new 1-point cluster.
365 int max_k = 0;
366 for (int k1 = 1; k1 < K; k1++)
367 {
368 if (counters[max_k] < counters[k1])
369 max_k = k1;
370 }
371
372 double max_dist = 0;
373 int farthest_i = -1;
374 float* base_center = centers.ptr<float>(max_k);
375 float* _base_center = temp.ptr<float>(); // normalized
376 float scale = 1.f/counters[max_k];
377 for (int j = 0; j < dims; j++)
378 _base_center[j] = base_center[j]*scale;
379
380 for (int i = 0; i < N; i++)
381 {
382 if (labels[i] != max_k)
383 continue;
384 const float* sample = data.ptr<float>(i);
385 double dist = hal::normL2Sqr_(sample, _base_center, dims);
386
387 if (max_dist <= dist)
388 {
389 max_dist = dist;
390 farthest_i = i;
391 }
392 }
393
394 counters[max_k]--;
395 counters[k]++;
396 labels[farthest_i] = k;
397
398 const float* sample = data.ptr<float>(farthest_i);
399 float* cur_center = centers.ptr<float>(k);
400 for (int j = 0; j < dims; j++)
401 {
402 base_center[j] -= sample[j];
403 cur_center[j] += sample[j];
404 }
405 }
406
407 for (int k = 0; k < K; k++)
408 {
409 float* center = centers.ptr<float>(k);
410 CV_Assert( counters[k] != 0 );
411
412 float scale = 1.f/counters[k];
413 for (int j = 0; j < dims; j++)
414 center[j] *= scale;
415
416 if (iter > 0)
417 {
418 double dist = 0;
419 const float* old_center = old_centers.ptr<float>(k);
420 for (int j = 0; j < dims; j++)
421 {
422 double t = center[j] - old_center[j];
423 dist += t*t;
424 }
425 max_center_shift = std::max(max_center_shift, dist);
426 }
427 }
428 }
429
430 bool isLastIter = (++iter == MAX(criteria.maxCount, 2) || max_center_shift <= criteria.epsilon);
431
432 if (isLastIter)
433 {
434 // don't re-assign labels to avoid creation of empty clusters
435 parallel_for_(Range(0, N), KMeansDistanceComputer<true>(dists.data(), labels, data, centers), (double)divUp((size_t)(dims * N), CV_KMEANS_PARALLEL_GRANULARITY));
436 compactness = sum(Mat(Size(N, 1), CV_64F, &dists[0]))[0];
437 break;
438 }
439 else
440 {
441 // assign labels
442 parallel_for_(Range(0, N), KMeansDistanceComputer<false>(dists.data(), labels, data, centers), (double)divUp((size_t)(dims * N * K), CV_KMEANS_PARALLEL_GRANULARITY));
443 }
444 }
445
446 if (compactness < best_compactness)
447 {
448 best_compactness = compactness;
449 if (_centers.needed())
450 {
451 if (_centers.fixedType() && _centers.channels() == dims)
452 centers.reshape(dims).copyTo(_centers);
453 else
454 centers.copyTo(_centers);
455 }
456 _labels.copyTo(best_labels);
457 }
458 }
459
460 return best_compactness;
461 }
462