1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2018, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 #include "precomp.hpp"
45 #include <opencv2/core/utils/configuration.private.hpp>
46 #include <opencv2/core/hal/hal.hpp>
47 
48 ////////////////////////////////////////// kmeans ////////////////////////////////////////////
49 
50 namespace cv
51 {
52 
53 static int CV_KMEANS_PARALLEL_GRANULARITY = (int)utils::getConfigurationParameterSizeT("OPENCV_KMEANS_PARALLEL_GRANULARITY", 1000);
54 
generateRandomCenter(int dims,const Vec2f * box,float * center,RNG & rng)55 static void generateRandomCenter(int dims, const Vec2f* box, float* center, RNG& rng)
56 {
57     float margin = 1.f/dims;
58     for (int j = 0; j < dims; j++)
59         center[j] = ((float)rng*(1.f+margin*2.f)-margin)*(box[j][1] - box[j][0]) + box[j][0];
60 }
61 
62 class KMeansPPDistanceComputer : public ParallelLoopBody
63 {
64 public:
KMeansPPDistanceComputer(float * tdist2_,const Mat & data_,const float * dist_,int ci_)65     KMeansPPDistanceComputer(float *tdist2_, const Mat& data_, const float *dist_, int ci_) :
66         tdist2(tdist2_), data(data_), dist(dist_), ci(ci_)
67     { }
68 
operator ()(const cv::Range & range) const69     void operator()( const cv::Range& range ) const CV_OVERRIDE
70     {
71         CV_TRACE_FUNCTION();
72         const int begin = range.start;
73         const int end = range.end;
74         const int dims = data.cols;
75 
76         for (int i = begin; i<end; i++)
77         {
78             tdist2[i] = std::min(hal::normL2Sqr_(data.ptr<float>(i), data.ptr<float>(ci), dims), dist[i]);
79         }
80     }
81 
82 private:
83     KMeansPPDistanceComputer& operator=(const KMeansPPDistanceComputer&); // = delete
84 
85     float *tdist2;
86     const Mat& data;
87     const float *dist;
88     const int ci;
89 };
90 
91 /*
92 k-means center initialization using the following algorithm:
93 Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
94 */
generateCentersPP(const Mat & data,Mat & _out_centers,int K,RNG & rng,int trials)95 static void generateCentersPP(const Mat& data, Mat& _out_centers,
96                               int K, RNG& rng, int trials)
97 {
98     CV_TRACE_FUNCTION();
99     const int dims = data.cols, N = data.rows;
100     cv::AutoBuffer<int, 64> _centers(K);
101     int* centers = &_centers[0];
102     cv::AutoBuffer<float, 0> _dist(N*3);
103     float* dist = &_dist[0], *tdist = dist + N, *tdist2 = tdist + N;
104     double sum0 = 0;
105 
106     centers[0] = (unsigned)rng % N;
107 
108     for (int i = 0; i < N; i++)
109     {
110         dist[i] = hal::normL2Sqr_(data.ptr<float>(i), data.ptr<float>(centers[0]), dims);
111         sum0 += dist[i];
112     }
113 
114     for (int k = 1; k < K; k++)
115     {
116         double bestSum = DBL_MAX;
117         int bestCenter = -1;
118 
119         for (int j = 0; j < trials; j++)
120         {
121             double p = (double)rng*sum0;
122             int ci = 0;
123             for (; ci < N - 1; ci++)
124             {
125                 p -= dist[ci];
126                 if (p <= 0)
127                     break;
128             }
129 
130             parallel_for_(Range(0, N),
131                           KMeansPPDistanceComputer(tdist2, data, dist, ci),
132                           (double)divUp((size_t)(dims * N), CV_KMEANS_PARALLEL_GRANULARITY));
133             double s = 0;
134             for (int i = 0; i < N; i++)
135             {
136                 s += tdist2[i];
137             }
138 
139             if (s < bestSum)
140             {
141                 bestSum = s;
142                 bestCenter = ci;
143                 std::swap(tdist, tdist2);
144             }
145         }
146         if (bestCenter < 0)
147             CV_Error(Error::StsNoConv, "kmeans: can't update cluster center (check input for huge or NaN values)");
148         centers[k] = bestCenter;
149         sum0 = bestSum;
150         std::swap(dist, tdist);
151     }
152 
153     for (int k = 0; k < K; k++)
154     {
155         const float* src = data.ptr<float>(centers[k]);
156         float* dst = _out_centers.ptr<float>(k);
157         for (int j = 0; j < dims; j++)
158             dst[j] = src[j];
159     }
160 }
161 
162 template<bool onlyDistance>
163 class KMeansDistanceComputer : public ParallelLoopBody
164 {
165 public:
KMeansDistanceComputer(double * distances_,int * labels_,const Mat & data_,const Mat & centers_)166     KMeansDistanceComputer( double *distances_,
167                             int *labels_,
168                             const Mat& data_,
169                             const Mat& centers_)
170         : distances(distances_),
171           labels(labels_),
172           data(data_),
173           centers(centers_)
174     {
175     }
176 
operator ()(const Range & range) const177     void operator()(const Range& range) const CV_OVERRIDE
178     {
179         CV_TRACE_FUNCTION();
180         const int begin = range.start;
181         const int end = range.end;
182         const int K = centers.rows;
183         const int dims = centers.cols;
184 
185         for (int i = begin; i < end; ++i)
186         {
187             const float *sample = data.ptr<float>(i);
188             if (onlyDistance)
189             {
190                 const float* center = centers.ptr<float>(labels[i]);
191                 distances[i] = hal::normL2Sqr_(sample, center, dims);
192                 continue;
193             }
194             else
195             {
196                 int k_best = 0;
197                 double min_dist = DBL_MAX;
198 
199                 for (int k = 0; k < K; k++)
200                 {
201                     const float* center = centers.ptr<float>(k);
202                     const double dist = hal::normL2Sqr_(sample, center, dims);
203 
204                     if (min_dist > dist)
205                     {
206                         min_dist = dist;
207                         k_best = k;
208                     }
209                 }
210 
211                 distances[i] = min_dist;
212                 labels[i] = k_best;
213             }
214         }
215     }
216 
217 private:
218     KMeansDistanceComputer& operator=(const KMeansDistanceComputer&); // = delete
219 
220     double *distances;
221     int *labels;
222     const Mat& data;
223     const Mat& centers;
224 };
225 
226 }
227 
kmeans(InputArray _data,int K,InputOutputArray _bestLabels,TermCriteria criteria,int attempts,int flags,OutputArray _centers)228 double cv::kmeans( InputArray _data, int K,
229                    InputOutputArray _bestLabels,
230                    TermCriteria criteria, int attempts,
231                    int flags, OutputArray _centers )
232 {
233     CV_INSTRUMENT_REGION();
234     const int SPP_TRIALS = 3;
235     Mat data0 = _data.getMat();
236     const bool isrow = data0.rows == 1;
237     const int N = isrow ? data0.cols : data0.rows;
238     const int dims = (isrow ? 1 : data0.cols)*data0.channels();
239     const int type = data0.depth();
240 
241     attempts = std::max(attempts, 1);
242     CV_Assert( data0.dims <= 2 && type == CV_32F && K > 0 );
243     CV_CheckGE(N, K, "Number of clusters should be more than number of elements");
244 
245     Mat data(N, dims, CV_32F, data0.ptr(), isrow ? dims * sizeof(float) : static_cast<size_t>(data0.step));
246 
247     _bestLabels.create(N, 1, CV_32S, -1, true);
248 
249     Mat _labels, best_labels = _bestLabels.getMat();
250     if (flags & CV_KMEANS_USE_INITIAL_LABELS)
251     {
252         CV_Assert( (best_labels.cols == 1 || best_labels.rows == 1) &&
253                   best_labels.cols*best_labels.rows == N &&
254                   best_labels.type() == CV_32S &&
255                   best_labels.isContinuous());
256         best_labels.reshape(1, N).copyTo(_labels);
257         for (int i = 0; i < N; i++)
258         {
259             CV_Assert((unsigned)_labels.at<int>(i) < (unsigned)K);
260         }
261     }
262     else
263     {
264         if (!((best_labels.cols == 1 || best_labels.rows == 1) &&
265              best_labels.cols*best_labels.rows == N &&
266              best_labels.type() == CV_32S &&
267              best_labels.isContinuous()))
268         {
269             _bestLabels.create(N, 1, CV_32S);
270             best_labels = _bestLabels.getMat();
271         }
272         _labels.create(best_labels.size(), best_labels.type());
273     }
274     int* labels = _labels.ptr<int>();
275 
276     Mat centers(K, dims, type), old_centers(K, dims, type), temp(1, dims, type);
277     cv::AutoBuffer<int, 64> counters(K);
278     cv::AutoBuffer<double, 64> dists(N);
279     RNG& rng = theRNG();
280 
281     if (criteria.type & TermCriteria::EPS)
282         criteria.epsilon = std::max(criteria.epsilon, 0.);
283     else
284         criteria.epsilon = FLT_EPSILON;
285     criteria.epsilon *= criteria.epsilon;
286 
287     if (criteria.type & TermCriteria::COUNT)
288         criteria.maxCount = std::min(std::max(criteria.maxCount, 2), 100);
289     else
290         criteria.maxCount = 100;
291 
292     if (K == 1)
293     {
294         attempts = 1;
295         criteria.maxCount = 2;
296     }
297 
298     cv::AutoBuffer<Vec2f, 64> box(dims);
299     if (!(flags & KMEANS_PP_CENTERS))
300     {
301         {
302             const float* sample = data.ptr<float>(0);
303             for (int j = 0; j < dims; j++)
304                 box[j] = Vec2f(sample[j], sample[j]);
305         }
306         for (int i = 1; i < N; i++)
307         {
308             const float* sample = data.ptr<float>(i);
309             for (int j = 0; j < dims; j++)
310             {
311                 float v = sample[j];
312                 box[j][0] = std::min(box[j][0], v);
313                 box[j][1] = std::max(box[j][1], v);
314             }
315         }
316     }
317 
318     double best_compactness = DBL_MAX;
319     for (int a = 0; a < attempts; a++)
320     {
321         double compactness = 0;
322 
323         for (int iter = 0; ;)
324         {
325             double max_center_shift = iter == 0 ? DBL_MAX : 0.0;
326 
327             swap(centers, old_centers);
328 
329             if (iter == 0 && (a > 0 || !(flags & KMEANS_USE_INITIAL_LABELS)))
330             {
331                 if (flags & KMEANS_PP_CENTERS)
332                     generateCentersPP(data, centers, K, rng, SPP_TRIALS);
333                 else
334                 {
335                     for (int k = 0; k < K; k++)
336                         generateRandomCenter(dims, box.data(), centers.ptr<float>(k), rng);
337                 }
338             }
339             else
340             {
341                 // compute centers
342                 centers = Scalar(0);
343                 for (int k = 0; k < K; k++)
344                     counters[k] = 0;
345 
346                 for (int i = 0; i < N; i++)
347                 {
348                     const float* sample = data.ptr<float>(i);
349                     int k = labels[i];
350                     float* center = centers.ptr<float>(k);
351                     for (int j = 0; j < dims; j++)
352                         center[j] += sample[j];
353                     counters[k]++;
354                 }
355 
356                 for (int k = 0; k < K; k++)
357                 {
358                     if (counters[k] != 0)
359                         continue;
360 
361                     // if some cluster appeared to be empty then:
362                     //   1. find the biggest cluster
363                     //   2. find the farthest from the center point in the biggest cluster
364                     //   3. exclude the farthest point from the biggest cluster and form a new 1-point cluster.
365                     int max_k = 0;
366                     for (int k1 = 1; k1 < K; k1++)
367                     {
368                         if (counters[max_k] < counters[k1])
369                             max_k = k1;
370                     }
371 
372                     double max_dist = 0;
373                     int farthest_i = -1;
374                     float* base_center = centers.ptr<float>(max_k);
375                     float* _base_center = temp.ptr<float>(); // normalized
376                     float scale = 1.f/counters[max_k];
377                     for (int j = 0; j < dims; j++)
378                         _base_center[j] = base_center[j]*scale;
379 
380                     for (int i = 0; i < N; i++)
381                     {
382                         if (labels[i] != max_k)
383                             continue;
384                         const float* sample = data.ptr<float>(i);
385                         double dist = hal::normL2Sqr_(sample, _base_center, dims);
386 
387                         if (max_dist <= dist)
388                         {
389                             max_dist = dist;
390                             farthest_i = i;
391                         }
392                     }
393 
394                     counters[max_k]--;
395                     counters[k]++;
396                     labels[farthest_i] = k;
397 
398                     const float* sample = data.ptr<float>(farthest_i);
399                     float* cur_center = centers.ptr<float>(k);
400                     for (int j = 0; j < dims; j++)
401                     {
402                         base_center[j] -= sample[j];
403                         cur_center[j] += sample[j];
404                     }
405                 }
406 
407                 for (int k = 0; k < K; k++)
408                 {
409                     float* center = centers.ptr<float>(k);
410                     CV_Assert( counters[k] != 0 );
411 
412                     float scale = 1.f/counters[k];
413                     for (int j = 0; j < dims; j++)
414                         center[j] *= scale;
415 
416                     if (iter > 0)
417                     {
418                         double dist = 0;
419                         const float* old_center = old_centers.ptr<float>(k);
420                         for (int j = 0; j < dims; j++)
421                         {
422                             double t = center[j] - old_center[j];
423                             dist += t*t;
424                         }
425                         max_center_shift = std::max(max_center_shift, dist);
426                     }
427                 }
428             }
429 
430             bool isLastIter = (++iter == MAX(criteria.maxCount, 2) || max_center_shift <= criteria.epsilon);
431 
432             if (isLastIter)
433             {
434                 // don't re-assign labels to avoid creation of empty clusters
435                 parallel_for_(Range(0, N), KMeansDistanceComputer<true>(dists.data(), labels, data, centers), (double)divUp((size_t)(dims * N), CV_KMEANS_PARALLEL_GRANULARITY));
436                 compactness = sum(Mat(Size(N, 1), CV_64F, &dists[0]))[0];
437                 break;
438             }
439             else
440             {
441                 // assign labels
442                 parallel_for_(Range(0, N), KMeansDistanceComputer<false>(dists.data(), labels, data, centers), (double)divUp((size_t)(dims * N * K), CV_KMEANS_PARALLEL_GRANULARITY));
443             }
444         }
445 
446         if (compactness < best_compactness)
447         {
448             best_compactness = compactness;
449             if (_centers.needed())
450             {
451                 if (_centers.fixedType() && _centers.channels() == dims)
452                     centers.reshape(dims).copyTo(_centers);
453                 else
454                     centers.copyTo(_centers);
455             }
456             _labels.copyTo(best_labels);
457         }
458     }
459 
460     return best_compactness;
461 }
462