1 // This is brl/bbas/bsta/algo/bsta_sample_set.h
2 #ifndef bsta_sample_set_h_
3 #define bsta_sample_set_h_
4 //:
5 // \file
6 // \brief Classes to collect samples
7 //
8 // \author Ozge C. Ozcanli
9 // \date March 04, 2009
10 //
11 // \verbatim
12 // Modifications
13 // (none yet)
14 // \endverbatim
15 //
16 #include <iostream>
17 #include <fstream>
18 #include <utility>
19 #include <bsta/bsta_parzen_sphere.h>
20 #include <bsta/bsta_mixture.h>
21 #include <bsta/bsta_attributes.h>
22 #include <bsta/bsta_gaussian_sphere.h>
23 #include <bsta/bsta_gaussian_full.h>
24 #include <vnl/vnl_vector_fixed.h>
25 #include <vnl/vnl_matrix_fixed.h>
26 #ifdef _MSC_VER
27 # include <vcl_msvc_warnings.h>
28 #endif
29
30 #define MIN_VAR_ 0.0001
31
32 //: A class to hold samples, the window width parameter, weights for each sample, assignments of each sample to modes/cluster centers/classes
33 // This class is used by mean-shift and EM algorithms
34 template <class T, unsigned n>
35 class bsta_sample_set : public bsta_parzen_sphere<T,n>
36 {
37 public:
38
39 typedef typename bsta_parzen_sphere<T,n>::vector_type vector_;
40
41 // Constructor
42 bsta_sample_set(T bandwidth = T(1)) : bsta_parzen_sphere<T,n>() { this->set_bandwidth(bandwidth); }
43
44 //: Compute the mean in a window around the given pt, the window size is the bandwidth
45 // If there are no points within bandwidth of the input pt, \return false
46 bool mean(vector_ const& pt, vector_& out);
47
48 //: Insert a weighted sample into the distribution
49 void insert_sample(vector_ const& sample, T weight = T(1.0));
50
weight(unsigned i)51 T weight(unsigned i) const { return weights_[i]; }
weights()52 std::vector<T>& weights() { return weights_; }
53
54 //: one may need to normalize the weights after the insertion is over
55 void normalize_weights();
56
57 //: must call this method before using the assignment vector
58 void initialize_assignments(); // initializes each sample's assignment to -1 (null assignment)
assignments()59 std::vector<int>& assignments() { return assignments_; }
assignment(unsigned i)60 int assignment(unsigned i) const { return assignments_[i]; }
set_assignment(unsigned i,int mode)61 void set_assignment(unsigned i, int mode) { assignments_[i] = mode; }
62
63 //: compute the mean of a particular assignment/mode/cluster
64 bool mode_mean(int mode, vector_& out) const;
65
samples_begin()66 typename std::vector<typename bsta_parzen_sphere<T,n>::vector_type >::const_iterator samples_begin() const { return bsta_parzen<T,n>::samples_.begin(); }
samples_end()67 typename std::vector<typename bsta_parzen_sphere<T,n>::vector_type >::const_iterator samples_end() const { return bsta_parzen<T,n>::samples_.end(); }
68
weights_begin()69 typename std::vector<T >::const_iterator weights_begin() const { return weights_.begin(); }
weights_end()70 typename std::vector<T >::const_iterator weights_end() const { return weights_.end(); }
71
assignments_begin()72 std::vector<int >::const_iterator assignments_begin() const { return assignments_.begin(); }
assignments_end()73 std::vector<int >::const_iterator assignments_end() const { return assignments_.end(); }
74
check_initializations()75 bool check_initializations() const { return bsta_parzen<T,n>::samples_.size() == weights_.size() &&
76 bsta_parzen<T,n>::samples_.size() == assignments_.size(); }
77
78 //: return number of assignments to this mode
79 int mode_size(int mode) const;
80
81 //: return total weight of assignments to this mode
82 T mode_weight(int mode) const;
83
84 //: return number of modes in the current assignment vector
85 unsigned mode_cnt() const;
86
87 //: return total weight of all assignments
88 T total_weight() const;
89
90 private:
91 //: hold a vector of weights for each data sample
92 // Needs to be set separately with each insert into the data set,
93 // otherwise it's set to 1.0 by default at the first call to mean()
94 std::vector<T> weights_;
95
96 std::vector<int> assignments_; // a negative value indicates "null assignment"
97 };
98
99 //: compute the variance of a particular assignment in a bsta_sample_set
100 template <class T>
bsta_sample_set_variance(const bsta_sample_set<T,1> & set,int mode,T min_var,T & out)101 bool bsta_sample_set_variance(const bsta_sample_set<T,1>& set, int mode, T min_var, T& out)
102 {
103 typedef typename std::vector<T >::const_iterator sit_t;
104 typedef typename std::vector<T >::const_iterator wit_t;
105 typedef typename std::vector<int >::const_iterator ait_t;
106
107 if (!set.check_initializations()) {
108 std::cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
109 return false;
110 }
111
112 T mv;
113 set.mode_mean(mode, mv);
114
115 T sum(T(0));
116 sit_t sit = set.samples_begin();
117 wit_t wit = set.weights_begin();
118 ait_t ait = set.assignments_begin();
119 T nsamp = 0;
120 for (; sit != set.samples_end(); ++sit, ++wit, ++ait) {
121 if (*ait != mode)
122 continue;
123
124 T s = (*sit-mv)*(*sit-mv);
125 sum += (*wit)*s;
126 nsamp += (*wit);
127 }
128 if (nsamp > 0) {
129 out = sum / nsamp;
130 if (out < min_var)
131 out = min_var;
132 return true;
133 }
134
135 return false;
136 }
137
138 //: compute the variance of a particular assignment in a bsta_sample_set
139 template <class T, unsigned n>
bsta_sample_set_variance(const bsta_sample_set<T,n> & set,int mode,vnl_matrix_fixed<T,n,n> & out)140 bool bsta_sample_set_variance(const bsta_sample_set<T,n>& set, int mode, vnl_matrix_fixed<T,n,n>& out)
141 {
142 typedef typename std::vector<vnl_vector_fixed<T,n> >::const_iterator sit_t;
143 typedef typename std::vector<T >::const_iterator wit_t;
144 typedef typename std::vector<int >::const_iterator ait_t;
145
146 if (!set.check_initializations()) {
147 std::cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
148 return false;
149 }
150
151 vnl_vector_fixed<T,n> mv;
152 set.mode_mean(mode, mv);
153
154 vnl_matrix_fixed<T,n,n> sum(T(0));
155 sit_t sit = set.samples_begin();
156 wit_t wit = set.weights_begin();
157 ait_t ait = set.assignments_begin();
158 T nsamp = 0;
159 for (; sit != set.samples_end(); ++sit, ++wit, ++ait) {
160 if (*ait != mode)
161 continue;
162
163 vnl_vector_fixed<T,n> diff = (*sit)-mv;
164 sum += (*wit)*outer_product(diff,diff);
165 nsamp += (*wit);
166 }
167 if (nsamp > 0) {
168 out = sum / nsamp;
169
170 return true;
171 }
172
173 return false;
174 }
175
176 //: compute the marginalized 1D sample set distribution from nD set
177 template <class T, unsigned n>
bsta_sample_set_marginalize(const bsta_sample_set<T,n> & set,unsigned component,bsta_sample_set<T,1> & out_set)178 bool bsta_sample_set_marginalize(const bsta_sample_set<T,n>& set, unsigned component, bsta_sample_set<T,1>& out_set)
179 {
180 typedef typename std::vector<vnl_vector_fixed<T,n> >::const_iterator sit_t;
181 typedef typename std::vector<T >::const_iterator wit_t;
182
183 if (n <= component) // if the vector is not as large to have component return false
184 return false;
185
186 sit_t sit = set.samples_begin();
187 wit_t wit = set.weights_begin();
188
189 for (; sit != set.samples_end(); ++sit, ++wit) {
190 out_set.insert_sample((*sit)[component], (*wit));
191 }
192
193 return true;
194 }
195
196 template <class T>
bsta_sample_set_fit_distribution(const bsta_sample_set<T,1> & set,bsta_mixture<bsta_num_obs<bsta_gaussian_sphere<T,1>>> & out)197 bool bsta_sample_set_fit_distribution(const bsta_sample_set<T,1>& set, bsta_mixture<bsta_num_obs<bsta_gaussian_sphere<T,1> > >& out)
198 {
199 if (!set.check_initializations()) {
200 std::cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
201 return false;
202 }
203
204 // compute mean and variance for each mode
205 unsigned mode_cnt = set.mode_cnt();
206
207 while (out.num_components() != 0) {
208 out.remove_last();
209 }
210
211 T total_weight = set.total_weight();
212 for (unsigned mi = 0; mi < mode_cnt; mi++) {
213 T meanv;
214 set.mode_mean(mi, meanv);
215 T var;
216 if (!bsta_sample_set_variance(set, mi, T(MIN_VAR_), var))
217 return false;
218 T w = set.mode_weight(mi);
219 bsta_gaussian_sphere<T,1> gauss_d(meanv,var);
220 bsta_num_obs<bsta_gaussian_sphere<T,1> > gauss(gauss_d, w);
221 if (!out.insert(gauss, w/total_weight))
222 return false;
223 }
224
225 return true;
226 }
227
228 template <class T, unsigned n>
bsta_sample_set_fit_distribution(const bsta_sample_set<T,n> & set,bsta_mixture<bsta_num_obs<bsta_gaussian_full<T,n>>> & out)229 bool bsta_sample_set_fit_distribution(const bsta_sample_set<T,n>& set, bsta_mixture<bsta_num_obs<bsta_gaussian_full<T,n> > >& out)
230 {
231 if (!set.check_initializations()) {
232 std::cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
233 return false;
234 }
235
236 // compute mean and variance for each mode
237 unsigned mode_cnt = set.mode_cnt();
238
239 while (out.num_components() != 0) {
240 out.remove_last();
241 }
242
243 T total_weight = set.total_weight();
244 for (unsigned mi = 0; mi < mode_cnt; mi++) {
245 vnl_vector_fixed<T,n> meanv;
246 set.mode_mean(mi, meanv);
247 vnl_matrix_fixed<T,n,n> covar;
248 if (!bsta_sample_set_variance(set, mi, covar))
249 return false;
250 T w = set.mode_weight(mi);
251 bsta_gaussian_full<T,n> gauss_d(meanv,covar);
252 bsta_num_obs<bsta_gaussian_full<T,n> > gauss(gauss_d, w);
253 if (!out.insert(gauss, w/total_weight))
254 return false;
255 }
256
257 return true;
258 }
259
260 //:
261 // Total weight is used to normalize the weight of the distribution
262 // (bsta_num_obs class contains total weight of samples assigned to this distribution)
263 template <class T>
bsta_sample_set_log_likelihood(const bsta_sample_set<T,1> & set,bsta_num_obs<bsta_gaussian_sphere<T,1>> & dist,T total_weight)264 T bsta_sample_set_log_likelihood(const bsta_sample_set<T,1>& set, bsta_num_obs<bsta_gaussian_sphere<T,1> >& dist, T total_weight)
265 {
266 if (!set.size()) {
267 std::cout << "Error in - bsta_sample_set<T,n>::bsta_sample_set_log_likelihood() : assignments not initialized!\n";
268 return T(0);
269 }
270
271 T w = dist.num_observations;
272 T p_dist = w/total_weight;
273 T sum = T(0);
274 for (unsigned i = 0; i < set.size(); i++) {
275 if (std::sqrt(dist.sqr_mahalanobis_dist(set.sample(i))) < 3) {// we don't want zero to be logged
276 T p = T(std::log(dist.prob_density(set.sample(i))));
277 T pw = T(std::log(p_dist));
278 sum += p + pw; // log is natural logarithm
279 }
280 }
281
282 return sum;
283 }
284
285 //:
286 // Total weight is used to normalize the weight of the distribution
287 // (bsta_num_obs class contains total weight of samples assigned to this distribution)
288 template <class T, unsigned n>
bsta_sample_set_log_likelihood(const bsta_sample_set<T,n> & set,bsta_num_obs<bsta_gaussian_full<T,n>> & dist,T total_weight)289 T bsta_sample_set_log_likelihood(const bsta_sample_set<T,n>& set, bsta_num_obs<bsta_gaussian_full<T,n> >& dist, T total_weight)
290 {
291 if (!set.size()) {
292 std::cout << "Error in - bsta_sample_set<T,n>::bsta_sample_set_log_likelihood() : assignments not initialized!\n";
293 return T(0);
294 }
295
296 T w = dist.num_observations;
297 T p_dist = w/total_weight;
298 T sum = T(0);
299 for (unsigned i = 0; i < set.size(); i++) {
300 if (std::sqrt(dist.sqr_mahalanobis_dist(set.sample(i))) < 3) {// we don't want zero to be logged
301 T p = T(std::log(dist.prob_density(set.sample(i))));
302 T pw = T(std::log(p_dist));
303 sum += p + pw; // log is natural logarithm
304 }
305 }
306
307 return sum;
308 }
309
310 //:
311 // Total weight is used to normalize the weight of the distribution
312 // (bsta_num_obs class contains total weight of samples assigned to this distribution)
313 template <class T>
bsta_sample_set_log_likelihood(const bsta_sample_set<T,2> & set,bsta_num_obs<bsta_gaussian_sphere<T,1>> & dist0,T w0,bsta_num_obs<bsta_gaussian_sphere<T,1>> & dist1,T w1,T & w_sum)314 T bsta_sample_set_log_likelihood(const bsta_sample_set<T,2>& set, bsta_num_obs<bsta_gaussian_sphere<T,1> >& dist0, T w0, bsta_num_obs<bsta_gaussian_sphere<T,1> >& dist1, T w1, T& w_sum)
315 {
316 if (!set.size()) {
317 std::cout << "Error in - bsta_sample_set<T,n>::bsta_sample_set_log_likelihood() : set is empty!\n";
318 return T(0);
319 }
320
321 T total_weight = T(0);
322 w_sum = T(0);
323 T sum = T(0);
324 unsigned cnt = 0;
325 for (unsigned i = 0; i < set.size(); i++) {
326
327 T d0 = dist0.sqr_mahalanobis_dist(set.sample(i)[0]);
328 T d1 = dist1.sqr_mahalanobis_dist(set.sample(i)[1]);
329 T d0_sqrt = std::sqrt(d0);
330 T d1_sqrt = std::sqrt(d1);
331
332 if (d0_sqrt < 3) {
333 if (d1_sqrt < 3) { // if this sample belongs to both of these modes
334 w_sum += set.weight(i);
335 cnt++;
336 } else {
337 d1 = 9; // make max distance to be 9
338 }
339 } else {
340 d0 = 9;
341 }
342
343 T p = dist0.dist_prob_density(d0);
344 p *= w0;
345 T p0 = T(std::log(p));
346 p = dist1.dist_prob_density(d1);
347 p *= w1;
348 T p1 = T(std::log(p));
349
350 sum += p0 + p1; // log is natural logarithm
351
352 total_weight += set.weight(i);
353 }
354
355 // w_sum is the total weight of all the samples assigned to these two modes w_sum/total_weight becomes the probability of this joint mode
356 T prior = w_sum/total_weight;
357 T log_prior = T(std::log(prior));
358 T tot_log_prior = set.size()*log_prior;
359 sum += tot_log_prior;
360
361 return sum;
362 }
363
364
365 //: a specialized matlab file printer for 2D data
366 template<class T>
bsta_sample_set_print_to_m(const bsta_sample_set<T,2> & set,std::ofstream & of)367 bool bsta_sample_set_print_to_m(const bsta_sample_set<T,2>& set, std::ofstream& of)
368 {
369 // print samples in different colors according to the assignment
370 unsigned mode_cnt = set.mode_cnt();
371
372 of << "cmap = colormap(lines(" << mode_cnt << "));\n";
373
374 for (unsigned m = 0; m < mode_cnt; m++) {
375 std::vector<std::pair<T,T> > points;
376 for (unsigned i = 0; i < set.size(); i++) {
377 if (set.assignment(i) == m)
378 points.push_back(std::pair<T,T>(T(set.sample(i)[0]), T(set.sample(i)[1])));
379 }
380 if (points.size() > 0) {
381 of << "x = [" << points[0].first;
382 for (unsigned i = 1; i < points.size(); i++) {
383 of << ", " << points[i].first;
384 }
385 of << "];\n";
386 of << "y = [" << points[0].second;
387 for (unsigned i = 1; i < points.size(); i++) {
388 of << ", " << points[i].second;
389 }
390 of << "];\n";
391 of << "h = plot(x,y,'or');\nset(h, 'Color', cmap(" << m+1 << ",:));\n";
392 of << "hold on\n";
393 }
394
395 vnl_vector_fixed<T,2> mode;
396 set.mode_mean(m, mode);
397 of << "xx = [" << mode[0] << "];\n";
398 of << "yy = [" << mode[1] << "];\n";
399 of << "h = plot(xx,yy,'+r');\nset(h, 'Color', cmap(" << m+1 << ",:));\n";
400 of << "hold on\n";
401 }
402
403 return true;
404 }
405
406 //: a specialized matlab file printer to visualize printed distribution
407 template<class T>
bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,2> & set,std::ofstream & of)408 bool bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,2>& set, std::ofstream& of)
409 {
410 // print samples in different colors according to the assignment
411 unsigned mode_cnt = set.mode_cnt();
412
413 of << "cmap = colormap(lines(" << mode_cnt << "));\n";
414
415 // find range of data to construct surface properly
416 T min = T(10000), max = T(0);
417 for (unsigned i = 0; i < set.size(); i++) {
418 if (set.sample(i)[0] < min)
419 min = set.sample(i)[0];
420 if (set.sample(i)[0] > max)
421 max = set.sample(i)[0];
422 if (set.sample(i)[1] < min)
423 min = set.sample(i)[1];
424 if (set.sample(i)[1] > max)
425 max = set.sample(i)[1];
426 }
427
428 of << "[x,y] = meshgrid(" << min << ":.2:" << max << ", " << min << ":.2:" << max << ");\n";
429
430 T total_weight = set.total_weight();
431 for (unsigned m = 0; m < mode_cnt; m++) {
432 vnl_vector_fixed<T,2> mode;
433 set.mode_mean(m, mode);
434 of << "mu = [" << mode[0] << ' ' << mode[1] << "];\n";
435 vnl_matrix_fixed<T,2,2> covar;
436 if (!bsta_sample_set_variance(set, m, covar))
437 return false;
438 of << "sigma = [";
439 for (unsigned r = 0; r < 2; r++) {
440 for (unsigned c = 0; c < 2; c++)
441 of << covar[r][c] << ' ';
442 if (r == 0)
443 of << ';';
444 }
445 of << "];\n";
446 of << "X = [x(:) y(:)];\n";
447 of << "p = " << set.mode_weight(m)/total_weight << "*mvnpdf(X, mu, sigma);\n";
448 of << "c = " << (m+1) << "*ones(size(x));\n";
449 of << "surf(x,y,reshape(p,size(x,1),size(x,2)),c);\n";
450 of << "hold on\n";
451 }
452
453 return true;
454 }
455
456 //: a specialized matlab file printer to visualize printed distribution
457 template<class T>
bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,1> & set,std::ofstream & of)458 bool bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,1>& set, std::ofstream& of)
459 {
460 // print samples in different colors according to the assignment
461 unsigned mode_cnt = set.mode_cnt();
462
463 of << "cmap = colormap(lines(" << mode_cnt << "));\n";
464
465 // find range of data to construct surface properly
466 T min = T(10000), max = T(0);
467 for (unsigned i = 0; i < set.size(); i++) {
468 if (set.sample(i) < min)
469 min = set.sample(i);
470 if (set.sample(i) > max)
471 max = set.sample(i);
472 }
473
474 of << "X = [" << min << ":.2:" << max << "]';\n";
475
476 T total_weight = set.total_weight();
477 for (unsigned m = 0; m < mode_cnt; m++) {
478 T mode;
479 set.mode_mean(m, mode);
480 of << "mu = [" << mode << "];\n";
481 T var;
482 if (!bsta_sample_set_variance(set, m, MIN_VAR_, var))
483 return false;
484 of << "sigma = [" << var << "];\n";
485 of << "p = " << set.mode_weight(m)/total_weight << "*mvnpdf(X, mu, sigma);\n";
486 of << "hh = plot(X,p,'+r');\nset(hh, 'Color', cmap(" << m+1 << ",:));\n";
487 of << "hold on\n";
488 }
489
490 return true;
491 }
492
493 #endif // bsta_sample_set_h_
494