1 // This is brl/bbas/bsta/algo/bsta_sample_set.h
2 #ifndef bsta_sample_set_h_
3 #define bsta_sample_set_h_
4 //:
5 // \file
6 // \brief Classes to collect samples
7 //
8 // \author Ozge C. Ozcanli
9 // \date March 04, 2009
10 //
11 // \verbatim
12 //  Modifications
13 //   (none yet)
14 // \endverbatim
15 //
16 #include <iostream>
17 #include <fstream>
18 #include <utility>
19 #include <bsta/bsta_parzen_sphere.h>
20 #include <bsta/bsta_mixture.h>
21 #include <bsta/bsta_attributes.h>
22 #include <bsta/bsta_gaussian_sphere.h>
23 #include <bsta/bsta_gaussian_full.h>
24 #include <vnl/vnl_vector_fixed.h>
25 #include <vnl/vnl_matrix_fixed.h>
26 #ifdef _MSC_VER
27 #  include <vcl_msvc_warnings.h>
28 #endif
29 
30 #define MIN_VAR_  0.0001
31 
32 //: A class to hold samples, the window width parameter, weights for each sample, assignments of each sample to modes/cluster centers/classes
33 //  This class is used by mean-shift and EM algorithms
34 template <class T, unsigned n>
35 class bsta_sample_set : public bsta_parzen_sphere<T,n>
36 {
37  public:
38 
39   typedef typename bsta_parzen_sphere<T,n>::vector_type vector_;
40 
41   // Constructor
42   bsta_sample_set(T bandwidth = T(1)) : bsta_parzen_sphere<T,n>() { this->set_bandwidth(bandwidth); }
43 
44   //: Compute the mean in a window around the given pt, the window size is the bandwidth
45   //  If there are no points within bandwidth of the input pt, \return false
46   bool mean(vector_ const& pt, vector_& out);
47 
48   //: Insert a weighted sample into the distribution
49   void insert_sample(vector_ const& sample, T weight = T(1.0));
50 
weight(unsigned i)51   T weight(unsigned i) const { return weights_[i]; }
weights()52   std::vector<T>& weights() { return weights_; }
53 
54   //: one may need to normalize the weights after the insertion is over
55   void normalize_weights();
56 
57   //: must call this method before using the assignment vector
58   void initialize_assignments();  // initializes each sample's assignment to -1 (null assignment)
assignments()59   std::vector<int>& assignments() { return assignments_; }
assignment(unsigned i)60   int assignment(unsigned i) const { return assignments_[i]; }
set_assignment(unsigned i,int mode)61   void set_assignment(unsigned i, int mode) { assignments_[i] = mode; }
62 
63   //: compute the mean of a particular assignment/mode/cluster
64   bool mode_mean(int mode, vector_& out) const;
65 
samples_begin()66   typename std::vector<typename bsta_parzen_sphere<T,n>::vector_type >::const_iterator samples_begin() const { return bsta_parzen<T,n>::samples_.begin(); }
samples_end()67   typename std::vector<typename bsta_parzen_sphere<T,n>::vector_type >::const_iterator samples_end() const { return bsta_parzen<T,n>::samples_.end(); }
68 
weights_begin()69   typename std::vector<T >::const_iterator weights_begin() const { return weights_.begin(); }
weights_end()70   typename std::vector<T >::const_iterator weights_end() const { return weights_.end(); }
71 
assignments_begin()72   std::vector<int >::const_iterator assignments_begin() const { return assignments_.begin(); }
assignments_end()73   std::vector<int >::const_iterator assignments_end() const { return assignments_.end(); }
74 
check_initializations()75   bool check_initializations() const { return bsta_parzen<T,n>::samples_.size() == weights_.size() &&
76                                         bsta_parzen<T,n>::samples_.size() == assignments_.size(); }
77 
78   //: return number of assignments to this mode
79   int mode_size(int mode) const;
80 
81   //: return total weight of assignments to this mode
82   T mode_weight(int mode) const;
83 
84   //: return number of modes in the current assignment vector
85   unsigned mode_cnt() const;
86 
87   //: return total weight of all assignments
88   T total_weight() const;
89 
90  private:
91   //: hold a vector of weights for each data sample
92   //  Needs to be set separately with each insert into the data set,
93   //  otherwise it's set to 1.0 by default at the first call to mean()
94   std::vector<T> weights_;
95 
96   std::vector<int> assignments_;  // a negative value indicates "null assignment"
97 };
98 
99 //: compute the variance of a particular assignment in a bsta_sample_set
100 template <class T>
bsta_sample_set_variance(const bsta_sample_set<T,1> & set,int mode,T min_var,T & out)101 bool bsta_sample_set_variance(const bsta_sample_set<T,1>& set, int mode, T min_var, T& out)
102 {
103   typedef typename std::vector<T >::const_iterator sit_t;
104   typedef typename std::vector<T >::const_iterator wit_t;
105   typedef typename std::vector<int >::const_iterator ait_t;
106 
107   if (!set.check_initializations()) {
108     std::cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
109     return false;
110   }
111 
112   T mv;
113   set.mode_mean(mode, mv);
114 
115   T sum(T(0));
116   sit_t sit = set.samples_begin();
117   wit_t wit = set.weights_begin();
118   ait_t ait = set.assignments_begin();
119   T nsamp = 0;
120   for (; sit != set.samples_end(); ++sit, ++wit, ++ait) {
121     if (*ait != mode)
122       continue;
123 
124     T s = (*sit-mv)*(*sit-mv);
125     sum += (*wit)*s;
126     nsamp += (*wit);
127   }
128   if (nsamp > 0) {
129     out = sum / nsamp;
130     if (out < min_var)
131       out = min_var;
132     return true;
133   }
134 
135   return false;
136 }
137 
138 //: compute the variance of a particular assignment in a bsta_sample_set
139 template <class T, unsigned n>
bsta_sample_set_variance(const bsta_sample_set<T,n> & set,int mode,vnl_matrix_fixed<T,n,n> & out)140 bool bsta_sample_set_variance(const bsta_sample_set<T,n>& set, int mode, vnl_matrix_fixed<T,n,n>& out)
141 {
142   typedef typename std::vector<vnl_vector_fixed<T,n> >::const_iterator sit_t;
143   typedef typename std::vector<T >::const_iterator wit_t;
144   typedef typename std::vector<int >::const_iterator ait_t;
145 
146   if (!set.check_initializations()) {
147     std::cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
148     return false;
149   }
150 
151   vnl_vector_fixed<T,n> mv;
152   set.mode_mean(mode, mv);
153 
154   vnl_matrix_fixed<T,n,n> sum(T(0));
155   sit_t sit = set.samples_begin();
156   wit_t wit = set.weights_begin();
157   ait_t ait = set.assignments_begin();
158   T nsamp = 0;
159   for (; sit != set.samples_end(); ++sit, ++wit, ++ait) {
160     if (*ait != mode)
161       continue;
162 
163     vnl_vector_fixed<T,n> diff = (*sit)-mv;
164     sum += (*wit)*outer_product(diff,diff);
165     nsamp += (*wit);
166   }
167   if (nsamp > 0) {
168     out = sum / nsamp;
169 
170     return true;
171   }
172 
173   return false;
174 }
175 
176 //: compute the marginalized 1D sample set distribution from nD set
177 template <class T, unsigned n>
bsta_sample_set_marginalize(const bsta_sample_set<T,n> & set,unsigned component,bsta_sample_set<T,1> & out_set)178 bool bsta_sample_set_marginalize(const bsta_sample_set<T,n>& set, unsigned component, bsta_sample_set<T,1>& out_set)
179 {
180   typedef typename std::vector<vnl_vector_fixed<T,n> >::const_iterator sit_t;
181   typedef typename std::vector<T >::const_iterator wit_t;
182 
183   if (n <= component)  // if the vector is not as large to have component return false
184     return false;
185 
186   sit_t sit = set.samples_begin();
187   wit_t wit = set.weights_begin();
188 
189   for (; sit != set.samples_end(); ++sit, ++wit) {
190     out_set.insert_sample((*sit)[component], (*wit));
191   }
192 
193   return true;
194 }
195 
196 template <class T>
bsta_sample_set_fit_distribution(const bsta_sample_set<T,1> & set,bsta_mixture<bsta_num_obs<bsta_gaussian_sphere<T,1>>> & out)197 bool bsta_sample_set_fit_distribution(const bsta_sample_set<T,1>& set, bsta_mixture<bsta_num_obs<bsta_gaussian_sphere<T,1> > >& out)
198 {
199   if (!set.check_initializations()) {
200     std::cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
201     return false;
202   }
203 
204   // compute mean and variance for each mode
205   unsigned mode_cnt = set.mode_cnt();
206 
207   while (out.num_components() != 0) {
208     out.remove_last();
209   }
210 
211   T total_weight = set.total_weight();
212   for (unsigned mi = 0; mi < mode_cnt; mi++) {
213     T meanv;
214     set.mode_mean(mi, meanv);
215     T var;
216     if (!bsta_sample_set_variance(set, mi, T(MIN_VAR_), var))
217       return false;
218     T w = set.mode_weight(mi);
219     bsta_gaussian_sphere<T,1> gauss_d(meanv,var);
220     bsta_num_obs<bsta_gaussian_sphere<T,1> > gauss(gauss_d, w);
221     if (!out.insert(gauss, w/total_weight))
222       return false;
223   }
224 
225   return true;
226 }
227 
228 template <class T, unsigned n>
bsta_sample_set_fit_distribution(const bsta_sample_set<T,n> & set,bsta_mixture<bsta_num_obs<bsta_gaussian_full<T,n>>> & out)229 bool bsta_sample_set_fit_distribution(const bsta_sample_set<T,n>& set, bsta_mixture<bsta_num_obs<bsta_gaussian_full<T,n> > >& out)
230 {
231   if (!set.check_initializations()) {
232     std::cout << "Error in - bsta_sample_set<T,n>::mean() : assignments not initialized!\n";
233     return false;
234   }
235 
236   // compute mean and variance for each mode
237   unsigned mode_cnt = set.mode_cnt();
238 
239   while (out.num_components() != 0) {
240     out.remove_last();
241   }
242 
243   T total_weight = set.total_weight();
244   for (unsigned mi = 0; mi < mode_cnt; mi++) {
245     vnl_vector_fixed<T,n> meanv;
246     set.mode_mean(mi, meanv);
247     vnl_matrix_fixed<T,n,n> covar;
248     if (!bsta_sample_set_variance(set, mi, covar))
249       return false;
250     T w = set.mode_weight(mi);
251     bsta_gaussian_full<T,n> gauss_d(meanv,covar);
252     bsta_num_obs<bsta_gaussian_full<T,n> > gauss(gauss_d, w);
253     if (!out.insert(gauss, w/total_weight))
254       return false;
255   }
256 
257   return true;
258 }
259 
260 //:
261 //  Total weight is used to normalize the weight of the distribution
262 //  (bsta_num_obs class contains total weight of samples assigned to this distribution)
263 template <class T>
bsta_sample_set_log_likelihood(const bsta_sample_set<T,1> & set,bsta_num_obs<bsta_gaussian_sphere<T,1>> & dist,T total_weight)264 T bsta_sample_set_log_likelihood(const bsta_sample_set<T,1>& set, bsta_num_obs<bsta_gaussian_sphere<T,1> >& dist, T total_weight)
265 {
266   if (!set.size()) {
267     std::cout << "Error in - bsta_sample_set<T,n>::bsta_sample_set_log_likelihood() : assignments not initialized!\n";
268     return T(0);
269   }
270 
271   T w = dist.num_observations;
272   T p_dist = w/total_weight;
273   T sum = T(0);
274   for (unsigned i = 0; i < set.size(); i++) {
275     if (std::sqrt(dist.sqr_mahalanobis_dist(set.sample(i))) < 3) {// we don't want zero to be logged
276       T p = T(std::log(dist.prob_density(set.sample(i))));
277       T pw = T(std::log(p_dist));
278       sum += p + pw;  // log is natural logarithm
279     }
280   }
281 
282   return sum;
283 }
284 
285 //:
286 //  Total weight is used to normalize the weight of the distribution
287 //  (bsta_num_obs class contains total weight of samples assigned to this distribution)
288 template <class T, unsigned n>
bsta_sample_set_log_likelihood(const bsta_sample_set<T,n> & set,bsta_num_obs<bsta_gaussian_full<T,n>> & dist,T total_weight)289 T bsta_sample_set_log_likelihood(const bsta_sample_set<T,n>& set, bsta_num_obs<bsta_gaussian_full<T,n> >& dist, T total_weight)
290 {
291   if (!set.size()) {
292     std::cout << "Error in - bsta_sample_set<T,n>::bsta_sample_set_log_likelihood() : assignments not initialized!\n";
293     return T(0);
294   }
295 
296   T w = dist.num_observations;
297   T p_dist = w/total_weight;
298   T sum = T(0);
299   for (unsigned i = 0; i < set.size(); i++) {
300     if (std::sqrt(dist.sqr_mahalanobis_dist(set.sample(i))) < 3) {// we don't want zero to be logged
301       T p = T(std::log(dist.prob_density(set.sample(i))));
302       T pw = T(std::log(p_dist));
303       sum += p + pw;  // log is natural logarithm
304     }
305   }
306 
307   return sum;
308 }
309 
310 //:
311 //  Total weight is used to normalize the weight of the distribution
312 //  (bsta_num_obs class contains total weight of samples assigned to this distribution)
313 template <class T>
bsta_sample_set_log_likelihood(const bsta_sample_set<T,2> & set,bsta_num_obs<bsta_gaussian_sphere<T,1>> & dist0,T w0,bsta_num_obs<bsta_gaussian_sphere<T,1>> & dist1,T w1,T & w_sum)314 T bsta_sample_set_log_likelihood(const bsta_sample_set<T,2>& set, bsta_num_obs<bsta_gaussian_sphere<T,1> >& dist0, T w0, bsta_num_obs<bsta_gaussian_sphere<T,1> >& dist1, T w1, T& w_sum)
315 {
316   if (!set.size()) {
317     std::cout << "Error in - bsta_sample_set<T,n>::bsta_sample_set_log_likelihood() : set is empty!\n";
318     return T(0);
319   }
320 
321   T total_weight = T(0);
322   w_sum = T(0);
323   T sum = T(0);
324   unsigned cnt = 0;
325   for (unsigned i = 0; i < set.size(); i++) {
326 
327     T d0 = dist0.sqr_mahalanobis_dist(set.sample(i)[0]);
328     T d1 = dist1.sqr_mahalanobis_dist(set.sample(i)[1]);
329     T d0_sqrt = std::sqrt(d0);
330     T d1_sqrt = std::sqrt(d1);
331 
332     if (d0_sqrt < 3) {
333       if (d1_sqrt < 3) {  // if this sample belongs to both of these modes
334         w_sum += set.weight(i);
335         cnt++;
336       } else {
337         d1 = 9; // make max distance to be 9
338       }
339     } else {
340       d0 = 9;
341     }
342 
343     T p = dist0.dist_prob_density(d0);
344     p *= w0;
345     T p0 = T(std::log(p));
346     p = dist1.dist_prob_density(d1);
347     p *= w1;
348     T p1 = T(std::log(p));
349 
350     sum += p0 + p1;  // log is natural logarithm
351 
352     total_weight += set.weight(i);
353   }
354 
355   // w_sum is the total weight of all the samples assigned to these two modes w_sum/total_weight becomes the probability of this joint mode
356   T prior = w_sum/total_weight;
357   T log_prior = T(std::log(prior));
358   T tot_log_prior = set.size()*log_prior;
359   sum += tot_log_prior;
360 
361   return sum;
362 }
363 
364 
365 //: a specialized matlab file printer for 2D data
366 template<class T>
bsta_sample_set_print_to_m(const bsta_sample_set<T,2> & set,std::ofstream & of)367 bool bsta_sample_set_print_to_m(const bsta_sample_set<T,2>& set, std::ofstream& of)
368 {
369   // print samples in different colors according to the assignment
370   unsigned mode_cnt = set.mode_cnt();
371 
372   of << "cmap = colormap(lines(" << mode_cnt << "));\n";
373 
374   for (unsigned m = 0; m < mode_cnt; m++) {
375     std::vector<std::pair<T,T> > points;
376     for (unsigned i = 0; i < set.size(); i++) {
377       if (set.assignment(i) == m)
378         points.push_back(std::pair<T,T>(T(set.sample(i)[0]), T(set.sample(i)[1])));
379     }
380     if (points.size() > 0) {
381       of << "x = [" << points[0].first;
382       for (unsigned i = 1; i < points.size(); i++) {
383         of << ", " << points[i].first;
384       }
385       of << "];\n";
386       of << "y = [" << points[0].second;
387       for (unsigned i = 1; i < points.size(); i++) {
388         of << ", " << points[i].second;
389       }
390       of << "];\n";
391       of << "h = plot(x,y,'or');\nset(h, 'Color', cmap(" << m+1 << ",:));\n";
392       of << "hold on\n";
393     }
394 
395     vnl_vector_fixed<T,2> mode;
396     set.mode_mean(m, mode);
397     of << "xx = [" << mode[0] << "];\n";
398     of << "yy = [" << mode[1] << "];\n";
399     of << "h = plot(xx,yy,'+r');\nset(h, 'Color', cmap(" << m+1 << ",:));\n";
400     of << "hold on\n";
401   }
402 
403   return true;
404 }
405 
406 //: a specialized matlab file printer to visualize printed distribution
407 template<class T>
bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,2> & set,std::ofstream & of)408 bool bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,2>& set, std::ofstream& of)
409 {
410   // print samples in different colors according to the assignment
411   unsigned mode_cnt = set.mode_cnt();
412 
413   of << "cmap = colormap(lines(" << mode_cnt << "));\n";
414 
415   // find range of data to construct surface properly
416   T min = T(10000), max = T(0);
417   for (unsigned i = 0; i < set.size(); i++) {
418     if (set.sample(i)[0] < min)
419       min = set.sample(i)[0];
420     if (set.sample(i)[0] > max)
421       max = set.sample(i)[0];
422     if (set.sample(i)[1] < min)
423       min = set.sample(i)[1];
424     if (set.sample(i)[1] > max)
425       max = set.sample(i)[1];
426   }
427 
428   of << "[x,y] = meshgrid(" << min << ":.2:" << max << ", " << min << ":.2:" << max << ");\n";
429 
430   T total_weight = set.total_weight();
431   for (unsigned m = 0; m < mode_cnt; m++) {
432     vnl_vector_fixed<T,2> mode;
433     set.mode_mean(m, mode);
434     of << "mu = [" << mode[0] << ' ' << mode[1] << "];\n";
435     vnl_matrix_fixed<T,2,2> covar;
436     if (!bsta_sample_set_variance(set, m, covar))
437       return false;
438     of << "sigma = [";
439     for (unsigned r = 0; r < 2; r++) {
440       for (unsigned c = 0; c < 2; c++)
441         of << covar[r][c] << ' ';
442       if (r == 0)
443         of << ';';
444     }
445     of << "];\n";
446     of << "X = [x(:) y(:)];\n";
447     of << "p = " << set.mode_weight(m)/total_weight << "*mvnpdf(X, mu, sigma);\n";
448     of << "c = " << (m+1) << "*ones(size(x));\n";
449     of << "surf(x,y,reshape(p,size(x,1),size(x,2)),c);\n";
450     of << "hold on\n";
451   }
452 
453   return true;
454 }
455 
456 //: a specialized matlab file printer to visualize printed distribution
457 template<class T>
bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,1> & set,std::ofstream & of)458 bool bsta_sample_set_dist_print_to_m(const bsta_sample_set<T,1>& set, std::ofstream& of)
459 {
460   // print samples in different colors according to the assignment
461   unsigned mode_cnt = set.mode_cnt();
462 
463   of << "cmap = colormap(lines(" << mode_cnt << "));\n";
464 
465   // find range of data to construct surface properly
466   T min = T(10000), max = T(0);
467   for (unsigned i = 0; i < set.size(); i++) {
468     if (set.sample(i) < min)
469       min = set.sample(i);
470     if (set.sample(i) > max)
471       max = set.sample(i);
472   }
473 
474   of << "X = [" << min << ":.2:" << max << "]';\n";
475 
476   T total_weight = set.total_weight();
477   for (unsigned m = 0; m < mode_cnt; m++) {
478     T mode;
479     set.mode_mean(m, mode);
480     of << "mu = [" << mode << "];\n";
481     T var;
482     if (!bsta_sample_set_variance(set, m, MIN_VAR_, var))
483       return false;
484     of << "sigma = [" << var << "];\n";
485     of << "p = " << set.mode_weight(m)/total_weight << "*mvnpdf(X, mu, sigma);\n";
486     of << "hh = plot(X,p,'+r');\nset(hh, 'Color', cmap(" << m+1 << ",:));\n";
487     of << "hold on\n";
488   }
489 
490   return true;
491 }
492 
493 #endif // bsta_sample_set_h_
494