1 /*  _______________________________________________________________________
2 
3     Surfpack: A Software Library of Multidimensional Surface Fitting Methods
4     Copyright (c) 2006, Sandia National Laboratories.
5     This software is distributed under the GNU Lesser General Public License.
6     For more information, see the README file in the top Surfpack directory.
7     _______________________________________________________________________ */
8 
9 #include "surfpack.h"
10 #include "SurfData.h"
11 #include "SurfpackModel.h"
12 #include "ModelFitness.h"
13 #include "ModelFactory.h"
14 
15 using std::cout;
16 using std::endl;
17 using std::max_element;
18 using std::min_element;
19 using std::random_shuffle;
20 using std::set;
21 using std::string;
22 using std::vector;
23 using surfpack::shared_rng;
24 
25 
26 // --------------------------
27 // implementation of Residual
28 // --------------------------
29 
Residual(DifferenceType dt_in)30 Residual::Residual(DifferenceType dt_in) : dt(dt_in)
31 { /* empty ctor */ }
32 
33 
operator ()(double observed,double predicted) const34 double Residual::operator()(double observed, double predicted) const
35 {
36     switch(dt) {
37       case DT_ABSOLUTE: return fabs(observed - predicted);
38       case DT_SCALED: return fabs(observed - predicted)/fabs(observed);
39       case DT_SQUARED: return (observed-predicted)*(observed-predicted);
40       default: assert(false);
41     }
42     assert(dt == DT_ABSOLUTE || dt == DT_SCALED || dt == DT_SQUARED);
43     return 0.0;
44 }
45 
46 
47 // ----------------------------
48 // implementation of VecSummary
49 // ----------------------------
50 
VecSummary(MetricType mt_in)51 VecSummary::VecSummary(MetricType mt_in)
52   : mt(mt_in)
53 { /* empty ctor */ }
54 
55 
operator ()(const VecDbl & resids) const56 double VecSummary::operator()(const VecDbl& resids) const
57 {
58   switch (mt) {
59     case MT_SUM: return std::accumulate(resids.begin(),resids.end(),0.0);
60     case MT_MEAN: return surfpack::mean(resids);
61     case MT_ROOT_MEAN: return sqrt(surfpack::mean(resids));
62     case MT_MAXIMUM:
63       VecDbl::const_iterator itr = max_element(resids.begin(),resids.end());
64       return *itr;
65     //default: throw string("Unknown vec summary");
66   }
67   return 0.0;
68 }
69 
70 
71 // ------------------------------
72 // implementation of ModelFitness
73 // ------------------------------
74 
operator ()(const VecDbl & obs,const VecDbl & pred) const75 double ModelFitness::operator()(const VecDbl& obs, const VecDbl& pred) const
76 {
77   throw string("Not implemented for abstract ModelFitness class");
78 }
79 
Create(const std::string & metric,unsigned n)80 ModelFitness* ModelFitness::Create(const std::string& metric, unsigned n)
81 {
82   if (metric == "sum_squared") {
83     return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_SUM));
84   } else if (metric == "mean_squared") {
85     return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_MEAN));
86   } else if (metric == "root_mean_squared") {
87     return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_ROOT_MEAN));
88   } else if (metric == "max_squared") {
89     return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_MAXIMUM));
90   } else if (metric == "sum_scaled") {
91     return new StandardFitness(Residual(DT_SCALED),VecSummary(MT_SUM));
92   } else if (metric == "mean_scaled") {
93     return new StandardFitness(Residual(DT_SCALED),VecSummary(MT_MEAN));
94   } else if (metric == "max_scaled") {
95     return new StandardFitness(Residual(DT_SCALED),VecSummary(MT_MAXIMUM));
96   } else if (metric == "sum_abs") {
97     return new StandardFitness(Residual(DT_ABSOLUTE),VecSummary(MT_SUM));
98   } else if (metric == "mean_abs") {
99     return new StandardFitness(Residual(DT_ABSOLUTE),VecSummary(MT_MEAN));
100   } else if (metric == "max_abs") {
101     return new StandardFitness(Residual(DT_ABSOLUTE),VecSummary(MT_MAXIMUM));
102   } else if (metric == "press") {
103     return new PRESSFitness();
104   } else if (metric == "cv") {
105     return new CrossValidationFitness(n);
106   } else if (metric == "rsquared") {
107     return new R2Fitness();
108   }
109   string msg = "Metric '" + metric + "' not supported";
110   throw msg;
111   return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_SUM));
112 }
113 
114 
getResiduals(const Residual & resid,const VecDbl & obs,const VecDbl & pred)115 VecDbl ModelFitness::getResiduals(const Residual& resid,
116 				  const VecDbl& obs, const VecDbl& pred)
117 {
118   assert(obs.size() == pred.size());
119   VecDbl result(obs.size());
120   for (unsigned i = 0; i < result.size(); i++) {
121     result[i] = resid(obs[i],pred[i]);
122   }
123   return result;
124 }
125 
126 
127 // ---------------------------------
128 // implementation of StandardFitness
129 // ---------------------------------
130 
StandardFitness()131 StandardFitness::StandardFitness()
132 : resid(Residual(DT_SQUARED)), vecsumry(MT_MEAN)
133 { /* empty ctor */ }
134 
135 
StandardFitness(const Residual & resid_in,const VecSummary & vecsumry_in)136 StandardFitness::StandardFitness(const Residual& resid_in,
137 				 const VecSummary& vecsumry_in)
138 : resid(resid_in), vecsumry(vecsumry_in)
139 { /* empty ctor */ }
140 
141 
operator ()(const VecDbl & obs,const VecDbl & pred) const142 double StandardFitness::operator()(const VecDbl& obs, const VecDbl& pred) const
143 {
144   VecDbl residuals = getResiduals(resid,obs,pred);
145   return vecsumry(residuals);
146 }
147 
148 
operator ()(const SurfpackModel & sm,const SurfData & sd) const149 double StandardFitness::operator()(const SurfpackModel& sm,
150 				   const SurfData& sd) const
151 {
152   VecDbl predicted = sm(sd);
153   VecDbl observed = sd.getResponses();
154   VecDbl residuals = getResiduals(resid,observed,predicted);
155   return vecsumry(residuals);
156 }
157 
158 
159 // ----------------------------------------
160 // implementation of CrossValidationFitness
161 // ----------------------------------------
162 
163 // BMA TODO: consider moving to root_mean_squared for default metric
CrossValidationFitness()164 CrossValidationFitness::CrossValidationFitness()
165   : ModelFitness(), num_partitions(10), default_metric("mean_squared")
166 { /* empty ctor */ }
167 
168 
169 // BMA TODO: consider moving to root_mean_squared for default metric
CrossValidationFitness(unsigned n_in)170 CrossValidationFitness::CrossValidationFitness(unsigned n_in)
171   : ModelFitness(), num_partitions(n_in), default_metric("mean_squared")
172 { /* empty ctor */ }
173 
174 
175 /** Historically CV used a single mean-squared fitness metric */
176 double CrossValidationFitness::
operator ()(const SurfpackModel & sm,const SurfData & sd) const177 operator()(const SurfpackModel& sm, const SurfData& sd) const
178 {
179   // get predictions (estimates) from the cross-validation core
180   VecDbl estimates;
181   leaveout_estimates(estimates, sm, sd);
182   //cout << "CV vals: " << surfpack::fromVec<double>(estimates) << endl;
183 
184   // get the active response from SurfData
185   VecDbl responses = sd.getResponses();
186 
187   return calc_one_metric(responses, estimates, default_metric);
188 }
189 
190 
191 /** Perform a single set of CV leave-out builds, but compute multiple metrics */
192 void CrossValidationFitness::
eval_metrics(VecDbl & metric_values,const SurfpackModel & sm,const SurfData & sd,const VecStr & metric_names) const193 eval_metrics(VecDbl& metric_values, const SurfpackModel& sm, const SurfData& sd,
194 	     const VecStr& metric_names) const
195 {
196   // get predictions (estimates) from the cross-validation core
197   VecDbl estimates;
198   leaveout_estimates(estimates, sm, sd);
199 
200   // get the active response from SurfData
201   VecDbl responses = sd.getResponses();
202 
203   // now compute all the metrics requested
204   metric_values.clear();
205   metric_values.reserve(metric_names.size());
206   for (VecStr::const_iterator mi = metric_names.begin();
207        mi != metric_names.end(); ++ mi)
208     metric_values.push_back(calc_one_metric(responses, estimates, *mi));
209 }
210 
211 
212 void CrossValidationFitness::
leaveout_estimates(VecDbl & estimates,const SurfpackModel & sm,const SurfData & sd) const213 leaveout_estimates(VecDbl& estimates, const SurfpackModel& sm,
214 		   const SurfData& sd) const
215 {
216   // The data is divided into n partitions for cross validation.
217   unsigned n_final = num_partitions;
218 
219   // When n = 0, leave out about 10% of the data (n=10 partitions),
220   // but with a upper bound of the data size (results in PRESS).
221   const unsigned default_partitions = 10;
222   if (num_partitions == 0)
223     n_final = default_partitions;
224 
225   // enforce a lower bound of 2 and upper bound of data size
226   const unsigned min_partitions = 2;
227   if (n_final > sd.size())
228     n_final = sd.size();
229   else if (n_final < min_partitions)
230     n_final = min_partitions;
231 
232   /*if you want cross validation leaving m out then n=sd.size()/m
233 
234     low  = partition*my_data.size()/n
235     high = (partition+1)*my_data.size()/n-1
236 
237     partition loops from 0 to n-1
238 
239     say partition=0   => low =0*(n+1)/n=0
240                          high=1*(n+1)/n-1=1+1/n-1 = 0
241     say partition=n-1 => low =(n-1)*(n+1)/n = (n^2-1)/n=n
242                          high =n*(n+1)/n-1= n+1-1 = n
243 
244     if n=my_data.size()-2 then
245     say partition=0   => low =0*(n+2)/n  = 0
246                          high=1*(n+2)/n-1= 0
247 
248     if n=my_data.size()/2 then
249     say partition=0   => low =0*2*n/n    = 0
250                          high=1*2*n/n-1  = 1
251     say partition=n-1 => low =(n-1)*2*n/n= 2*n-2
252                          high= n*2*n/n-1 = 2*n-1
253 
254     if n=my_data.size()/3 then
255     say partition=0   => low =0
256                       => high=1*3*n/n-1=2
257     say partition=n-1 => low =(n-1)*3*n/n= 3*n-3
258                       => high=n*3*n/n-1  = 3*n-1
259   */
260 
261   //cout << "CV Fitness: " << n << endl;
262   SurfData my_data = sd; // Get non const copy
263   ParamMap args = sm.parameters();
264 
265   // silence model output for cross-validation
266   args["verbosity"] = surfpack::toString<short>(surfpack::SILENT_OUTPUT);
267 
268   VecUns indices(my_data.size());
269   for (unsigned i = 0; i < indices.size(); i++) indices[i] = i;
270   random_shuffle(indices.begin(),indices.end(),shared_rng());
271   estimates.resize(my_data.size());
272   for (unsigned partition = 0; partition < n_final; partition++) {
273     //cout << "part: " << partition << endl;
274     SetUns excludedPoints;
275     unsigned low = surfpack::block_low(partition, n_final, my_data.size());
276     unsigned high = surfpack::block_high(partition, n_final, my_data.size());
277     //cout << "low/high: " << low << " " << high << endl;
278     for (unsigned k = low; k <= high; k++) excludedPoints.insert(indices[k]);
279     my_data.setExcludedPoints(excludedPoints);
280     //cout << " excludes: " << excludedPoints.size() << endl;
281     SurfpackModelFactory* factory = ModelFactory::createModelFactory(args);
282     SurfpackModel* model = factory->Build(my_data);
283     my_data.setExcludedPoints(SetUns());
284     for (unsigned k = low; k <= high; k++) {
285       estimates[indices[k]] = (*model)(my_data(indices[k]));
286       //cout << "for k = " << k << ": " << estimates[indices[k]] << endl;
287     }
288     delete model;
289     delete factory;
290   }
291 }
292 
293 
294 double CrossValidationFitness::
calc_one_metric(const VecDbl & observed,const VecDbl & predicted,const string & metric_name) const295 calc_one_metric(const VecDbl& observed, const VecDbl& predicted,
296 		const string& metric_name) const
297 {
298   assert(observed.size() == predicted.size());
299 
300   ModelFitness* mf = ModelFitness::Create(metric_name);
301   double fitness = (*mf)(predicted, observed);
302   delete mf;
303 
304   return fitness;
305 }
306 
307 
308 // ------------------------------
309 // implementation of PRESSFitness
310 // ------------------------------
311 
PRESSFitness()312 PRESSFitness::PRESSFitness()
313 { /* empty ctor */ }
314 
315 
operator ()(const SurfpackModel & sm,const SurfData & sd) const316 double PRESSFitness::operator()(const SurfpackModel& sm,
317 				const SurfData& sd) const
318 {
319   // create a leave one out cross-validation fitness operator here
320   // since the data size isn't available at construct time (number of
321   // partitions equals number of points) TODO: for efficiency, might
322   // want to just reimplement instead of calling CV fitness; for now,
323   // assume model construction dominates bookeeping arithmetic
324   // overhead
325   ModelFitness* cvmf = ModelFitness::Create("cv", sd.size());
326   double fitness = (*cvmf)(sm, sd);
327   delete cvmf;
328   return fitness;
329 }
330 
331 
332 // ---------------------------------
333 // implementation of R2Fitness
334 // ---------------------------------
335 
R2Fitness()336 R2Fitness::R2Fitness()
337 { /* empty ctor */ }
338 
339 
operator ()(const SurfpackModel & sm,const SurfData & sd) const340 double R2Fitness::operator()(const SurfpackModel& sm, const SurfData& sd) const
341 {
342 
343   VecDbl predicted = sm(sd);
344   VecDbl observed = sd.getResponses();
345 
346   return this->operator()(observed, predicted);
347 }
348 
349 
operator ()(const VecDbl & obs,const VecDbl & pred) const350 double R2Fitness::operator()(const VecDbl& obs, const VecDbl& pred) const
351 {
352   double obs_mean = surfpack::mean(obs);
353   VecDbl vec_mean = VecDbl(obs.size(),obs_mean);
354   StandardFitness sum_squares =
355     StandardFitness(Residual(DT_SQUARED),VecSummary(MT_SUM));
356 
357   return sum_squares(pred,vec_mean)/sum_squares(obs,vec_mean);
358 }
359