1 /* _______________________________________________________________________
2
3 Surfpack: A Software Library of Multidimensional Surface Fitting Methods
4 Copyright (c) 2006, Sandia National Laboratories.
5 This software is distributed under the GNU Lesser General Public License.
6 For more information, see the README file in the top Surfpack directory.
7 _______________________________________________________________________ */
8
9 #include "surfpack.h"
10 #include "SurfData.h"
11 #include "SurfpackModel.h"
12 #include "ModelFitness.h"
13 #include "ModelFactory.h"
14
15 using std::cout;
16 using std::endl;
17 using std::max_element;
18 using std::min_element;
19 using std::random_shuffle;
20 using std::set;
21 using std::string;
22 using std::vector;
23 using surfpack::shared_rng;
24
25
26 // --------------------------
27 // implementation of Residual
28 // --------------------------
29
Residual(DifferenceType dt_in)30 Residual::Residual(DifferenceType dt_in) : dt(dt_in)
31 { /* empty ctor */ }
32
33
operator ()(double observed,double predicted) const34 double Residual::operator()(double observed, double predicted) const
35 {
36 switch(dt) {
37 case DT_ABSOLUTE: return fabs(observed - predicted);
38 case DT_SCALED: return fabs(observed - predicted)/fabs(observed);
39 case DT_SQUARED: return (observed-predicted)*(observed-predicted);
40 default: assert(false);
41 }
42 assert(dt == DT_ABSOLUTE || dt == DT_SCALED || dt == DT_SQUARED);
43 return 0.0;
44 }
45
46
47 // ----------------------------
48 // implementation of VecSummary
49 // ----------------------------
50
VecSummary(MetricType mt_in)51 VecSummary::VecSummary(MetricType mt_in)
52 : mt(mt_in)
53 { /* empty ctor */ }
54
55
operator ()(const VecDbl & resids) const56 double VecSummary::operator()(const VecDbl& resids) const
57 {
58 switch (mt) {
59 case MT_SUM: return std::accumulate(resids.begin(),resids.end(),0.0);
60 case MT_MEAN: return surfpack::mean(resids);
61 case MT_ROOT_MEAN: return sqrt(surfpack::mean(resids));
62 case MT_MAXIMUM:
63 VecDbl::const_iterator itr = max_element(resids.begin(),resids.end());
64 return *itr;
65 //default: throw string("Unknown vec summary");
66 }
67 return 0.0;
68 }
69
70
71 // ------------------------------
72 // implementation of ModelFitness
73 // ------------------------------
74
operator ()(const VecDbl & obs,const VecDbl & pred) const75 double ModelFitness::operator()(const VecDbl& obs, const VecDbl& pred) const
76 {
77 throw string("Not implemented for abstract ModelFitness class");
78 }
79
Create(const std::string & metric,unsigned n)80 ModelFitness* ModelFitness::Create(const std::string& metric, unsigned n)
81 {
82 if (metric == "sum_squared") {
83 return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_SUM));
84 } else if (metric == "mean_squared") {
85 return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_MEAN));
86 } else if (metric == "root_mean_squared") {
87 return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_ROOT_MEAN));
88 } else if (metric == "max_squared") {
89 return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_MAXIMUM));
90 } else if (metric == "sum_scaled") {
91 return new StandardFitness(Residual(DT_SCALED),VecSummary(MT_SUM));
92 } else if (metric == "mean_scaled") {
93 return new StandardFitness(Residual(DT_SCALED),VecSummary(MT_MEAN));
94 } else if (metric == "max_scaled") {
95 return new StandardFitness(Residual(DT_SCALED),VecSummary(MT_MAXIMUM));
96 } else if (metric == "sum_abs") {
97 return new StandardFitness(Residual(DT_ABSOLUTE),VecSummary(MT_SUM));
98 } else if (metric == "mean_abs") {
99 return new StandardFitness(Residual(DT_ABSOLUTE),VecSummary(MT_MEAN));
100 } else if (metric == "max_abs") {
101 return new StandardFitness(Residual(DT_ABSOLUTE),VecSummary(MT_MAXIMUM));
102 } else if (metric == "press") {
103 return new PRESSFitness();
104 } else if (metric == "cv") {
105 return new CrossValidationFitness(n);
106 } else if (metric == "rsquared") {
107 return new R2Fitness();
108 }
109 string msg = "Metric '" + metric + "' not supported";
110 throw msg;
111 return new StandardFitness(Residual(DT_SQUARED),VecSummary(MT_SUM));
112 }
113
114
getResiduals(const Residual & resid,const VecDbl & obs,const VecDbl & pred)115 VecDbl ModelFitness::getResiduals(const Residual& resid,
116 const VecDbl& obs, const VecDbl& pred)
117 {
118 assert(obs.size() == pred.size());
119 VecDbl result(obs.size());
120 for (unsigned i = 0; i < result.size(); i++) {
121 result[i] = resid(obs[i],pred[i]);
122 }
123 return result;
124 }
125
126
127 // ---------------------------------
128 // implementation of StandardFitness
129 // ---------------------------------
130
StandardFitness()131 StandardFitness::StandardFitness()
132 : resid(Residual(DT_SQUARED)), vecsumry(MT_MEAN)
133 { /* empty ctor */ }
134
135
StandardFitness(const Residual & resid_in,const VecSummary & vecsumry_in)136 StandardFitness::StandardFitness(const Residual& resid_in,
137 const VecSummary& vecsumry_in)
138 : resid(resid_in), vecsumry(vecsumry_in)
139 { /* empty ctor */ }
140
141
operator ()(const VecDbl & obs,const VecDbl & pred) const142 double StandardFitness::operator()(const VecDbl& obs, const VecDbl& pred) const
143 {
144 VecDbl residuals = getResiduals(resid,obs,pred);
145 return vecsumry(residuals);
146 }
147
148
operator ()(const SurfpackModel & sm,const SurfData & sd) const149 double StandardFitness::operator()(const SurfpackModel& sm,
150 const SurfData& sd) const
151 {
152 VecDbl predicted = sm(sd);
153 VecDbl observed = sd.getResponses();
154 VecDbl residuals = getResiduals(resid,observed,predicted);
155 return vecsumry(residuals);
156 }
157
158
159 // ----------------------------------------
160 // implementation of CrossValidationFitness
161 // ----------------------------------------
162
163 // BMA TODO: consider moving to root_mean_squared for default metric
CrossValidationFitness()164 CrossValidationFitness::CrossValidationFitness()
165 : ModelFitness(), num_partitions(10), default_metric("mean_squared")
166 { /* empty ctor */ }
167
168
169 // BMA TODO: consider moving to root_mean_squared for default metric
CrossValidationFitness(unsigned n_in)170 CrossValidationFitness::CrossValidationFitness(unsigned n_in)
171 : ModelFitness(), num_partitions(n_in), default_metric("mean_squared")
172 { /* empty ctor */ }
173
174
175 /** Historically CV used a single mean-squared fitness metric */
176 double CrossValidationFitness::
operator ()(const SurfpackModel & sm,const SurfData & sd) const177 operator()(const SurfpackModel& sm, const SurfData& sd) const
178 {
179 // get predictions (estimates) from the cross-validation core
180 VecDbl estimates;
181 leaveout_estimates(estimates, sm, sd);
182 //cout << "CV vals: " << surfpack::fromVec<double>(estimates) << endl;
183
184 // get the active response from SurfData
185 VecDbl responses = sd.getResponses();
186
187 return calc_one_metric(responses, estimates, default_metric);
188 }
189
190
191 /** Perform a single set of CV leave-out builds, but compute multiple metrics */
192 void CrossValidationFitness::
eval_metrics(VecDbl & metric_values,const SurfpackModel & sm,const SurfData & sd,const VecStr & metric_names) const193 eval_metrics(VecDbl& metric_values, const SurfpackModel& sm, const SurfData& sd,
194 const VecStr& metric_names) const
195 {
196 // get predictions (estimates) from the cross-validation core
197 VecDbl estimates;
198 leaveout_estimates(estimates, sm, sd);
199
200 // get the active response from SurfData
201 VecDbl responses = sd.getResponses();
202
203 // now compute all the metrics requested
204 metric_values.clear();
205 metric_values.reserve(metric_names.size());
206 for (VecStr::const_iterator mi = metric_names.begin();
207 mi != metric_names.end(); ++ mi)
208 metric_values.push_back(calc_one_metric(responses, estimates, *mi));
209 }
210
211
212 void CrossValidationFitness::
leaveout_estimates(VecDbl & estimates,const SurfpackModel & sm,const SurfData & sd) const213 leaveout_estimates(VecDbl& estimates, const SurfpackModel& sm,
214 const SurfData& sd) const
215 {
216 // The data is divided into n partitions for cross validation.
217 unsigned n_final = num_partitions;
218
219 // When n = 0, leave out about 10% of the data (n=10 partitions),
220 // but with a upper bound of the data size (results in PRESS).
221 const unsigned default_partitions = 10;
222 if (num_partitions == 0)
223 n_final = default_partitions;
224
225 // enforce a lower bound of 2 and upper bound of data size
226 const unsigned min_partitions = 2;
227 if (n_final > sd.size())
228 n_final = sd.size();
229 else if (n_final < min_partitions)
230 n_final = min_partitions;
231
232 /*if you want cross validation leaving m out then n=sd.size()/m
233
234 low = partition*my_data.size()/n
235 high = (partition+1)*my_data.size()/n-1
236
237 partition loops from 0 to n-1
238
239 say partition=0 => low =0*(n+1)/n=0
240 high=1*(n+1)/n-1=1+1/n-1 = 0
241 say partition=n-1 => low =(n-1)*(n+1)/n = (n^2-1)/n=n
242 high =n*(n+1)/n-1= n+1-1 = n
243
244 if n=my_data.size()-2 then
245 say partition=0 => low =0*(n+2)/n = 0
246 high=1*(n+2)/n-1= 0
247
248 if n=my_data.size()/2 then
249 say partition=0 => low =0*2*n/n = 0
250 high=1*2*n/n-1 = 1
251 say partition=n-1 => low =(n-1)*2*n/n= 2*n-2
252 high= n*2*n/n-1 = 2*n-1
253
254 if n=my_data.size()/3 then
255 say partition=0 => low =0
256 => high=1*3*n/n-1=2
257 say partition=n-1 => low =(n-1)*3*n/n= 3*n-3
258 => high=n*3*n/n-1 = 3*n-1
259 */
260
261 //cout << "CV Fitness: " << n << endl;
262 SurfData my_data = sd; // Get non const copy
263 ParamMap args = sm.parameters();
264
265 // silence model output for cross-validation
266 args["verbosity"] = surfpack::toString<short>(surfpack::SILENT_OUTPUT);
267
268 VecUns indices(my_data.size());
269 for (unsigned i = 0; i < indices.size(); i++) indices[i] = i;
270 random_shuffle(indices.begin(),indices.end(),shared_rng());
271 estimates.resize(my_data.size());
272 for (unsigned partition = 0; partition < n_final; partition++) {
273 //cout << "part: " << partition << endl;
274 SetUns excludedPoints;
275 unsigned low = surfpack::block_low(partition, n_final, my_data.size());
276 unsigned high = surfpack::block_high(partition, n_final, my_data.size());
277 //cout << "low/high: " << low << " " << high << endl;
278 for (unsigned k = low; k <= high; k++) excludedPoints.insert(indices[k]);
279 my_data.setExcludedPoints(excludedPoints);
280 //cout << " excludes: " << excludedPoints.size() << endl;
281 SurfpackModelFactory* factory = ModelFactory::createModelFactory(args);
282 SurfpackModel* model = factory->Build(my_data);
283 my_data.setExcludedPoints(SetUns());
284 for (unsigned k = low; k <= high; k++) {
285 estimates[indices[k]] = (*model)(my_data(indices[k]));
286 //cout << "for k = " << k << ": " << estimates[indices[k]] << endl;
287 }
288 delete model;
289 delete factory;
290 }
291 }
292
293
294 double CrossValidationFitness::
calc_one_metric(const VecDbl & observed,const VecDbl & predicted,const string & metric_name) const295 calc_one_metric(const VecDbl& observed, const VecDbl& predicted,
296 const string& metric_name) const
297 {
298 assert(observed.size() == predicted.size());
299
300 ModelFitness* mf = ModelFitness::Create(metric_name);
301 double fitness = (*mf)(predicted, observed);
302 delete mf;
303
304 return fitness;
305 }
306
307
308 // ------------------------------
309 // implementation of PRESSFitness
310 // ------------------------------
311
PRESSFitness()312 PRESSFitness::PRESSFitness()
313 { /* empty ctor */ }
314
315
operator ()(const SurfpackModel & sm,const SurfData & sd) const316 double PRESSFitness::operator()(const SurfpackModel& sm,
317 const SurfData& sd) const
318 {
319 // create a leave one out cross-validation fitness operator here
320 // since the data size isn't available at construct time (number of
321 // partitions equals number of points) TODO: for efficiency, might
322 // want to just reimplement instead of calling CV fitness; for now,
323 // assume model construction dominates bookeeping arithmetic
324 // overhead
325 ModelFitness* cvmf = ModelFitness::Create("cv", sd.size());
326 double fitness = (*cvmf)(sm, sd);
327 delete cvmf;
328 return fitness;
329 }
330
331
332 // ---------------------------------
333 // implementation of R2Fitness
334 // ---------------------------------
335
R2Fitness()336 R2Fitness::R2Fitness()
337 { /* empty ctor */ }
338
339
operator ()(const SurfpackModel & sm,const SurfData & sd) const340 double R2Fitness::operator()(const SurfpackModel& sm, const SurfData& sd) const
341 {
342
343 VecDbl predicted = sm(sd);
344 VecDbl observed = sd.getResponses();
345
346 return this->operator()(observed, predicted);
347 }
348
349
operator ()(const VecDbl & obs,const VecDbl & pred) const350 double R2Fitness::operator()(const VecDbl& obs, const VecDbl& pred) const
351 {
352 double obs_mean = surfpack::mean(obs);
353 VecDbl vec_mean = VecDbl(obs.size(),obs_mean);
354 StandardFitness sum_squares =
355 StandardFitness(Residual(DT_SQUARED),VecSummary(MT_SUM));
356
357 return sum_squares(pred,vec_mean)/sum_squares(obs,vec_mean);
358 }
359