1 /* _______________________________________________________________________
2
3 Surfpack: A Software Library of Multidimensional Surface Fitting Methods
4 Copyright (c) 2006, Sandia National Laboratories.
5 This software is distributed under the GNU Lesser General Public License.
6 For more information, see the README file in the top Surfpack directory.
7 _______________________________________________________________________ */
8
9 #ifndef __KRIGING_MODEL_HPP__
10 #define __KRIGING_MODEL_HPP__
11
12 //#include "surfpack_system_headers.h"
13 #include "NKM_SurfPack.hpp"
14 #include "NKM_SurfData.hpp"
15 #include "NKM_SurfPackModel.hpp"
16 #include "NKM_Optimize.hpp"
17 //#include "NKM_LinearRegressionModel.hpp"
18 #include <map>
19 #include <string>
20
21 namespace nkm {
22
23 typedef std::map< std::string, std::string> ParamMap;
24
25 // enumerated type stored in nkm::KrigingModel::corrFunc see below for more details
26 enum {DEFAULT_CORR_FUNC, GAUSSIAN_CORR_FUNC, EXP_CORR_FUNC, POW_EXP_CORR_FUNC, MATERN_CORR_FUNC};
27
28 // BMA TODO: Use more descriptive names for variables?
29
30 /**
31 KrigingModel: a class for creating and evaluating Gaussian process
32 emulators with constant, linear, or quadratic trend function.
33 Options for:
34
35 * choice of optimizer
36 * coordinate rotation
37 * evaluation with gradients
38 * nugget to control ill-conditioning.
39 * optimal subset selection to control ill-conditioning
40 * correlation Function (powered exponential or matern families)
41 */
42 class KrigingModel: public SurfPackModel
43 {
44
45 public:
46
47 // MtxDbl& makeGuessFeasible(MtxDbl& nat_log_corr_len, OptimizationProblem *opt);
48
49 //returns a string indicating the correlation function
50 std::string get_corr_func() const;
51
52 std::string model_summary_string() const;
53
54 // Return a string containing the model in "algebraic" format
55 std::string asString() const;
56
57 // BMA TODO: can we redesign so these need not be public?
58 void set_conmin_parameters(OptimizationProblem& opt) const;
59
60 void set_direct_parameters(OptimizationProblem& opt) const;
61
62 // Creating KrigingModels
63
64 /// Default constructor
KrigingModel()65 KrigingModel() : ifChooseNug(false), ifAssumeRcondZero(false), ifPrescribedNug(false), nug(0.0), XR(sdBuild.xr)
66 { /* empty constructor */ };
67
68 /// Standard KrigingModel constructor
69 KrigingModel(const SurfData& sd, const ParamMap& params);
70
71 /// After construction a Kriging model must be created with this
72 /// function (TODO: add builtFlag for safety)
73 void create();
74
75
76 // Evaluating Kriging Models
77
78 /// evaluate (y) the Kriging Model at a single point (xr is a Real row vector)
79 double evaluate(const MtxDbl& xr);
80
81 /// evaluate (y) the Kriging Model at a collection of points xr, one per row
82 MtxDbl& evaluate(MtxDbl& y, const MtxDbl& xr);
83
84 /// evaluate the KrigingModel's adjusted variance at a single point
85 double eval_variance(const MtxDbl& xr);
86
87 /// evaluate the KrigingModel's adjusted variance at a collection of points xr, one per row
88 MtxDbl& eval_variance(MtxDbl& adj_var, const MtxDbl& xr);
89
90 //double get_unadjusted_variance(){return (estVarianceMLE*scaler.unScaleFactorVarY());};
91
92 /// evaluate the partial first derivatives with respect to xr of the models adjusted mean
93 MtxDbl& evaluate_d1y(MtxDbl& d1y, const MtxDbl& xr);
94
95 /// evaluate the partial second derivatives with respect to xr of the models adjusted mean... this gives you the lower triangular, including diagonal, part of the Hessian(s), with each evaluation point being a row in both xr (input) and d2y(output)
96 MtxDbl& evaluate_d2y(MtxDbl& d2y, const MtxDbl& xr);
97
98 // Helpers for solving correlation optimization problems
99
100 /// the objective function, i.e. the negative log(likelihood);
101 /// minimizing this produces a "KrigingModel" good)
objective(const MtxDbl & nat_log_corr_len)102 inline double objective(const MtxDbl& nat_log_corr_len) {
103 MtxDbl corr_len(numTheta,1);
104 for(int i=0; i<numTheta; ++i)
105 corr_len(i,0)=std::exp(nat_log_corr_len(i,0));
106 correlations.newSize(numTheta,1);
107 get_theta_from_corr_len(correlations,corr_len);
108 masterObjectiveAndConstraints(correlations, 1, 0);
109 //printf("[objective]");
110 return obj;
111 };
112
113 /// objective plus condition number constraints
114 //void objectiveAndConstraints(double& obj_out, MtxDbl& con_out,
objectiveAndConstraints(double & obj_out,MtxDbl & con_out,const MtxDbl & nat_log_corr_len)115 inline void objectiveAndConstraints(double& obj_out, MtxDbl& con_out,
116 const MtxDbl& nat_log_corr_len) {
117 //printf("entered objectiveAndConstraints\n"); fflush(stdout);
118 MtxDbl corr_len(numTheta,1);
119 for(int i=0; i<numTheta; ++i)
120 corr_len(i,0)=std::exp(nat_log_corr_len(i,0));
121 correlations.newSize(numTheta,1);
122 get_theta_from_corr_len(correlations,corr_len);
123 con_out.newSize(numConFunc,1);
124 //MtxDbl theta(1,numTheta);
125 for(int i=0; i<numTheta; ++i)
126 correlations(i,0)=0.5*std::exp(-2.0*nat_log_corr_len(i,0));
127 //printf("about to enter masterObjectiveAndConstraints\n"); fflush(stdout);
128 masterObjectiveAndConstraints(correlations, 1, 1);
129 //printf("left masterObjectiveAndConstraints\n"); fflush(stdout);
130 obj_out=obj;
131 for(int i=0; i<numConFunc; i++){
132 //printf("i=%d ",i); fflush(stdout);
133 con_out(i,0)=con(i,0);
134 }
135 //con_out.copy(con);
136 //printf("[objectiveAndConstraints]");
137 //printf("leaving objectiveAndConstraints\n"); fflush(stdout);
138 return;
139 };
140
141 /// return the Number of Trend functions, the trend is represented by an
142 /// arbitrary order multidimensional polynomial, individual trend functions
143 /// are the separate additive terms in that multidimensional polynomial
getNTrend() const144 inline int getNTrend() const
145 { return (Poly.getNCols()); }
146
147 // return the likelihood of this model
getLikelihood()148 inline double getLikelihood()
149 { return likelihood; }
150
min_coefficients(int nvars,int poly_order)151 static int min_coefficients(int nvars, int poly_order)
152 {
153 return num_multi_dim_poly_coef(nvars,poly_order)+nvars;
154 };
155
156 void getRandGuess(MtxDbl& guess) const;
157
158 private:
159
160 #ifdef SURFPACK_HAVE_BOOST_SERIALIZATION
161 // allow serializers access to private data
162 friend class boost::serialization::access;
163 /// serializer for derived class SurfPoint data
164 template<class Archive>
165 void serialize(Archive & archive, const unsigned int version);
166 #endif
167
168 // helper functions
169 void preAllocateMaxMemory();
170 void reorderCopyRtoRChol();
171 void nuggetSelectingCholR();
172 void equationSelectingCholR();
173 void trendSelectingPivotedCholesky();
174
175 /// this function calculates the objective function (negative log
176 /// likelihood) and/or the constraint functions and/or their analytical
177 /// gradients and/or the hessian of the objective function using a
178 /// precompute and store (store across sequential calls to this function)
179 /// strategy to reduce the computational cost. To ensure that precomputed
180 /// and stored values are not changed externally this function return no
181 /// output, instead member variables must be copied out by wrapper functions.
182 /// The objective and contraint derivative modes are bit flags, i.e.
183 /// each is the sum of 2^(all orders of derivative you want). KRD 2010.05.13
184 void masterObjectiveAndConstraints(const MtxDbl& theta, int obj_der_mode,
185 int con_der_mode);
186
187 //void set_conmin_parameters(OptimizationProblem& opt) const;
188
189 /// evaluate the trend function g(xr), using class member Poly
eval_trend_fn(MtxDbl & g,const MtxDbl & xr)190 inline MtxDbl& eval_trend_fn(MtxDbl& g, const MtxDbl& xr) {
191 return (evaluate_poly_basis(g, flyPoly, Poly, xr));
192 }
193
eval_der_trend_fn(MtxDbl & dg,const MtxInt & der,const MtxDbl & xr)194 inline MtxDbl& eval_der_trend_fn(MtxDbl& dg, const MtxInt& der,
195 const MtxDbl& xr) {
196 return (evaluate_poly_der_basis(dg, flyPoly, derivBetaHat, Poly, der, xr));
197 return dg;
198 }
199
200
201 //the following matern_1pt5_... and matern_2pt5_... functions don't really need to be member functions as they don't access any data members
202
203 /// multiply exponential corr func by this to get matern 1.5 corr func
matern_1pt5_coef(double theta_abs_dx) const204 inline double matern_1pt5_coef(double theta_abs_dx) const {
205 return 1.0+theta_abs_dx;
206 };
207 /// multiply matern 1.5 corr func by this to get d1 of matern 1.5 corr func
matern_1pt5_d1_mult_r(double theta,double dx) const208 inline double matern_1pt5_d1_mult_r(double theta, double dx) const {
209 return -theta*theta*dx/matern_1pt5_coef(theta*std::fabs(dx));
210 };
211 /** multiply matern 1.5 corr func by this to get d2 of matern 1.5 corr func
212 1D MATERN_CORR_FUNC 1.5 r(x1,x2) is twice+ differential except
213 where x1==x2 this is correct for x1!=x2 */
matern_1pt5_d2_mult_r(double theta,double dx) const214 inline double matern_1pt5_d2_mult_r(double theta, double dx) const{
215 return theta*theta*(1.0-2.0/matern_1pt5_coef(theta*std::fabs(dx)));
216 };
217
218
219 /// multiply exponential corr func by this to get matern 2.5 corr func
matern_2pt5_coef(double theta_abs_dx) const220 inline double matern_2pt5_coef(double theta_abs_dx) const{
221 return 1.0+theta_abs_dx+theta_abs_dx*theta_abs_dx/3.0;
222 };
223 /// multiply matern 2.5 corr func by this to get d1 of matern 2.5 corr func
matern_2pt5_d1_mult_r(double theta,double dx) const224 inline double matern_2pt5_d1_mult_r(double theta, double dx) const{
225 double theta_abs_dx=theta*std::fabs(dx);
226 return (-theta*theta*dx*(1.0+theta_abs_dx)/
227 (3.0*matern_2pt5_coef(theta_abs_dx)));
228 };
229 /// multiply matern 2.5 corr func by this to get d2 of matern 2.5 corr func
matern_2pt5_d2_mult_r(double theta,double dx) const230 inline double matern_2pt5_d2_mult_r(double theta, double dx) const{
231 double theta_abs_dx=theta*std::fabs(dx);
232 return -theta*theta*(1.0+theta_abs_dx-theta_abs_dx*theta_abs_dx)/
233 (3.0*matern_2pt5_coef(theta_abs_dx));
234 };
235
236
237 // BMA TODO: these docs need updating
238
239 /** converts from correlation lengths to theta
240 for powered exponential (including exponential and Gaussian)
241 theta=1/(powExpCorrLenPow*corr_len^powExpCorrLenPow)
242 for matern (excluding Gaussian)
243 theta= sqrt(2*maternCorrFuncNu)/corr_len */
244 MtxDbl& get_theta_from_corr_len(MtxDbl& theta, const MtxDbl& corr_len) const;
245 /** converts from theta to correlation lengths
246 for powered exponential (including exponential and Gaussian)
247 theta=1/(powExpCorrLenPow*corr_len^powExpCorrLenPow)
248 for matern (excluding Gaussian)
249 theta= sqrt(2*maternCorrFuncNu)/corr_len */
250 MtxDbl& get_corr_len_from_theta(MtxDbl& corr_len, const MtxDbl& theta) const;
251
252 MtxDbl& eval_kriging_correlation_matrix(MtxDbl& r, const MtxDbl& xr) const;
253 MtxDbl& eval_gek_correlation_matrix(MtxDbl& r, const MtxDbl& xr) const;
254 /** r(i,j)=corr_func(xr(i,:),XR(j,:);theta(:)) choices for correlation
255 function are gaussian, exponential, powered exponential with 1<power<2,
256 and matern with nu=1.5 or 2.5 (gaussian and exponential are pulled out
257 for efficient implementation and because they belong to both families).
258 (note that only matern 1.5, matern 2.5, and gaussian are avaliable for
259 Gradient Enhanced Kriging) The convention is that capital matrices are
260 for the data the model is built from, lower case matrices are for
261 arbitrary points to evaluate the model at. This function calls either
262 the eval_kriging_* or eval_gek_* version of the same function depending
263 on wheter Kriging or Gradient Enhanced Kriging (GEK) is being used) */
correlation_matrix(MtxDbl & r,const MtxDbl & xr) const264 inline MtxDbl& correlation_matrix(MtxDbl& r, const MtxDbl& xr) const {
265 if(buildDerOrder==0)
266 return eval_kriging_correlation_matrix(r,xr);
267 else if(buildDerOrder==1)
268 return eval_gek_correlation_matrix(r,xr);
269 else{
270 std::cerr << "unsupported derivative order in\n inline MtxDbl& correlation_matrix(MtxDbl& r, const MtxDble& xr) const\n";
271 assert(false);
272 }
273 };
274
275 MtxDbl& eval_kriging_dcorrelation_matrix_dxI(MtxDbl& dr, const MtxDbl& r, const MtxDbl& xr, int Ider) const;
276 MtxDbl& eval_gek_dcorrelation_matrix_dxI(MtxDbl& dr, const MtxDbl& r, const MtxDbl& xr, int Ider) const;
277 /** if r(i,j) is the evaluation of the corr_func(XR(:,i),xr(:,j)) then this
278 function returns the matrix dr which is the derivative of matrix r with
279 respect to dimension Ider of xr i.e.
280 dr(i,j)=d(r(i,j))/d(xr(Ider,j))
281 combining repeated calls can be used to get arbitrary (mixed) higher
282 order derivatives BUT this doesn't work when the two or more
283 derivatives are with respect to the same input variable; any unmixed
284 higher order derivatives (including when it is a component of an even
285 higher order mixed order derivative) requires special treatment. This
286 function calls eitherthe eval_kriging_* or eval_gek_* version of the
287 same function depending on wheter Kriging or Gradient Enhanced Kriging
288 (GEK) is being used) */
dcorrelation_matrix_dxI(MtxDbl & dr,const MtxDbl & r,const MtxDbl & xr,int Ider) const289 inline MtxDbl& dcorrelation_matrix_dxI(MtxDbl& dr, const MtxDbl& r,
290 const MtxDbl& xr, int Ider) const
291 {
292 if(buildDerOrder==0)
293 return eval_kriging_dcorrelation_matrix_dxI(dr, r, xr, Ider);
294 else if(buildDerOrder==1)
295 return eval_gek_dcorrelation_matrix_dxI(dr, r, xr, Ider);
296 else{
297 std::cerr << "unsupported derivative order in\n inline MtxDbl& dcorrelation_matrix_dxI(MtxDbl& dr, const MtxDbl& r, const MtxDbl& xr, int Ider) const\n";
298 assert(false);
299 }
300 };
301
302 MtxDbl& eval_kriging_d2correlation_matrix_dxIdxJ(MtxDbl& d2r, const MtxDbl& drI, const MtxDbl& r, const MtxDbl& xr, int Ider, int Jder) const;
303 MtxDbl& eval_gek_d2correlation_matrix_dxIdxJ(MtxDbl& d2r, const MtxDbl& drI, const MtxDbl& r, const MtxDbl& xr, int Ider, int Jder) const;
304 /** d2r(i,j)= d^2r(i,j)/dxr(Ider,j)dxr(Jder,j) where j is the point
305 index for xr, i is the point index for XR, and Ider and Jder are
306 the dimensions with respect to which the derivatives are being
307 taken, drI is the first derivative with respect to Ider (which
308 must be precomputed and passed in), this function calls either
309 the eval_kriging_* or eval_gek_* version of the same function
310 depending on whether Kriging OR Gradient Enhanced Kriging (GEK)
311 is being used. */
d2correlation_matrix_dxIdxJ(MtxDbl & d2r,const MtxDbl & drI,const MtxDbl & r,const MtxDbl & xr,int Ider,int Jder) const312 inline MtxDbl& d2correlation_matrix_dxIdxJ(MtxDbl& d2r, const MtxDbl& drI, const MtxDbl& r, const MtxDbl& xr, int Ider, int Jder) const
313 {
314 if(buildDerOrder==0)
315 return eval_kriging_d2correlation_matrix_dxIdxJ(d2r,drI,r,xr,Ider,Jder);
316 else if(buildDerOrder==1)
317 return eval_gek_d2correlation_matrix_dxIdxJ(d2r,drI,r,xr,Ider,Jder);
318 else{
319 std::cerr << "unsupported derivative order in\ninline MtxDbl& d2correlation_matrix_dxIdxJ(MtxDbl& d2r, const MtxDbl& drI, const MtxDbl& r, const MtxDbl& xr, int Ider, int Jder) const\n";
320 assert(false);
321 }
322 };
323
324 /** R(i,j)=corr_func(XR(i,:),XR(j,:);theta(:)) where choices for for
325 correlation function are gaussian, exponential, powered exponential
326 with 1<power<2, and matern with nu=1.5 or 2.5 (gaussian and exponential
327 are pulled out for efficient implementation and because they belong
328 to both families). All of the preceeding correlation functions are
329 available for Kriging, only Gaussian, Matern 1.5 and Matern 2.5 are
330 available for GEK. All correlation functions are implemented as
331 Kriging R=something.*exp(Z*theta) (with reshapes) the something
332 depends on the correlation function (for gaussian, exponential, and
333 powered exponential that something is 1, and the multiplication by
334 1 is not actually done for the sake of efficiency) for the matern
335 function that something is matern_1pt5_coef or matern_2pt5_coef) of
336 course the definition of Z and theta differs for different correlation
337 functions. Note that Z only stores what is needed to compute the
338 strictly lower (BELOW the diagonal) part of the Kriging R to save
339 memory and computation. The Kriging R is symmetric and has ones on
340 the diagonal. The GEK R matrix is blocked into (1+numVarsr) by
341 (1+numVarsr) submatrices. Each submatric has numPoints by numPoints
342 elements. The upper-left-most submatrix is the Kriging R matrix
343 the others submatrices are derivatives of the Kriging R with respect
344 to the various dimensions of the first (rows) and second (columns)
345 inputs of the correlation function */
346 void correlation_matrix(const MtxDbl& corr_vec);
347
348 /** this function applies the nugget to the R matrix (a member variable)
349 and stores the result in R (another member variable), i.e. it adds
350 nug to the diagonal of R. The convention is that capital matrices
351 are for the data the model is built from, lower case matrices are for
352 arbitrary points to evaluate the model at. Once the nugget is added
353 R is not strictly a correlation matrix */
354 void apply_nugget_build();
355
356 /** the Z matrix, Z=Z(XR), its definitition depends on the correlation
357 function
358 for the gaussian correlation function
359 Z(ij,k)=-(XR(i,k)-XR(j,k))^2,
360 for the exponetial and matern correlation functions
361 Z(ij,k)=-|XR(i,k)-XR(j,k)|
362 for the powered exponential correlation function
363 Z(ii,k)=-|XR(i,k)-XR(j,k)|^powExpCorrFuncPow
364 where 1<powExpCorrFuncPow<2
365 the Z matrix facilitates the efficient evaluation of the correlation
366 matrix R... R=something.*exp(Z*theta)
367 note that Z only holds what is needed to compute the strictly lower
368 (below the diagonal) portion of R to save memory and computation.
369 R is symmetric and has ones on the diagonal. The convention is that
370 capital matrices are for the data the model is built from, lower
371 case matrices are for arbitrary points to evaluate the model at,
372 the Z and XR matrices are member variables so they don't need to be
373 passed in */
374 MtxDbl& gen_Z_matrix();
375
376 /** the order of the derivatives this Kriging Model was built for
377 buildDerOrder=0 means function values only (regular Kriging)
378 buildDerOrder=1 means function values plus gradients (Gradient Enhanced
379 Kriging)
380 buildDerOrder>=2 is currently not allowed, a later developer could
381 implement Hessian Enhanced Kriging but KRD did not
382 do this */
383 short buildDerOrder;
384
385 /** number of derivatives used to construct the Kriging/GEK model
386 the zeroth-order derivative, i.e. function value itself counts
387 as a derivative, so for Kriging nDer=1, for GEK nder=1+numVarsr */
388 int nDer;
389
390 /** a "Poly" style matrix (see "Poly" below) that stores derivatives
391 orders used to construct the Kriging/GEK model. Der is a
392 numVarsr by nDer matrix. For Kriging Der is a numVarsr by 1 matrix
393 of zeros. For GEK Der is a numVarsr by 1+numVarsr matrix whose
394 first (index zero) column is all zeros and columns with index 1
395 through numVarsr hold the identiy matrix which is the mixed partial
396 derivative order representation of the gradient. */
397 MtxInt Der;
398
399 /** stores which correlation function we using, major choices are
400 powered exponential with 1<=power<=2 and
401 matern with nu=0.5,1.5,2.5 or "infinity"
402 There are 2 special cases that belong to both families and are
403 pulled out for efficient implementation these are
404 gaussian correlation function
405 equals powered exponential with power=2
406 equals matern with nu="infinity"
407 exponential correlation function
408 equals powered exponential with power=1
409 equals matern with nu = 0.5
410 after these are pulled out we have
411 powered exponential with 1<power<2 and
412 matern with nu = 1.5 or 2.5
413 corrFunc stores an enumerated type
414 */
415 short corrFunc;
416
417 ///the power of the powered exponential family of correlation functions
418 double powExpCorrFuncPow;
419
420 /// the "nu" parameter of the Matern family of correlation functions
421 double maternCorrFuncNu;
422
423 /** used to determine the "small feasible region" that we need to search to
424 find good correlation lengths for the chosen correlation function */
425 double aveDistBetweenPts;
426
427 /// the upper bound of the small feasible region of correlation lengths
428 double maxNatLogCorrLen;
429
430 /// the lower bound of the small feasible region of correlation lengths
431 double minNatLogCorrLen;
432
433 /** the chosen natural log of the correlation LENGTHS (NOT correlation
434 PARAMETERS) */
435 MtxDbl natLogCorrLen;
436
437 /** the vector of correlation parameters (NOT correlation LENGTHS), these
438 are the values determinened by the maximum likelihood optimization, the
439 temporary in process version is called "theta" */
440 MtxDbl correlations;
441
442 /// NUMber of Real input VARiableS => NUMRVARS => NUMVARSR => numVarsr
443 int numVarsr;
444
445 /// NUMber of THETA for Real input variables... this is the number of correlation parameters (for real input variables), for Kriging numTheta=numVarsr, for radial basis functions numTheta=1 (radial basis functions, intended to be a derived class, are independent of direction)
446 int numTheta;
447
448 /** what optimization method did the user want us to use to determine the
449 correlation lengths of the Gaussian Process error model */
450 std::string optimizationMethod;
451
452 /** did the user specify correlation Lengths (either to use directly or
453 to use as the starting location of local optimization) */
454 bool ifUserSpecifiedCorrLengths;
455
456 /// number of starting locations for (possibly multistart) local optimization to try
457 int numStarts;
458
459 /// maximum number of sets of roughness parameters to try
460 int maxTrials;
461
462 ///used if optimization_method = global_local
463 int maxTrialsGlobal;
464
465 ///used if optimization_method = global_local
466 int maxTrialsLocal;
467
468 //"rcond" is now the only allowed constraint type, the eig option was removed
469 //std::string constraintType;
470
471 /** the number of constraint FUNCTIONS (typically these are nonlinear),
472 this number does NOT include box edge constraints for the inputs,
473 those are handled separately, the method of computing analytical
474 derivatives of eigenvalues was questionable also the eigenvalue
475 approach was for the 2 norm condition number of the R matrix when
476 what we care about is actually the 1 norm condition number of the
477 R matrix, so the option for analytical constraints were removed which
478 means numConFunc should be exactly 1 */
479 int numConFunc;
480
481 /** the correlation matrix R is considered to be "ill-conditioned" if
482 if it's condition number exceeds this value. The constraint is that
483 R not be ill conditioned */
484 double maxCondNum;
485
486 /** ifChooseNug==true tells KrigingModel to choose the smallest nugget it
487 needs to fix ill conditioning, ifChooseNug=0 tells KrigingModel not
488 to choose one (the default value for the nugget is zero) but the
489 user still has the option to prescribe a nugget the Nugget must be
490 a positive number */
491 bool ifChooseNug;
492
493 /** iff ifChooseNug==true then ifAssumeRcondZero matters
494 * for normal operations (ifAssumeRcondZero==false) a LAPACK Cholesky
495 factorization is performed, from that rcondR is calculated
496 * if ifAssumeRcondZero==true then we skip this first LAPACK Cholesky and
497 assume rcondR=0.0 (to speed things up)
498 If rcondR says R is ill conditioned, then the minimum size nugget that
499 is guaranteed to fix worst case ill-conditioning for the rcondR is
500 chosen, that nugget is applied to the diagonal of R, and a "second"
501 LAPACK Cholesky (this time of R with the nugget) is performed.
502 ifAssumeRcondZero==true is a way to speed things up by always adding
503 a still very small nugget, it can be particulary useful for Gradient
504 Enhanced Kriging if you would like to add a nugget.*/
505 bool ifAssumeRcondZero;
506
507 /** if ifPrescribedNug==true then the user has prescribed a nugget,
508 (think of the nugget a measurement noise term, it should be
509 roughly the variance of the measurement noise divided by the
510 variance of the output at the build data points) this will
511 cause the GP to smooth the data. Typically this will be more
512 than sufficient to handle ill conditioning, but if not the bound
513 on rcond will be used to restrict the allowable range of correlation
514 parameters so that all data points will be used */
515 bool ifPrescribedNug;
516
517 /** the nugget value sets the ammount of smoothing (approximation instead
518 of interpolation) that the KrigingModel will use, it can also be used
519 to fix ill conditioning, setting ifChooseNug=1 tells KrigingModel to
520 choose the smallest nugget needed to fix ill conditioning */
521 double nug;
522
523 /// the number of build points available
524 int numPoints;
525
526 /** which points are we keeping, for when using Pivoted Cholesky to select
527 an optimal subset of points to retain, if we're not selecting a subset
528 of points then this is ALL points. When GEK is used all but the last
529 point is required to be a whole point */
530 MtxInt iPtsKeep;
531
532 /** the number of points retained after using Pivoted Cholesky to select
533 an optimal subset of points. For GEK partial points are included in
534 this number */
535 int numPointsKeep;
536
537 /** only meaningful is GEK is used,
538 for Kriging numWholePointsKeep=numPointsKeep
539 for GEK numWholePointsKeep is either numPointKeep or numPointsKeep-1 */
540 int numWholePointsKeep;
541
542 /** only meaningful if GEK is used, the number of derivatives the last
543 retained point */
544 int numExtraDerKeep;
545
546 /** the number of equations available to build the Kriging Model
547 for regular Kriging numEqnAvail=numPoints;
548 for Gradient Enhanced Kriging (GEK) numEqnAvail=(1+numVarsr)*numPoints */
549 int numEqnAvail;
550
551 /** the number of rows (and columns) in the R matrix. For Kriging this
552 is identical to numPointsKeep. For GEK this is
553 (1+numVarsr)*numWholePointsKeep +
554 (numPointsKeep-numWholePointsKeep)*(1+numExtraDerKeep) */
555 int numRowsR;
556
557 /** do we have an anchor point, i.e. one point that the user has required us
558 to retain when we select a subset of points to build our Kriging model
559 from */
560 bool ifHaveAnchorPoint;
561
562 /// if we have an Anchor point what is its index?
563 int iAnchorPoint;
564
565 /** the input the model was constructed from; convention is capital
566 matrices are data model is built from, lower case matrices are
567 arbitrary points to evaluate model at, using XR instead of X in
568 anticipation of mixed real and integer input, use a reference XR
569 as shortcut/shorthand-notation to xr in the surfdata
570 */
571 MtxDbl& XR;
572 MtxDbl XRreorder; //a reordered (by pivoted cholesky subset selection)
573 //version of XR to make emulator EVALUATION fast
574
575 /** the output at ALL available build data points, reshaped to a vector.
576 If GEK is used the function value and derivatives at a point are
577 sequential (i.e. a "whole point" at a time) */
578 MtxDbl Yall;
579
580 /** the likely reorderd subset of build point output data that the model
581 was constructed from. If GEK is used, all but the last point is
582 guaranteed to be a whole point (a function value immediately followed
583 by the entire gradient in standard gradient order); the last retained
584 point can be partial (i.e. can be missing some or all derivatives, but is
585 guaranteed to have the function value, the derivatives in the final
586 gradient were not reorderd before trailing entries were dropped). The
587 convention is capital matrices are data model is built from, lower case
588 matrices are arbitrary points to evaluate model at */
589 MtxDbl Y;
590
591 /** the trend basis functions evaluated at all available build data points
592 in their original order. It has npoly rows. For Kriging it has numPoints
593 columns. For GEK it has (1+numVarsr)*numPoints columns with each "whole
594 point" appearing as as 1+numVarsr sequential columns */
595 MtxDbl Gall;
596
597 /** the transpose of the matrix of trend function evaluations at the
598 (likely reorderd) subset of points used to build the Kriging Model.
599 If GEK is used, it will also contain derivatives of the original
600 polynomial basis functions. The convention is that capital matrices
601 are for the data the model is built from, lower case matrices are
602 for arbitrary points to evaluate the model at */
603 MtxDbl Gtran;
604
605 /** true if the user said to use only the main effects (no interaction/mixed
606 terms) in the polynomial basis */
607 bool ifReducedPoly;
608
609 /** this is what the user asked for, highest total order of any term in the
610 polynomial basis */
611 int polyOrderRequested;
612
613 /** this is what was actually used, can be less than what the user asked for
614 if the correlation matrix was ill-conditioned and we had to drop points
615 to fix the ill-conditioning and had to drop trend order to make it less
616 than the remaining points. */
617 int polyOrder;
618
619 /** the number of equations needed for trend functions of order 0 through
620 polyOrderRequested */
621 MtxInt numTrend;
622
623 /// the number of terms in the trend function that was actually used
624 int nTrend;
625
626 /** the indexes of the subset trend basis functions that a pivoted
627 Cholesky factorization of G*R^-1*G^T selected for retention, these
628 indexes must be in "logical order" (monotonically increasing)*/
629 MtxInt iTrendKeep;
630
631 /** the polynomial powers for individual dimensions in each (trend) "basis
632 function" (for now the only choice of polynomial basis functions are
633 multidimensional monomials) in a multidimensional polynomial of
634 arbitrary order
635 For example if a multidimensional monomial is
636 coef * x0^0 * x1^2 * x2^0 * x3^1 *x4^0
637 It's "Poly" representation would be
638 [0 2 0 1 0]^T (matlab notation) with an associated coefficient
639 stored in betaHat
640 Each monomial in a polynomial is a column of the "Poly" matrix
641 This format has many features that make it good for a "permanent"
642 record of polynomials, for example it is convenient for taking
643 analytical derivatives. But it is not particularly efficient to
644 evaluate when a set of numPoints points is stored as a matrix with
645 numVarsr rows and numPoints columns (where each column is a point).
646 */
647 MtxInt Poly;
648
649 /** flyPoly is "work space" for an on the fly generated
650 represenation of polynomials that is concise/fast to evaluate,
651 particularly when a set of numPoints points is stored as a
652 matrix with numVarsr rows and numPoints columns (where each column
653 is a point). But the "flypoly" style representation is not a
654 convenient format for performing analytical derivatives of polynomials.
655 However, converting from a "poly" representation to a "flypoly"
656 representation is trivially easy. See comments in NKM_SurfPack.hpp
657 for a description of the "flypoly" format. */
658 MtxInt flyPoly;
659
660 /** the vector of coefficients of the trend functions (unadjusted mean)
661 betaHat=(G*R^-1*G^T)^-1*(G*R^-1*Y) i.e. the generalized by R^-1
662 least squares fit, the generalization makes it unbiased */
663 MtxDbl betaHat;
664
665 /// modified coefficients for on the fly derivatives of the trend functions
666 MtxDbl derivBetaHat;
667
668 /** the Z matrix, Z=Z(XR),
669 * for the Gaussian Correlation function
670 Z(ij,k)=-(XR(i,k)-XR(j,k))^2
671 * for the Exponential, Matern 1.5 and Matern 2.5 Correlation functions
672 Z(ij,k)=-|XR(i,k)-XR(j,k)|
673 * for the powered Exponential Correlation function (other than the
674 Exponential and Gaussian correlation functions)
675 Z(ij,k)=-(|XR(i,k)-XR(j,k)|^powExpCorrFuncPow)
676 The Z matrix facilitates the efficient evaluation of the correlation
677 matrix R and its derivatives with respect to theta (the vector of
678 correlation parameters);
679 i.e. R=coefficient*exp(Z*theta) the coefficient is 1.0 for the powered
680 Exponential family of correlation functions, it is not 1.0 for the
681 Matern 1.5 and Matern 2.5 correlation functions. The convention is
682 that capital matrices are for the data the model is built from,
683 lower case matrices are for arbitrary points to evaluate model at
684 size(Z)=[numVarsr nchoosek(numPoints,2)] */
685 MtxDbl Z;
686
687 /** working memory (so we don't constantly need to allocate and deallocate it)
688 used during the calculation of R, equals Z^T*theta (matrix multiplication
689 is used),
690 size(Ztran_theta)=[nchoosek(numPoints,2) 1] */
691 MtxDbl Ztran_theta;
692
693 /** deltaXR(ij,k)=XR(i,k)-XR(j,k) for the strictly lower triangular part of R
694 this is useful for efficient computation of derivative enhanced R matrix
695 for Gradient Enhanced Kriging, if Kriging is used this will be an
696 empty matrix (i.e. space will not be allocated for it)
697 size(deltaXR)=[nchoosek(numPoints,2) numVarsr]
698 having a transposed order of Z should make it faster to compute the GEK R
699 matrix sine it is "blocked" into (1+numVarsr) by (1+numVarsr) submatrices.
700 The size of each submatrix is numPoints by numPoints (i.e. the size of
701 the Kriging R matrix) */
702 MtxDbl deltaXR;
703
704 /** the "correlation matrix," for either regular Kriging or Gradient Enhanced
705 Kriging, after possible inclusion of a nugget, use of a nugget causes
706 the KrigingModel to smooth i.e. approximate (which is useful if you would
707 like to account for known measurement noise) rather than interpolate and
708 can be used to fix ill-conditioning. Technically R is only a Correlation
709 Matrix if all of it's diagonal elements = 1.0, this is not the case if
710 you've added a nugget or if you're using Gradient Enhanced Kriging */
711 MtxDbl R;
712
713 //we will need Rinv if we want to evaluate the INTEGRAL of the adjusted variance; if that gets implemented we should compute Rinv as the "last step" of the construction of the Kriging/GEK model
714 //MtxDbl Rinv;
715 //Rinv_Gtran*inv(G_Rinv_Gtran)*Rinv_Gtran^T would also be needed for analtyical integration of the adjusted variance, we should calculate this ONCE after the optimization in complete (when we clear stuff, or maybe we should just go ahead and calculate the integral (mean) of adjusted variance and store the answer in case anyone ever asks for it, then we wouldn't have to store Rinv or the longer matrix forever)
716
717 /** The lower triangular part of the Cholesky decomposition of R (the
718 correlation matrix after possible modification by the inclusion of
719 a nugget). Keep this around to evaluate the adjusted variance.
720 The convention is that capital matrices are for the data the model
721 is built from, lower case matrices are for arbitrary points to
722 evaluate the model at */
723 MtxDbl RChol;
724
725 /** working memory for the factorization of R so that the equilibrated
726 Cholesky Lapack wrapper won't have to allocate memory each time it is
727 called, done for computational efficiency */
728 MtxDbl scaleRChol;
729
730 /** working memory for efficient computation of rcond when using pivoted
731 Choleksy to select an optimal subset of points to retain */
732 MtxDbl sumAbsColR;
733
734 /** working memory for efficient computation of rcond when using pivoted
735 Choleksy to select an optimal subset of points to retain */
736 MtxDbl oneNormR;
737
738 /** working memory for the bisection search for the maximum number of
739 points that can be retained after using Pivoted Cholesky to peform
740 an optimal (in terms of unique information content) reordering
741 of points/equations, used when using Pivoted Cholesky to select an
742 optimal subset of points to retain */
743 MtxDbl lapackRcondR;
744
745 /** working memory for efficient computation of rcond when using pivoted
746 Choleksy to select an optimal subset of points to retain */
747 MtxDbl rcondDblWork;
748
749 /** working memory for efficient computation of rcond when using pivoted
750 Choleksy to select an optimal subset of points to retain */
751 MtxInt rcondIntWork;
752
753 /** the rcond (estimated reciprocal of the condition number) of the
754 modified correlation matrix, R */
755 double rcondR;
756
757 /// rcond of (G^T*R^-1*G)
758 double rcond_G_Rinv_Gtran;
759
760 /// keep around to evaluate adjusted variance
761 MtxDbl Rinv_Gtran;
762
763 /// don't keep arround
764 MtxDbl G_Rinv_Gtran;
765
766 /// keep around to evaluate adjusted variance
767 MtxDbl G_Rinv_Gtran_Chol;
768
769 /** working memory used to calculate G_Rinv_Gtran_Chol efficiently (so we
770 don't have to constantly allocate/deallocate memory) */
771 MtxDbl G_Rinv_Gtran_Chol_Scale;
772
773 /** working memory used to calculate G_Rinv_Gtran_Chol efficiently (so we
774 don't have to constantly allocate/deallocate memory) */
775 MtxDbl G_Rinv_Gtran_Chol_DblWork;
776
777 /** working memory used to calculate G_Rinv_Gtran_Chol efficiently (so we
778 don't have to constantly allocate/deallocate memory) */
779 MtxInt G_Rinv_Gtran_Chol_IntWork;
780
781 /// working memory G*R^-1*Y
782 MtxDbl G_Rinv_Y;
783
784 /// working memory eps=epsilon=Y-G^T*betaHat
785 MtxDbl eps;
786
787 /** rhs = right hand side, rhs=Rinv*(Y-G^T*betaHat); The convention is
788 that capital matrices are for the data the model is built from, lower
789 case matrices are for arbitrary points to evaluate the model at */
790 MtxDbl rhs;
791
792 /// need keep around to evaluate adjusted variance
793 double estVarianceMLE;
794
795 /// the per equation log(likelihood) of the Kriging Model
796 double likelihood;
797
798 /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
799 int prevObjDerMode;
800
801 /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
802 int prevConDerMode;
803
804 /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
805 MtxDbl prevTheta; //(numTheta,1)
806
807 /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
808 int maxObjDerMode;
809
810 /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
811 int maxConDerMode;
812
813 /// the objective function for the optimization of correlation lengths, it's the negative "per equation" log likelihood function
814 double obj;
815
816 /// the vector (a numConFunc by 1 matrix) of constraint functions for the optimization of the correlation lengths, it only needs to be a vector for compatibility with the nkm::OptimizationProblem class, otherwise it could be a double
817 MtxDbl con;
818 };
819
820 } // end namespace nkm
821
822 #ifdef SURFPACK_HAVE_BOOST_SERIALIZATION
823 template< class Archive >
serialize(Archive & archive,const unsigned int version)824 void nkm::KrigingModel::serialize(Archive & archive,
825 const unsigned int version)
826 {
827
828 archive & boost::serialization::base_object<nkm::SurfPackModel>(*this);
829 archive & buildDerOrder;
830 archive & nDer;
831 archive & Der;
832 archive & corrFunc;
833 archive & powExpCorrFuncPow;
834 archive & maternCorrFuncNu;
835 archive & aveDistBetweenPts;
836 archive & maxNatLogCorrLen;
837 archive & minNatLogCorrLen;
838 archive & natLogCorrLen;
839 archive & correlations;
840 archive & numVarsr;
841 archive & numTheta;
842 archive & optimizationMethod;
843 archive & ifUserSpecifiedCorrLengths;
844 archive & numStarts;
845 archive & maxTrials;
846 archive & maxTrialsGlobal;
847 archive & maxTrialsLocal;
848 archive & numConFunc;
849 archive & maxCondNum;
850 archive & ifChooseNug;
851 archive & ifAssumeRcondZero;
852 archive & ifPrescribedNug;
853 archive & nug;
854 archive & iPtsKeep;
855 archive & numPoints;
856 archive & numPointsKeep;
857 archive & numWholePointsKeep;
858 archive & numExtraDerKeep;
859 archive & numEqnAvail;
860 archive & numRowsR;
861 archive & ifHaveAnchorPoint;
862 archive & iAnchorPoint;
863 //don't archive XR since it's only a reference into SurfData .xr
864 archive & XRreorder;
865 //archive & Yall; //need this during the construction of a model but not afterward so don't archive
866 archive & Y;
867 //don't archive Gall we need this during the construction of a model but not afterward
868 //don't archive Gtran we need this during the construction of a model but not afterward
869 archive & ifReducedPoly;
870 archive & polyOrderRequested;
871 archive & polyOrder;
872 archive & numTrend;
873 archive & nTrend;
874 //don't archive iTrendKeep, it's not needed because at the discarded terms in the trend basis function are removed from Poly at the end of create()
875 archive & Poly;
876 //don't archive flyPoly, it's work space needed to efficiently evaluate a polynomial basis "on the fly" (it's a member variable only to avoid constant allocation and deallocation)
877 archive & betaHat;
878 //don't archive derivBeta, it's work space needed to efficiently evaluate derivatives of a polynomial (it's a member variable only to avoid constant allocation and deallocation)
879 //don't archive Z, we need it during the construction of a model but not afterward
880 //don't archive Ztran_theta, we need it during the construction of a model but not afterward
881 //don't archive deltaXR, we need it during the construction of a model but not afterward
882 //don't archive R, we need it during the construction of a model but not afterward, once we have ranking of candidate points by pivoteted cholesky we will use it as temporary variable space after the model is constructed but it still won't be something we want to retain
883 //archive & Rinv; //not used, would be used for analytic integral of adjusted variance
884 archive & RChol;
885 //don't archive scaleRChol, we need it during the construction of a model but not afterward
886 //don't archive sumAbsColR, we need it during the construction of a model but not afterward
887 //don't archive oneNormR, we need it during the construction of a model but not afterward
888 //don't archive lapackRcondR, weneed it during the construction of a model but not afterward
889 //don't archive rcondDblWork, we need it during the construction of a model but not afterward
890 //don't archive rcondIntWork, we need it during the construction of a model but not afterward
891 archive & rcondR;
892 archive & rcond_G_Rinv_Gtran;
893 archive & Rinv_Gtran;
894 //don't archive G_Rinv_Gtran, we need it during the construction of a model but not afterward
895 archive & G_Rinv_Gtran_Chol;
896 //don't archive G_Rinv_Gtran_Chol_Scale, we need it during the construction of a model but not afterward
897 //don't archive G_Rinv_Gtran_Chol_DblWork, we need it during the construction of a model but not afterward
898 //don't archive G_Rinv_Gtran_Chol_IntWork, we need it during the construction of a model but not afterward
899 //don't archive G_Rinv_Y, we need it during the construction of a model but not afterward
900 //don't archive eps, we need it during the construction of a model but not afterward
901 archive & rhs;
902 archive & estVarianceMLE;
903 archive & likelihood;
904 //don't archive prevObjDerMode, we need it during the construction of a model but not afterward
905 //don't archive prevConDerMode, we need it during the construction of a model but not afterward
906 //don't archive prevTheta, we need it during the construction of a model but not afterward
907 archive & maxObjDerMode;
908 archive & maxConDerMode;
909 archive & obj;
910 //don't archive con, we need it during the construction of a model but not afterward
911 }
912 #endif
913
914 #endif
915