1 /*  _______________________________________________________________________
2 
3     Surfpack: A Software Library of Multidimensional Surface Fitting Methods
4     Copyright (c) 2006, Sandia National Laboratories.
5     This software is distributed under the GNU Lesser General Public License.
6     For more information, see the README file in the top Surfpack directory.
7     _______________________________________________________________________ */
8 
9 #ifndef __KRIGING_MODEL_HPP__
10 #define __KRIGING_MODEL_HPP__
11 
12 //#include "surfpack_system_headers.h"
13 #include "NKM_SurfPack.hpp"
14 #include "NKM_SurfData.hpp"
15 #include "NKM_SurfPackModel.hpp"
16 #include "NKM_Optimize.hpp"
17 //#include "NKM_LinearRegressionModel.hpp"
18 #include <map>
19 #include <string>
20 
21 namespace nkm {
22 
23 typedef std::map< std::string, std::string> ParamMap;
24 
25 // enumerated type stored in nkm::KrigingModel::corrFunc see below for more details
26 enum {DEFAULT_CORR_FUNC, GAUSSIAN_CORR_FUNC, EXP_CORR_FUNC, POW_EXP_CORR_FUNC, MATERN_CORR_FUNC};
27 
28 // BMA TODO: Use more descriptive names for variables?
29 
30 /**
31     KrigingModel: a class for creating and evaluating Gaussian process
32     emulators with constant, linear, or quadratic trend function.
33     Options for:
34 
35     * choice of optimizer
36     * coordinate rotation
37     * evaluation with gradients
38     * nugget to control ill-conditioning.
39     * optimal subset selection to control ill-conditioning
40     * correlation Function (powered exponential or matern families)
41 */
42 class KrigingModel: public SurfPackModel
43 {
44 
45 public:
46 
47   // MtxDbl& makeGuessFeasible(MtxDbl& nat_log_corr_len, OptimizationProblem *opt);
48 
49   //returns a string indicating the correlation function
50   std::string get_corr_func() const;
51 
52   std::string model_summary_string() const;
53 
54   // Return a string containing the model in "algebraic" format
55   std::string asString() const;
56 
57   // BMA TODO: can we redesign so these need not be public?
58   void set_conmin_parameters(OptimizationProblem& opt) const;
59 
60   void set_direct_parameters(OptimizationProblem& opt) const;
61 
62   // Creating KrigingModels
63 
64   /// Default constructor
KrigingModel()65   KrigingModel() : ifChooseNug(false), ifAssumeRcondZero(false), ifPrescribedNug(false), nug(0.0), XR(sdBuild.xr)
66   { /* empty constructor */ };
67 
68   /// Standard KrigingModel constructor
69   KrigingModel(const SurfData& sd, const ParamMap& params);
70 
71   /// After construction a Kriging model must be created with this
72   /// function (TODO: add builtFlag for safety)
73   void create();
74 
75 
76   // Evaluating Kriging Models
77 
78   /// evaluate (y) the Kriging Model at a single point (xr is a Real row vector)
79   double evaluate(const MtxDbl& xr);
80 
81   /// evaluate (y) the Kriging Model at a collection of points xr, one per row
82   MtxDbl& evaluate(MtxDbl& y, const MtxDbl& xr);
83 
84   /// evaluate the KrigingModel's adjusted variance at a single point
85   double eval_variance(const MtxDbl& xr);
86 
87   /// evaluate the KrigingModel's adjusted variance at a collection of points xr, one per row
88   MtxDbl& eval_variance(MtxDbl& adj_var, const MtxDbl& xr);
89 
90   //double get_unadjusted_variance(){return (estVarianceMLE*scaler.unScaleFactorVarY());};
91 
92   /// evaluate the partial first derivatives with respect to xr of the models adjusted mean
93   MtxDbl& evaluate_d1y(MtxDbl& d1y, const MtxDbl& xr);
94 
95   /// evaluate the partial second derivatives with respect to xr of the models adjusted mean... this gives you the lower triangular, including diagonal, part of the Hessian(s), with each evaluation point being a row in both xr (input) and d2y(output)
96   MtxDbl& evaluate_d2y(MtxDbl& d2y, const MtxDbl& xr);
97 
98   // Helpers for solving correlation optimization problems
99 
100   /// the objective function, i.e. the negative log(likelihood);
101   /// minimizing this produces a "KrigingModel" good)
objective(const MtxDbl & nat_log_corr_len)102   inline double objective(const MtxDbl& nat_log_corr_len) {
103     MtxDbl corr_len(numTheta,1);
104     for(int i=0; i<numTheta; ++i)
105       corr_len(i,0)=std::exp(nat_log_corr_len(i,0));
106     correlations.newSize(numTheta,1);
107     get_theta_from_corr_len(correlations,corr_len);
108     masterObjectiveAndConstraints(correlations, 1, 0);
109     //printf("[objective]");
110     return obj;
111   };
112 
113   /// objective plus condition number constraints
114   //void objectiveAndConstraints(double& obj_out, MtxDbl& con_out,
objectiveAndConstraints(double & obj_out,MtxDbl & con_out,const MtxDbl & nat_log_corr_len)115   inline void objectiveAndConstraints(double& obj_out, MtxDbl& con_out,
116 				      const MtxDbl& nat_log_corr_len) {
117     //printf("entered objectiveAndConstraints\n");  fflush(stdout);
118     MtxDbl corr_len(numTheta,1);
119     for(int i=0; i<numTheta; ++i)
120       corr_len(i,0)=std::exp(nat_log_corr_len(i,0));
121     correlations.newSize(numTheta,1);
122     get_theta_from_corr_len(correlations,corr_len);
123     con_out.newSize(numConFunc,1);
124     //MtxDbl theta(1,numTheta);
125     for(int i=0; i<numTheta; ++i)
126       correlations(i,0)=0.5*std::exp(-2.0*nat_log_corr_len(i,0));
127     //printf("about to enter masterObjectiveAndConstraints\n"); fflush(stdout);
128     masterObjectiveAndConstraints(correlations, 1, 1);
129     //printf("left masterObjectiveAndConstraints\n"); fflush(stdout);
130     obj_out=obj;
131     for(int i=0; i<numConFunc; i++){
132       //printf("i=%d ",i); fflush(stdout);
133       con_out(i,0)=con(i,0);
134     }
135     //con_out.copy(con);
136     //printf("[objectiveAndConstraints]");
137     //printf("leaving objectiveAndConstraints\n");  fflush(stdout);
138     return;
139   };
140 
141   /// return the Number of Trend functions, the trend is represented by an
142   /// arbitrary order multidimensional polynomial, individual trend functions
143   /// are the separate additive terms in that multidimensional polynomial
getNTrend() const144   inline int getNTrend() const
145   { return (Poly.getNCols());   }
146 
147   // return the likelihood of this model
getLikelihood()148   inline double getLikelihood()
149   { return likelihood; }
150 
min_coefficients(int nvars,int poly_order)151   static int min_coefficients(int nvars, int poly_order)
152   {
153     return num_multi_dim_poly_coef(nvars,poly_order)+nvars;
154   };
155 
156   void getRandGuess(MtxDbl& guess) const;
157 
158 private:
159 
160 #ifdef SURFPACK_HAVE_BOOST_SERIALIZATION
161   // allow serializers access to private data
162   friend class boost::serialization::access;
163   /// serializer for derived class SurfPoint data
164   template<class Archive>
165   void serialize(Archive & archive, const unsigned int version);
166 #endif
167 
168   // helper functions
169   void preAllocateMaxMemory();
170   void reorderCopyRtoRChol();
171   void nuggetSelectingCholR();
172   void equationSelectingCholR();
173   void trendSelectingPivotedCholesky();
174 
175   /// this function calculates the objective function (negative log
176   /// likelihood) and/or the constraint functions and/or their analytical
177   /// gradients and/or the hessian of the objective function using a
178   /// precompute and store (store across sequential calls to this function)
179   /// strategy to reduce the computational cost.  To ensure that precomputed
180   /// and stored values are not changed externally this function return no
181   /// output, instead member variables must be copied out by wrapper functions.
182   /// The objective and contraint derivative modes are bit flags, i.e.
183   /// each is the sum of 2^(all orders of derivative you want). KRD 2010.05.13
184   void masterObjectiveAndConstraints(const MtxDbl& theta, int obj_der_mode,
185 				     int con_der_mode);
186 
187   //void set_conmin_parameters(OptimizationProblem& opt) const;
188 
189   /// evaluate the trend function g(xr), using class member Poly
eval_trend_fn(MtxDbl & g,const MtxDbl & xr)190   inline MtxDbl& eval_trend_fn(MtxDbl& g, const MtxDbl& xr) {
191     return (evaluate_poly_basis(g, flyPoly, Poly, xr));
192   }
193 
eval_der_trend_fn(MtxDbl & dg,const MtxInt & der,const MtxDbl & xr)194   inline MtxDbl& eval_der_trend_fn(MtxDbl& dg, const MtxInt& der,
195 				   const MtxDbl& xr) {
196     return (evaluate_poly_der_basis(dg, flyPoly, derivBetaHat, Poly, der, xr));
197     return dg;
198   }
199 
200 
201   //the following matern_1pt5_... and matern_2pt5_... functions don't really need to be member functions as they don't access any data members
202 
203   /// multiply exponential corr func by this to get matern 1.5 corr func
matern_1pt5_coef(double theta_abs_dx) const204   inline double matern_1pt5_coef(double theta_abs_dx) const {
205     return 1.0+theta_abs_dx;
206   };
207   /// multiply matern 1.5 corr func by this to get d1 of matern 1.5 corr func
matern_1pt5_d1_mult_r(double theta,double dx) const208   inline double matern_1pt5_d1_mult_r(double theta, double dx) const {
209     return -theta*theta*dx/matern_1pt5_coef(theta*std::fabs(dx));
210   };
211   /** multiply matern 1.5 corr func by this to get d2 of matern 1.5 corr func
212       1D MATERN_CORR_FUNC 1.5 r(x1,x2) is twice+ differential except
213       where x1==x2 this is correct for x1!=x2 */
matern_1pt5_d2_mult_r(double theta,double dx) const214   inline double matern_1pt5_d2_mult_r(double theta, double dx) const{
215     return theta*theta*(1.0-2.0/matern_1pt5_coef(theta*std::fabs(dx)));
216   };
217 
218 
219   /// multiply exponential corr func by this to get matern 2.5 corr func
matern_2pt5_coef(double theta_abs_dx) const220   inline double matern_2pt5_coef(double theta_abs_dx) const{
221     return 1.0+theta_abs_dx+theta_abs_dx*theta_abs_dx/3.0;
222   };
223   /// multiply matern 2.5 corr func by this to get d1 of matern 2.5 corr func
matern_2pt5_d1_mult_r(double theta,double dx) const224   inline double matern_2pt5_d1_mult_r(double theta, double dx) const{
225     double theta_abs_dx=theta*std::fabs(dx);
226     return (-theta*theta*dx*(1.0+theta_abs_dx)/
227 	    (3.0*matern_2pt5_coef(theta_abs_dx)));
228   };
229   /// multiply matern 2.5 corr func by this to get d2 of matern 2.5 corr func
matern_2pt5_d2_mult_r(double theta,double dx) const230   inline double matern_2pt5_d2_mult_r(double theta, double dx) const{
231     double theta_abs_dx=theta*std::fabs(dx);
232     return -theta*theta*(1.0+theta_abs_dx-theta_abs_dx*theta_abs_dx)/
233       (3.0*matern_2pt5_coef(theta_abs_dx));
234   };
235 
236 
237   // BMA TODO: these docs need updating
238 
239   /** converts from correlation lengths to theta
240       for powered exponential (including exponential and Gaussian)
241           theta=1/(powExpCorrLenPow*corr_len^powExpCorrLenPow)
242       for matern (excluding Gaussian)
243           theta= sqrt(2*maternCorrFuncNu)/corr_len  */
244   MtxDbl& get_theta_from_corr_len(MtxDbl& theta, const MtxDbl& corr_len) const;
245   /** converts from theta to correlation lengths
246       for powered exponential (including exponential and Gaussian)
247           theta=1/(powExpCorrLenPow*corr_len^powExpCorrLenPow)
248       for matern (excluding Gaussian)
249           theta= sqrt(2*maternCorrFuncNu)/corr_len  */
250   MtxDbl& get_corr_len_from_theta(MtxDbl& corr_len, const MtxDbl& theta) const;
251 
252   MtxDbl& eval_kriging_correlation_matrix(MtxDbl& r, const MtxDbl& xr) const;
253   MtxDbl& eval_gek_correlation_matrix(MtxDbl& r, const MtxDbl& xr) const;
254   /** r(i,j)=corr_func(xr(i,:),XR(j,:);theta(:)) choices for correlation
255       function are gaussian, exponential, powered exponential with 1<power<2,
256       and matern with nu=1.5 or 2.5 (gaussian and exponential are pulled out
257       for efficient implementation and because they belong to both families).
258       (note that only matern 1.5, matern 2.5, and gaussian are avaliable for
259       Gradient Enhanced Kriging) The convention is that capital matrices are
260       for the data the model is built from, lower case matrices are for
261       arbitrary points to evaluate the model at. This function calls either
262       the eval_kriging_* or eval_gek_* version of the same function depending
263       on wheter Kriging or Gradient Enhanced Kriging (GEK) is being used) */
correlation_matrix(MtxDbl & r,const MtxDbl & xr) const264   inline MtxDbl& correlation_matrix(MtxDbl& r, const MtxDbl& xr) const {
265     if(buildDerOrder==0)
266       return eval_kriging_correlation_matrix(r,xr);
267     else if(buildDerOrder==1)
268       return eval_gek_correlation_matrix(r,xr);
269     else{
270       std::cerr << "unsupported derivative order in\n  inline MtxDbl& correlation_matrix(MtxDbl& r, const MtxDble& xr) const\n";
271       assert(false);
272     }
273   };
274 
275   MtxDbl& eval_kriging_dcorrelation_matrix_dxI(MtxDbl& dr, const MtxDbl& r, const MtxDbl& xr, int Ider) const;
276   MtxDbl& eval_gek_dcorrelation_matrix_dxI(MtxDbl& dr, const MtxDbl& r, const MtxDbl& xr, int Ider) const;
277   /** if r(i,j) is the evaluation of the corr_func(XR(:,i),xr(:,j)) then this
278       function returns the matrix dr which is the derivative of matrix r with
279       respect to dimension Ider of xr i.e.
280       dr(i,j)=d(r(i,j))/d(xr(Ider,j))
281       combining repeated calls can be used to get arbitrary (mixed) higher
282       order derivatives BUT this doesn't work when the two or more
283       derivatives are with respect to the same input variable; any unmixed
284       higher order derivatives (including when it is a component of an even
285       higher order mixed order derivative) requires special treatment. This
286       function calls eitherthe eval_kriging_* or eval_gek_* version of the
287       same function depending on wheter Kriging or Gradient Enhanced Kriging
288       (GEK) is being used) */
dcorrelation_matrix_dxI(MtxDbl & dr,const MtxDbl & r,const MtxDbl & xr,int Ider) const289   inline MtxDbl& dcorrelation_matrix_dxI(MtxDbl& dr, const MtxDbl& r,
290 					 const MtxDbl& xr, int Ider) const
291   {
292     if(buildDerOrder==0)
293       return eval_kriging_dcorrelation_matrix_dxI(dr, r, xr, Ider);
294     else if(buildDerOrder==1)
295       return eval_gek_dcorrelation_matrix_dxI(dr, r, xr, Ider);
296     else{
297       std::cerr << "unsupported derivative order in\n inline MtxDbl& dcorrelation_matrix_dxI(MtxDbl& dr, const MtxDbl& r, const MtxDbl& xr, int Ider) const\n";
298       assert(false);
299     }
300   };
301 
302   MtxDbl& eval_kriging_d2correlation_matrix_dxIdxJ(MtxDbl& d2r, const MtxDbl& drI, const MtxDbl& r, const MtxDbl& xr, int Ider, int Jder) const;
303   MtxDbl& eval_gek_d2correlation_matrix_dxIdxJ(MtxDbl& d2r, const MtxDbl& drI, const MtxDbl& r, const MtxDbl& xr, int Ider, int Jder) const;
304   /** d2r(i,j)= d^2r(i,j)/dxr(Ider,j)dxr(Jder,j) where j is the point
305       index for xr, i is the point index for XR, and Ider and Jder are
306       the dimensions with respect to which the derivatives are being
307       taken, drI is the first derivative with respect to Ider (which
308       must be precomputed and passed  in), this function calls either
309       the eval_kriging_* or eval_gek_* version of the same function
310       depending on whether Kriging OR Gradient Enhanced Kriging (GEK)
311       is being used. */
d2correlation_matrix_dxIdxJ(MtxDbl & d2r,const MtxDbl & drI,const MtxDbl & r,const MtxDbl & xr,int Ider,int Jder) const312   inline MtxDbl& d2correlation_matrix_dxIdxJ(MtxDbl& d2r, const MtxDbl& drI, const MtxDbl& r, const MtxDbl& xr, int Ider, int Jder) const
313   {
314     if(buildDerOrder==0)
315       return eval_kriging_d2correlation_matrix_dxIdxJ(d2r,drI,r,xr,Ider,Jder);
316     else if(buildDerOrder==1)
317       return eval_gek_d2correlation_matrix_dxIdxJ(d2r,drI,r,xr,Ider,Jder);
318     else{
319       std::cerr << "unsupported derivative order in\ninline MtxDbl& d2correlation_matrix_dxIdxJ(MtxDbl& d2r, const MtxDbl& drI, const MtxDbl& r, const MtxDbl& xr, int Ider, int Jder) const\n";
320       assert(false);
321     }
322   };
323 
324   /** R(i,j)=corr_func(XR(i,:),XR(j,:);theta(:)) where choices for for
325       correlation function are gaussian, exponential, powered exponential
326       with 1<power<2, and matern with nu=1.5 or 2.5 (gaussian and exponential
327       are pulled out for efficient implementation and because they belong
328       to both families).  All of the preceeding correlation functions are
329       available for Kriging, only Gaussian, Matern 1.5 and Matern 2.5 are
330       available for GEK.  All correlation functions are implemented as
331       Kriging R=something.*exp(Z*theta) (with reshapes) the something
332       depends on the correlation function (for gaussian, exponential, and
333       powered exponential that something is 1, and the multiplication by
334       1 is not actually done for the sake of efficiency) for the matern
335       function that something is matern_1pt5_coef or matern_2pt5_coef) of
336       course the definition of Z and theta differs for different correlation
337       functions.  Note that Z only stores what is needed to compute the
338       strictly lower (BELOW the diagonal) part of the Kriging R to save
339       memory and computation.  The Kriging R is symmetric and has ones on
340       the diagonal. The GEK R matrix is blocked into (1+numVarsr) by
341       (1+numVarsr) submatrices.  Each submatric has numPoints by numPoints
342       elements.  The upper-left-most submatrix is the Kriging R matrix
343       the others submatrices are derivatives of the Kriging R with respect
344       to the various dimensions of the first (rows) and second (columns)
345       inputs of the correlation function */
346   void correlation_matrix(const MtxDbl& corr_vec);
347 
348   /** this function applies the nugget to the R matrix (a member variable)
349       and stores the result in R (another member variable), i.e. it adds
350       nug to the diagonal of R. The convention is that capital matrices
351       are for the data the model is built from, lower case matrices are for
352       arbitrary points to evaluate the model at.  Once the nugget is added
353       R is not strictly a correlation matrix */
354   void apply_nugget_build();
355 
356   /** the Z matrix, Z=Z(XR), its definitition depends on the correlation
357       function
358           for the gaussian correlation function
359               Z(ij,k)=-(XR(i,k)-XR(j,k))^2,
360           for the exponetial and matern correlation functions
361               Z(ij,k)=-|XR(i,k)-XR(j,k)|
362 	  for the powered exponential correlation function
363 	      Z(ii,k)=-|XR(i,k)-XR(j,k)|^powExpCorrFuncPow
364 	      where 1<powExpCorrFuncPow<2
365       the Z matrix facilitates the efficient evaluation of the correlation
366       matrix R... R=something.*exp(Z*theta)
367       note that Z only holds what is needed to compute the strictly lower
368       (below the diagonal) portion of R to save memory and computation.
369       R is symmetric and has ones on the diagonal.  The convention is that
370       capital matrices are for the data the model is built from, lower
371       case matrices are for arbitrary points to evaluate the model at,
372       the Z and XR matrices are member variables so they don't need to be
373       passed in */
374   MtxDbl& gen_Z_matrix();
375 
376   /** the order of the derivatives this Kriging Model was built for
377       buildDerOrder=0  means function values only (regular Kriging)
378       buildDerOrder=1  means function values plus gradients (Gradient Enhanced
379                        Kriging)
380       buildDerOrder>=2 is currently not allowed, a later developer could
381                        implement Hessian Enhanced Kriging but KRD did not
382 		       do this  */
383   short buildDerOrder;
384 
385   /** number of derivatives used to construct the Kriging/GEK model
386       the zeroth-order derivative, i.e. function value itself counts
387       as a derivative, so for Kriging nDer=1, for GEK nder=1+numVarsr */
388   int nDer;
389 
390   /** a "Poly" style matrix (see "Poly" below) that stores derivatives
391       orders used to construct the Kriging/GEK model.  Der is a
392       numVarsr by nDer matrix. For Kriging Der is a numVarsr by 1 matrix
393       of zeros.  For GEK Der is a numVarsr by 1+numVarsr matrix whose
394       first (index zero) column is all zeros and columns with index 1
395       through numVarsr hold the identiy matrix which is the mixed partial
396       derivative order representation of the gradient. */
397   MtxInt Der;
398 
399   /** stores which correlation function we using, major choices are
400           powered exponential with 1<=power<=2 and
401           matern with nu=0.5,1.5,2.5 or "infinity"
402       There are 2 special cases that belong to both families and are
403       pulled out for efficient implementation these are
404           gaussian correlation function
405               equals powered exponential with power=2
406               equals matern with nu="infinity"
407           exponential correlation function
408               equals powered exponential with power=1
409 	      equals matern with nu = 0.5
410       after these are pulled out we have
411       powered exponential with 1<power<2 and
412       matern with nu = 1.5 or 2.5
413       corrFunc stores an enumerated type
414   */
415   short corrFunc;
416 
417   ///the power of the powered exponential family of correlation functions
418   double powExpCorrFuncPow;
419 
420   /// the "nu" parameter of the Matern family of correlation functions
421   double maternCorrFuncNu;
422 
423   /** used to determine the "small feasible region" that we need to search to
424       find good correlation lengths for the chosen correlation function */
425   double aveDistBetweenPts;
426 
427   /// the upper bound of the small feasible region of correlation lengths
428   double maxNatLogCorrLen;
429 
430   /// the lower bound of the small feasible region of correlation lengths
431   double minNatLogCorrLen;
432 
433   /** the chosen natural log of the correlation LENGTHS (NOT correlation
434       PARAMETERS) */
435   MtxDbl natLogCorrLen;
436 
437   /** the vector of correlation parameters (NOT correlation LENGTHS), these
438       are the values determinened by the maximum likelihood optimization, the
439       temporary in process version is called "theta" */
440   MtxDbl correlations;
441 
442   /// NUMber of Real input VARiableS => NUMRVARS => NUMVARSR => numVarsr
443   int numVarsr;
444 
445   /// NUMber of THETA for Real input variables... this is the number of correlation parameters (for real input variables), for Kriging numTheta=numVarsr, for radial basis functions numTheta=1 (radial basis functions, intended to be a derived class, are independent of direction)
446   int numTheta;
447 
448   /** what optimization method did the user want us to use to determine the
449       correlation lengths of the Gaussian Process error model */
450   std::string optimizationMethod;
451 
452   /** did the user specify correlation Lengths (either to use directly or
453       to use as the starting location of local optimization) */
454   bool ifUserSpecifiedCorrLengths;
455 
456   /// number of starting locations for (possibly multistart) local optimization to try
457   int numStarts;
458 
459   /// maximum number of sets of roughness parameters to try
460   int maxTrials;
461 
462   ///used if optimization_method = global_local
463   int maxTrialsGlobal;
464 
465   ///used if optimization_method = global_local
466   int maxTrialsLocal;
467 
468   //"rcond" is now the only allowed constraint type, the eig option was removed
469   //std::string constraintType;
470 
471   /** the number of constraint FUNCTIONS (typically these are nonlinear),
472       this number does NOT include box edge constraints for the inputs,
473       those are handled separately, the method of computing analytical
474       derivatives of eigenvalues was questionable also the eigenvalue
475       approach was for the 2 norm condition number of the R matrix when
476       what we care about is actually the 1 norm condition number of the
477       R matrix, so the option for analytical constraints were removed which
478       means numConFunc should be exactly 1 */
479   int numConFunc;
480 
481   /** the correlation matrix R is considered to be "ill-conditioned" if
482       if it's condition number exceeds this value. The constraint is that
483       R not be ill conditioned */
484   double maxCondNum;
485 
486   /** ifChooseNug==true tells KrigingModel to choose the smallest nugget it
487       needs to fix ill conditioning, ifChooseNug=0 tells KrigingModel not
488       to choose one (the default value for the nugget is zero) but the
489       user still has the option to prescribe a nugget the Nugget must be
490       a positive number */
491   bool ifChooseNug;
492 
493   /** iff ifChooseNug==true then ifAssumeRcondZero matters
494       * for normal operations (ifAssumeRcondZero==false) a LAPACK Cholesky
495         factorization is performed, from that rcondR is calculated
496       * if ifAssumeRcondZero==true then we skip this first LAPACK Cholesky and
497         assume rcondR=0.0 (to speed things up)
498       If rcondR says R is ill conditioned, then the minimum size nugget that
499       is guaranteed to fix worst case ill-conditioning for the rcondR is
500       chosen, that nugget is applied to the diagonal of R, and a "second"
501       LAPACK Cholesky (this time of R with the nugget) is performed.
502       ifAssumeRcondZero==true is a way to speed things up by always adding
503       a still very small nugget, it can be particulary useful for Gradient
504       Enhanced Kriging if you would like to add a nugget.*/
505   bool ifAssumeRcondZero;
506 
507   /** if ifPrescribedNug==true then the user has prescribed a nugget,
508       (think of the nugget a measurement noise term, it should be
509       roughly the variance of the measurement noise divided by the
510       variance of the output at the build data points) this will
511       cause the GP to smooth the data.  Typically this will be more
512       than sufficient to handle ill conditioning, but if not the bound
513       on rcond will be used to restrict the allowable range of correlation
514       parameters so that all data points will be used */
515   bool ifPrescribedNug;
516 
517   /** the nugget value sets the ammount of smoothing (approximation instead
518       of interpolation) that the KrigingModel will use, it can also be used
519       to fix ill conditioning, setting ifChooseNug=1 tells KrigingModel to
520       choose the smallest nugget needed to fix ill conditioning */
521   double nug;
522 
523   /// the number of build points available
524   int numPoints;
525 
526   /** which points are we keeping, for when using Pivoted Cholesky to select
527       an optimal subset of points to retain, if we're not selecting a subset
528       of points then this is ALL points.  When GEK is used all but the last
529       point is required to be a whole point */
530   MtxInt iPtsKeep;
531 
532   /** the number of points retained after using Pivoted Cholesky to select
533       an optimal subset of points. For GEK partial points are included in
534       this number */
535   int numPointsKeep;
536 
537   /** only meaningful is GEK is used,
538       for Kriging numWholePointsKeep=numPointsKeep
539       for GEK numWholePointsKeep is either numPointKeep or numPointsKeep-1 */
540   int numWholePointsKeep;
541 
542   /** only meaningful if GEK is used, the number of derivatives the last
543       retained point */
544   int numExtraDerKeep;
545 
546   /** the number of equations available to build the Kriging Model
547       for regular Kriging numEqnAvail=numPoints;
548       for Gradient Enhanced Kriging (GEK) numEqnAvail=(1+numVarsr)*numPoints */
549   int numEqnAvail;
550 
551   /** the number of rows (and columns) in the R matrix.  For Kriging this
552       is identical to numPointsKeep.  For GEK this is
553       (1+numVarsr)*numWholePointsKeep +
554       (numPointsKeep-numWholePointsKeep)*(1+numExtraDerKeep) */
555   int numRowsR;
556 
557   /** do we have an anchor point, i.e. one point that the user has required us
558       to retain when we select a subset of points to build our Kriging model
559       from */
560   bool ifHaveAnchorPoint;
561 
562   /// if we have an Anchor point what is its index?
563   int  iAnchorPoint;
564 
565   /** the input the model was constructed from; convention is capital
566       matrices are data model is built from, lower case matrices are
567       arbitrary points to evaluate model at, using XR instead of X in
568       anticipation of mixed real and integer input, use a reference XR
569       as shortcut/shorthand-notation to xr in the surfdata
570   */
571   MtxDbl& XR;
572   MtxDbl XRreorder;  //a reordered (by pivoted cholesky subset selection)
573   //version of XR to make emulator EVALUATION fast
574 
575   /** the output at ALL available build data points, reshaped to a vector.
576       If GEK is used the function value and derivatives at a point are
577       sequential (i.e. a "whole point" at a time) */
578   MtxDbl Yall;
579 
580   /** the likely reorderd subset of build point output data that the model
581       was constructed from. If GEK is used, all but the last point is
582       guaranteed to be a whole point (a function value immediately followed
583       by the entire gradient in standard gradient order); the last retained
584       point can be partial (i.e. can be missing some or all derivatives, but is
585       guaranteed to have the function value, the derivatives in the final
586       gradient were not reorderd before trailing entries were dropped). The
587       convention is capital matrices are data model is built from, lower case
588       matrices are arbitrary points to evaluate model at */
589   MtxDbl Y;
590 
591   /** the trend basis functions evaluated at all available build data points
592       in their original order.  It has npoly rows. For Kriging it has numPoints
593       columns.  For GEK it has (1+numVarsr)*numPoints columns with each "whole
594       point" appearing as as 1+numVarsr sequential columns */
595   MtxDbl Gall;
596 
597   /** the transpose of the matrix of trend function evaluations at the
598       (likely reorderd) subset of points used to build the Kriging Model.
599       If GEK is used, it will also contain derivatives of the original
600       polynomial basis functions. The convention is that capital matrices
601       are for the data the model is built from, lower case matrices are
602       for arbitrary points to evaluate the model at */
603   MtxDbl Gtran;
604 
605   /** true if the user said to use only the main effects (no interaction/mixed
606       terms) in the polynomial basis */
607   bool ifReducedPoly;
608 
609   /** this is what the user asked for, highest total order of any term in the
610       polynomial basis */
611   int polyOrderRequested;
612 
613   /** this is what was actually used, can be less than what the user asked for
614       if the correlation matrix was ill-conditioned and we had to drop points
615       to fix the ill-conditioning and had to drop trend order to make it less
616       than the remaining points. */
617   int polyOrder;
618 
619   /** the number of equations needed for trend functions of order 0 through
620       polyOrderRequested */
621   MtxInt numTrend;
622 
623   /// the number of terms in the trend function that was actually used
624   int nTrend;
625 
626   /** the indexes of the subset trend basis functions that a pivoted
627       Cholesky factorization of G*R^-1*G^T selected for retention, these
628       indexes must be in "logical order" (monotonically increasing)*/
629   MtxInt iTrendKeep;
630 
631   /** the polynomial powers for individual dimensions in each (trend) "basis
632       function" (for now the only choice of polynomial basis functions are
633       multidimensional monomials) in a multidimensional polynomial of
634       arbitrary order
635       For example if a multidimensional monomial is
636           coef * x0^0 * x1^2 * x2^0 * x3^1 *x4^0
637       It's "Poly" representation would be
638 	  [0 2 0 1 0]^T (matlab notation) with an associated coefficient
639 	  stored in betaHat
640       Each monomial in a polynomial is a column of the "Poly" matrix
641       This format has many features that make it good for a "permanent"
642       record of polynomials, for example it is convenient for taking
643       analytical derivatives.  But it is not particularly efficient to
644       evaluate when a set of numPoints points is stored as a matrix with
645       numVarsr rows and numPoints columns (where each column is a point).
646   */
647   MtxInt Poly;
648 
649   /** flyPoly is "work space" for an on the fly generated
650       represenation of polynomials that is concise/fast to evaluate,
651       particularly when a set of numPoints points is stored as a
652       matrix with numVarsr rows and numPoints columns (where each column
653       is a point).  But the "flypoly" style representation is not a
654       convenient format for performing analytical derivatives of polynomials.
655       However, converting from a "poly" representation to a "flypoly"
656       representation is trivially easy. See comments in NKM_SurfPack.hpp
657       for a description of the "flypoly" format. */
658   MtxInt flyPoly;
659 
660   /** the vector of coefficients of the trend functions (unadjusted mean)
661       betaHat=(G*R^-1*G^T)^-1*(G*R^-1*Y) i.e. the generalized by R^-1
662       least squares fit, the generalization makes it unbiased */
663   MtxDbl betaHat;
664 
665   /// modified coefficients for on the fly derivatives of the trend functions
666   MtxDbl derivBetaHat;
667 
668   /** the Z matrix, Z=Z(XR),
669       * for the Gaussian Correlation function
670         Z(ij,k)=-(XR(i,k)-XR(j,k))^2
671       * for the Exponential, Matern 1.5 and Matern 2.5 Correlation functions
672         Z(ij,k)=-|XR(i,k)-XR(j,k)|
673       * for the powered Exponential Correlation function (other than the
674         Exponential and Gaussian correlation functions)
675         Z(ij,k)=-(|XR(i,k)-XR(j,k)|^powExpCorrFuncPow)
676       The Z matrix facilitates the efficient evaluation of the correlation
677       matrix R and its derivatives with respect to theta (the vector of
678       correlation parameters);
679       i.e. R=coefficient*exp(Z*theta) the coefficient is 1.0 for the powered
680       Exponential family of correlation functions, it is not 1.0 for the
681       Matern 1.5 and Matern 2.5 correlation functions.  The convention is
682       that capital matrices are for the data the model is built from,
683       lower case matrices are for arbitrary points to evaluate model at
684       size(Z)=[numVarsr nchoosek(numPoints,2)] */
685   MtxDbl Z;
686 
687   /** working memory (so we don't constantly need to allocate and deallocate it)
688       used during the calculation of R, equals Z^T*theta (matrix multiplication
689       is used),
690       size(Ztran_theta)=[nchoosek(numPoints,2) 1] */
691   MtxDbl Ztran_theta;
692 
693   /** deltaXR(ij,k)=XR(i,k)-XR(j,k) for the strictly lower triangular part of R
694       this is useful for efficient computation of derivative enhanced R matrix
695       for Gradient Enhanced Kriging, if Kriging is used this will be an
696       empty matrix (i.e. space will not be allocated for it)
697       size(deltaXR)=[nchoosek(numPoints,2) numVarsr]
698       having a transposed order of Z should make it faster to compute the GEK R
699       matrix sine it is "blocked" into (1+numVarsr) by (1+numVarsr) submatrices.
700       The size of each submatrix is numPoints by numPoints (i.e. the size of
701       the Kriging R matrix) */
702   MtxDbl deltaXR;
703 
704   /** the "correlation matrix," for either regular Kriging or Gradient Enhanced
705       Kriging, after possible inclusion of a nugget, use of a nugget causes
706       the KrigingModel to smooth i.e. approximate (which is useful if you would
707       like to account for known measurement noise) rather than interpolate and
708       can be used to fix ill-conditioning. Technically R is only a Correlation
709       Matrix if all of it's diagonal elements = 1.0, this is not the case if
710       you've added a nugget or if you're using Gradient Enhanced Kriging */
711   MtxDbl R;
712 
713   //we will need Rinv if we want to evaluate the INTEGRAL of the adjusted variance; if that gets implemented we should compute Rinv as the "last step" of the construction of the Kriging/GEK model
714   //MtxDbl Rinv;
715   //Rinv_Gtran*inv(G_Rinv_Gtran)*Rinv_Gtran^T would also be needed for analtyical integration of the adjusted variance, we should calculate this ONCE after the optimization in complete (when we clear stuff, or maybe we should just go ahead and calculate the integral (mean) of adjusted variance and store the answer in case anyone ever asks for it, then we wouldn't have to store Rinv or the longer matrix forever)
716 
717   /** The lower triangular part of the Cholesky decomposition of R (the
718       correlation matrix after possible modification by the inclusion of
719       a nugget).  Keep this around to evaluate the adjusted variance.
720       The convention is that capital matrices are for the data the model
721       is built from, lower case matrices are for arbitrary points to
722       evaluate the model at */
723   MtxDbl RChol;
724 
725   /** working memory for the factorization of R so that the equilibrated
726       Cholesky Lapack wrapper won't have to allocate memory each time it is
727       called, done for computational efficiency */
728   MtxDbl scaleRChol;
729 
730   /** working memory for efficient computation of rcond when using pivoted
731       Choleksy to select an optimal subset of points to retain */
732   MtxDbl sumAbsColR;
733 
734   /** working memory for efficient computation of rcond when using pivoted
735       Choleksy to select an optimal subset of points to retain */
736   MtxDbl oneNormR;
737 
738   /** working memory for the bisection search for the maximum number of
739       points that can be retained after using Pivoted Cholesky to peform
740       an optimal (in terms of unique information content) reordering
741       of points/equations, used when using Pivoted Cholesky to select an
742       optimal subset of points to retain */
743   MtxDbl lapackRcondR;
744 
745   /** working memory for efficient computation of rcond when using pivoted
746       Choleksy to select an optimal subset of points to retain */
747   MtxDbl rcondDblWork;
748 
749   /** working memory for efficient computation of rcond when using pivoted
750       Choleksy to select an optimal subset of points to retain */
751   MtxInt rcondIntWork;
752 
753   /** the rcond (estimated reciprocal of the condition number) of the
754       modified correlation matrix, R */
755   double rcondR;
756 
757   /// rcond of (G^T*R^-1*G)
758   double rcond_G_Rinv_Gtran;
759 
760   /// keep around to evaluate adjusted variance
761   MtxDbl Rinv_Gtran;
762 
763   /// don't keep arround
764   MtxDbl G_Rinv_Gtran;
765 
766   /// keep around to evaluate adjusted variance
767   MtxDbl G_Rinv_Gtran_Chol;
768 
769   /** working memory used to calculate G_Rinv_Gtran_Chol efficiently (so we
770       don't have to constantly allocate/deallocate memory) */
771   MtxDbl G_Rinv_Gtran_Chol_Scale;
772 
773   /** working memory used to calculate G_Rinv_Gtran_Chol efficiently (so we
774       don't have to constantly allocate/deallocate memory) */
775   MtxDbl G_Rinv_Gtran_Chol_DblWork;
776 
777   /** working memory used to calculate G_Rinv_Gtran_Chol efficiently (so we
778       don't have to constantly allocate/deallocate memory) */
779   MtxInt G_Rinv_Gtran_Chol_IntWork;
780 
781   /// working memory G*R^-1*Y
782   MtxDbl G_Rinv_Y;
783 
784   /// working memory eps=epsilon=Y-G^T*betaHat
785   MtxDbl eps;
786 
787   /** rhs = right hand side, rhs=Rinv*(Y-G^T*betaHat); The convention is
788       that capital matrices are for the data the model is built from, lower
789       case matrices are for arbitrary points to evaluate the model at */
790   MtxDbl rhs;
791 
792   /// need keep around to evaluate adjusted variance
793   double estVarianceMLE;
794 
795   /// the per equation log(likelihood) of the Kriging Model
796   double likelihood;
797 
798   /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
799   int prevObjDerMode;
800 
801   /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
802   int prevConDerMode;
803 
804   /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
805   MtxDbl prevTheta; //(numTheta,1)
806 
807   /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
808   int maxObjDerMode;
809 
810   /// part of infrastructure to allow masterObjectivesAndConstraints to just "return" (have copied out) the answer if the same point is used in sequential calls
811   int maxConDerMode;
812 
813   /// the objective function for the optimization of correlation lengths, it's the negative "per equation" log likelihood function
814   double obj;
815 
816   /// the vector (a numConFunc by 1 matrix) of constraint functions for the optimization of the correlation lengths, it only needs to be a vector for compatibility with the nkm::OptimizationProblem class, otherwise it could be a double
817   MtxDbl con;
818 };
819 
820 } // end namespace nkm
821 
822 #ifdef SURFPACK_HAVE_BOOST_SERIALIZATION
823 template< class Archive >
serialize(Archive & archive,const unsigned int version)824 void nkm::KrigingModel::serialize(Archive & archive,
825 			  const unsigned int version)
826 {
827 
828   archive & boost::serialization::base_object<nkm::SurfPackModel>(*this);
829   archive & buildDerOrder;
830   archive & nDer;
831   archive & Der;
832   archive & corrFunc;
833   archive & powExpCorrFuncPow;
834   archive & maternCorrFuncNu;
835   archive & aveDistBetweenPts;
836   archive & maxNatLogCorrLen;
837   archive & minNatLogCorrLen;
838   archive & natLogCorrLen;
839   archive & correlations;
840   archive & numVarsr;
841   archive & numTheta;
842   archive & optimizationMethod;
843   archive & ifUserSpecifiedCorrLengths;
844   archive & numStarts;
845   archive & maxTrials;
846   archive & maxTrialsGlobal;
847   archive & maxTrialsLocal;
848   archive & numConFunc;
849   archive & maxCondNum;
850   archive & ifChooseNug;
851   archive & ifAssumeRcondZero;
852   archive & ifPrescribedNug;
853   archive & nug;
854   archive & iPtsKeep;
855   archive & numPoints;
856   archive & numPointsKeep;
857   archive & numWholePointsKeep;
858   archive & numExtraDerKeep;
859   archive & numEqnAvail;
860   archive & numRowsR;
861   archive & ifHaveAnchorPoint;
862   archive & iAnchorPoint;
863   //don't archive XR since it's only a reference into SurfData .xr
864   archive & XRreorder;
865   //archive & Yall; //need this during the construction of a model but not afterward so don't archive
866   archive & Y;
867   //don't archive Gall we need this during the construction of a model but not afterward
868   //don't archive Gtran we need this during the construction of a model but not afterward
869   archive & ifReducedPoly;
870   archive & polyOrderRequested;
871   archive & polyOrder;
872   archive & numTrend;
873   archive & nTrend;
874   //don't archive iTrendKeep, it's not needed because at the discarded terms in the trend basis function are removed from Poly at the end of create()
875   archive & Poly;
876   //don't archive flyPoly, it's work space needed to efficiently evaluate a polynomial basis "on the fly" (it's a member variable only to avoid constant allocation and deallocation)
877   archive & betaHat;
878   //don't archive derivBeta, it's work space needed to efficiently evaluate derivatives of a polynomial (it's a member variable only to avoid constant allocation and deallocation)
879   //don't archive Z, we need it during the construction of a model but not afterward
880   //don't archive Ztran_theta, we need it during the construction of a model but not afterward
881   //don't archive deltaXR, we need it during the construction of a model but not afterward
882   //don't archive R, we need it during the construction of a model but not afterward, once we have ranking of candidate points by pivoteted cholesky we will use it as temporary variable space after the model is constructed but it still won't be something we want to retain
883   //archive & Rinv; //not used, would be used for analytic integral of adjusted variance
884   archive & RChol;
885   //don't archive scaleRChol, we need it during the construction of a model but not afterward
886   //don't archive sumAbsColR, we need it during the construction of a model but not afterward
887   //don't archive oneNormR, we need it during the construction of a model but not afterward
888   //don't archive lapackRcondR, weneed it during the construction of a model but not afterward
889   //don't archive rcondDblWork, we need it during the construction of a model but not afterward
890   //don't archive rcondIntWork, we need it during the construction of a model but not afterward
891   archive & rcondR;
892   archive & rcond_G_Rinv_Gtran;
893   archive & Rinv_Gtran;
894   //don't archive G_Rinv_Gtran, we need it during the construction of a model but not afterward
895   archive & G_Rinv_Gtran_Chol;
896   //don't archive G_Rinv_Gtran_Chol_Scale, we need it during the construction of a model but not afterward
897   //don't archive G_Rinv_Gtran_Chol_DblWork, we need it during the construction of a model but not afterward
898   //don't archive G_Rinv_Gtran_Chol_IntWork, we need it during the construction of a model but not afterward
899   //don't archive G_Rinv_Y, we need it during the construction of a model but not afterward
900   //don't archive eps, we need it during the construction of a model but not afterward
901   archive & rhs;
902   archive & estVarianceMLE;
903   archive & likelihood;
904   //don't archive prevObjDerMode, we need it during the construction of a model but not afterward
905   //don't archive prevConDerMode, we need it during the construction of a model but not afterward
906   //don't archive prevTheta, we need it during the construction of a model but not afterward
907   archive & maxObjDerMode;
908   archive & maxConDerMode;
909   archive & obj;
910   //don't archive con, we need it during the construction of a model but not afterward
911 }
912 #endif
913 
914 #endif
915