1 /* 2 This file is part of MADNESS. 3 4 Copyright (C) 2007,2010 Oak Ridge National Laboratory 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2 of the License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software 18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 20 For more information please contact: 21 22 Robert J. Harrison 23 Oak Ridge National Laboratory 24 One Bethel Valley Road 25 P.O. Box 2008, MS-6367 26 27 email: harrisonrj@ornl.gov 28 tel: 865-241-3937 29 fax: 865-572-0680 30 31 $Id$ 32 */ 33 #include <madness/tensor/solvers.h> 34 35 namespace madness { 36 37 test_gradient(Tensor<double> & x,double value_precision,bool doprint)38 double OptimizationTargetInterface::test_gradient(Tensor<double>& x, double value_precision, bool doprint) { 39 const double eps = pow(value_precision,0.3333); 40 if (doprint) { 41 printf("\n"); 42 printf("Testing gradient eps=%.1e\n----------------\n", eps); 43 printf(" i f- f+ ganalytic gnumeric err\n"); 44 printf(" ---- ------------------ ------------------ ------------------ ------------------ -------\n"); 45 } 46 Tensor<double> tt = gradient(x); 47 int n = int(tt.dim(0)); 48 double maxerr = 0.0; 49 for (int i=0; i<n; ++i) { 50 x[i] += eps; 51 double fp = value(x); 52 x[i] -= 2.0*eps; 53 double fm = value(x); 54 x[i] += eps; 55 56 double gg = 0.5*(fp-fm)/eps; 57 if (doprint) 58 printf("% 5d%20.12e%20.12e%20.12e%20.12e %.1e\n", i, fm, fp, tt(i), gg, std::abs(tt(i)-gg)); 59 maxerr = std::max(std::abs(gg-tt(i)),maxerr); 60 } 61 if (doprint) printf("\n"); 62 63 return maxerr; 64 } 65 66 SteepestDescent(const std::shared_ptr<OptimizationTargetInterface> & tar,double tol,double value_precision,double gradient_precision)67 SteepestDescent::SteepestDescent(const std::shared_ptr<OptimizationTargetInterface>& tar, 68 double tol, 69 double value_precision, 70 double gradient_precision) 71 : target(tar) 72 , tol(tol) 73 , f(tol*1e16) 74 , gnorm(tol*1e16) 75 { 76 if (!target->provides_gradient()) throw "Steepest descent requires the gradient"; 77 } 78 optimize(Tensor<double> & x)79 bool SteepestDescent::optimize(Tensor<double>& x) { 80 double step = 10.0; 81 double fnew; 82 Tensor<double> g; 83 target->value_and_gradient(x,f,g); 84 gnorm = g.normf(); 85 for (int i=0; i<100; ++i) { 86 while (1) { 87 Tensor<double> gnew; 88 x.gaxpy(1.0, g, -step); 89 target->value_and_gradient(x,fnew,gnew); 90 if (fnew < f) { 91 f = fnew; 92 g = gnew; 93 break; 94 } 95 x.gaxpy(1.0, g, step); 96 step *= 0.5; 97 print("reducing step size", f, fnew, step); 98 } 99 Tensor<double> g = target->gradient(x); 100 gnorm = g.normf(); 101 print("iteration",i,"value",f,"gradient",gnorm); 102 if (converged()) break; 103 } 104 return converged(); 105 } 106 converged() const107 bool SteepestDescent::converged() const {return gnorm < tol;} 108 gradient_norm() const109 double SteepestDescent::gradient_norm() const {return gnorm;} 110 value() const111 double SteepestDescent::value() const {return f;} 112 line_search(double a1,double f0,double dxgrad,const Tensor<double> & x,const Tensor<double> & dx,std::shared_ptr<OptimizationTargetInterface> target1,double value_precision)113 double QuasiNewton::line_search(double a1, double f0, double dxgrad, 114 const Tensor<double>& x, const Tensor<double>& dx, 115 std::shared_ptr<OptimizationTargetInterface> target1, 116 double value_precision) { 117 double f1, f2p; 118 double hess, a2; 119 const char* lsmode = ""; 120 121 if (dxgrad*a1 > 0.0) { 122 print(" line search gradient +ve ", a1, dxgrad); 123 a1 = -a1; 124 } 125 126 f1 = target1->value(x + a1 * dx); 127 128 // Fit to a parabola using f0, g0, f1 129 hess = 2.0*(f1-f0-a1*dxgrad)/(a1*a1); 130 a2 = -dxgrad/hess; 131 132 if (std::abs(f1-f0) < value_precision) { // Insufficient precision 133 a2 = a1; 134 lsmode = "fixed"; 135 } 136 else if (hess > 0.0) { // Positive curvature 137 if ((f1 - f0) <= -value_precision) { // Downhill 138 lsmode = "downhill"; 139 if (std::abs(a2) > 4.0*std::abs(a1)) { 140 lsmode = "restrict"; 141 a2 = 4.0*a1; 142 } 143 } 144 else { // Uphill 145 lsmode = "bracket"; 146 } 147 } 148 else { // Negative curvature 149 if ((f1 - f0) < value_precision) { // Downhill 150 lsmode = "negative"; 151 a2 = 2e0*a1; 152 } 153 else { 154 lsmode = "punt"; 155 a2 = a1; 156 } 157 } 158 159 f2p = f0 + dxgrad*a2 + 0.5*hess*a2*a2; 160 printf(" line search grad=%.2e hess=%.2e mode=%s newstep=%.3f\n", dxgrad, hess, lsmode, a2); 161 printf(" predicted %.12e\n", f2p); 162 163 return a2; 164 } 165 hessian_update_sr1(const Tensor<double> & s,const Tensor<double> & y,Tensor<double> & hessian)166 void QuasiNewton::hessian_update_sr1(const Tensor<double>& s, 167 const Tensor<double>& y, Tensor<double>& hessian) { 168 Tensor<double> q = y - inner(hessian,s); 169 double qds = q.trace(s); 170 if (std::abs(qds) > 1e-8 * s.normf() * q.normf()) { 171 hessian += outer(q,q).scale(1.0/qds); 172 } 173 else { 174 printf(" SR1 not updating\n"); 175 } 176 } 177 178 hessian_update_bfgs(const Tensor<double> & dx,const Tensor<double> & dg,Tensor<double> & hessian)179 void QuasiNewton::hessian_update_bfgs(const Tensor<double>& dx, 180 const Tensor<double>& dg, Tensor<double>& hessian) { 181 /* 182 Apply the BFGS update to the approximate Hessian h[][]. 183 184 h[][] = Hessian matrix from previous iteration 185 dx[] = Step from previous iteration 186 . (dx[] = x[] - xp[] where xp[] is the previous point) 187 dg[] = gradient difference (dg = g - gp) 188 */ 189 190 Tensor<double> hdx = inner(hessian,dx); 191 192 double dxhdx = dx.trace(hdx); 193 double dxdx = dx.trace(dx); 194 double dxdg = dx.trace(dg); 195 double dgdg = dg.trace(dg); 196 197 const int n=hessian.dim(0); 198 if ( (dxdx > 0.0) && (dgdg > 0.0) && (std::abs(dxdg/std::sqrt(dxdx*dgdg)) > 1.e-8) ) { 199 for (int i=0; i<n; ++i) { 200 for (int j=0; j<n; ++j) { 201 hessian(i,j) += dg[i]*dg[j]/dxdg - hdx[i]*hdx[j]/dxhdx; 202 } 203 } 204 } 205 else { 206 printf(" BFGS not updating dxdg (%e), dgdg (%e), dxhdx (%f), dxdx(%e)\n" , dxdg, dgdg, dxhdx, dxdx); 207 } 208 } 209 new_search_direction(const Tensor<double> & g) const210 Tensor<double> QuasiNewton::new_search_direction(const Tensor<double>& g) const { 211 Tensor<double> dx, s; 212 double tol = gradient_precision; 213 double trust = 1.0; // This applied in spectral basis 214 215 Tensor<double> v, e; 216 syev(h, v, e); 217 218 // Transform gradient into spectral basis 219 Tensor<double> gv = inner(g,v); 220 221 // Take step applying restriction 222 int nneg=0, nsmall=0, nrestrict=0; 223 for (int i=0; i<n; ++i) { 224 if (e[i] < -tol) { 225 if (printtest) printf(" forcing negative eigenvalue to be positive %d %.1e\n", i, e[i]); 226 nneg++; 227 //e[i] = -2.0*e[i]; // Enforce positive search direction 228 e[i] = -0.1*e[i]; // Enforce positive search direction 229 } 230 else if (e[i] < tol) { 231 if (printtest) printf(" forcing small eigenvalue to be positive %d %.1e\n", i, e[i]); 232 nsmall++; 233 e[i] = tol; 234 } 235 236 gv[i] = -gv[i] / e[i]; 237 if (std::abs(gv[i]) > trust) { // Step restriction 238 double gvnew = trust*std::abs(gv(i))/gv[i]; 239 if (printtest) printf(" restricting step in spectral direction %d %.1e --> %.1e\n", i, gv[i], gvnew); 240 nrestrict++; 241 gv[i] = gvnew; 242 } 243 } 244 if (nneg || nsmall || nrestrict) printf(" nneg=%d nsmall=%d nrestrict=%d\n", nneg, nsmall, nrestrict); 245 246 // Transform back from spectral basis 247 return inner(v,gv); 248 } 249 QuasiNewton(const std::shared_ptr<OptimizationTargetInterface> & tar,int maxiter,double tol,double value_precision,double gradient_precision)250 QuasiNewton::QuasiNewton(const std::shared_ptr<OptimizationTargetInterface>& tar, 251 int maxiter, 252 double tol, 253 double value_precision, 254 double gradient_precision) 255 : update("BFGS") 256 , target(tar) 257 , maxiter(maxiter) 258 , tol(tol) 259 , value_precision(value_precision) 260 , gradient_precision(gradient_precision) 261 , f(tol*1e16) 262 , gnorm(tol*1e16) 263 , n(0) 264 , printtest(false) 265 { 266 if (!target->provides_gradient()) throw "QuasiNewton requires the gradient"; 267 } 268 set_update(const std::string & method)269 void QuasiNewton::set_update(const std::string& method) { 270 if (method == "BFGS" || method == "SR1") update=method; 271 else throw "QuasiNewton: unknown update mthod"; 272 } 273 set_test(const bool & test_level)274 void QuasiNewton::set_test(const bool& test_level) { 275 printtest = test_level; 276 } 277 optimize(Tensor<double> & x)278 bool QuasiNewton::optimize(Tensor<double>& x) { 279 // int maxiter = 20; // !!!!!!!!! dumb 280 if (n != x.dim(0)) { 281 n = x.dim(0); 282 h = Tensor<double>(); 283 } 284 285 286 if(printtest) target->test_gradient(x, value_precision); 287 288 bool h_is_identity = (h.size() == 0); 289 if (h_is_identity) { 290 h = Tensor<double>(n,n); 291 for (int i=0; i<n; ++i) h(i,i) = 1.0; 292 } 293 294 // the previous gradient 295 Tensor<double> gp; 296 // the displacement 297 Tensor<double> dx; 298 299 for (int iter=0; iter<maxiter; ++iter) { 300 Tensor<double> g; 301 target->value_and_gradient(x, f, g); 302 gnorm = g.normf(); 303 printf(" QuasiNewton iteration %2d value %.12e gradient %.2e\n",iter,f,gnorm); 304 if (converged()) break; 305 306 if (iter == 1 && h_is_identity) { 307 // Default initial Hessian is scaled identity but 308 // prefer to reuse any existing approximation. 309 h.scale(g.trace(gp)/gp.trace(dx)); 310 } 311 312 if (iter > 0) { 313 if (update == "BFGS") hessian_update_bfgs(dx, g-gp,h); 314 else hessian_update_sr1(dx, g-gp,h); 315 } 316 317 dx = new_search_direction(g); 318 319 double step = line_search(1.0, f, dx.trace(g), x, dx, target, 320 value_precision); 321 322 dx.scale(step); 323 x += dx; 324 gp = g; 325 } 326 327 if (printtest) { 328 print("final hessian"); 329 print(h); 330 } 331 return converged(); 332 } 333 converged() const334 bool QuasiNewton::converged() const {return gnorm < tol;} 335 value() const336 double QuasiNewton::value() const {return f;} 337 gradient_norm() const338 double QuasiNewton::gradient_norm() const {return gnorm;} 339 } 340