1 /*
2   This file is part of MADNESS.
3 
4   Copyright (C) 2007,2010 Oak Ridge National Laboratory
5 
6   This program is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2 of the License, or
9   (at your option) any later version.
10 
11   This program is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14   GNU General Public License for more details.
15 
16   You should have received a copy of the GNU General Public License
17   along with this program; if not, write to the Free Software
18   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 
20   For more information please contact:
21 
22   Robert J. Harrison
23   Oak Ridge National Laboratory
24   One Bethel Valley Road
25   P.O. Box 2008, MS-6367
26 
27   email: harrisonrj@ornl.gov
28   tel:   865-241-3937
29   fax:   865-572-0680
30 
31   $Id$
32 */
33 #include <madness/tensor/solvers.h>
34 
35 namespace madness {
36 
37 
test_gradient(Tensor<double> & x,double value_precision,bool doprint)38     double OptimizationTargetInterface::test_gradient(Tensor<double>& x, double value_precision, bool doprint) {
39         const double eps = pow(value_precision,0.3333);
40         if (doprint) {
41             printf("\n");
42             printf("Testing gradient eps=%.1e\n----------------\n", eps);
43             printf("  i            f-                 f+               ganalytic           gnumeric         err\n");
44             printf(" ----  ------------------  ------------------  ------------------  ------------------  -------\n");
45         }
46         Tensor<double> tt = gradient(x);
47         int n = int(tt.dim(0));
48         double maxerr = 0.0;
49         for (int i=0; i<n; ++i) {
50             x[i] += eps;
51             double fp = value(x);
52             x[i] -= 2.0*eps;
53             double fm = value(x);
54             x[i] += eps;
55 
56             double gg = 0.5*(fp-fm)/eps;
57             if (doprint)
58                 printf("% 5d%20.12e%20.12e%20.12e%20.12e  %.1e\n", i, fm, fp, tt(i), gg, std::abs(tt(i)-gg));
59             maxerr = std::max(std::abs(gg-tt(i)),maxerr);
60         }
61         if (doprint) printf("\n");
62 
63         return maxerr;
64     }
65 
66 
SteepestDescent(const std::shared_ptr<OptimizationTargetInterface> & tar,double tol,double value_precision,double gradient_precision)67     SteepestDescent::SteepestDescent(const std::shared_ptr<OptimizationTargetInterface>& tar,
68                                      double tol,
69                                      double value_precision,
70                                      double gradient_precision)
71         : target(tar)
72         , tol(tol)
73         , f(tol*1e16)
74         , gnorm(tol*1e16)
75     {
76         if (!target->provides_gradient()) throw "Steepest descent requires the gradient";
77     }
78 
optimize(Tensor<double> & x)79     bool SteepestDescent::optimize(Tensor<double>& x) {
80         double step = 10.0;
81         double  fnew;
82         Tensor<double> g;
83         target->value_and_gradient(x,f,g);
84         gnorm = g.normf();
85         for (int i=0; i<100; ++i) {
86             while (1) {
87                 Tensor<double> gnew;
88                 x.gaxpy(1.0, g, -step);
89                 target->value_and_gradient(x,fnew,gnew);
90                 if (fnew < f) {
91                     f = fnew;
92                     g = gnew;
93                     break;
94                 }
95                 x.gaxpy(1.0, g, step);
96                 step *= 0.5;
97                 print("reducing step size", f, fnew, step);
98             }
99             Tensor<double> g = target->gradient(x);
100             gnorm = g.normf();
101             print("iteration",i,"value",f,"gradient",gnorm);
102             if (converged()) break;
103         }
104         return converged();
105     }
106 
converged() const107     bool SteepestDescent::converged() const {return gnorm < tol;}
108 
gradient_norm() const109     double SteepestDescent::gradient_norm() const {return gnorm;}
110 
value() const111     double SteepestDescent::value() const {return f;}
112 
line_search(double a1,double f0,double dxgrad,const Tensor<double> & x,const Tensor<double> & dx,std::shared_ptr<OptimizationTargetInterface> target1,double value_precision)113     double QuasiNewton::line_search(double a1, double f0, double dxgrad,
114             const Tensor<double>& x, const Tensor<double>& dx,
115             std::shared_ptr<OptimizationTargetInterface> target1,
116             double value_precision) {
117         double f1, f2p;
118         double hess, a2;
119         const char* lsmode = "";
120 
121         if (dxgrad*a1 > 0.0) {
122             print("    line search gradient +ve ", a1, dxgrad);
123             a1 = -a1;
124         }
125 
126         f1 = target1->value(x + a1 * dx);
127 
128         // Fit to a parabola using f0, g0, f1
129         hess = 2.0*(f1-f0-a1*dxgrad)/(a1*a1);
130         a2 = -dxgrad/hess;
131 
132         if (std::abs(f1-f0) < value_precision) { // Insufficient precision
133             a2 = a1;
134             lsmode = "fixed";
135         }
136         else if (hess > 0.0) { // Positive curvature
137             if ((f1 - f0) <= -value_precision) { // Downhill
138                 lsmode = "downhill";
139                 if (std::abs(a2) > 4.0*std::abs(a1)) {
140                     lsmode = "restrict";
141                     a2 = 4.0*a1;
142                 }
143             }
144             else { // Uphill
145                 lsmode = "bracket";
146             }
147         }
148         else { // Negative curvature
149             if ((f1 - f0) < value_precision) { // Downhill
150                 lsmode = "negative";
151                 a2 = 2e0*a1;
152             }
153             else {
154                 lsmode = "punt";
155                 a2 = a1;
156             }
157         }
158 
159         f2p = f0 + dxgrad*a2 + 0.5*hess*a2*a2;
160         printf("   line search grad=%.2e hess=%.2e mode=%s newstep=%.3f\n", dxgrad, hess, lsmode, a2);
161         printf("                      predicted %.12e\n", f2p);
162 
163         return a2;
164     }
165 
hessian_update_sr1(const Tensor<double> & s,const Tensor<double> & y,Tensor<double> & hessian)166     void QuasiNewton::hessian_update_sr1(const Tensor<double>& s,
167             const Tensor<double>& y, Tensor<double>& hessian) {
168         Tensor<double> q = y - inner(hessian,s);
169         double qds = q.trace(s);
170         if (std::abs(qds) > 1e-8 * s.normf() * q.normf()) {
171             hessian += outer(q,q).scale(1.0/qds);
172         }
173         else {
174             printf("   SR1 not updating\n");
175         }
176     }
177 
178 
hessian_update_bfgs(const Tensor<double> & dx,const Tensor<double> & dg,Tensor<double> & hessian)179     void QuasiNewton::hessian_update_bfgs(const Tensor<double>& dx,
180                 const Tensor<double>& dg, Tensor<double>& hessian) {
181         /*
182           Apply the BFGS update to the approximate Hessian h[][].
183 
184           h[][] = Hessian matrix from previous iteration
185           dx[]  = Step from previous iteration
186           .       (dx[] = x[] - xp[] where xp[] is the previous point)
187           dg[]  = gradient difference (dg = g - gp)
188         */
189 
190         Tensor<double> hdx  = inner(hessian,dx);
191 
192         double dxhdx = dx.trace(hdx);
193         double dxdx  = dx.trace(dx);
194         double dxdg  = dx.trace(dg);
195         double dgdg  = dg.trace(dg);
196 
197         const int n=hessian.dim(0);
198         if ( (dxdx > 0.0) && (dgdg > 0.0) && (std::abs(dxdg/std::sqrt(dxdx*dgdg)) > 1.e-8) ) {
199             for (int i=0; i<n; ++i) {
200                 for (int j=0; j<n; ++j) {
201                     hessian(i,j) += dg[i]*dg[j]/dxdg - hdx[i]*hdx[j]/dxhdx;
202                 }
203             }
204         }
205         else {
206             printf("   BFGS not updating dxdg (%e), dgdg (%e), dxhdx (%f), dxdx(%e)\n" , dxdg, dgdg, dxhdx, dxdx);
207         }
208     }
209 
new_search_direction(const Tensor<double> & g) const210     Tensor<double> QuasiNewton::new_search_direction(const Tensor<double>& g) const {
211         Tensor<double> dx, s;
212         double tol = gradient_precision;
213         double trust = 1.0; // This applied in spectral basis
214 
215         Tensor<double> v, e;
216         syev(h, v, e);
217 
218         // Transform gradient into spectral basis
219         Tensor<double> gv = inner(g,v);
220 
221         // Take step applying restriction
222         int nneg=0, nsmall=0, nrestrict=0;
223         for (int i=0; i<n; ++i) {
224             if (e[i] < -tol) {
225                 if (printtest) printf("   forcing negative eigenvalue to be positive %d %.1e\n", i, e[i]);
226                 nneg++;
227                 //e[i] = -2.0*e[i]; // Enforce positive search direction
228                 e[i] = -0.1*e[i]; // Enforce positive search direction
229             }
230             else if (e[i] < tol) {
231                 if (printtest) printf("   forcing small eigenvalue to be positive %d %.1e\n", i, e[i]);
232                 nsmall++;
233                 e[i] = tol;
234             }
235 
236             gv[i] = -gv[i] / e[i];
237             if (std::abs(gv[i]) > trust) { // Step restriction
238                 double gvnew = trust*std::abs(gv(i))/gv[i];
239                 if (printtest) printf("   restricting step in spectral direction %d %.1e --> %.1e\n", i, gv[i], gvnew);
240                 nrestrict++;
241                 gv[i] = gvnew;
242             }
243         }
244         if (nneg || nsmall || nrestrict) printf("   nneg=%d nsmall=%d nrestrict=%d\n", nneg, nsmall, nrestrict);
245 
246         // Transform back from spectral basis
247         return inner(v,gv);
248     }
249 
QuasiNewton(const std::shared_ptr<OptimizationTargetInterface> & tar,int maxiter,double tol,double value_precision,double gradient_precision)250     QuasiNewton::QuasiNewton(const std::shared_ptr<OptimizationTargetInterface>& tar,
251                              int maxiter,
252                              double tol,
253                              double value_precision,
254                              double gradient_precision)
255         : update("BFGS")
256         , target(tar)
257         , maxiter(maxiter)
258         , tol(tol)
259         , value_precision(value_precision)
260         , gradient_precision(gradient_precision)
261         , f(tol*1e16)
262         , gnorm(tol*1e16)
263         , n(0)
264         , printtest(false)
265     {
266         if (!target->provides_gradient()) throw "QuasiNewton requires the gradient";
267     }
268 
set_update(const std::string & method)269     void QuasiNewton::set_update(const std::string& method) {
270         if (method == "BFGS" || method == "SR1") update=method;
271         else throw "QuasiNewton: unknown update mthod";
272     }
273 
set_test(const bool & test_level)274     void QuasiNewton::set_test(const bool& test_level) {
275         printtest = test_level;
276     }
277 
optimize(Tensor<double> & x)278     bool QuasiNewton::optimize(Tensor<double>& x) {
279 //        int maxiter = 20; // !!!!!!!!! dumb
280         if (n != x.dim(0)) {
281             n = x.dim(0);
282             h = Tensor<double>();
283         }
284 
285 
286        if(printtest)  target->test_gradient(x, value_precision);
287 
288         bool h_is_identity = (h.size() == 0);
289         if (h_is_identity) {
290             h = Tensor<double>(n,n);
291             for (int i=0; i<n; ++i) h(i,i) = 1.0;
292         }
293 
294         // the previous gradient
295         Tensor<double> gp;
296         // the displacement
297         Tensor<double> dx;
298 
299         for (int iter=0; iter<maxiter; ++iter) {
300             Tensor<double> g;
301             target->value_and_gradient(x, f, g);
302             gnorm = g.normf();
303             printf(" QuasiNewton iteration %2d value %.12e gradient %.2e\n",iter,f,gnorm);
304             if (converged()) break;
305 
306             if (iter == 1 && h_is_identity) {
307                 // Default initial Hessian is scaled identity but
308                 // prefer to reuse any existing approximation.
309                 h.scale(g.trace(gp)/gp.trace(dx));
310             }
311 
312             if (iter > 0) {
313                 if (update == "BFGS") hessian_update_bfgs(dx, g-gp,h);
314                 else hessian_update_sr1(dx, g-gp,h);
315             }
316 
317             dx = new_search_direction(g);
318 
319             double step = line_search(1.0, f, dx.trace(g), x, dx, target,
320                     value_precision);
321 
322             dx.scale(step);
323             x += dx;
324             gp = g;
325         }
326 
327         if (printtest) {
328             print("final hessian");
329             print(h);
330         }
331         return converged();
332     }
333 
converged() const334         bool QuasiNewton::converged() const {return gnorm < tol;}
335 
value() const336         double QuasiNewton::value() const {return f;}
337 
gradient_norm() const338         double QuasiNewton::gradient_norm() const {return gnorm;}
339 }
340