1 #ifndef STAN_OPTIMIZATION_BFGS_LINESEARCH_HPP
2 #define STAN_OPTIMIZATION_BFGS_LINESEARCH_HPP
3 
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdlib>
7 #include <string>
8 #include <limits>
9 
10 namespace stan {
11 namespace optimization {
12 /**
13  * Find the minima in an interval [loX, hiX] of a cubic function which
14  * interpolates the points, function values and gradients provided.
15  *
16  * Implicitly, this function constructs an interpolating polynomial
17  *     g(x) = a_3 x^3 + a_2 x^2 + a_1 x + a_0
18  * such that g(0) = 0, g(x1) = f1, g'(0) = df0, g'(x1) = df1 where
19  *     g'(x) = 3 a_3 x^2 + 2 a_2 x + a_1
20  * is the derivative of g(x).  It then computes the roots of g'(x) and
21  * finds the minimal value of g(x) on the interval [loX,hiX] including
22  * the end points.
23  *
24  * This function implements the full parameter version of CubicInterp().
25  *
26  * @param df0 First derivative value, f'(x0)
27  * @param x1 Second point
28  * @param f1 Second function value, f(x1)
29  * @param df1 Second derivative value, f'(x1)
30  * @param loX Lower bound on the interval of solutions
31  * @param hiX Upper bound on the interval of solutions
32  **/
33 template <typename Scalar>
CubicInterp(const Scalar & df0,const Scalar & x1,const Scalar & f1,const Scalar & df1,const Scalar & loX,const Scalar & hiX)34 Scalar CubicInterp(const Scalar &df0, const Scalar &x1, const Scalar &f1,
35                    const Scalar &df1, const Scalar &loX, const Scalar &hiX) {
36   const Scalar c3((-12 * f1 + 6 * x1 * (df0 + df1)) / (x1 * x1 * x1));
37   const Scalar c2(-(4 * df0 + 2 * df1) / x1 + 6 * f1 / (x1 * x1));
38   const Scalar &c1(df0);
39 
40   const Scalar t_s = std::sqrt(c2 * c2 - 2.0 * c1 * c3);
41   const Scalar s1 = -(c2 + t_s) / c3;
42   const Scalar s2 = -(c2 - t_s) / c3;
43 
44   Scalar tmpF;
45   Scalar minF, minX;
46 
47   // Check value at lower bound
48   minF = loX * (loX * (loX * c3 / 3.0 + c2) / 2.0 + c1);
49   minX = loX;
50 
51   // Check value at upper bound
52   tmpF = hiX * (hiX * (hiX * c3 / 3.0 + c2) / 2.0 + c1);
53   if (tmpF < minF) {
54     minF = tmpF;
55     minX = hiX;
56   }
57 
58   // Check value of first root
59   if (loX < s1 && s1 < hiX) {
60     tmpF = s1 * (s1 * (s1 * c3 / 3.0 + c2) / 2.0 + c1);
61     if (tmpF < minF) {
62       minF = tmpF;
63       minX = s1;
64     }
65   }
66 
67   // Check value of second root
68   if (loX < s2 && s2 < hiX) {
69     tmpF = s2 * (s2 * (s2 * c3 / 3.0 + c2) / 2.0 + c1);
70     if (tmpF < minF) {
71       minF = tmpF;
72       minX = s2;
73     }
74   }
75 
76   return minX;
77 }
78 
79 /**
80  * Find the minima in an interval [loX, hiX] of a cubic function which
81  * interpolates the points, function values and gradients provided.
82  *
83  * Implicitly, this function constructs an interpolating polynomial
84  *     g(x) = a_3 x^3 + a_2 x^2 + a_1 x + a_0
85  * such that g(x0) = f0, g(x1) = f1, g'(x0) = df0, g'(x1) = df1 where
86  *     g'(x) = 3 a_3 x^2 + 2 a_2 x + a_1
87  * is the derivative of g(x).  It then computes the roots of g'(x) and
88  * finds the minimal value of g(x) on the interval [loX,hiX] including
89  * the end points.
90  *
91  * @param x0 First point
92  * @param f0 First function value, f(x0)
93  * @param df0 First derivative value, f'(x0)
94  * @param x1 Second point
95  * @param f1 Second function value, f(x1)
96  * @param df1 Second derivative value, f'(x1)
97  * @param loX Lower bound on the interval of solutions
98  * @param hiX Upper bound on the interval of solutions
99  **/
100 template <typename Scalar>
CubicInterp(const Scalar & x0,const Scalar & f0,const Scalar & df0,const Scalar & x1,const Scalar & f1,const Scalar & df1,const Scalar & loX,const Scalar & hiX)101 Scalar CubicInterp(const Scalar &x0, const Scalar &f0, const Scalar &df0,
102                    const Scalar &x1, const Scalar &f1, const Scalar &df1,
103                    const Scalar &loX, const Scalar &hiX) {
104   return x0 + CubicInterp(df0, x1 - x0, f1 - f0, df1, loX - x0, hiX - x0);
105 }
106 
107 /**
108  * An internal utility function for implementing WolfeLineSearch()
109  **/
110 template <typename FunctorType, typename Scalar, typename XType>
WolfLSZoom(Scalar & alpha,XType & newX,Scalar & newF,XType & newDF,FunctorType & func,const XType & x,const Scalar & f,const Scalar & dfp,const Scalar & c1dfp,const Scalar & c2dfp,const XType & p,Scalar alo,Scalar aloF,Scalar aloDFp,Scalar ahi,Scalar ahiF,Scalar ahiDFp,const Scalar & min_range)111 int WolfLSZoom(Scalar &alpha, XType &newX, Scalar &newF, XType &newDF,
112                FunctorType &func, const XType &x, const Scalar &f,
113                const Scalar &dfp, const Scalar &c1dfp, const Scalar &c2dfp,
114                const XType &p, Scalar alo, Scalar aloF, Scalar aloDFp,
115                Scalar ahi, Scalar ahiF, Scalar ahiDFp,
116                const Scalar &min_range) {
117   Scalar d1, d2, newDFp;
118   int itNum(0);
119 
120   while (1) {
121     itNum++;
122 
123     if (std::fabs(alo - ahi) < min_range)
124       return 1;
125 
126     if (itNum % 5 == 0) {
127       alpha = 0.5 * (alo + ahi);
128     } else {
129       // Perform cubic interpolation to determine next point to try
130       d1 = aloDFp + ahiDFp - 3 * (aloF - ahiF) / (alo - ahi);
131       d2 = std::sqrt(d1 * d1 - aloDFp * ahiDFp);
132       if (ahi < alo)
133         d2 = -d2;
134       alpha
135           = ahi - (ahi - alo) * (ahiDFp + d2 - d1) / (ahiDFp - aloDFp + 2 * d2);
136       if (!std::isfinite(alpha)
137           || alpha < std::min(alo, ahi) + 0.01 * std::fabs(alo - ahi)
138           || alpha > std::max(alo, ahi) - 0.01 * std::fabs(alo - ahi))
139         alpha = 0.5 * (alo + ahi);
140     }
141 
142     newX = x + alpha * p;
143     while (func(newX, newF, newDF)) {
144       alpha = 0.5 * (alpha + std::min(alo, ahi));
145       if (std::fabs(std::min(alo, ahi) - alpha) < min_range)
146         return 1;
147       newX = x + alpha * p;
148     }
149     newDFp = newDF.dot(p);
150     if (newF > (f + alpha * c1dfp) || newF >= aloF) {
151       ahi = alpha;
152       ahiF = newF;
153       ahiDFp = newDFp;
154     } else {
155       if (std::fabs(newDFp) <= -c2dfp)
156         break;
157       if (newDFp * (ahi - alo) >= 0) {
158         ahi = alo;
159         ahiF = aloF;
160         ahiDFp = aloDFp;
161       }
162       alo = alpha;
163       aloF = newF;
164       aloDFp = newDFp;
165     }
166   }
167   return 0;
168 }
169 
170 /**
171  * Perform a line search which finds an approximate solution to:
172  * \f[
173  *       \min_\alpha f(x_0 + \alpha p)
174  * \f]
175  * satisfying the strong Wolfe conditions:
176  *  1) \f$ f(x_0 + \alpha p) \leq f(x_0) + c_1 \alpha p^T g(x_0) \f$
177  *  2) \f$ \vert p^T g(x_0 + \alpha p) \vert \leq c_2 \vert p^T g(x_0) \vert \f$
178  * where \f$g(x) = \frac{\partial f}{\partial x}\f$ is the gradient of f(x).
179  *
180  * @tparam FunctorType A type which supports being called as
181  *        ret = func(x,f,g)
182  * where x is the input point, f and g are the function value and
183  * gradient at x and ret is non-zero if function evaluation fails.
184  *
185  * @param func Function which is being minimized.
186  *
187  * @param alpha First value of \f$ \alpha \f$ to try.  Upon return this
188  * contains the final value of the \f$ \alpha \f$.
189  *
190  * @param x1 Final point, equal to \f$ x_0 + \alpha p \f$.
191  *
192  * @param f1 Final point function value, equal to \f$ f(x_0 + \alpha p) \f$.
193  *
194  * @param gradx1 Final point gradient, equal to \f$ g(x_0 + \alpha p) \f$.
195  *
196  * @param p Search direction.  It is assumed to be a descent direction such
197  * that \f$ p^T g(x_0) < 0 \f$.
198  *
199  * @param x0 Value of starting point, \f$ x_0 \f$.
200  *
201  * @param f0 Value of function at starting point, \f$ f(x_0) \f$.
202  *
203  * @param gradx0 Value of function gradient at starting point,
204  *    \f$ g(x_0) \f$.
205  *
206  * @param c1 Parameter of the Wolfe conditions. \f$ 0 < c_1 < c_2 < 1 \f$
207  * Typically c1 = 1e-4.
208  *
209  * @param c2 Parameter of the Wolfe conditions. \f$ 0 < c_1 < c_2 < 1 \f$
210  * Typically c2 = 0.9.
211  *
212  * @param minAlpha Smallest allowable step-size.
213  *
214  * @param maxLSIts Maximum number line search iterations.
215  *
216  * @param maxLSRestarts Maximum number of times line search will
217  * restart with \f$ f() \f$ failing.
218  *
219  * @return Returns zero on success, non-zero otherwise.
220  **/
221 template <typename FunctorType, typename Scalar, typename XType>
WolfeLineSearch(FunctorType & func,Scalar & alpha,XType & x1,Scalar & f1,XType & gradx1,const XType & p,const XType & x0,const Scalar & f0,const XType & gradx0,const Scalar & c1,const Scalar & c2,const Scalar & minAlpha,const Scalar & maxLSIts,const Scalar & maxLSRestarts)222 int WolfeLineSearch(FunctorType &func, Scalar &alpha, XType &x1, Scalar &f1,
223                     XType &gradx1, const XType &p, const XType &x0,
224                     const Scalar &f0, const XType &gradx0, const Scalar &c1,
225                     const Scalar &c2, const Scalar &minAlpha,
226                     const Scalar &maxLSIts, const Scalar &maxLSRestarts) {
227   const Scalar dfp(gradx0.dot(p));
228   const Scalar c1dfp(c1 * dfp);
229   const Scalar c2dfp(c2 * dfp);
230 
231   Scalar alpha0(minAlpha);
232   Scalar alpha1(alpha);
233 
234   Scalar prevF(f0);
235   XType prevDF(gradx0);
236   Scalar prevDFp(dfp);
237   Scalar newDFp;
238 
239   int retCode = 0, nits = 0, lsRestarts = 0, ret;
240 
241   while (1) {
242     if (nits >= maxLSIts) {
243       retCode = 1;
244       break;
245     }
246 
247     x1.noalias() = x0 + alpha1 * p;
248     ret = func(x1, f1, gradx1);
249     if (ret != 0) {
250       if (lsRestarts >= maxLSRestarts) {
251         retCode = 1;
252         break;
253       }
254 
255       alpha1 = 0.5 * (alpha0 + alpha1);
256       lsRestarts++;
257       continue;
258     }
259     lsRestarts = 0;
260 
261     newDFp = gradx1.dot(p);
262     if ((f1 > f0 + alpha * c1dfp) || (f1 >= prevF && nits > 0)) {
263       retCode
264           = WolfLSZoom(alpha, x1, f1, gradx1, func, x0, f0, dfp, c1dfp, c2dfp,
265                        p, alpha0, prevF, prevDFp, alpha1, f1, newDFp, 1e-16);
266       break;
267     }
268     if (std::fabs(newDFp) <= -c2dfp) {
269       alpha = alpha1;
270       break;
271     }
272     if (newDFp >= 0) {
273       retCode
274           = WolfLSZoom(alpha, x1, f1, gradx1, func, x0, f0, dfp, c1dfp, c2dfp,
275                        p, alpha1, f1, newDFp, alpha0, prevF, prevDFp, 1e-16);
276       break;
277     }
278 
279     alpha0 = alpha1;
280     prevF = f1;
281     std::swap(prevDF, gradx1);
282     prevDFp = newDFp;
283 
284     alpha1 *= 10.0;
285 
286     nits++;
287   }
288   return retCode;
289 }
290 }  // namespace optimization
291 }  // namespace stan
292 
293 #endif
294