1 #ifndef STAN_OPTIMIZATION_BFGS_LINESEARCH_HPP
2 #define STAN_OPTIMIZATION_BFGS_LINESEARCH_HPP
3
4 #include <algorithm>
5 #include <cmath>
6 #include <cstdlib>
7 #include <string>
8 #include <limits>
9
10 namespace stan {
11 namespace optimization {
12 /**
13 * Find the minima in an interval [loX, hiX] of a cubic function which
14 * interpolates the points, function values and gradients provided.
15 *
16 * Implicitly, this function constructs an interpolating polynomial
17 * g(x) = a_3 x^3 + a_2 x^2 + a_1 x + a_0
18 * such that g(0) = 0, g(x1) = f1, g'(0) = df0, g'(x1) = df1 where
19 * g'(x) = 3 a_3 x^2 + 2 a_2 x + a_1
20 * is the derivative of g(x). It then computes the roots of g'(x) and
21 * finds the minimal value of g(x) on the interval [loX,hiX] including
22 * the end points.
23 *
24 * This function implements the full parameter version of CubicInterp().
25 *
26 * @param df0 First derivative value, f'(x0)
27 * @param x1 Second point
28 * @param f1 Second function value, f(x1)
29 * @param df1 Second derivative value, f'(x1)
30 * @param loX Lower bound on the interval of solutions
31 * @param hiX Upper bound on the interval of solutions
32 **/
33 template <typename Scalar>
CubicInterp(const Scalar & df0,const Scalar & x1,const Scalar & f1,const Scalar & df1,const Scalar & loX,const Scalar & hiX)34 Scalar CubicInterp(const Scalar &df0, const Scalar &x1, const Scalar &f1,
35 const Scalar &df1, const Scalar &loX, const Scalar &hiX) {
36 const Scalar c3((-12 * f1 + 6 * x1 * (df0 + df1)) / (x1 * x1 * x1));
37 const Scalar c2(-(4 * df0 + 2 * df1) / x1 + 6 * f1 / (x1 * x1));
38 const Scalar &c1(df0);
39
40 const Scalar t_s = std::sqrt(c2 * c2 - 2.0 * c1 * c3);
41 const Scalar s1 = -(c2 + t_s) / c3;
42 const Scalar s2 = -(c2 - t_s) / c3;
43
44 Scalar tmpF;
45 Scalar minF, minX;
46
47 // Check value at lower bound
48 minF = loX * (loX * (loX * c3 / 3.0 + c2) / 2.0 + c1);
49 minX = loX;
50
51 // Check value at upper bound
52 tmpF = hiX * (hiX * (hiX * c3 / 3.0 + c2) / 2.0 + c1);
53 if (tmpF < minF) {
54 minF = tmpF;
55 minX = hiX;
56 }
57
58 // Check value of first root
59 if (loX < s1 && s1 < hiX) {
60 tmpF = s1 * (s1 * (s1 * c3 / 3.0 + c2) / 2.0 + c1);
61 if (tmpF < minF) {
62 minF = tmpF;
63 minX = s1;
64 }
65 }
66
67 // Check value of second root
68 if (loX < s2 && s2 < hiX) {
69 tmpF = s2 * (s2 * (s2 * c3 / 3.0 + c2) / 2.0 + c1);
70 if (tmpF < minF) {
71 minF = tmpF;
72 minX = s2;
73 }
74 }
75
76 return minX;
77 }
78
79 /**
80 * Find the minima in an interval [loX, hiX] of a cubic function which
81 * interpolates the points, function values and gradients provided.
82 *
83 * Implicitly, this function constructs an interpolating polynomial
84 * g(x) = a_3 x^3 + a_2 x^2 + a_1 x + a_0
85 * such that g(x0) = f0, g(x1) = f1, g'(x0) = df0, g'(x1) = df1 where
86 * g'(x) = 3 a_3 x^2 + 2 a_2 x + a_1
87 * is the derivative of g(x). It then computes the roots of g'(x) and
88 * finds the minimal value of g(x) on the interval [loX,hiX] including
89 * the end points.
90 *
91 * @param x0 First point
92 * @param f0 First function value, f(x0)
93 * @param df0 First derivative value, f'(x0)
94 * @param x1 Second point
95 * @param f1 Second function value, f(x1)
96 * @param df1 Second derivative value, f'(x1)
97 * @param loX Lower bound on the interval of solutions
98 * @param hiX Upper bound on the interval of solutions
99 **/
100 template <typename Scalar>
CubicInterp(const Scalar & x0,const Scalar & f0,const Scalar & df0,const Scalar & x1,const Scalar & f1,const Scalar & df1,const Scalar & loX,const Scalar & hiX)101 Scalar CubicInterp(const Scalar &x0, const Scalar &f0, const Scalar &df0,
102 const Scalar &x1, const Scalar &f1, const Scalar &df1,
103 const Scalar &loX, const Scalar &hiX) {
104 return x0 + CubicInterp(df0, x1 - x0, f1 - f0, df1, loX - x0, hiX - x0);
105 }
106
107 /**
108 * An internal utility function for implementing WolfeLineSearch()
109 **/
110 template <typename FunctorType, typename Scalar, typename XType>
WolfLSZoom(Scalar & alpha,XType & newX,Scalar & newF,XType & newDF,FunctorType & func,const XType & x,const Scalar & f,const Scalar & dfp,const Scalar & c1dfp,const Scalar & c2dfp,const XType & p,Scalar alo,Scalar aloF,Scalar aloDFp,Scalar ahi,Scalar ahiF,Scalar ahiDFp,const Scalar & min_range)111 int WolfLSZoom(Scalar &alpha, XType &newX, Scalar &newF, XType &newDF,
112 FunctorType &func, const XType &x, const Scalar &f,
113 const Scalar &dfp, const Scalar &c1dfp, const Scalar &c2dfp,
114 const XType &p, Scalar alo, Scalar aloF, Scalar aloDFp,
115 Scalar ahi, Scalar ahiF, Scalar ahiDFp,
116 const Scalar &min_range) {
117 Scalar d1, d2, newDFp;
118 int itNum(0);
119
120 while (1) {
121 itNum++;
122
123 if (std::fabs(alo - ahi) < min_range)
124 return 1;
125
126 if (itNum % 5 == 0) {
127 alpha = 0.5 * (alo + ahi);
128 } else {
129 // Perform cubic interpolation to determine next point to try
130 d1 = aloDFp + ahiDFp - 3 * (aloF - ahiF) / (alo - ahi);
131 d2 = std::sqrt(d1 * d1 - aloDFp * ahiDFp);
132 if (ahi < alo)
133 d2 = -d2;
134 alpha
135 = ahi - (ahi - alo) * (ahiDFp + d2 - d1) / (ahiDFp - aloDFp + 2 * d2);
136 if (!std::isfinite(alpha)
137 || alpha < std::min(alo, ahi) + 0.01 * std::fabs(alo - ahi)
138 || alpha > std::max(alo, ahi) - 0.01 * std::fabs(alo - ahi))
139 alpha = 0.5 * (alo + ahi);
140 }
141
142 newX = x + alpha * p;
143 while (func(newX, newF, newDF)) {
144 alpha = 0.5 * (alpha + std::min(alo, ahi));
145 if (std::fabs(std::min(alo, ahi) - alpha) < min_range)
146 return 1;
147 newX = x + alpha * p;
148 }
149 newDFp = newDF.dot(p);
150 if (newF > (f + alpha * c1dfp) || newF >= aloF) {
151 ahi = alpha;
152 ahiF = newF;
153 ahiDFp = newDFp;
154 } else {
155 if (std::fabs(newDFp) <= -c2dfp)
156 break;
157 if (newDFp * (ahi - alo) >= 0) {
158 ahi = alo;
159 ahiF = aloF;
160 ahiDFp = aloDFp;
161 }
162 alo = alpha;
163 aloF = newF;
164 aloDFp = newDFp;
165 }
166 }
167 return 0;
168 }
169
170 /**
171 * Perform a line search which finds an approximate solution to:
172 * \f[
173 * \min_\alpha f(x_0 + \alpha p)
174 * \f]
175 * satisfying the strong Wolfe conditions:
176 * 1) \f$ f(x_0 + \alpha p) \leq f(x_0) + c_1 \alpha p^T g(x_0) \f$
177 * 2) \f$ \vert p^T g(x_0 + \alpha p) \vert \leq c_2 \vert p^T g(x_0) \vert \f$
178 * where \f$g(x) = \frac{\partial f}{\partial x}\f$ is the gradient of f(x).
179 *
180 * @tparam FunctorType A type which supports being called as
181 * ret = func(x,f,g)
182 * where x is the input point, f and g are the function value and
183 * gradient at x and ret is non-zero if function evaluation fails.
184 *
185 * @param func Function which is being minimized.
186 *
187 * @param alpha First value of \f$ \alpha \f$ to try. Upon return this
188 * contains the final value of the \f$ \alpha \f$.
189 *
190 * @param x1 Final point, equal to \f$ x_0 + \alpha p \f$.
191 *
192 * @param f1 Final point function value, equal to \f$ f(x_0 + \alpha p) \f$.
193 *
194 * @param gradx1 Final point gradient, equal to \f$ g(x_0 + \alpha p) \f$.
195 *
196 * @param p Search direction. It is assumed to be a descent direction such
197 * that \f$ p^T g(x_0) < 0 \f$.
198 *
199 * @param x0 Value of starting point, \f$ x_0 \f$.
200 *
201 * @param f0 Value of function at starting point, \f$ f(x_0) \f$.
202 *
203 * @param gradx0 Value of function gradient at starting point,
204 * \f$ g(x_0) \f$.
205 *
206 * @param c1 Parameter of the Wolfe conditions. \f$ 0 < c_1 < c_2 < 1 \f$
207 * Typically c1 = 1e-4.
208 *
209 * @param c2 Parameter of the Wolfe conditions. \f$ 0 < c_1 < c_2 < 1 \f$
210 * Typically c2 = 0.9.
211 *
212 * @param minAlpha Smallest allowable step-size.
213 *
214 * @param maxLSIts Maximum number line search iterations.
215 *
216 * @param maxLSRestarts Maximum number of times line search will
217 * restart with \f$ f() \f$ failing.
218 *
219 * @return Returns zero on success, non-zero otherwise.
220 **/
221 template <typename FunctorType, typename Scalar, typename XType>
WolfeLineSearch(FunctorType & func,Scalar & alpha,XType & x1,Scalar & f1,XType & gradx1,const XType & p,const XType & x0,const Scalar & f0,const XType & gradx0,const Scalar & c1,const Scalar & c2,const Scalar & minAlpha,const Scalar & maxLSIts,const Scalar & maxLSRestarts)222 int WolfeLineSearch(FunctorType &func, Scalar &alpha, XType &x1, Scalar &f1,
223 XType &gradx1, const XType &p, const XType &x0,
224 const Scalar &f0, const XType &gradx0, const Scalar &c1,
225 const Scalar &c2, const Scalar &minAlpha,
226 const Scalar &maxLSIts, const Scalar &maxLSRestarts) {
227 const Scalar dfp(gradx0.dot(p));
228 const Scalar c1dfp(c1 * dfp);
229 const Scalar c2dfp(c2 * dfp);
230
231 Scalar alpha0(minAlpha);
232 Scalar alpha1(alpha);
233
234 Scalar prevF(f0);
235 XType prevDF(gradx0);
236 Scalar prevDFp(dfp);
237 Scalar newDFp;
238
239 int retCode = 0, nits = 0, lsRestarts = 0, ret;
240
241 while (1) {
242 if (nits >= maxLSIts) {
243 retCode = 1;
244 break;
245 }
246
247 x1.noalias() = x0 + alpha1 * p;
248 ret = func(x1, f1, gradx1);
249 if (ret != 0) {
250 if (lsRestarts >= maxLSRestarts) {
251 retCode = 1;
252 break;
253 }
254
255 alpha1 = 0.5 * (alpha0 + alpha1);
256 lsRestarts++;
257 continue;
258 }
259 lsRestarts = 0;
260
261 newDFp = gradx1.dot(p);
262 if ((f1 > f0 + alpha * c1dfp) || (f1 >= prevF && nits > 0)) {
263 retCode
264 = WolfLSZoom(alpha, x1, f1, gradx1, func, x0, f0, dfp, c1dfp, c2dfp,
265 p, alpha0, prevF, prevDFp, alpha1, f1, newDFp, 1e-16);
266 break;
267 }
268 if (std::fabs(newDFp) <= -c2dfp) {
269 alpha = alpha1;
270 break;
271 }
272 if (newDFp >= 0) {
273 retCode
274 = WolfLSZoom(alpha, x1, f1, gradx1, func, x0, f0, dfp, c1dfp, c2dfp,
275 p, alpha1, f1, newDFp, alpha0, prevF, prevDFp, 1e-16);
276 break;
277 }
278
279 alpha0 = alpha1;
280 prevF = f1;
281 std::swap(prevDF, gradx1);
282 prevDFp = newDFp;
283
284 alpha1 *= 10.0;
285
286 nits++;
287 }
288 return retCode;
289 }
290 } // namespace optimization
291 } // namespace stan
292
293 #endif
294