1 /**
2  * @file spalera_stepsize.hpp
3  * @author Marcus Edel
4  *
5  * Definition of the SPALeRA stepsize technique as described in:
6  * "Stochastic Gradient Descent: Going As Fast As Possible But Not Faster" by
7  * A. Schoenauer Sebag et al.
8  *
9  * ensmallen is free software; you may redistribute it and/or modify it under
10  * the terms of the 3-clause BSD license.  You should have received a copy of
11  * the 3-clause BSD license along with ensmallen.  If not, see
12  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
13  */
14 
15 #ifndef ENSMALLEN_SPALERA_SGD_SPALERA_STEPSIZE_HPP
16 #define ENSMALLEN_SPALERA_SGD_SPALERA_STEPSIZE_HPP
17 
18 namespace ens {
19 
20 /**
21  * Definition of the SPALeRA stepize technique, which implementes a change
22  * detection mechanism with an agnostic adaptation scheme.
23  *
24  * For more information, please refer to:
25  *
26  * @code
27  * @misc{Schoenauer2017,
28  *   title   = {Stochastic Gradient Descent:
29  *              Going As Fast As Possible But Not Faster},
30  *   author  = {Schoenauer-Sebag, Alice; Schoenauer, Marc; Sebag, Michele},
31  *   journal = {CoRR},
32  *   year    = {2017},
33  *   url     = {https://arxiv.org/abs/1709.01427},
34  * }
35  * @endcode
36  */
37 class SPALeRAStepsize
38 {
39  public:
40   /**
41    * Construct the SPALeRAStepsize object with the given parameters.
42    * The defaults here are not necessarily good for the given
43    * problem, so it is suggested that the values used be tailored to the task at
44    * hand.
45    *
46    * @param alpha Memory parameter of the agnostic learning rate adaptation.
47    * @param epsilon Numerical stability parameter.
48    * @param adaptRate Agnostic learning rate update rate.
49    */
SPALeRAStepsize(const double alpha=0.001,const double epsilon=1e-6,const double adaptRate=3.10e-8)50   SPALeRAStepsize(const double alpha = 0.001,
51                   const double epsilon = 1e-6,
52                   const double adaptRate = 3.10e-8) :
53       alpha(alpha),
54       epsilon(epsilon),
55       adaptRate(adaptRate),
56       lambda(0)
57   {
58     /* Nothing to do here. */
59   }
60 
61   //! Get the agnostic learning rate adaptation parameter.
Alpha() const62   double Alpha() const { return alpha; }
63   //! Modify the agnostic learning rate adaptation parameter.
Alpha()64   double& Alpha() { return alpha; }
65 
66   //! Get the numerical stability parameter.
Epsilon() const67   double Epsilon() const { return epsilon; }
68   //! Modify the numerical stability parameter.
Epsilon()69   double& Epsilon() { return epsilon; }
70 
71   //! Get the agnostic learning rate update rate.
AdaptRate() const72   double AdaptRate() const { return adaptRate; }
73   //! Modify the agnostic learning rate update rate.
AdaptRate()74   double& AdaptRate() { return adaptRate; }
75 
76   //! Get the Page-Hinkley update parameter lambda.
Lambda() const77   double Lambda() const { return lambda; }
78   //! Modify the Page-Hinkley update parameter lambda.
Lambda()79   double& Lambda() { return lambda; }
80 
81   /**
82    * The UpdatePolicyType policy classes must contain an internal 'Policy'
83    * template class with two template arguments: MatType and GradType.  This is
84    * instantiated at the start of the optimization.
85    */
86   template<typename MatType, typename GradType>
87   class Policy
88   {
89    public:
90     /**
91      * This is called by the optimizer method before the start of the iteration
92      * update process.
93      *
94      * @param parent Instantiated parent class.
95      * @param rows Number of rows in the gradient matrix.
96      * @param cols Number of columns in the gradient matrix.
97      * @param lambda Page-Hinkley update parameter.
98      */
Policy(SPALeRAStepsize & parent,const size_t rows,const size_t cols,const double lambda)99     Policy(SPALeRAStepsize& parent,
100            const size_t rows,
101            const size_t cols,
102            const double lambda) :
103         parent(parent),
104         mu0(0),
105         un(0),
106         mn(0),
107         relaxedObjective(0),
108         phCounter(0),
109         eveCounter(0)
110     {
111       learningRates.ones(rows, cols);
112       relaxedSums.zeros(rows, cols);
113 
114       parent.lambda = lambda;
115     }
116 
117     /**
118      * This function is called in each iteration.
119      *
120      * @param stepSize Step size to be used for the given iteration.
121      * @param objective The current function loss.
122      * @param batchSize Batch size to be used for the given iteration.
123      * @param numFunctions The number of functions.
124      * @param iterate Parameters that minimize the function.
125      * @param gradient The gradient matrix.
126      *
127      * @return Stop or continue the learning process.
128      */
Update(const double stepSize,const typename MatType::elem_type objective,const size_t batchSize,const size_t numFunctions,MatType & iterate,const GradType & gradient)129     bool Update(const double stepSize,
130                 const typename MatType::elem_type objective,
131                 const size_t batchSize,
132                 const size_t numFunctions,
133                 MatType& iterate,
134                 const GradType& gradient)
135     {
136       // The ratio of mini-batch size to training set size; needed for the
137       // Page-Hinkley relaxed objective computations.
138       const double mbRatio = batchSize / (double) numFunctions;
139 
140       // Page-Hinkley iteration, check if we have to reset the parameter and
141       // adjust the step size.
142       if (phCounter > (1 / mbRatio))
143       {
144         relaxedObjective = (1 - mbRatio) * relaxedObjective + mbRatio *
145             objective;
146       }
147       else
148       {
149         relaxedObjective = phCounter * relaxedObjective + objective;
150         relaxedObjective /= (phCounter + 1);
151       }
152 
153       // Update the mu0 parameter.
154       mu0 = phCounter * mu0 + relaxedObjective;
155       mu0 = mu0 / (phCounter + 1);
156 
157       // Update the un parameter.
158       un += relaxedObjective - mu0;
159 
160       // Updating the mn parameter.
161       if (un < mn)
162         mn = un;
163 
164       // If the condition is true we reset the parameter and update parameter.
165       if ((un - mn) > parent.lambda)
166       {
167         // Backtracking, reset the parameter.
168         iterate = previousIterate;
169 
170         // Dividing learning rates by 2 as proposed in:
171         // Stochastic Gradient Descent: Going As Fast As Possible But Not
172         // Faster.
173         learningRates /= 2;
174 
175         if (arma::any(arma::vectorise(learningRates) <= 1e-15))
176         {
177           // Stop because learning rate too low.
178           return false;
179         }
180 
181         // Reset evaluation and Page-Hinkley counter parameter.
182         mu0 = un = mn = relaxedObjective = phCounter = eveCounter = 0;
183       }
184       else
185       {
186         const double paramMean = (parent.alpha / (2 - parent.alpha) *
187             (1 - std::pow(1 - parent.alpha, 2 * (eveCounter + 1)))) /
188             iterate.n_elem;
189 
190         const double paramStd = (parent.alpha / std::sqrt(iterate.n_elem)) /
191             std::sqrt(iterate.n_elem);
192 
193         const typename MatType::elem_type normGradient =
194             std::sqrt(arma::accu(arma::pow(gradient, 2)));
195 
196         relaxedSums *= (1 - parent.alpha);
197         if (normGradient > parent.epsilon)
198           relaxedSums += gradient * (parent.alpha / normGradient);
199 
200         learningRates %= arma::exp((arma::pow(relaxedSums, 2) - paramMean) *
201             (parent.adaptRate / paramStd));
202 
203         previousIterate = iterate;
204 
205         iterate -= stepSize * (learningRates % gradient);
206 
207         // Keep track of the the number of evaluations and Page-Hinkley steps.
208         eveCounter++;
209         phCounter++;
210       }
211 
212       return true;
213     }
214 
215    private:
216     //! Instantiated parent object.
217     SPALeRAStepsize& parent;
218 
219     //! Page-Hinkley update parameter.
220     double mu0;
221 
222     //! Page-Hinkley update parameter.
223     double un;
224 
225     //! Page-Hinkley update parameter.
226     double mn;
227 
228     //! Page-Hinkley update parameter.
229     typename MatType::elem_type relaxedObjective;
230 
231     //! Page-Hinkley step counter.
232     size_t phCounter;
233 
234     //! Evaluations step counter.
235     size_t eveCounter;
236 
237     //! Locally-stored parameter wise learning rates.
238     MatType learningRates;
239 
240     //! Locally-stored parameter wise sums.
241     MatType relaxedSums;
242 
243     //! Locally-stored previous parameter matrix (backtracking).
244     MatType previousIterate;
245   };
246 
247  private:
248 
249   //! Memory parameter of the agnostic learning rate adaptation.
250   double alpha;
251 
252   //! Numerical stability parameter.
253   double epsilon;
254 
255   //! Agnostic learning rate update rate.
256   double adaptRate;
257 
258   //! Page-Hinkley update parameter.
259   double lambda;
260 };
261 
262 } // namespace ens
263 
264 #endif // ENSMALLEN_SPALERA_SGD_SPALERA_STEPSIZE_HPP
265