1 /** 2 * @file spalera_stepsize.hpp 3 * @author Marcus Edel 4 * 5 * Definition of the SPALeRA stepsize technique as described in: 6 * "Stochastic Gradient Descent: Going As Fast As Possible But Not Faster" by 7 * A. Schoenauer Sebag et al. 8 * 9 * ensmallen is free software; you may redistribute it and/or modify it under 10 * the terms of the 3-clause BSD license. You should have received a copy of 11 * the 3-clause BSD license along with ensmallen. If not, see 12 * http://www.opensource.org/licenses/BSD-3-Clause for more information. 13 */ 14 15 #ifndef ENSMALLEN_SPALERA_SGD_SPALERA_STEPSIZE_HPP 16 #define ENSMALLEN_SPALERA_SGD_SPALERA_STEPSIZE_HPP 17 18 namespace ens { 19 20 /** 21 * Definition of the SPALeRA stepize technique, which implementes a change 22 * detection mechanism with an agnostic adaptation scheme. 23 * 24 * For more information, please refer to: 25 * 26 * @code 27 * @misc{Schoenauer2017, 28 * title = {Stochastic Gradient Descent: 29 * Going As Fast As Possible But Not Faster}, 30 * author = {Schoenauer-Sebag, Alice; Schoenauer, Marc; Sebag, Michele}, 31 * journal = {CoRR}, 32 * year = {2017}, 33 * url = {https://arxiv.org/abs/1709.01427}, 34 * } 35 * @endcode 36 */ 37 class SPALeRAStepsize 38 { 39 public: 40 /** 41 * Construct the SPALeRAStepsize object with the given parameters. 42 * The defaults here are not necessarily good for the given 43 * problem, so it is suggested that the values used be tailored to the task at 44 * hand. 45 * 46 * @param alpha Memory parameter of the agnostic learning rate adaptation. 47 * @param epsilon Numerical stability parameter. 48 * @param adaptRate Agnostic learning rate update rate. 49 */ SPALeRAStepsize(const double alpha=0.001,const double epsilon=1e-6,const double adaptRate=3.10e-8)50 SPALeRAStepsize(const double alpha = 0.001, 51 const double epsilon = 1e-6, 52 const double adaptRate = 3.10e-8) : 53 alpha(alpha), 54 epsilon(epsilon), 55 adaptRate(adaptRate), 56 lambda(0) 57 { 58 /* Nothing to do here. */ 59 } 60 61 //! Get the agnostic learning rate adaptation parameter. Alpha() const62 double Alpha() const { return alpha; } 63 //! Modify the agnostic learning rate adaptation parameter. Alpha()64 double& Alpha() { return alpha; } 65 66 //! Get the numerical stability parameter. Epsilon() const67 double Epsilon() const { return epsilon; } 68 //! Modify the numerical stability parameter. Epsilon()69 double& Epsilon() { return epsilon; } 70 71 //! Get the agnostic learning rate update rate. AdaptRate() const72 double AdaptRate() const { return adaptRate; } 73 //! Modify the agnostic learning rate update rate. AdaptRate()74 double& AdaptRate() { return adaptRate; } 75 76 //! Get the Page-Hinkley update parameter lambda. Lambda() const77 double Lambda() const { return lambda; } 78 //! Modify the Page-Hinkley update parameter lambda. Lambda()79 double& Lambda() { return lambda; } 80 81 /** 82 * The UpdatePolicyType policy classes must contain an internal 'Policy' 83 * template class with two template arguments: MatType and GradType. This is 84 * instantiated at the start of the optimization. 85 */ 86 template<typename MatType, typename GradType> 87 class Policy 88 { 89 public: 90 /** 91 * This is called by the optimizer method before the start of the iteration 92 * update process. 93 * 94 * @param parent Instantiated parent class. 95 * @param rows Number of rows in the gradient matrix. 96 * @param cols Number of columns in the gradient matrix. 97 * @param lambda Page-Hinkley update parameter. 98 */ Policy(SPALeRAStepsize & parent,const size_t rows,const size_t cols,const double lambda)99 Policy(SPALeRAStepsize& parent, 100 const size_t rows, 101 const size_t cols, 102 const double lambda) : 103 parent(parent), 104 mu0(0), 105 un(0), 106 mn(0), 107 relaxedObjective(0), 108 phCounter(0), 109 eveCounter(0) 110 { 111 learningRates.ones(rows, cols); 112 relaxedSums.zeros(rows, cols); 113 114 parent.lambda = lambda; 115 } 116 117 /** 118 * This function is called in each iteration. 119 * 120 * @param stepSize Step size to be used for the given iteration. 121 * @param objective The current function loss. 122 * @param batchSize Batch size to be used for the given iteration. 123 * @param numFunctions The number of functions. 124 * @param iterate Parameters that minimize the function. 125 * @param gradient The gradient matrix. 126 * 127 * @return Stop or continue the learning process. 128 */ Update(const double stepSize,const typename MatType::elem_type objective,const size_t batchSize,const size_t numFunctions,MatType & iterate,const GradType & gradient)129 bool Update(const double stepSize, 130 const typename MatType::elem_type objective, 131 const size_t batchSize, 132 const size_t numFunctions, 133 MatType& iterate, 134 const GradType& gradient) 135 { 136 // The ratio of mini-batch size to training set size; needed for the 137 // Page-Hinkley relaxed objective computations. 138 const double mbRatio = batchSize / (double) numFunctions; 139 140 // Page-Hinkley iteration, check if we have to reset the parameter and 141 // adjust the step size. 142 if (phCounter > (1 / mbRatio)) 143 { 144 relaxedObjective = (1 - mbRatio) * relaxedObjective + mbRatio * 145 objective; 146 } 147 else 148 { 149 relaxedObjective = phCounter * relaxedObjective + objective; 150 relaxedObjective /= (phCounter + 1); 151 } 152 153 // Update the mu0 parameter. 154 mu0 = phCounter * mu0 + relaxedObjective; 155 mu0 = mu0 / (phCounter + 1); 156 157 // Update the un parameter. 158 un += relaxedObjective - mu0; 159 160 // Updating the mn parameter. 161 if (un < mn) 162 mn = un; 163 164 // If the condition is true we reset the parameter and update parameter. 165 if ((un - mn) > parent.lambda) 166 { 167 // Backtracking, reset the parameter. 168 iterate = previousIterate; 169 170 // Dividing learning rates by 2 as proposed in: 171 // Stochastic Gradient Descent: Going As Fast As Possible But Not 172 // Faster. 173 learningRates /= 2; 174 175 if (arma::any(arma::vectorise(learningRates) <= 1e-15)) 176 { 177 // Stop because learning rate too low. 178 return false; 179 } 180 181 // Reset evaluation and Page-Hinkley counter parameter. 182 mu0 = un = mn = relaxedObjective = phCounter = eveCounter = 0; 183 } 184 else 185 { 186 const double paramMean = (parent.alpha / (2 - parent.alpha) * 187 (1 - std::pow(1 - parent.alpha, 2 * (eveCounter + 1)))) / 188 iterate.n_elem; 189 190 const double paramStd = (parent.alpha / std::sqrt(iterate.n_elem)) / 191 std::sqrt(iterate.n_elem); 192 193 const typename MatType::elem_type normGradient = 194 std::sqrt(arma::accu(arma::pow(gradient, 2))); 195 196 relaxedSums *= (1 - parent.alpha); 197 if (normGradient > parent.epsilon) 198 relaxedSums += gradient * (parent.alpha / normGradient); 199 200 learningRates %= arma::exp((arma::pow(relaxedSums, 2) - paramMean) * 201 (parent.adaptRate / paramStd)); 202 203 previousIterate = iterate; 204 205 iterate -= stepSize * (learningRates % gradient); 206 207 // Keep track of the the number of evaluations and Page-Hinkley steps. 208 eveCounter++; 209 phCounter++; 210 } 211 212 return true; 213 } 214 215 private: 216 //! Instantiated parent object. 217 SPALeRAStepsize& parent; 218 219 //! Page-Hinkley update parameter. 220 double mu0; 221 222 //! Page-Hinkley update parameter. 223 double un; 224 225 //! Page-Hinkley update parameter. 226 double mn; 227 228 //! Page-Hinkley update parameter. 229 typename MatType::elem_type relaxedObjective; 230 231 //! Page-Hinkley step counter. 232 size_t phCounter; 233 234 //! Evaluations step counter. 235 size_t eveCounter; 236 237 //! Locally-stored parameter wise learning rates. 238 MatType learningRates; 239 240 //! Locally-stored parameter wise sums. 241 MatType relaxedSums; 242 243 //! Locally-stored previous parameter matrix (backtracking). 244 MatType previousIterate; 245 }; 246 247 private: 248 249 //! Memory parameter of the agnostic learning rate adaptation. 250 double alpha; 251 252 //! Numerical stability parameter. 253 double epsilon; 254 255 //! Agnostic learning rate update rate. 256 double adaptRate; 257 258 //! Page-Hinkley update parameter. 259 double lambda; 260 }; 261 262 } // namespace ens 263 264 #endif // ENSMALLEN_SPALERA_SGD_SPALERA_STEPSIZE_HPP 265