1 /** 2 * @file padam_update.hpp 3 * @author Marcus Edel 4 * 5 * Implementation of Partially adaptive momentum estimation method (Padam). 6 * 7 * ensmallen is free software; you may redistribute it and/or modify it under 8 * the terms of the 3-clause BSD license. You should have received a copy of 9 * the 3-clause BSD license along with ensmallen. If not, see 10 * http://www.opensource.org/licenses/BSD-3-Clause for more information. 11 */ 12 #ifndef ENSMALLEN_PADAM_PADAM_UPDATE_HPP 13 #define ENSMALLEN_PADAM_PADAM_UPDATE_HPP 14 15 namespace ens { 16 17 /** 18 * Partially adaptive momentum estimation method (Padam), 19 * adopts historical gradient information to automatically adjust the 20 * learning rate. 21 * 22 * For more information, see the following. 23 * 24 * @code 25 * @article{ 26 * title = {Closing the Generalization Gap of Adaptive Gradient Methods in 27 * Training Deep Neural Networks}, 28 * author = {{Chen}, J. and {Gu}, Q.}, 29 * journal = {ArXiv e-prints}, 30 * url = {https://arxiv.org/abs/1806.06763} 31 * year = {2018} 32 * } 33 * @endcode 34 */ 35 class PadamUpdate 36 { 37 public: 38 /** 39 * Construct the Padam update policy with the given parameters. 40 * 41 * @param epsilon Epsilon is the minimum allowed gradient. 42 * @param beta1 The smoothing parameter. 43 * @param beta2 The second moment coefficient. 44 * @param partial Partially adaptive parameter. 45 */ PadamUpdate(const double epsilon=1e-8,const double beta1=0.9,const double beta2=0.999,const double partial=0.25)46 PadamUpdate(const double epsilon = 1e-8, 47 const double beta1 = 0.9, 48 const double beta2 = 0.999, 49 const double partial = 0.25) : 50 epsilon(epsilon), 51 beta1(beta1), 52 beta2(beta2), 53 partial(partial), 54 iteration(0) 55 { 56 // Nothing to do. 57 } 58 59 //! Get the value used to initialise the squared gradient parameter. Epsilon() const60 double Epsilon() const { return epsilon; } 61 //! Modify the value used to initialise the squared gradient parameter. Epsilon()62 double& Epsilon() { return epsilon; } 63 64 //! Get the smoothing parameter. Beta1() const65 double Beta1() const { return beta1; } 66 //! Modify the smoothing parameter. Beta1()67 double& Beta1() { return beta1; } 68 69 //! Get the second moment coefficient. Beta2() const70 double Beta2() const { return beta2; } 71 //! Modify the second moment coefficient. Beta2()72 double& Beta2() { return beta2; } 73 74 //! Get the partial adaptive parameter. Partial() const75 double Partial() const { return partial; } 76 //! Modify the partial adaptive parameter. Partial()77 double& Partial() { return partial; } 78 79 //! Get the current iteration number. Iteration() const80 size_t Iteration() const { return iteration; } 81 //! Modify the current iteration number. Iteration()82 size_t& Iteration() { return iteration; } 83 84 /** 85 * The UpdatePolicyType policy classes must contain an internal 'Policy' 86 * template class with two template arguments: MatType and GradType. This is 87 * instantiated at the start of the optimization, and holds parameters 88 * specific to an individual optimization. 89 */ 90 template<typename MatType, typename GradType> 91 class Policy 92 { 93 public: 94 /** 95 * This constructor is called by the SGD Optimize() method before the start 96 * of the iteration update process. 97 * 98 * @param parent Instantiated PadamUpdate parent object. 99 * @param rows Number of rows in the gradient matrix. 100 * @param cols Number of columns in the gradient matrix. 101 */ Policy(PadamUpdate & parent,const size_t rows,const size_t cols)102 Policy(PadamUpdate& parent, const size_t rows, const size_t cols) : 103 parent(parent) 104 { 105 m.zeros(rows, cols); 106 v.zeros(rows, cols); 107 vImproved.zeros(rows, cols); 108 } 109 110 /** 111 * Update step for Padam. 112 * 113 * @param iterate Parameters that minimize the function. 114 * @param stepSize Step size to be used for the given iteration. 115 * @param gradient The gradient matrix. 116 */ Update(MatType & iterate,const double stepSize,const GradType & gradient)117 void Update(MatType& iterate, 118 const double stepSize, 119 const GradType& gradient) 120 { 121 // Increment the iteration counter variable. 122 ++parent.iteration; 123 124 // And update the iterate. 125 m *= parent.beta1; 126 m += (1 - parent.beta1) * gradient; 127 128 v *= parent.beta2; 129 v += (1 - parent.beta2) * (gradient % gradient); 130 131 const double biasCorrection1 = 1.0 - std::pow(parent.beta1, 132 parent.iteration); 133 const double biasCorrection2 = 1.0 - std::pow(parent.beta2, 134 parent.iteration); 135 136 // Element wise maximum of past and present squared gradients. 137 vImproved = arma::max(vImproved, v); 138 139 iterate -= (stepSize * std::sqrt(biasCorrection2) / biasCorrection1) * 140 m / arma::pow(vImproved + parent.epsilon, parent.partial); 141 } 142 143 private: 144 //! Instantiated parent object. 145 PadamUpdate& parent; 146 147 //! The exponential moving average of gradient values. 148 GradType m; 149 150 //! The exponential moving average of squared gradient values. 151 GradType v; 152 153 //! The optimal sqaured gradient value. 154 GradType vImproved; 155 }; 156 157 private: 158 //! The epsilon value used to initialise the squared gradient parameter. 159 double epsilon; 160 161 //! The smoothing parameter. 162 double beta1; 163 164 //! The second moment coefficient. 165 double beta2; 166 167 //! Partial adaptive parameter. 168 double partial; 169 170 //! The number of iterations. 171 size_t iteration; 172 }; 173 174 } // namespace ens 175 176 #endif 177