1 /** 2 * @file qhadam_update.hpp 3 * @author Niteya Shah 4 * 5 * Implments the QHAdam Optimizer. QHAdam is a variant of Adam which introduces 6 * quasi hyperbolic moment terms to improve paramterisation and performance. 7 * 8 * ensmallen is free software; you may redistribute it and/or modify it under 9 * the terms of the 3-clause BSD license. You should have received a copy of 10 * the 3-clause BSD license along with ensmallen. If not, see 11 * http://www.opensource.org/licenses/BSD-3-Clause for more information. 12 */ 13 #ifndef ENSMALLEN_ADAM_QHADAM_UPDATE_HPP 14 #define ENSMALLEN_ADAM_QHADAM_UPDATE_HPP 15 16 namespace ens { 17 18 /** 19 * QHAdam is a optimising strategy based on the Quasi-Hyperbolic step when 20 * applied to the Adam Optimiser.QH updates can be considered to a weighted 21 * average of the momentum.QHAdam,based on its paramterisation can recover 22 * many algorithms such as NAdam and RMSProp. 23 * 24 * For more information, see the following. 25 * 26 * @code 27 * @inproceedings{ma2019qh, 28 * title={Quasi-hyperbolic momentum and Adam for deep learning}, 29 * author={Jerry Ma and Denis Yarats}, 30 * booktitle={International Conference on Learning Representations}, 31 * year={2019} 32 * } 33 * @endcode 34 */ 35 class QHAdamUpdate 36 { 37 public: 38 /** 39 * Construct the QHAdam update policy with the given parameters. 40 * 41 * @param epsilon The epsilon value used to initialise the squared gradient 42 * parameter. 43 * @param beta1 The smoothing parameter. 44 * @param beta2 The second moment coefficient. 45 * @param v1 The first quasi-hyperbolic term. 46 * @param v1 The second quasi-hyperbolic term. 47 */ QHAdamUpdate(const double epsilon=1e-8,const double beta1=0.9,const double beta2=0.999,const double v1=0.7,const double v2=1)48 QHAdamUpdate(const double epsilon = 1e-8, 49 const double beta1 = 0.9, 50 const double beta2 = 0.999, 51 const double v1 = 0.7, 52 const double v2 = 1) : 53 epsilon(epsilon), 54 beta1(beta1), 55 beta2(beta2), 56 v1(v1), 57 v2(v2), 58 iteration(0) 59 { 60 // Nothing to do. 61 } 62 63 //! Get the value used to initialise the squared gradient parameter. Epsilon() const64 double Epsilon() const { return epsilon; } 65 //! Modify the value used to initialise the squared gradient parameter. Epsilon()66 double& Epsilon() { return epsilon; } 67 68 //! Get the smoothing parameter. Beta1() const69 double Beta1() const { return beta1; } 70 //! Modify the smoothing parameter. Beta1()71 double& Beta1() { return beta1; } 72 73 //! Get the second moment coefficient. Beta2() const74 double Beta2() const { return beta2; } 75 //! Modify the second moment coefficient. Beta2()76 double& Beta2() { return beta2; } 77 78 //! Get the current iteration number. Iteration() const79 size_t Iteration() const { return iteration; } 80 //! Modify the current iteration number. Iteration()81 size_t& Iteration() { return iteration; } 82 83 //! Get the first quasi-hyperbolic term. V1() const84 double V1() const { return v1; } 85 //! Modify the first quasi-hyperbolic term. V1()86 double& V1() { return v1; } 87 88 //! Get the second quasi-hyperbolic term. V2() const89 double V2() const { return v2; } 90 //! Modify the second quasi-hyperbolic term. V2()91 double& V2() { return v2; } 92 93 /** 94 * The UpdatePolicyType policy classes must contain an internal 'Policy' 95 * template class with two template arguments: MatType and GradType. This is 96 * instantiated at the start of the optimization, and holds parameters 97 * specific to an individual optimization. 98 */ 99 template<typename MatType, typename GradType> 100 class Policy 101 { 102 public: 103 /** 104 * This constructor is called by the SGD Optimize() method before the start 105 * of the iteration update process. 106 * 107 * @param parent AdamUpdate object. 108 * @param rows Number of rows in the gradient matrix. 109 * @param cols Number of columns in the gradient matrix. 110 */ Policy(QHAdamUpdate & parent,const size_t rows,const size_t cols)111 Policy(QHAdamUpdate& parent, const size_t rows, const size_t cols) : 112 parent(parent) 113 { 114 m.zeros(rows, cols); 115 v.zeros(rows, cols); 116 } 117 118 /** 119 * Update step for QHAdam. 120 * 121 * @param iterate Parameters that minimize the function. 122 * @param stepSize Step size to be used for the given iteration. 123 * @param gradient The gradient matrix. 124 */ Update(MatType & iterate,const double stepSize,const GradType & gradient)125 void Update(MatType& iterate, 126 const double stepSize, 127 const GradType& gradient) 128 { 129 // Increment the iteration counter variable. 130 ++parent.iteration; 131 132 // And update the iterate. 133 m *= parent.beta1; 134 m += (1 - parent.beta1) * gradient; 135 136 v *= parent.beta2; 137 v += (1 - parent.beta2) * (gradient % gradient); 138 139 const double biasCorrection1 = 1.0 - std::pow(parent.beta1, 140 parent.iteration); 141 const double biasCorrection2 = 1.0 - std::pow(parent.beta2, 142 parent.iteration); 143 144 GradType mDash = m / biasCorrection1; 145 GradType vDash = v / biasCorrection2; 146 147 // QHAdam recovers Adam when v2 = v1 = 1. 148 iterate -= stepSize * 149 ((((1 - parent.v1) * gradient) + parent.v1 * mDash) / 150 (arma::sqrt(((1 - parent.v2) * (gradient % gradient)) + 151 parent.v2 * vDash) + parent.epsilon)); 152 } 153 154 private: 155 //! Instantiated parent object. 156 QHAdamUpdate& parent; 157 158 //! The exponential moving average of gradient values. 159 GradType m; 160 161 // The exponential moving average of squared gradient values. 162 GradType v; 163 }; 164 165 private: 166 // The epsilon value used to initialise the squared gradient parameter. 167 double epsilon; 168 169 // The smoothing parameter. 170 double beta1; 171 172 // The second moment coefficient. 173 double beta2; 174 175 // The first quasi-hyperbolic term. 176 double v1; 177 178 // The second quasi-hyperbolic term. 179 double v2; 180 181 // The number of iterations. 182 size_t iteration; 183 }; 184 185 } // namespace ens 186 187 #endif 188