1 /** 2 * @file nadamax_update.hpp 3 * @author Sourabh Varshney 4 * 5 * NadaMax update rule. NadaMax is an optimizer that combines the effect of 6 * Adamax and NAG to the gradient descent to improve its Performance. 7 * 8 * ensmallen is free software; you may redistribute it and/or modify it under 9 * the terms of the 3-clause BSD license. You should have received a copy of 10 * the 3-clause BSD license along with ensmallen. If not, see 11 * http://www.opensource.org/licenses/BSD-3-Clause for more information. 12 */ 13 #ifndef ENSMALLEN_ADAM_NADAMAX_UPDATE_HPP 14 #define ENSMALLEN_ADAM_NADAMAX_UPDATE_HPP 15 16 namespace ens { 17 18 /** 19 * NadaMax is an optimizer that combines the AdaMax and NAG. 20 * 21 * For more information, see the following. 22 * 23 * @code 24 * @techreport{Dozat2015, 25 * title = {Incorporating Nesterov momentum into Adam}, 26 * author = {Timothy Dozat}, 27 * institution = {Stanford University}, 28 * address = {Stanford}, 29 * year = {2015}, 30 * url = {https://openreview.net/pdf?id=OM0jvwB8jIp57ZJjtNEZ} 31 * } 32 * @endcode 33 */ 34 class NadaMaxUpdate 35 { 36 public: 37 /** 38 * Construct the NadaMax update policy with the given parameters. 39 * 40 * @param epsilon The epsilon value used to initialise the squared gradient 41 * parameter. 42 * @param beta1 The smoothing parameter. 43 * @param beta2 The second moment coefficient 44 * @param scheduleDecay The decay parameter for decay coefficients 45 */ NadaMaxUpdate(const double epsilon=1e-8,const double beta1=0.9,const double beta2=0.99,const double scheduleDecay=4e-3)46 NadaMaxUpdate(const double epsilon = 1e-8, 47 const double beta1 = 0.9, 48 const double beta2 = 0.99, 49 const double scheduleDecay = 4e-3) : 50 epsilon(epsilon), 51 beta1(beta1), 52 beta2(beta2), 53 scheduleDecay(scheduleDecay), 54 iteration(0) 55 { 56 // Nothing to do. 57 } 58 59 //! Get the value used to initialise the squared gradient parameter. Epsilon() const60 double Epsilon() const { return epsilon; } 61 //! Modify the value used to initialise the squared gradient parameter. Epsilon()62 double& Epsilon() { return epsilon; } 63 64 //! Get the smoothing parameter. Beta1() const65 double Beta1() const { return beta1; } 66 //! Modify the smoothing parameter. Beta1()67 double& Beta1() { return beta1; } 68 69 //! Get the second moment coefficient. Beta2() const70 double Beta2() const { return beta2; } 71 //! Modify the second moment coefficient. Beta2()72 double& Beta2() { return beta2; } 73 74 //! Get the decay parameter for decay coefficients ScheduleDecay() const75 double ScheduleDecay() const { return scheduleDecay; } 76 //! Modify the decay parameter for decay coefficients ScheduleDecay()77 double& ScheduleDecay() { return scheduleDecay; } 78 79 //! Get the current iteration number. Iteration() const80 size_t Iteration() const { return iteration; } 81 //! Modify the current iteration number. Iteration()82 size_t& Iteration() { return iteration; } 83 84 /** 85 * The UpdatePolicyType policy classes must contain an internal 'Policy' 86 * template class with two template arguments: MatType and GradType. This is 87 * instantiated at the start of the optimization, and holds parameters 88 * specific to an individual optimization. 89 */ 90 template<typename MatType, typename GradType> 91 class Policy 92 { 93 public: 94 /** 95 * This constructor method is called by the optimizer before the start of 96 * the iteration update process. 97 * 98 * @param parent Instantiated NadaMaxUpdate parent object. 99 * @param rows Number of rows in the gradient matrix. 100 * @param cols Number of columns in the gradient matrix. 101 */ Policy(NadaMaxUpdate & parent,const size_t rows,const size_t cols)102 Policy(NadaMaxUpdate& parent, const size_t rows, const size_t cols) : 103 parent(parent), 104 cumBeta1(1) 105 { 106 m.zeros(rows, cols); 107 u.zeros(rows, cols); 108 } 109 110 /** 111 * Update step for NadaMax. 112 * 113 * @param iterate Parameters that minimize the function. 114 * @param stepSize Step size to be used for the given iteration. 115 * @param gradient The gradient matrix. 116 */ Update(MatType & iterate,const double stepSize,const GradType & gradient)117 void Update(MatType& iterate, 118 const double stepSize, 119 const GradType& gradient) 120 { 121 // Increment the iteration counter variable. 122 ++parent.iteration; 123 124 // And update the iterate. 125 m *= parent.beta1; 126 m += (1 - parent.beta1) * gradient; 127 128 u = arma::max(u * parent.beta2, arma::abs(gradient)); 129 130 double beta1T = parent.beta1 * (1 - (0.5 * 131 std::pow(0.96, parent.iteration * parent.scheduleDecay))); 132 133 double beta1T1 = parent.beta1 * (1 - (0.5 * 134 std::pow(0.96, (parent.iteration + 1) * parent.scheduleDecay))); 135 136 cumBeta1 *= beta1T; 137 138 const double biasCorrection1 = 1.0 - cumBeta1; 139 140 const double biasCorrection2 = 1.0 - (cumBeta1 * beta1T1); 141 142 if ((biasCorrection1 != 0) && (biasCorrection2 != 0)) 143 { 144 iterate -= (stepSize * (((1 - beta1T) / biasCorrection1) * gradient 145 + (beta1T1 / biasCorrection2) * m)) / (u + parent.epsilon); 146 } 147 } 148 149 private: 150 // Instantiated parent object. 151 NadaMaxUpdate& parent; 152 153 // The exponential moving average of gradient values. 154 GradType m; 155 156 // The exponentially weighted infinity norm. 157 GradType u; 158 159 // The cumulative product of decay coefficients. 160 double cumBeta1; 161 }; 162 163 private: 164 // The epsilon value used to initialise the squared gradient parameter. 165 double epsilon; 166 167 // The smoothing parameter. 168 double beta1; 169 170 // The second moment coefficient. 171 double beta2; 172 173 // The decay parameter for decay coefficients. 174 double scheduleDecay; 175 176 // The number of iterations. 177 size_t iteration; 178 }; 179 180 } // namespace ens 181 182 #endif 183