1 /**
2  * @file nadamax_update.hpp
3  * @author Sourabh Varshney
4  *
5  * NadaMax update rule. NadaMax is an optimizer that combines the effect of
6  * Adamax and NAG to the gradient descent to improve its Performance.
7  *
8  * ensmallen is free software; you may redistribute it and/or modify it under
9  * the terms of the 3-clause BSD license.  You should have received a copy of
10  * the 3-clause BSD license along with ensmallen.  If not, see
11  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
12  */
13 #ifndef ENSMALLEN_ADAM_NADAMAX_UPDATE_HPP
14 #define ENSMALLEN_ADAM_NADAMAX_UPDATE_HPP
15 
16 namespace ens {
17 
18 /**
19  * NadaMax is an optimizer that combines the AdaMax and NAG.
20  *
21  * For more information, see the following.
22  *
23  * @code
24  * @techreport{Dozat2015,
25  *   title       = {Incorporating Nesterov momentum into Adam},
26  *   author      = {Timothy Dozat},
27  *   institution = {Stanford University},
28  *   address     = {Stanford},
29  *   year        = {2015},
30  *   url         = {https://openreview.net/pdf?id=OM0jvwB8jIp57ZJjtNEZ}
31  * }
32  * @endcode
33  */
34 class NadaMaxUpdate
35 {
36  public:
37   /**
38    * Construct the NadaMax update policy with the given parameters.
39    *
40    * @param epsilon The epsilon value used to initialise the squared gradient
41    *        parameter.
42    * @param beta1 The smoothing parameter.
43    * @param beta2 The second moment coefficient
44    * @param scheduleDecay The decay parameter for decay coefficients
45    */
NadaMaxUpdate(const double epsilon=1e-8,const double beta1=0.9,const double beta2=0.99,const double scheduleDecay=4e-3)46   NadaMaxUpdate(const double epsilon = 1e-8,
47                 const double beta1 = 0.9,
48                 const double beta2 = 0.99,
49                 const double scheduleDecay = 4e-3) :
50       epsilon(epsilon),
51       beta1(beta1),
52       beta2(beta2),
53       scheduleDecay(scheduleDecay),
54       iteration(0)
55   {
56     // Nothing to do.
57   }
58 
59   //! Get the value used to initialise the squared gradient parameter.
Epsilon() const60   double Epsilon() const { return epsilon; }
61   //! Modify the value used to initialise the squared gradient parameter.
Epsilon()62   double& Epsilon() { return epsilon; }
63 
64   //! Get the smoothing parameter.
Beta1() const65   double Beta1() const { return beta1; }
66   //! Modify the smoothing parameter.
Beta1()67   double& Beta1() { return beta1; }
68 
69   //! Get the second moment coefficient.
Beta2() const70   double Beta2() const { return beta2; }
71   //! Modify the second moment coefficient.
Beta2()72   double& Beta2() { return beta2; }
73 
74   //! Get the decay parameter for decay coefficients
ScheduleDecay() const75   double ScheduleDecay() const { return scheduleDecay; }
76   //! Modify the decay parameter for decay coefficients
ScheduleDecay()77   double& ScheduleDecay() { return scheduleDecay; }
78 
79   //! Get the current iteration number.
Iteration() const80   size_t Iteration() const { return iteration; }
81   //! Modify the current iteration number.
Iteration()82   size_t& Iteration() { return iteration; }
83 
84   /**
85    * The UpdatePolicyType policy classes must contain an internal 'Policy'
86    * template class with two template arguments: MatType and GradType.  This is
87    * instantiated at the start of the optimization, and holds parameters
88    * specific to an individual optimization.
89    */
90   template<typename MatType, typename GradType>
91   class Policy
92   {
93    public:
94     /**
95      * This constructor method is called by the optimizer before the start of
96      * the iteration update process.
97      *
98      * @param parent Instantiated NadaMaxUpdate parent object.
99      * @param rows Number of rows in the gradient matrix.
100      * @param cols Number of columns in the gradient matrix.
101      */
Policy(NadaMaxUpdate & parent,const size_t rows,const size_t cols)102     Policy(NadaMaxUpdate& parent, const size_t rows, const size_t cols) :
103         parent(parent),
104         cumBeta1(1)
105     {
106       m.zeros(rows, cols);
107       u.zeros(rows, cols);
108     }
109 
110     /**
111      * Update step for NadaMax.
112      *
113      * @param iterate Parameters that minimize the function.
114      * @param stepSize Step size to be used for the given iteration.
115      * @param gradient The gradient matrix.
116      */
Update(MatType & iterate,const double stepSize,const GradType & gradient)117     void Update(MatType& iterate,
118                 const double stepSize,
119                 const GradType& gradient)
120     {
121       // Increment the iteration counter variable.
122       ++parent.iteration;
123 
124       // And update the iterate.
125       m *= parent.beta1;
126       m += (1 - parent.beta1) * gradient;
127 
128       u = arma::max(u * parent.beta2, arma::abs(gradient));
129 
130       double beta1T = parent.beta1 * (1 - (0.5 *
131           std::pow(0.96, parent.iteration * parent.scheduleDecay)));
132 
133       double beta1T1 = parent.beta1 * (1 - (0.5 *
134           std::pow(0.96, (parent.iteration + 1) * parent.scheduleDecay)));
135 
136       cumBeta1 *= beta1T;
137 
138       const double biasCorrection1 = 1.0 - cumBeta1;
139 
140       const double biasCorrection2 = 1.0 - (cumBeta1 * beta1T1);
141 
142       if ((biasCorrection1 != 0) && (biasCorrection2 != 0))
143       {
144          iterate -= (stepSize * (((1 - beta1T) / biasCorrection1) * gradient
145              + (beta1T1 / biasCorrection2) * m)) / (u + parent.epsilon);
146       }
147     }
148 
149    private:
150     // Instantiated parent object.
151     NadaMaxUpdate& parent;
152 
153     // The exponential moving average of gradient values.
154     GradType m;
155 
156     // The exponentially weighted infinity norm.
157     GradType u;
158 
159     // The cumulative product of decay coefficients.
160     double cumBeta1;
161   };
162 
163  private:
164   // The epsilon value used to initialise the squared gradient parameter.
165   double epsilon;
166 
167   // The smoothing parameter.
168   double beta1;
169 
170   // The second moment coefficient.
171   double beta2;
172 
173   // The decay parameter for decay coefficients.
174   double scheduleDecay;
175 
176   // The number of iterations.
177   size_t iteration;
178 };
179 
180 } // namespace ens
181 
182 #endif
183