1 /**
2  * @file qhadam_update.hpp
3  * @author Niteya Shah
4  *
5  * Implments the QHAdam Optimizer. QHAdam is a variant of Adam which introduces
6  * quasi hyperbolic moment terms to improve paramterisation and performance.
7  *
8  * ensmallen is free software; you may redistribute it and/or modify it under
9  * the terms of the 3-clause BSD license.  You should have received a copy of
10  * the 3-clause BSD license along with ensmallen.  If not, see
11  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
12  */
13 #ifndef ENSMALLEN_ADAM_QHADAM_UPDATE_HPP
14 #define ENSMALLEN_ADAM_QHADAM_UPDATE_HPP
15 
16 namespace ens {
17 
18 /**
19  * QHAdam is a optimising strategy based on the Quasi-Hyperbolic step when
20  * applied to the Adam Optimiser.QH updates can be considered to a weighted
21  * average of the momentum.QHAdam,based on its paramterisation can recover
22  * many algorithms such as NAdam and RMSProp.
23  *
24  * For more information, see the following.
25  *
26  * @code
27  * @inproceedings{ma2019qh,
28  *   title={Quasi-hyperbolic momentum and Adam for deep learning},
29  *   author={Jerry Ma and Denis Yarats},
30  *   booktitle={International Conference on Learning Representations},
31  *   year={2019}
32  * }
33  * @endcode
34  */
35 class QHAdamUpdate
36 {
37  public:
38   /**
39    * Construct the QHAdam update policy with the given parameters.
40    *
41    * @param epsilon The epsilon value used to initialise the squared gradient
42    *        parameter.
43    * @param beta1 The smoothing parameter.
44    * @param beta2 The second moment coefficient.
45    * @param v1 The first quasi-hyperbolic term.
46    * @param v1 The second quasi-hyperbolic term.
47    */
QHAdamUpdate(const double epsilon=1e-8,const double beta1=0.9,const double beta2=0.999,const double v1=0.7,const double v2=1)48   QHAdamUpdate(const double epsilon = 1e-8,
49                const double beta1 = 0.9,
50                const double beta2 = 0.999,
51                const double v1 = 0.7,
52                const double v2 = 1) :
53     epsilon(epsilon),
54     beta1(beta1),
55     beta2(beta2),
56     v1(v1),
57     v2(v2),
58     iteration(0)
59   {
60     // Nothing to do.
61   }
62 
63   //! Get the value used to initialise the squared gradient parameter.
Epsilon() const64   double Epsilon() const { return epsilon; }
65   //! Modify the value used to initialise the squared gradient parameter.
Epsilon()66   double& Epsilon() { return epsilon; }
67 
68   //! Get the smoothing parameter.
Beta1() const69   double Beta1() const { return beta1; }
70   //! Modify the smoothing parameter.
Beta1()71   double& Beta1() { return beta1; }
72 
73   //! Get the second moment coefficient.
Beta2() const74   double Beta2() const { return beta2; }
75   //! Modify the second moment coefficient.
Beta2()76   double& Beta2() { return beta2; }
77 
78   //! Get the current iteration number.
Iteration() const79   size_t Iteration() const { return iteration; }
80   //! Modify the current iteration number.
Iteration()81   size_t& Iteration() { return iteration; }
82 
83   //! Get the first quasi-hyperbolic term.
V1() const84   double V1() const { return v1; }
85   //! Modify the first quasi-hyperbolic term.
V1()86   double& V1() { return v1; }
87 
88   //! Get the second quasi-hyperbolic term.
V2() const89   double V2() const { return v2; }
90   //! Modify the second quasi-hyperbolic term.
V2()91   double& V2() { return v2; }
92 
93   /**
94    * The UpdatePolicyType policy classes must contain an internal 'Policy'
95    * template class with two template arguments: MatType and GradType.  This is
96    * instantiated at the start of the optimization, and holds parameters
97    * specific to an individual optimization.
98    */
99   template<typename MatType, typename GradType>
100   class Policy
101   {
102    public:
103     /**
104      * This constructor is called by the SGD Optimize() method before the start
105      * of the iteration update process.
106      *
107      * @param parent AdamUpdate object.
108      * @param rows Number of rows in the gradient matrix.
109      * @param cols Number of columns in the gradient matrix.
110      */
Policy(QHAdamUpdate & parent,const size_t rows,const size_t cols)111     Policy(QHAdamUpdate& parent, const size_t rows, const size_t cols) :
112         parent(parent)
113     {
114       m.zeros(rows, cols);
115       v.zeros(rows, cols);
116     }
117 
118     /**
119      * Update step for QHAdam.
120      *
121      * @param iterate Parameters that minimize the function.
122      * @param stepSize Step size to be used for the given iteration.
123      * @param gradient The gradient matrix.
124      */
Update(MatType & iterate,const double stepSize,const GradType & gradient)125     void Update(MatType& iterate,
126                 const double stepSize,
127                 const GradType& gradient)
128     {
129       // Increment the iteration counter variable.
130       ++parent.iteration;
131 
132       // And update the iterate.
133       m *= parent.beta1;
134       m += (1 - parent.beta1) * gradient;
135 
136       v *= parent.beta2;
137       v += (1 - parent.beta2) * (gradient % gradient);
138 
139       const double biasCorrection1 = 1.0 - std::pow(parent.beta1,
140           parent.iteration);
141       const double biasCorrection2 = 1.0 - std::pow(parent.beta2,
142           parent.iteration);
143 
144       GradType mDash = m / biasCorrection1;
145       GradType vDash = v / biasCorrection2;
146 
147       // QHAdam recovers Adam when v2 = v1 = 1.
148       iterate -= stepSize *
149           ((((1 - parent.v1) * gradient) + parent.v1 * mDash) /
150            (arma::sqrt(((1 - parent.v2) * (gradient % gradient)) +
151             parent.v2 * vDash) + parent.epsilon));
152     }
153 
154    private:
155     //! Instantiated parent object.
156     QHAdamUpdate& parent;
157 
158     //! The exponential moving average of gradient values.
159     GradType m;
160 
161     // The exponential moving average of squared gradient values.
162     GradType v;
163   };
164 
165  private:
166   // The epsilon value used to initialise the squared gradient parameter.
167   double epsilon;
168 
169   // The smoothing parameter.
170   double beta1;
171 
172   // The second moment coefficient.
173   double beta2;
174 
175   // The first quasi-hyperbolic term.
176   double v1;
177 
178   // The second quasi-hyperbolic term.
179   double v2;
180 
181   // The number of iterations.
182   size_t iteration;
183 };
184 
185 } // namespace ens
186 
187 #endif
188