1 /**
2  * @file padam_update.hpp
3  * @author Marcus Edel
4  *
5  * Implementation of Partially adaptive momentum estimation method (Padam).
6  *
7  * ensmallen is free software; you may redistribute it and/or modify it under
8  * the terms of the 3-clause BSD license.  You should have received a copy of
9  * the 3-clause BSD license along with ensmallen.  If not, see
10  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
11  */
12 #ifndef ENSMALLEN_PADAM_PADAM_UPDATE_HPP
13 #define ENSMALLEN_PADAM_PADAM_UPDATE_HPP
14 
15 namespace ens {
16 
17 /**
18  * Partially adaptive momentum estimation method (Padam),
19  * adopts historical gradient information to automatically adjust the
20  * learning rate.
21  *
22  * For more information, see the following.
23  *
24  * @code
25  * @article{
26  *   title   = {Closing the Generalization Gap of Adaptive Gradient Methods in
27  *              Training Deep Neural Networks},
28  *   author  = {{Chen}, J. and {Gu}, Q.},
29  *   journal = {ArXiv e-prints},
30  *   url     = {https://arxiv.org/abs/1806.06763}
31  *   year    = {2018}
32  * }
33  * @endcode
34  */
35 class PadamUpdate
36 {
37  public:
38   /**
39    * Construct the Padam update policy with the given parameters.
40    *
41    * @param epsilon Epsilon is the minimum allowed gradient.
42    * @param beta1 The smoothing parameter.
43    * @param beta2 The second moment coefficient.
44    * @param partial Partially adaptive parameter.
45    */
PadamUpdate(const double epsilon=1e-8,const double beta1=0.9,const double beta2=0.999,const double partial=0.25)46   PadamUpdate(const double epsilon = 1e-8,
47               const double beta1 = 0.9,
48               const double beta2 = 0.999,
49               const double partial = 0.25) :
50       epsilon(epsilon),
51       beta1(beta1),
52       beta2(beta2),
53       partial(partial),
54       iteration(0)
55   {
56     // Nothing to do.
57   }
58 
59   //! Get the value used to initialise the squared gradient parameter.
Epsilon() const60   double Epsilon() const { return epsilon; }
61   //! Modify the value used to initialise the squared gradient parameter.
Epsilon()62   double& Epsilon() { return epsilon; }
63 
64   //! Get the smoothing parameter.
Beta1() const65   double Beta1() const { return beta1; }
66   //! Modify the smoothing parameter.
Beta1()67   double& Beta1() { return beta1; }
68 
69   //! Get the second moment coefficient.
Beta2() const70   double Beta2() const { return beta2; }
71   //! Modify the second moment coefficient.
Beta2()72   double& Beta2() { return beta2; }
73 
74   //! Get the partial adaptive parameter.
Partial() const75   double Partial() const { return partial; }
76   //! Modify the partial adaptive parameter.
Partial()77   double& Partial() { return partial; }
78 
79   //! Get the current iteration number.
Iteration() const80   size_t Iteration() const { return iteration; }
81   //! Modify the current iteration number.
Iteration()82   size_t& Iteration() { return iteration; }
83 
84   /**
85    * The UpdatePolicyType policy classes must contain an internal 'Policy'
86    * template class with two template arguments: MatType and GradType.  This is
87    * instantiated at the start of the optimization, and holds parameters
88    * specific to an individual optimization.
89    */
90   template<typename MatType, typename GradType>
91   class Policy
92   {
93    public:
94     /**
95      * This constructor is called by the SGD Optimize() method before the start
96      * of the iteration update process.
97      *
98      * @param parent Instantiated PadamUpdate parent object.
99      * @param rows Number of rows in the gradient matrix.
100      * @param cols Number of columns in the gradient matrix.
101      */
Policy(PadamUpdate & parent,const size_t rows,const size_t cols)102     Policy(PadamUpdate& parent, const size_t rows, const size_t cols) :
103         parent(parent)
104     {
105       m.zeros(rows, cols);
106       v.zeros(rows, cols);
107       vImproved.zeros(rows, cols);
108     }
109 
110     /**
111      * Update step for Padam.
112      *
113      * @param iterate Parameters that minimize the function.
114      * @param stepSize Step size to be used for the given iteration.
115      * @param gradient The gradient matrix.
116      */
Update(MatType & iterate,const double stepSize,const GradType & gradient)117     void Update(MatType& iterate,
118                 const double stepSize,
119                 const GradType& gradient)
120     {
121       // Increment the iteration counter variable.
122       ++parent.iteration;
123 
124       // And update the iterate.
125       m *= parent.beta1;
126       m += (1 - parent.beta1) * gradient;
127 
128       v *= parent.beta2;
129       v += (1 - parent.beta2) * (gradient % gradient);
130 
131       const double biasCorrection1 = 1.0 - std::pow(parent.beta1,
132           parent.iteration);
133       const double biasCorrection2 = 1.0 - std::pow(parent.beta2,
134           parent.iteration);
135 
136       // Element wise maximum of past and present squared gradients.
137       vImproved = arma::max(vImproved, v);
138 
139       iterate -= (stepSize * std::sqrt(biasCorrection2) / biasCorrection1) *
140           m / arma::pow(vImproved + parent.epsilon, parent.partial);
141     }
142 
143    private:
144     //! Instantiated parent object.
145     PadamUpdate& parent;
146 
147     //! The exponential moving average of gradient values.
148     GradType m;
149 
150     //! The exponential moving average of squared gradient values.
151     GradType v;
152 
153     //! The optimal sqaured gradient value.
154     GradType vImproved;
155   };
156 
157  private:
158   //! The epsilon value used to initialise the squared gradient parameter.
159   double epsilon;
160 
161   //! The smoothing parameter.
162   double beta1;
163 
164   //! The second moment coefficient.
165   double beta2;
166 
167   //! Partial adaptive parameter.
168   double partial;
169 
170   //! The number of iterations.
171   size_t iteration;
172 };
173 
174 } // namespace ens
175 
176 #endif
177