1"""Stochastic optimization methods for MLP
2"""
3
4# Authors: Jiyuan Qian <jq401@nyu.edu>
5# License: BSD 3 clause
6
7import numpy as np
8
9
10class BaseOptimizer:
11    """Base (Stochastic) gradient descent optimizer
12
13    Parameters
14    ----------
15    learning_rate_init : float, default=0.1
16        The initial learning rate used. It controls the step-size in updating
17        the weights
18
19    Attributes
20    ----------
21    learning_rate : float
22        the current learning rate
23    """
24
25    def __init__(self, learning_rate_init=0.1):
26        self.learning_rate_init = learning_rate_init
27        self.learning_rate = float(learning_rate_init)
28
29    def update_params(self, params, grads):
30        """Update parameters with given gradients
31
32        Parameters
33        ----------
34        params : list of length = len(coefs_) + len(intercepts_)
35            The concatenated list containing coefs_ and intercepts_ in MLP
36            model. Used for initializing velocities and updating params
37
38        grads : list of length = len(params)
39            Containing gradients with respect to coefs_ and intercepts_ in MLP
40            model. So length should be aligned with params
41        """
42        updates = self._get_updates(grads)
43        for param, update in zip((p for p in params), updates):
44            param += update
45
46    def iteration_ends(self, time_step):
47        """Perform update to learning rate and potentially other states at the
48        end of an iteration
49        """
50        pass
51
52    def trigger_stopping(self, msg, verbose):
53        """Decides whether it is time to stop training
54
55        Parameters
56        ----------
57        msg : str
58            Message passed in for verbose output
59
60        verbose : bool
61            Print message to stdin if True
62
63        Returns
64        -------
65        is_stopping : bool
66            True if training needs to stop
67        """
68        if verbose:
69            print(msg + " Stopping.")
70        return True
71
72
73class SGDOptimizer(BaseOptimizer):
74    """Stochastic gradient descent optimizer with momentum
75
76    Parameters
77    ----------
78    params : list, length = len(coefs_) + len(intercepts_)
79        The concatenated list containing coefs_ and intercepts_ in MLP model.
80        Used for initializing velocities and updating params
81
82    learning_rate_init : float, default=0.1
83        The initial learning rate used. It controls the step-size in updating
84        the weights
85
86    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
87        Learning rate schedule for weight updates.
88
89        -'constant', is a constant learning rate given by
90         'learning_rate_init'.
91
92        -'invscaling' gradually decreases the learning rate 'learning_rate_' at
93          each time step 't' using an inverse scaling exponent of 'power_t'.
94          learning_rate_ = learning_rate_init / pow(t, power_t)
95
96        -'adaptive', keeps the learning rate constant to
97         'learning_rate_init' as long as the training keeps decreasing.
98         Each time 2 consecutive epochs fail to decrease the training loss by
99         tol, or fail to increase validation score by tol if 'early_stopping'
100         is on, the current learning rate is divided by 5.
101
102    momentum : float, default=0.9
103        Value of momentum used, must be larger than or equal to 0
104
105    nesterov : bool, default=True
106        Whether to use nesterov's momentum or not. Use nesterov's if True
107
108    power_t : float, default=0.5
109        Power of time step 't' in inverse scaling. See `lr_schedule` for
110        more details.
111
112    Attributes
113    ----------
114    learning_rate : float
115        the current learning rate
116
117    velocities : list, length = len(params)
118        velocities that are used to update params
119    """
120
121    def __init__(
122        self,
123        params,
124        learning_rate_init=0.1,
125        lr_schedule="constant",
126        momentum=0.9,
127        nesterov=True,
128        power_t=0.5,
129    ):
130        super().__init__(learning_rate_init)
131
132        self.lr_schedule = lr_schedule
133        self.momentum = momentum
134        self.nesterov = nesterov
135        self.power_t = power_t
136        self.velocities = [np.zeros_like(param) for param in params]
137
138    def iteration_ends(self, time_step):
139        """Perform updates to learning rate and potential other states at the
140        end of an iteration
141
142        Parameters
143        ----------
144        time_step : int
145            number of training samples trained on so far, used to update
146            learning rate for 'invscaling'
147        """
148        if self.lr_schedule == "invscaling":
149            self.learning_rate = (
150                float(self.learning_rate_init) / (time_step + 1) ** self.power_t
151            )
152
153    def trigger_stopping(self, msg, verbose):
154        if self.lr_schedule != "adaptive":
155            if verbose:
156                print(msg + " Stopping.")
157            return True
158
159        if self.learning_rate <= 1e-6:
160            if verbose:
161                print(msg + " Learning rate too small. Stopping.")
162            return True
163
164        self.learning_rate /= 5.0
165        if verbose:
166            print(msg + " Setting learning rate to %f" % self.learning_rate)
167        return False
168
169    def _get_updates(self, grads):
170        """Get the values used to update params with given gradients
171
172        Parameters
173        ----------
174        grads : list, length = len(coefs_) + len(intercepts_)
175            Containing gradients with respect to coefs_ and intercepts_ in MLP
176            model. So length should be aligned with params
177
178        Returns
179        -------
180        updates : list, length = len(grads)
181            The values to add to params
182        """
183        updates = [
184            self.momentum * velocity - self.learning_rate * grad
185            for velocity, grad in zip(self.velocities, grads)
186        ]
187        self.velocities = updates
188
189        if self.nesterov:
190            updates = [
191                self.momentum * velocity - self.learning_rate * grad
192                for velocity, grad in zip(self.velocities, grads)
193            ]
194
195        return updates
196
197
198class AdamOptimizer(BaseOptimizer):
199    """Stochastic gradient descent optimizer with Adam
200
201    Note: All default values are from the original Adam paper
202
203    Parameters
204    ----------
205    params : list, length = len(coefs_) + len(intercepts_)
206        The concatenated list containing coefs_ and intercepts_ in MLP model.
207        Used for initializing velocities and updating params
208
209    learning_rate_init : float, default=0.001
210        The initial learning rate used. It controls the step-size in updating
211        the weights
212
213    beta_1 : float, default=0.9
214        Exponential decay rate for estimates of first moment vector, should be
215        in [0, 1)
216
217    beta_2 : float, default=0.999
218        Exponential decay rate for estimates of second moment vector, should be
219        in [0, 1)
220
221    epsilon : float, default=1e-8
222        Value for numerical stability
223
224    Attributes
225    ----------
226    learning_rate : float
227        The current learning rate
228
229    t : int
230        Timestep
231
232    ms : list, length = len(params)
233        First moment vectors
234
235    vs : list, length = len(params)
236        Second moment vectors
237
238    References
239    ----------
240    Kingma, Diederik, and Jimmy Ba.
241    "Adam: A method for stochastic optimization."
242    arXiv preprint arXiv:1412.6980 (2014).
243    """
244
245    def __init__(
246        self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
247    ):
248        super().__init__(learning_rate_init)
249
250        self.beta_1 = beta_1
251        self.beta_2 = beta_2
252        self.epsilon = epsilon
253        self.t = 0
254        self.ms = [np.zeros_like(param) for param in params]
255        self.vs = [np.zeros_like(param) for param in params]
256
257    def _get_updates(self, grads):
258        """Get the values used to update params with given gradients
259
260        Parameters
261        ----------
262        grads : list, length = len(coefs_) + len(intercepts_)
263            Containing gradients with respect to coefs_ and intercepts_ in MLP
264            model. So length should be aligned with params
265
266        Returns
267        -------
268        updates : list, length = len(grads)
269            The values to add to params
270        """
271        self.t += 1
272        self.ms = [
273            self.beta_1 * m + (1 - self.beta_1) * grad
274            for m, grad in zip(self.ms, grads)
275        ]
276        self.vs = [
277            self.beta_2 * v + (1 - self.beta_2) * (grad ** 2)
278            for v, grad in zip(self.vs, grads)
279        ]
280        self.learning_rate = (
281            self.learning_rate_init
282            * np.sqrt(1 - self.beta_2 ** self.t)
283            / (1 - self.beta_1 ** self.t)
284        )
285        updates = [
286            -self.learning_rate * m / (np.sqrt(v) + self.epsilon)
287            for m, v in zip(self.ms, self.vs)
288        ]
289        return updates
290