1"""Stochastic optimization methods for MLP 2""" 3 4# Authors: Jiyuan Qian <jq401@nyu.edu> 5# License: BSD 3 clause 6 7import numpy as np 8 9 10class BaseOptimizer: 11 """Base (Stochastic) gradient descent optimizer 12 13 Parameters 14 ---------- 15 learning_rate_init : float, default=0.1 16 The initial learning rate used. It controls the step-size in updating 17 the weights 18 19 Attributes 20 ---------- 21 learning_rate : float 22 the current learning rate 23 """ 24 25 def __init__(self, learning_rate_init=0.1): 26 self.learning_rate_init = learning_rate_init 27 self.learning_rate = float(learning_rate_init) 28 29 def update_params(self, params, grads): 30 """Update parameters with given gradients 31 32 Parameters 33 ---------- 34 params : list of length = len(coefs_) + len(intercepts_) 35 The concatenated list containing coefs_ and intercepts_ in MLP 36 model. Used for initializing velocities and updating params 37 38 grads : list of length = len(params) 39 Containing gradients with respect to coefs_ and intercepts_ in MLP 40 model. So length should be aligned with params 41 """ 42 updates = self._get_updates(grads) 43 for param, update in zip((p for p in params), updates): 44 param += update 45 46 def iteration_ends(self, time_step): 47 """Perform update to learning rate and potentially other states at the 48 end of an iteration 49 """ 50 pass 51 52 def trigger_stopping(self, msg, verbose): 53 """Decides whether it is time to stop training 54 55 Parameters 56 ---------- 57 msg : str 58 Message passed in for verbose output 59 60 verbose : bool 61 Print message to stdin if True 62 63 Returns 64 ------- 65 is_stopping : bool 66 True if training needs to stop 67 """ 68 if verbose: 69 print(msg + " Stopping.") 70 return True 71 72 73class SGDOptimizer(BaseOptimizer): 74 """Stochastic gradient descent optimizer with momentum 75 76 Parameters 77 ---------- 78 params : list, length = len(coefs_) + len(intercepts_) 79 The concatenated list containing coefs_ and intercepts_ in MLP model. 80 Used for initializing velocities and updating params 81 82 learning_rate_init : float, default=0.1 83 The initial learning rate used. It controls the step-size in updating 84 the weights 85 86 lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant' 87 Learning rate schedule for weight updates. 88 89 -'constant', is a constant learning rate given by 90 'learning_rate_init'. 91 92 -'invscaling' gradually decreases the learning rate 'learning_rate_' at 93 each time step 't' using an inverse scaling exponent of 'power_t'. 94 learning_rate_ = learning_rate_init / pow(t, power_t) 95 96 -'adaptive', keeps the learning rate constant to 97 'learning_rate_init' as long as the training keeps decreasing. 98 Each time 2 consecutive epochs fail to decrease the training loss by 99 tol, or fail to increase validation score by tol if 'early_stopping' 100 is on, the current learning rate is divided by 5. 101 102 momentum : float, default=0.9 103 Value of momentum used, must be larger than or equal to 0 104 105 nesterov : bool, default=True 106 Whether to use nesterov's momentum or not. Use nesterov's if True 107 108 power_t : float, default=0.5 109 Power of time step 't' in inverse scaling. See `lr_schedule` for 110 more details. 111 112 Attributes 113 ---------- 114 learning_rate : float 115 the current learning rate 116 117 velocities : list, length = len(params) 118 velocities that are used to update params 119 """ 120 121 def __init__( 122 self, 123 params, 124 learning_rate_init=0.1, 125 lr_schedule="constant", 126 momentum=0.9, 127 nesterov=True, 128 power_t=0.5, 129 ): 130 super().__init__(learning_rate_init) 131 132 self.lr_schedule = lr_schedule 133 self.momentum = momentum 134 self.nesterov = nesterov 135 self.power_t = power_t 136 self.velocities = [np.zeros_like(param) for param in params] 137 138 def iteration_ends(self, time_step): 139 """Perform updates to learning rate and potential other states at the 140 end of an iteration 141 142 Parameters 143 ---------- 144 time_step : int 145 number of training samples trained on so far, used to update 146 learning rate for 'invscaling' 147 """ 148 if self.lr_schedule == "invscaling": 149 self.learning_rate = ( 150 float(self.learning_rate_init) / (time_step + 1) ** self.power_t 151 ) 152 153 def trigger_stopping(self, msg, verbose): 154 if self.lr_schedule != "adaptive": 155 if verbose: 156 print(msg + " Stopping.") 157 return True 158 159 if self.learning_rate <= 1e-6: 160 if verbose: 161 print(msg + " Learning rate too small. Stopping.") 162 return True 163 164 self.learning_rate /= 5.0 165 if verbose: 166 print(msg + " Setting learning rate to %f" % self.learning_rate) 167 return False 168 169 def _get_updates(self, grads): 170 """Get the values used to update params with given gradients 171 172 Parameters 173 ---------- 174 grads : list, length = len(coefs_) + len(intercepts_) 175 Containing gradients with respect to coefs_ and intercepts_ in MLP 176 model. So length should be aligned with params 177 178 Returns 179 ------- 180 updates : list, length = len(grads) 181 The values to add to params 182 """ 183 updates = [ 184 self.momentum * velocity - self.learning_rate * grad 185 for velocity, grad in zip(self.velocities, grads) 186 ] 187 self.velocities = updates 188 189 if self.nesterov: 190 updates = [ 191 self.momentum * velocity - self.learning_rate * grad 192 for velocity, grad in zip(self.velocities, grads) 193 ] 194 195 return updates 196 197 198class AdamOptimizer(BaseOptimizer): 199 """Stochastic gradient descent optimizer with Adam 200 201 Note: All default values are from the original Adam paper 202 203 Parameters 204 ---------- 205 params : list, length = len(coefs_) + len(intercepts_) 206 The concatenated list containing coefs_ and intercepts_ in MLP model. 207 Used for initializing velocities and updating params 208 209 learning_rate_init : float, default=0.001 210 The initial learning rate used. It controls the step-size in updating 211 the weights 212 213 beta_1 : float, default=0.9 214 Exponential decay rate for estimates of first moment vector, should be 215 in [0, 1) 216 217 beta_2 : float, default=0.999 218 Exponential decay rate for estimates of second moment vector, should be 219 in [0, 1) 220 221 epsilon : float, default=1e-8 222 Value for numerical stability 223 224 Attributes 225 ---------- 226 learning_rate : float 227 The current learning rate 228 229 t : int 230 Timestep 231 232 ms : list, length = len(params) 233 First moment vectors 234 235 vs : list, length = len(params) 236 Second moment vectors 237 238 References 239 ---------- 240 Kingma, Diederik, and Jimmy Ba. 241 "Adam: A method for stochastic optimization." 242 arXiv preprint arXiv:1412.6980 (2014). 243 """ 244 245 def __init__( 246 self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8 247 ): 248 super().__init__(learning_rate_init) 249 250 self.beta_1 = beta_1 251 self.beta_2 = beta_2 252 self.epsilon = epsilon 253 self.t = 0 254 self.ms = [np.zeros_like(param) for param in params] 255 self.vs = [np.zeros_like(param) for param in params] 256 257 def _get_updates(self, grads): 258 """Get the values used to update params with given gradients 259 260 Parameters 261 ---------- 262 grads : list, length = len(coefs_) + len(intercepts_) 263 Containing gradients with respect to coefs_ and intercepts_ in MLP 264 model. So length should be aligned with params 265 266 Returns 267 ------- 268 updates : list, length = len(grads) 269 The values to add to params 270 """ 271 self.t += 1 272 self.ms = [ 273 self.beta_1 * m + (1 - self.beta_1) * grad 274 for m, grad in zip(self.ms, grads) 275 ] 276 self.vs = [ 277 self.beta_2 * v + (1 - self.beta_2) * (grad ** 2) 278 for v, grad in zip(self.vs, grads) 279 ] 280 self.learning_rate = ( 281 self.learning_rate_init 282 * np.sqrt(1 - self.beta_2 ** self.t) 283 / (1 - self.beta_1 ** self.t) 284 ) 285 updates = [ 286 -self.learning_rate * m / (np.sqrt(v) + self.epsilon) 287 for m, v in zip(self.ms, self.vs) 288 ] 289 return updates 290