1### tests for stochastic gradient ascent 2### 3### do not run unless 'NOT_CRAN' explicitly defined 4### (Suggested by Sebastian Meyer and others) 5if(!identical(Sys.getenv("NOT_CRAN"), "true")) { 6 message("We are on CRAN: skipping slow optimizer tests") 7 q("no") 8} 9if(!requireNamespace("tinytest", quietly = TRUE)) { 10 message("These tests require 'tinytest' package\n") 11 q("no") 12} 13library(maxLik) 14 15### Test the following things: 16### 17### 1. basic 2-D SGA 18### SGA without function, only gradient 19### SGA neither function nor gradient 20### SGA in 1-D case 21### 2. SGA w/momentum 22### 3. SGA full batch 23### 4. SGA, no gradient supplied 24### SGA, return numeric hessian, gradient provided 25### SGA, return numeric hessian, no gradient provided 26### SGA, printlevel 1, storeValues 27### SGA, NA as iterlim: should give informative error 28### SGA, storeValues but no fn (should fail) 29### 30### using highly unequally scaled data 31### SGA without gradient clipping (fails) 32### SGA with gradient clipping (works, although does not converge) 33 34## ---------- OLS 35## log-likelihood function(s): 36## return log-likelihood on validation data 37loglik <- function(beta, index) { 38 e <- yValid - XValid %*% beta 39 -crossprod(e)/length(y) 40} 41## gradlik: work on training data 42gradlik <- function(beta, index) { 43 e <- yTrain[index] - XTrain[index,,drop=FALSE] %*% beta 44 g <- t(-2*t(XTrain[index,,drop=FALSE]) %*% e) 45 -g/length(index) 46} 47 48### create random data 49set.seed(1) 50N <- 1000 51x <- rnorm(N) 52X <- cbind(1, x) 53y <- 100 + 100*x + rnorm(N) 54## training-validation 55iTrain <- sample(N, 0.8*N) 56XTrain <- X[iTrain,,drop=FALSE] 57XValid <- X[-iTrain,,drop=FALSE] 58yTrain <- y[iTrain] 59yValid <- y[-iTrain] 60## Analytic solution (training data): 61start <- c(const=10, x=10) 62b0 <- drop(solve(crossprod(XTrain)) %*% crossprod(XTrain, yTrain)) 63names(b0) <- names(start) 64tol <- 1e-3 # coefficient tolerance 65 66## ---------- 1. working example 67res <- maxSGA(loglik, gradlik, start=start, 68 control=list(printLevel=0, iterlim=200, 69 SG_batchSize=100, SG_learningRate=0.1, 70 storeValues=TRUE), 71 nObs=length(yTrain)) 72expect_equal(coef(res), b0, tolerance=tol) 73 # SGA usually ends with gradient not equal to 0 so we don't test that 74 75## ---------- store parameters 76res <- maxSGA(loglik, gradlik, start=start, 77 control=list(printLevel=0, iterlim=20, 78 SG_batchSize=100, SG_learningRate=0.1, 79 storeParameters=TRUE), 80 nObs=length(yTrain)) 81expect_equal(dim(storedParameters(res)), c(1 + nIter(res), 2)) 82 83## ---------- no function, only gradient 84expect_silent( 85 res <- maxSGA(grad=gradlik, start=start, 86 control=list(printLevel=0, iterlim=10, SG_batchSize=100), 87 nObs=length(yTrain)) 88) 89 90## ---------- neither function nor gradient 91expect_error( 92 res <- maxSGA(start=start, 93 control=list(printLevel=0, iterlim=10, SG_batchSize=100), 94 nObs=length(yTrain)) 95) 96 97## ---------- 1D case 98N1 <- 1000 99t <- rexp(N1, 2) 100loglik1 <- function(theta, index) sum(log(theta) - theta*t[index]) 101gradlik1 <- function(theta, index) sum(1/theta - t[index]) 102expect_silent( 103 res <- maxSGA(loglik1, gradlik1, start=1, 104 control=list(iterlim=300, SG_batchSize=20), nObs=length(t)) 105) 106expect_equal(coef(res), 1/mean(t), tolerance=0.2) 107expect_null(hessian(res)) 108 109## ---------- 2. SGA with momentum 110expect_silent( 111 res <- maxSGA(loglik, gradlik, start=start, 112 control=list(printLevel=0, iterlim=200, 113 SG_batchSize=100, SG_learningRate=0.1, SGA_momentum=0.9), 114 nObs=length(yTrain)) 115) 116expect_equal(coef(res), b0, tolerance=tol) 117 118## ---------- 3. full batch 119expect_silent( 120 res <- maxSGA(loglik, gradlik, start=start, 121 control=list(printLevel=0, iterlim=200, 122 SG_batchSize=NULL, SG_learningRate=0.1), 123 nObs=length(yTrain)) 124) 125expect_equal(coef(res), b0, tolerance=tol) 126 127## ---------- 4. no gradient 128expect_silent( 129 res <- maxSGA(loglik, start=start, 130 control=list(iterlim=1000, SG_learningRate=0.02), nObs=length(yTrain)) 131) 132expect_equal(coef(res), b0, tolerance=tol) 133 134## ---------- return Hessian, gradient provided 135expect_silent( 136 res <- maxSGA(loglik, gradlik, start=start, 137 control=list(iterlim=1000, SG_learningRate=0.02), 138 nObs=length(yTrain), 139 finalHessian=TRUE) 140) 141expect_equal(coef(res), b0, tolerance=tol) 142expect_equal(dim(hessian(res)), c(2,2)) 143 144## ---------- return Hessian, no gradient 145expect_silent( 146 res <- maxSGA(loglik, start=start, 147 control=list(iterlim=1000, SG_learningRate=0.02), 148 nObs=length(yTrain), 149 finalHessian=TRUE) 150) 151expect_equal(coef(res), b0, tolerance=tol) 152expect_equal(dim(hessian(res)), c(2,2)) 153 154### ---------- SGA, printlevel 1, storeValues ---------- 155### it should just work 156expect_silent( 157 res <- maxSGA(loglik, gradlik, start=start, 158 control=list(iterlim=2, storeValues=TRUE, printLevel=1), 159 nObs=length(yTrain), 160 finalHessian=TRUE) 161) 162 163### ---------- SGA, NA as iterlim ---------- 164### should give informative error 165expect_error( 166 res <- maxSGA(loglik, gradlik, start=start, 167 control=list(iterlim=NA), 168 nObs=length(yTrain), 169 finalHessian=TRUE), 170 pattern = "invalid class \"MaxControl\" object: NA in 'iterlim'" 171) 172 173### ---------- SGA, fn missing but storeValues=TRUE 174### should give informative error 175expect_error( 176 res <- maxSGA(grad=gradlik, start=start, 177 control=list(iterlim=10, storeValues=TRUE), 178 nObs=length(yTrain)), 179 pattern = "Cannot compute the objective function value: no objective function supplied" 180) 181 182## ---------- gradient by observations 183gradlikO <- function(beta, index) { 184 e <- yTrain[index] - XTrain[index,,drop=FALSE] %*% beta 185 g <- -2*drop(e)*XTrain[index,,drop=FALSE] 186 -g/length(index) 187} 188expect_silent( 189 res <- maxSGA(grad=gradlikO, start=start, 190 control=list(printLevel=0, iterlim=100, 191 SG_batchSize=100), 192 nObs=length(yTrain)) 193) 194expect_equal(coef(res), b0, tolerance=tol) 195 196## ---------- 0 iterations 197expect_silent( 198 res <- maxSGA(grad=gradlik, start=start, 199 control=list(iterlim=0), 200 nObs=length(yTrain)) 201) 202expect_equal(coef(res), start) 203 # should return start values exactly 204 205### -------------------- create unequally scaled data 206set.seed(1) 207N <- 1000 208x <- rnorm(N, sd=100) 209XTrain <- cbind(1, x) 210yTrain <- 1 + x + rnorm(N) 211start <- c(const=10, x=10) 212 213## ---------- no gradient clipping: 214## should fail with informative "NA/Inf in gradient" message 215expect_error( 216 res <- maxSGA(loglik, gradlik, start=start, 217 control=list(iterlim=100, SG_learningRate=0.5), 218 nObs=length(yTrain)), 219 pattern = "NA/Inf in gradient" 220) 221 222## ---------- gradient clipping: should not fail 223expect_silent( 224 res <- maxSGA(loglik, gradlik, start=start, 225 control=list(iterlim=100, SG_learningRate=0.5, 226 SG_clip=1e6), 227 nObs=length(yTrain) 228 ) 229) 230