1### tests for stochastic gradient ascent
2###
3### do not run unless 'NOT_CRAN' explicitly defined
4### (Suggested by Sebastian Meyer and others)
5if(!identical(Sys.getenv("NOT_CRAN"), "true")) {
6    message("We are on CRAN: skipping slow optimizer tests")
7    q("no")
8}
9if(!requireNamespace("tinytest", quietly = TRUE)) {
10   message("These tests require 'tinytest' package\n")
11   q("no")
12}
13library(maxLik)
14
15### Test the following things:
16###
17### 1. basic 2-D SGA
18###    SGA without function, only gradient
19###    SGA neither function nor gradient
20###    SGA in 1-D case
21### 2. SGA w/momentum
22### 3. SGA full batch
23### 4. SGA, no gradient supplied
24###    SGA, return numeric hessian, gradient provided
25###    SGA, return numeric hessian, no gradient provided
26###    SGA, printlevel 1, storeValues
27###    SGA, NA as iterlim: should give informative error
28###    SGA, storeValues but no fn (should fail)
29###
30### using highly unequally scaled data
31###    SGA without gradient clipping (fails)
32###    SGA with gradient clipping (works, although does not converge)
33
34## ---------- OLS
35## log-likelihood function(s):
36## return log-likelihood on validation data
37loglik <- function(beta, index) {
38   e <- yValid - XValid %*% beta
39   -crossprod(e)/length(y)
40}
41## gradlik: work on training data
42gradlik <- function(beta, index) {
43   e <- yTrain[index] - XTrain[index,,drop=FALSE] %*% beta
44   g <- t(-2*t(XTrain[index,,drop=FALSE]) %*% e)
45   -g/length(index)
46}
47
48### create random data
49set.seed(1)
50N <- 1000
51x <- rnorm(N)
52X <- cbind(1, x)
53y <- 100 + 100*x + rnorm(N)
54## training-validation
55iTrain <- sample(N, 0.8*N)
56XTrain <- X[iTrain,,drop=FALSE]
57XValid <- X[-iTrain,,drop=FALSE]
58yTrain <- y[iTrain]
59yValid <- y[-iTrain]
60## Analytic solution (training data):
61start <- c(const=10, x=10)
62b0 <- drop(solve(crossprod(XTrain)) %*% crossprod(XTrain, yTrain))
63names(b0) <- names(start)
64tol <- 1e-3  # coefficient tolerance
65
66## ---------- 1. working example
67res <- maxSGA(loglik, gradlik, start=start,
68            control=list(printLevel=0, iterlim=200,
69                         SG_batchSize=100, SG_learningRate=0.1,
70                         storeValues=TRUE),
71            nObs=length(yTrain))
72expect_equal(coef(res), b0, tolerance=tol)
73                           # SGA usually ends with gradient not equal to 0 so we don't test that
74
75## ---------- store parameters
76res <- maxSGA(loglik, gradlik, start=start,
77              control=list(printLevel=0, iterlim=20,
78                           SG_batchSize=100, SG_learningRate=0.1,
79                           storeParameters=TRUE),
80              nObs=length(yTrain))
81expect_equal(dim(storedParameters(res)), c(1 + nIter(res), 2))
82
83## ---------- no function, only gradient
84expect_silent(
85   res <- maxSGA(grad=gradlik, start=start,
86                 control=list(printLevel=0, iterlim=10, SG_batchSize=100),
87                 nObs=length(yTrain))
88)
89
90## ---------- neither function nor gradient
91expect_error(
92   res <- maxSGA(start=start,
93                 control=list(printLevel=0, iterlim=10, SG_batchSize=100),
94                 nObs=length(yTrain))
95)
96
97## ---------- 1D case
98N1 <- 1000
99t <- rexp(N1, 2)
100loglik1 <- function(theta, index) sum(log(theta) - theta*t[index])
101gradlik1 <- function(theta, index) sum(1/theta - t[index])
102expect_silent(
103   res <- maxSGA(loglik1, gradlik1, start=1,
104                 control=list(iterlim=300, SG_batchSize=20), nObs=length(t))
105)
106expect_equal(coef(res), 1/mean(t), tolerance=0.2)
107expect_null(hessian(res))
108
109## ---------- 2. SGA with momentum
110expect_silent(
111   res <- maxSGA(loglik, gradlik, start=start,
112                 control=list(printLevel=0, iterlim=200,
113                              SG_batchSize=100, SG_learningRate=0.1, SGA_momentum=0.9),
114                 nObs=length(yTrain))
115)
116expect_equal(coef(res), b0, tolerance=tol)
117
118## ---------- 3. full batch
119expect_silent(
120   res <- maxSGA(loglik, gradlik, start=start,
121                 control=list(printLevel=0, iterlim=200,
122                              SG_batchSize=NULL, SG_learningRate=0.1),
123                 nObs=length(yTrain))
124)
125expect_equal(coef(res), b0, tolerance=tol)
126
127## ---------- 4. no gradient
128expect_silent(
129   res <- maxSGA(loglik, start=start,
130                 control=list(iterlim=1000, SG_learningRate=0.02), nObs=length(yTrain))
131)
132expect_equal(coef(res), b0, tolerance=tol)
133
134## ---------- return Hessian, gradient provided
135expect_silent(
136   res <- maxSGA(loglik, gradlik, start=start,
137                 control=list(iterlim=1000, SG_learningRate=0.02),
138                 nObs=length(yTrain),
139                 finalHessian=TRUE)
140)
141expect_equal(coef(res), b0, tolerance=tol)
142expect_equal(dim(hessian(res)), c(2,2))
143
144## ---------- return Hessian, no gradient
145expect_silent(
146   res <- maxSGA(loglik, start=start,
147                 control=list(iterlim=1000, SG_learningRate=0.02),
148                 nObs=length(yTrain),
149                 finalHessian=TRUE)
150)
151expect_equal(coef(res), b0, tolerance=tol)
152expect_equal(dim(hessian(res)), c(2,2))
153
154### ---------- SGA, printlevel 1, storeValues ----------
155### it should just work
156expect_silent(
157   res <- maxSGA(loglik, gradlik, start=start,
158                 control=list(iterlim=2, storeValues=TRUE, printLevel=1),
159                 nObs=length(yTrain),
160                 finalHessian=TRUE)
161)
162
163### ---------- SGA, NA as iterlim ----------
164### should give informative error
165expect_error(
166   res <- maxSGA(loglik, gradlik, start=start,
167                 control=list(iterlim=NA),
168                 nObs=length(yTrain),
169                 finalHessian=TRUE),
170   pattern = "invalid class \"MaxControl\" object: NA in 'iterlim'"
171)
172
173### ---------- SGA, fn missing but storeValues=TRUE
174### should give informative error
175expect_error(
176   res <- maxSGA(grad=gradlik, start=start,
177                 control=list(iterlim=10, storeValues=TRUE),
178                 nObs=length(yTrain)),
179    pattern = "Cannot compute the objective function value: no objective function supplied"
180)
181
182## ---------- gradient by observations
183gradlikO <- function(beta, index) {
184   e <- yTrain[index] - XTrain[index,,drop=FALSE] %*% beta
185   g <- -2*drop(e)*XTrain[index,,drop=FALSE]
186   -g/length(index)
187}
188expect_silent(
189   res <- maxSGA(grad=gradlikO, start=start,
190                 control=list(printLevel=0, iterlim=100,
191                              SG_batchSize=100),
192                 nObs=length(yTrain))
193)
194expect_equal(coef(res), b0, tolerance=tol)
195
196## ---------- 0 iterations
197expect_silent(
198   res <- maxSGA(grad=gradlik, start=start,
199                 control=list(iterlim=0),
200                 nObs=length(yTrain))
201)
202expect_equal(coef(res), start)
203                           # should return start values exactly
204
205### -------------------- create unequally scaled data
206set.seed(1)
207N <- 1000
208x <- rnorm(N, sd=100)
209XTrain <- cbind(1, x)
210yTrain <- 1 + x + rnorm(N)
211start <- c(const=10, x=10)
212
213## ---------- no gradient clipping:
214## should fail with informative "NA/Inf in gradient" message
215expect_error(
216   res <- maxSGA(loglik, gradlik, start=start,
217                 control=list(iterlim=100, SG_learningRate=0.5),
218                 nObs=length(yTrain)),
219   pattern = "NA/Inf in gradient"
220)
221
222## ---------- gradient clipping: should not fail
223expect_silent(
224   res <- maxSGA(loglik, gradlik, start=start,
225                 control=list(iterlim=100, SG_learningRate=0.5,
226                              SG_clip=1e6),
227                 nObs=length(yTrain)
228                 )
229)
230