1# data is a list
2# x: either a list or a matrix containing the data rowwise
3# y: vector of labels/values
4constructData = function(x, y) {
5  stopifnot(is.list(x) || is.vector(x) || is.matrix(x))
6  stopifnot(is.list(y) || is.vector(y) || is.factor(y))
7  data = list(x=x, y=y)
8  class(data) = "CVST.data"
9  return(data)
10}
11
12getN = function(data) {
13  stopifnot(class(data) == "CVST.data")
14  if (is.list(data$x) || is.vector(data$x)) {
15    N = length(data$x)
16  }
17  else {
18    N = nrow(data$x)
19  }
20  return(N)
21}
22
23shuffleData = function(data) {
24  stopifnot(class(data) == "CVST.data")
25  shuffle = sample.int(getN(data))
26  return(getSubset(data, shuffle))
27}
28
29getSubset = function(data, subset) {
30  stopifnot(class(data) == "CVST.data")
31  x = getX(data, subset)
32  y = data$y[subset]
33  ret = constructData(x=x, y=y)
34  return(ret)
35}
36
37getX = function(data, subset=NULL) {
38  stopifnot(class(data) == "CVST.data")
39  if (is.null(subset)) {
40    ret = data$x
41  }
42  else {
43    if (is.list(data$x) || is.vector(data$x)) {
44      ret = data$x[subset]
45    }
46    else {
47      ret = data$x[subset, ,drop=FALSE]
48    }
49  }
50  return(ret)
51}
52
53isClassification = function(data) {
54  stopifnot(class(data) == "CVST.data")
55  return(is.factor(data$y))
56}
57
58isRegression = function(data) {
59  stopifnot(class(data) == "CVST.data")
60  return(!isClassification(data))
61}
62
63constructLearner = function(learn, predict) {
64  stopifnot(is.function(learn) && is.function(predict))
65  learner = list(learn=learn, predict=predict)
66  class(learner) = "CVST.learner"
67  return(learner)
68}
69
70constructCVSTModel = function(steps=10, beta=.1, alpha=.01, similaritySignificance=.05, earlyStoppingSignificance=.05, earlyStoppingWindow=3, regressionSimilarityViaOutliers=FALSE) {
71  ret = list(steps=steps,
72    beta=beta,
73    alpha=alpha,
74    similaritySignificance=similaritySignificance,
75    earlyStoppingSignificance=earlyStoppingSignificance,
76    earlyStoppingWindow=earlyStoppingWindow,
77    regressionSimilarityViaOutliers=regressionSimilarityViaOutliers)
78  class(ret) = "CVST.setup"
79  return(ret)
80}
81
82constructParams = function(...) {
83  pn = names(substitute(c(...)))[-1]
84  ret = expand.grid(..., stringsAsFactors=FALSE, KEEP.OUT.ATTRS = FALSE)
85  params = lapply(1:nrow(ret), function(ind) as.list(ret[ind, ]))
86  paramNames = lapply(1:nrow(ret), function(ind) paste(pn, ret[ind, ], sep="=", collapse=" "))
87  names(params) = paramNames
88  class(params) = "CVST.params"
89  return(params)
90}
91
92
93.getResult = function(train, test, learner, param, squared=TRUE) {
94  stopifnot(class(learner) == "CVST.learner" && class(train) == "CVST.data" && class(test) == "CVST.data")
95  model = try(learner$learn(train, param))
96  if (class(model) == "try-error") {
97    pred = rep(NA, length(test$y))
98  }
99  else {
100    pred = try(learner$predict(model, test))
101    if (class(pred) == "try-error") {
102      pred = rep(NA, length(test$y))
103    }
104  }
105  if (isClassification(test)) {
106    res = (test$y != pred)
107  }
108  else {
109    if (squared) {
110      res = (pred - test$y)^2
111    }
112    else {
113      res = (pred - test$y)
114    }
115  }
116  return(res)
117}
118
119cochranq.test = function(mat) {
120  cochransQtest = list(statistic = 0, parameter = 0, p.value = 1,
121    method = "Cochran's Q Test",
122    data.name = deparse(substitute(mat)))
123  class(cochransQtest) = "htest"
124
125  if (is.vector(mat) || any(dim(mat) <= 1)) {
126    return(cochransQtest)
127  }
128
129  # we expect the individuals in the rows, repetitions/treatments in the columns
130  m = ncol(mat)
131  df = m - 1
132  L = apply(mat, 1, sum)
133  index = (L > 0 & L < m)
134  if (sum(index) <= 1) {
135    # all rows are either one or zero... no effect!
136    return(cochransQtest)
137  }
138
139  if (sum(index) * m <= 24) {
140    return(.perm.cochranq.test(mat[index, ]))
141  }
142
143  L = L[index]
144  T = apply(mat[index, ], 2, sum)
145  Q = ((m-1) * (m * sum(T^2) - sum(T)^2)) / (m * sum(L) - sum(L^2))
146  names(df) = "df"
147  names(Q) = "Cochran's Q"
148
149  if (is.nan(Q)) {
150    p.val = 1.0
151  }
152  else {
153    p.val = pchisq(Q, df, lower.tail=FALSE)
154  }
155  cochransQtest$statistic = Q
156  cochransQtest$parameter = df
157  cochransQtest$p.value = p.val
158  return(cochransQtest)
159}
160
161.perm.cochranq.test = function(mat, nperm=1000) {
162  if (is.vector(mat) || any(dim(mat) <= 1)) {
163    cochransQtest = list(statistic = 0, parameter = 0, p.value = 1,
164      method = "Cochran's Q Test",
165      data.name = deparse(substitute(mat)))
166    class(cochransQtest) = "htest"
167    return(cochransQtest)
168  }
169  # we expect no straight zero or one-rows in mat
170  m = ncol(mat)
171  df = m - 1
172  L = apply(mat, 1, sum)
173  T = apply(mat, 2, sum)
174  quot = (m * sum(L) - sum(L^2))
175  Q = ((m-1) * (m * sum(T^2) - sum(T)^2)) / quot
176  names(df) = "df"
177  names(Q) = "Cochran's Q"
178
179  permFun = function() {
180    newPerm = mat
181    for (i in 1:nrow(mat)) {
182        newPerm[i, ] = mat[i, sample(m)]
183      }
184    T = apply(newPerm, 2, sum)
185    Q = ((m-1) * (m * sum(T^2) - sum(T)^2)) / quot
186    return(Q)
187  }
188
189  QS = replicate(nperm, permFun())
190  p.value = mean(QS >= Q)
191  cochransQtest = list(statistic = Q, parameter = df, p.value = p.value,
192    method = "Cochran's Q Test (monte-carlo)",
193    data.name = deparse(substitute(mat)))
194  class(cochransQtest) = "htest"
195  return(cochransQtest)
196}
197
198constructSequentialTest = function(piH0=.5, piH1=.9, beta, alpha) {
199  a1 = log((1 - beta) / alpha) / (log(piH1 / piH0) + log((1 - piH0) / (1 - piH1)))
200  a0 = -log(beta / (1 - alpha)) / (log(piH1 / piH0) + log((1 - piH0) / (1 - piH1)))
201  b = log((1 - piH0) / (1 - piH1)) / (log(piH1 / piH0) + log((1 - piH0) / (1 - piH1)))
202  ret = list(a1=a1, a0=a0, b=b, piH0=piH0, piH1=piH1, alpha=alpha, beta=beta)
203  class(ret) = "CVST.sequentialTest"
204  return(ret)
205}
206
207plotSequence = function(st, s) {
208  y = cumsum(s)
209  if (!is.null(st$steps)) {
210    plot(y, xlim=c(1, st$steps), ylim=c(1, st$steps))
211  }
212  else {
213    plot(y)
214  }
215  abline(a=st$a1, b=st$b, col="red")
216  abline(a=-st$a0, b=st$b, col="red", lty=2)
217
218  abline(h=0)
219  abline(a=0, b=1)
220  title(sprintf("one-sided H0:%0.2f; H1:%0.2f", st$piH0, st$piH1))
221}
222
223
224testSequence = function(st, s) {
225  stopifnot(class(st) == "CVST.sequentialTest")
226  n = length(s)
227  y = cumsum(s)
228  ret = 0
229  if (y[n] >= st$b * n + st$a1) {
230    ret = 1
231  }
232  else if (y[n] <= st$b * n - st$a0) {
233    ret = -1
234  }
235  return(ret)
236}
237
238noisySinc = function(n, dim=2, sigma=0.1) {
239  if (length(n) > 1) {
240    x = n
241  }
242  else {
243    x = runif(n, -pi, pi)
244  }
245  sinc = function(d) sin(d) / (d)
246  y = sinc(4 * x) + 0.2 * sin(15 * x * dim) + sigma*rnorm(n)
247  y[is.nan(y)] = 1
248  return(constructData(x=as.matrix(x), y=y))
249}
250
251noisySine = function(n, dim=5, sigma=.25) {
252  x = runif(n, 0, 2 * pi * dim)
253  y = sin(x)
254  if (!is.null(sigma) && sigma > 0) {
255    y = y + rnorm(n, sd=sigma)
256  }
257  label = factor(y == abs(y))
258  return(constructData(x=as.matrix(x), y=label))
259}
260
261noisyDonoho = function(n, fun=doppler, sigma=1) {
262  x = matrix(runif(n, 0, 1), n, 1)
263  y = as.vector(fun(x)) + rnorm(n, sd=sigma)
264  return(constructData(x=x, y=y))
265}
266
267blocks = function(x, scale=3.656993) {
268  t = c(0.1, 0.13, 0.15, 0.23, 0.25, 0.40, 0.44, 0.65, 0.76, 0.78, 0.81)
269  h = c(4, -5, 3, -4, 5, -4.2, 2.1, 4.3, -3.1, 2.1, -4.2)
270  ret = t(sapply(x, function(xx) (1 + sign(xx - t)) / 2)) %*% h
271
272  ret = ret * scale
273  return(ret)
274}
275
276bumps = function(x, scale=10.52884) {
277  t = c(0.1, 0.13, 0.15, 0.23, 0.25, 0.40, 0.44, 0.65, 0.76, 0.78, 0.81)
278  h = c(4, 5, 3, 4, 5, 4.2, 2.1, 4.3, 3.1, 5.1, 4.2)
279  w = c(0.005, 0.005, 0.006, 0.01, 0.01, 0.03, 0.01, 0.01, 0.005, 0.008, 0.005)
280  ret = t(sapply(x, function(xx) (1 + abs((xx - t) / w))^-4 )) %*% h
281  ret = ret * scale
282  return(ret)
283}
284
285heavisine = function(x, scale=2.356934) {
286  ret = 4 * sin(4 * pi * x) - sign(x - 0.3) - sign(0.72 - x)
287  ret = ret * scale
288  return(ret)
289}
290
291doppler = function(x, scale=24.22172) {
292  ret = sqrt(x * (1 - x)) * sin((2.1 * pi) / (x + 0.05))
293  ret = ret * scale
294  return(ret)
295}
296