1# data is a list 2# x: either a list or a matrix containing the data rowwise 3# y: vector of labels/values 4constructData = function(x, y) { 5 stopifnot(is.list(x) || is.vector(x) || is.matrix(x)) 6 stopifnot(is.list(y) || is.vector(y) || is.factor(y)) 7 data = list(x=x, y=y) 8 class(data) = "CVST.data" 9 return(data) 10} 11 12getN = function(data) { 13 stopifnot(class(data) == "CVST.data") 14 if (is.list(data$x) || is.vector(data$x)) { 15 N = length(data$x) 16 } 17 else { 18 N = nrow(data$x) 19 } 20 return(N) 21} 22 23shuffleData = function(data) { 24 stopifnot(class(data) == "CVST.data") 25 shuffle = sample.int(getN(data)) 26 return(getSubset(data, shuffle)) 27} 28 29getSubset = function(data, subset) { 30 stopifnot(class(data) == "CVST.data") 31 x = getX(data, subset) 32 y = data$y[subset] 33 ret = constructData(x=x, y=y) 34 return(ret) 35} 36 37getX = function(data, subset=NULL) { 38 stopifnot(class(data) == "CVST.data") 39 if (is.null(subset)) { 40 ret = data$x 41 } 42 else { 43 if (is.list(data$x) || is.vector(data$x)) { 44 ret = data$x[subset] 45 } 46 else { 47 ret = data$x[subset, ,drop=FALSE] 48 } 49 } 50 return(ret) 51} 52 53isClassification = function(data) { 54 stopifnot(class(data) == "CVST.data") 55 return(is.factor(data$y)) 56} 57 58isRegression = function(data) { 59 stopifnot(class(data) == "CVST.data") 60 return(!isClassification(data)) 61} 62 63constructLearner = function(learn, predict) { 64 stopifnot(is.function(learn) && is.function(predict)) 65 learner = list(learn=learn, predict=predict) 66 class(learner) = "CVST.learner" 67 return(learner) 68} 69 70constructCVSTModel = function(steps=10, beta=.1, alpha=.01, similaritySignificance=.05, earlyStoppingSignificance=.05, earlyStoppingWindow=3, regressionSimilarityViaOutliers=FALSE) { 71 ret = list(steps=steps, 72 beta=beta, 73 alpha=alpha, 74 similaritySignificance=similaritySignificance, 75 earlyStoppingSignificance=earlyStoppingSignificance, 76 earlyStoppingWindow=earlyStoppingWindow, 77 regressionSimilarityViaOutliers=regressionSimilarityViaOutliers) 78 class(ret) = "CVST.setup" 79 return(ret) 80} 81 82constructParams = function(...) { 83 pn = names(substitute(c(...)))[-1] 84 ret = expand.grid(..., stringsAsFactors=FALSE, KEEP.OUT.ATTRS = FALSE) 85 params = lapply(1:nrow(ret), function(ind) as.list(ret[ind, ])) 86 paramNames = lapply(1:nrow(ret), function(ind) paste(pn, ret[ind, ], sep="=", collapse=" ")) 87 names(params) = paramNames 88 class(params) = "CVST.params" 89 return(params) 90} 91 92 93.getResult = function(train, test, learner, param, squared=TRUE) { 94 stopifnot(class(learner) == "CVST.learner" && class(train) == "CVST.data" && class(test) == "CVST.data") 95 model = try(learner$learn(train, param)) 96 if (class(model) == "try-error") { 97 pred = rep(NA, length(test$y)) 98 } 99 else { 100 pred = try(learner$predict(model, test)) 101 if (class(pred) == "try-error") { 102 pred = rep(NA, length(test$y)) 103 } 104 } 105 if (isClassification(test)) { 106 res = (test$y != pred) 107 } 108 else { 109 if (squared) { 110 res = (pred - test$y)^2 111 } 112 else { 113 res = (pred - test$y) 114 } 115 } 116 return(res) 117} 118 119cochranq.test = function(mat) { 120 cochransQtest = list(statistic = 0, parameter = 0, p.value = 1, 121 method = "Cochran's Q Test", 122 data.name = deparse(substitute(mat))) 123 class(cochransQtest) = "htest" 124 125 if (is.vector(mat) || any(dim(mat) <= 1)) { 126 return(cochransQtest) 127 } 128 129 # we expect the individuals in the rows, repetitions/treatments in the columns 130 m = ncol(mat) 131 df = m - 1 132 L = apply(mat, 1, sum) 133 index = (L > 0 & L < m) 134 if (sum(index) <= 1) { 135 # all rows are either one or zero... no effect! 136 return(cochransQtest) 137 } 138 139 if (sum(index) * m <= 24) { 140 return(.perm.cochranq.test(mat[index, ])) 141 } 142 143 L = L[index] 144 T = apply(mat[index, ], 2, sum) 145 Q = ((m-1) * (m * sum(T^2) - sum(T)^2)) / (m * sum(L) - sum(L^2)) 146 names(df) = "df" 147 names(Q) = "Cochran's Q" 148 149 if (is.nan(Q)) { 150 p.val = 1.0 151 } 152 else { 153 p.val = pchisq(Q, df, lower.tail=FALSE) 154 } 155 cochransQtest$statistic = Q 156 cochransQtest$parameter = df 157 cochransQtest$p.value = p.val 158 return(cochransQtest) 159} 160 161.perm.cochranq.test = function(mat, nperm=1000) { 162 if (is.vector(mat) || any(dim(mat) <= 1)) { 163 cochransQtest = list(statistic = 0, parameter = 0, p.value = 1, 164 method = "Cochran's Q Test", 165 data.name = deparse(substitute(mat))) 166 class(cochransQtest) = "htest" 167 return(cochransQtest) 168 } 169 # we expect no straight zero or one-rows in mat 170 m = ncol(mat) 171 df = m - 1 172 L = apply(mat, 1, sum) 173 T = apply(mat, 2, sum) 174 quot = (m * sum(L) - sum(L^2)) 175 Q = ((m-1) * (m * sum(T^2) - sum(T)^2)) / quot 176 names(df) = "df" 177 names(Q) = "Cochran's Q" 178 179 permFun = function() { 180 newPerm = mat 181 for (i in 1:nrow(mat)) { 182 newPerm[i, ] = mat[i, sample(m)] 183 } 184 T = apply(newPerm, 2, sum) 185 Q = ((m-1) * (m * sum(T^2) - sum(T)^2)) / quot 186 return(Q) 187 } 188 189 QS = replicate(nperm, permFun()) 190 p.value = mean(QS >= Q) 191 cochransQtest = list(statistic = Q, parameter = df, p.value = p.value, 192 method = "Cochran's Q Test (monte-carlo)", 193 data.name = deparse(substitute(mat))) 194 class(cochransQtest) = "htest" 195 return(cochransQtest) 196} 197 198constructSequentialTest = function(piH0=.5, piH1=.9, beta, alpha) { 199 a1 = log((1 - beta) / alpha) / (log(piH1 / piH0) + log((1 - piH0) / (1 - piH1))) 200 a0 = -log(beta / (1 - alpha)) / (log(piH1 / piH0) + log((1 - piH0) / (1 - piH1))) 201 b = log((1 - piH0) / (1 - piH1)) / (log(piH1 / piH0) + log((1 - piH0) / (1 - piH1))) 202 ret = list(a1=a1, a0=a0, b=b, piH0=piH0, piH1=piH1, alpha=alpha, beta=beta) 203 class(ret) = "CVST.sequentialTest" 204 return(ret) 205} 206 207plotSequence = function(st, s) { 208 y = cumsum(s) 209 if (!is.null(st$steps)) { 210 plot(y, xlim=c(1, st$steps), ylim=c(1, st$steps)) 211 } 212 else { 213 plot(y) 214 } 215 abline(a=st$a1, b=st$b, col="red") 216 abline(a=-st$a0, b=st$b, col="red", lty=2) 217 218 abline(h=0) 219 abline(a=0, b=1) 220 title(sprintf("one-sided H0:%0.2f; H1:%0.2f", st$piH0, st$piH1)) 221} 222 223 224testSequence = function(st, s) { 225 stopifnot(class(st) == "CVST.sequentialTest") 226 n = length(s) 227 y = cumsum(s) 228 ret = 0 229 if (y[n] >= st$b * n + st$a1) { 230 ret = 1 231 } 232 else if (y[n] <= st$b * n - st$a0) { 233 ret = -1 234 } 235 return(ret) 236} 237 238noisySinc = function(n, dim=2, sigma=0.1) { 239 if (length(n) > 1) { 240 x = n 241 } 242 else { 243 x = runif(n, -pi, pi) 244 } 245 sinc = function(d) sin(d) / (d) 246 y = sinc(4 * x) + 0.2 * sin(15 * x * dim) + sigma*rnorm(n) 247 y[is.nan(y)] = 1 248 return(constructData(x=as.matrix(x), y=y)) 249} 250 251noisySine = function(n, dim=5, sigma=.25) { 252 x = runif(n, 0, 2 * pi * dim) 253 y = sin(x) 254 if (!is.null(sigma) && sigma > 0) { 255 y = y + rnorm(n, sd=sigma) 256 } 257 label = factor(y == abs(y)) 258 return(constructData(x=as.matrix(x), y=label)) 259} 260 261noisyDonoho = function(n, fun=doppler, sigma=1) { 262 x = matrix(runif(n, 0, 1), n, 1) 263 y = as.vector(fun(x)) + rnorm(n, sd=sigma) 264 return(constructData(x=x, y=y)) 265} 266 267blocks = function(x, scale=3.656993) { 268 t = c(0.1, 0.13, 0.15, 0.23, 0.25, 0.40, 0.44, 0.65, 0.76, 0.78, 0.81) 269 h = c(4, -5, 3, -4, 5, -4.2, 2.1, 4.3, -3.1, 2.1, -4.2) 270 ret = t(sapply(x, function(xx) (1 + sign(xx - t)) / 2)) %*% h 271 272 ret = ret * scale 273 return(ret) 274} 275 276bumps = function(x, scale=10.52884) { 277 t = c(0.1, 0.13, 0.15, 0.23, 0.25, 0.40, 0.44, 0.65, 0.76, 0.78, 0.81) 278 h = c(4, 5, 3, 4, 5, 4.2, 2.1, 4.3, 3.1, 5.1, 4.2) 279 w = c(0.005, 0.005, 0.006, 0.01, 0.01, 0.03, 0.01, 0.01, 0.005, 0.008, 0.005) 280 ret = t(sapply(x, function(xx) (1 + abs((xx - t) / w))^-4 )) %*% h 281 ret = ret * scale 282 return(ret) 283} 284 285heavisine = function(x, scale=2.356934) { 286 ret = 4 * sin(4 * pi * x) - sign(x - 0.3) - sign(0.72 - x) 287 ret = ret * scale 288 return(ret) 289} 290 291doppler = function(x, scale=24.22172) { 292 ret = sqrt(x * (1 - x)) * sin((2.1 * pi) / (x + 0.05)) 293 ret = ret * scale 294 return(ret) 295} 296