1## Gaussian Processes implementation. Laplace approximation for classification.
2## author : alexandros karatzoglou
3
4setGeneric("gausspr", function(x, ...) standardGeneric("gausspr"))
5setMethod("gausspr",signature(x="formula"),
6function (x, data=NULL, ..., subset, na.action = na.omit, scaled = TRUE){
7  cl <- match.call()
8  m <- match.call(expand.dots = FALSE)
9  if (is.matrix(eval(m$data, parent.frame())))
10    m$data <- as.data.frame(data)
11  m$... <- NULL
12  m$formula <- m$x
13  m$x <- NULL
14  m[[1L]] <- quote(stats::model.frame)
15  m <- eval(m, parent.frame())
16  Terms <- attr(m, "terms")
17  attr(Terms, "intercept") <- 0
18  x <- model.matrix(Terms, m)
19  y <- model.extract(m, "response")
20
21  if (length(scaled) == 1)
22    scaled <- rep(scaled, ncol(x))
23  if (any(scaled)) {
24    remove <- unique(c(which(labels(Terms) %in% names(attr(x, "contrasts"))),
25                       which(!scaled)
26                       )
27                     )
28    scaled <- !attr(x, "assign") %in% remove
29  }
30
31  ret <- gausspr(x, y, scaled = scaled, ...)
32  kcall(ret) <- cl
33  terms(ret) <- Terms
34  if (!is.null(attr(m, "na.action")))
35    n.action(ret) <- attr(m, "na.action")
36  return (ret)
37})
38
39setMethod("gausspr",signature(x="vector"),
40function(x,...)
41  {
42    x <- t(t(x))
43    ret <- gausspr(x, ...)
44    ret
45  })
46
47setMethod("gausspr",signature(x="matrix"),
48function (x,
49          y,
50          scaled    = TRUE,
51          type      = NULL,
52          kernel    = "rbfdot",
53          kpar      = "automatic",
54          var       = 1,
55          variance.model = FALSE,
56          tol       = 0.0005,
57          cross     = 0,
58          fit       = TRUE,
59          ...
60          ,subset
61         ,na.action = na.omit)
62{
63
64## should become an option
65  reduced <- FALSE
66## subsetting and na-handling for matrices
67  ret <- new("gausspr")
68  if (!missing(subset)) x <- x[subset,]
69  if (is.null(y))
70    x <- na.action(x)
71  else {
72    df <- na.action(data.frame(y, x))
73    y <- df[,1]
74    x <- as.matrix(df[,-1])
75  }
76  ncols <- ncol(x)
77  m <- nrows <- nrow(x)
78
79 if (is.null (type)) type(ret) <-
80   if (is.factor(y)) "classification"
81    else "regression"
82  else type(ret) <- type
83
84  x.scale <- y.scale <- NULL
85 ## scaling
86  if (length(scaled) == 1)
87    scaled <- rep(scaled, ncol(x))
88  if (any(scaled)) {
89    co <- !apply(x[,scaled, drop = FALSE], 2, var)
90    if (any(co)) {
91      scaled <- rep(FALSE, ncol(x))
92      warning(paste("Variable(s)",
93                    paste("`",colnames(x[,scaled, drop = FALSE])[co],
94                          "'", sep="", collapse=" and "),
95                    "constant. Cannot scale data.")
96              )
97    } else {
98      xtmp <- scale(x[,scaled])
99      x[,scaled] <- xtmp
100      x.scale <- attributes(xtmp)[c("scaled:center","scaled:scale")]
101      if (is.numeric(y)&&(type(ret)!="classification")) {
102        y <- scale(y)
103        y.scale <- attributes(y)[c("scaled:center","scaled:scale")]
104        y <- as.vector(y)
105      }
106      tmpsc <- list(scaled = scaled, x.scale = x.scale, y.scale = y.scale)
107    }
108  }
109
110
111  if (var < 10^-3)
112    stop("Noise variance parameter var has to be greater than 10^-3")
113
114 # in case of classification: transform factors into integers
115  if (is.factor(y)) {
116    lev(ret) <- levels (y)
117    y <- as.integer (y)
118  }
119  else {
120    if (type(ret) == "classification" && any(as.integer (y) != y))
121        stop ("dependent variable has to be of factor or integer type for classification mode.")
122    if(type(ret) == "classification")
123      lev(ret) <- unique (y)
124    }
125 # initialize
126  nclass(ret) <- length (lev(ret))
127
128  if(!is.null(type))
129    type(ret) <- match.arg(type,c("classification", "regression"))
130
131if(is.character(kernel)){
132    kernel <- match.arg(kernel,c("rbfdot","polydot","tanhdot","vanilladot","laplacedot","besseldot","anovadot","splinedot"))
133
134    if(is.character(kpar))
135       if((kernel == "tanhdot" || kernel == "vanilladot" || kernel == "polydot"|| kernel == "besseldot" || kernel== "anovadot"|| kernel=="splinedot") &&  kpar=="automatic" )
136       {
137         cat (" Setting default kernel parameters ","\n")
138         kpar <- list()
139       }
140     }
141
142  if (!is.function(kernel))
143  if (!is.list(kpar)&&is.character(kpar)&&(class(kernel)=="rbfkernel" || class(kernel) =="laplacedot" || kernel == "laplacedot"|| kernel=="rbfdot")){
144    kp <- match.arg(kpar,"automatic")
145    if(kp=="automatic")
146      kpar <- list(sigma=mean(sigest(x,scaled=FALSE)[c(1,3)]))
147   cat("Using automatic sigma estimation (sigest) for RBF or laplace kernel","\n")
148
149  }
150
151  if(!is(kernel,"kernel"))
152    {
153      if(is(kernel,"function")) kernel <- deparse(substitute(kernel))
154      kernel <- do.call(kernel, kpar)
155    }
156  if(!is(kernel,"kernel")) stop("kernel must inherit from class `kernel'")
157
158  p <- 0
159
160  if (type(ret) == "classification")
161    {
162      indexes <- lapply(1:nclass(ret), function(kk) which(y == kk))
163      for (i in 1:(nclass(ret)-1)) {
164        jj <- i+1
165        for(j in jj:nclass(ret)) {
166          p <- p+1
167          ##prepare data
168          li <- length(indexes[[i]])
169          lj <- length(indexes[[j]])
170          xd <- matrix(0,(li+lj),dim(x)[2])
171          xdi <- 1:(li+lj) <= li
172          xd[xdi,rep(TRUE,dim(x)[2])] <- x[indexes[[i]],]
173          xd[xdi == FALSE,rep(TRUE,dim(x)[2])] <- x[indexes[[j]],]
174          if(y[indexes[[i]][1]] < y[indexes[[j]]][1])
175            yd <- c(rep(1,li),rep(-1,lj))
176          else
177            yd <- c(rep(-1,li),rep(1,lj))
178          if(reduced == FALSE){
179            K <- kernelMatrix(kernel,xd)
180            gradnorm <- 1
181            alphag <- solut <- rep(0,li+lj)
182            while (gradnorm > tol)
183              {
184                f <- crossprod(K,alphag)
185                grad <- -yd/(1 + exp(yd*f))
186                hess <- exp(yd*f)
187                hess <- hess / ((1 + hess)^2)
188
189                ## We use solveiter instead of solve to speed up things
190                ## A <- t(t(K)*as.vector(hess))
191                ## diag(A) <- diag(A) + 1
192                ## alphag <- alphag - solve(A,(grad + alphag))
193
194                solut <- solveiter(K, hess, (grad + alphag), solut)
195                alphag <- alphag - solut
196                gradnorm <- sqrt(sum((grad + alphag)^2))
197              }
198          }
199          else if (reduced ==TRUE)
200            {
201
202              yind <- t(matrix(unique(yd),2,length(yd)))
203              ymat <- matrix(0, length(yd), 2)
204              ymat[yind==yd] <- 1
205              ##Z <- csi(xd, ymat, kernel = kernel, rank = dim(yd)[1])
206              ##Z <- Z[sort(pivots(Z),index.return = TRUE)$ix, ,drop=FALSE]
207              Z <- inchol(xd, kernel = kernel)
208              gradnorm <- 1
209              alphag <- rep(0,li+lj)
210              m1 <- dim(Z)[1]
211              n1 <- dim(Z)[2]
212              Ksub <- diag(rep(1,n1))
213
214           while (gradnorm > tol)
215              {
216                f <- drop(Z%*%crossprod(Z,alphag))
217                f[which(f>20)] <- 20
218                grad <- -yd/(1 + exp(yd*f))
219                hess <- exp(yd*f)
220                hess <- as.vector(hess / ((1 + hess)^2))
221
222                alphag <- alphag - (- Z %*%solve(Ksub + (t(Z)*hess)%*%Z) %*% (t(Z)*hess))%*%(grad + alphag) + (grad + alphag)
223
224                gradnorm <- sqrt(sum((grad + alphag)^2))
225              }
226
227            }
228              alpha(ret)[[p]] <- alphag
229              alphaindex(ret)[[p]] <- c(indexes[[i]],indexes[[j]])
230        }
231      }
232    }
233
234  if (type(ret) == "regression")
235    {
236      K <- kernelMatrix(kernel,x)
237      if(variance.model)
238        {
239          sol <- solve(K + diag(rep(var, length = m)))
240          rm(K)
241          alpha(ret) <- sol%*%y
242        }
243      else
244        alpha(ret) <- solve(K + diag(rep(var, length = m))) %*% y
245
246    }
247
248  kcall(ret) <- match.call()
249  kernelf(ret) <- kernel
250  xmatrix(ret) <- x
251  if(variance.model)
252      sol(ret) <- sol
253
254  fitted(ret)  <- if (fit)
255    predict(ret, x) else NA
256
257  if (fit){
258    if(type(ret)=="classification")
259      error(ret) <- 1 - .classAgreement(table(y,as.integer(fitted(ret))))
260    if(type(ret)=="regression"){
261      if (!is.null(scaling(ret)$y.scale))
262        fitted(ret) <- fitted(ret) * tmpsc$y.scale$"scaled:scale" + tmpsc$y.scale$"scaled:center"
263
264    error(ret) <- drop(crossprod(fitted(ret) - y)/m)
265    }
266  }
267  if(any(scaled))
268    scaling(ret) <- tmpsc
269
270  cross(ret) <- -1
271  if(cross == 1)
272    cat("\n","cross should be >1 no cross-validation done!","\n","\n")
273  else if (cross > 1)
274    {
275      cerror <- 0
276      suppressWarnings(vgr<-split(sample(1:m,m),1:cross))
277      for(i in 1:cross)
278        {
279          cind <- unsplit(vgr[-i],factor(rep((1:cross)[-i],unlist(lapply(vgr[-i],length)))))
280          if(type(ret)=="classification")
281            {
282              cret <- gausspr(x[cind,], y[cind], scaled = FALSE, type=type(ret),kernel=kernel,var = var, cross = 0, fit = FALSE)
283               cres <- predict(cret, x[vgr[[i]],])
284            cerror <- (1 - .classAgreement(table(y[vgr[[i]]],as.integer(cres))))/cross + cerror
285            }
286          if(type(ret)=="regression")
287            {
288              cret <- gausspr(x[cind,],y[cind],type=type(ret),scaled = FALSE, kernel=kernel,var = var,tol=tol, cross = 0, fit = FALSE)
289              cres <- predict(cret, x[vgr[[i]],])
290              if (!is.null(scaling(ret)$y.scale))
291                scal <- scaling(ret)$y.scale$"scaled:scale"
292              cerror <- drop((scal^2)*crossprod(cres - y[vgr[[i]]])/m) + cerror
293            }
294        }
295      cross(ret) <- cerror
296    }
297
298
299
300  return(ret)
301})
302
303
304setMethod("predict", signature(object = "gausspr"),
305function (object, newdata, type = "response", coupler = "minpair")
306{
307  sc <- 0
308  type <- match.arg(type,c("response","probabilities","votes", "variance", "sdeviation"))
309  if (missing(newdata) && type!="response")
310    return(fitted(object))
311  else if(missing(newdata))
312    {
313      newdata <- xmatrix(object)
314      sc <- 1
315    }
316
317  ncols <- ncol(xmatrix(object))
318  nrows <- nrow(xmatrix(object))
319  oldco <- ncols
320
321  if (!is.null(terms(object)))
322  {
323  newdata <- model.matrix(delete.response(terms(object)), as.data.frame(newdata), na.action = na.action)
324   }
325  else
326    newdata  <- if (is.vector (newdata)) t(t(newdata)) else as.matrix(newdata)
327
328  newcols  <- 0
329  newnrows <- nrow(newdata)
330  newncols <- ncol(newdata)
331  newco    <- newncols
332
333  if (oldco != newco) stop ("test vector does not match model !")
334
335   if (is.list(scaling(object)) && sc != 1)
336    newdata[,scaling(object)$scaled] <-
337      scale(newdata[,scaling(object)$scaled, drop = FALSE],
338            center = scaling(object)$x.scale$"scaled:center",
339            scale  = scaling(object)$x.scale$"scaled:scale"
340            )
341
342  p <- 0
343  if(type == "response")
344    {
345  if(type(object)=="classification")
346    {
347      predres <- 1:newnrows
348      votematrix <- matrix(0,nclass(object),nrows)
349      for(i in 1:(nclass(object)-1))
350        {
351        jj <- i+1
352        for(j in jj:nclass(object))
353          {
354            p <- p+1
355            ret <- kernelMult(kernelf(object),newdata,xmatrix(object)[alphaindex(object)[[p]],],alpha(object)[[p]])
356            votematrix[i,ret>0] <- votematrix[i,ret>0] + 1
357            votematrix[j,ret<0] <- votematrix[j,ret<0] + 1
358          }
359      }
360      predres <- sapply(predres, function(x) which.max(votematrix[,x]))
361    }
362
363}
364
365  if(type == "probabilities")
366    {
367      if(type(object)=="classification")
368        {
369          binprob <- matrix(0, newnrows, nclass(object)*(nclass(object) - 1)/2)
370          for(i in 1:(nclass(object)-1))
371            {
372              jj <- i+1
373              for(j in jj:nclass(object))
374                {
375                  p <- p+1
376                  binprob[,p] <-  1/(1+exp(-kernelMult(kernelf(object),newdata,xmatrix(object)[alphaindex(object)[[p]],],alpha(object)[[p]])))
377
378                }
379            }
380          ## multiprob <- sapply(1:newnrows, function(x) couple(binprob[x ,],coupler = coupler))
381          multiprob <- couple(binprob, coupler = coupler)
382        }
383    }
384
385
386  if(type(object) == "regression")
387    {
388      if (type == "variance"||type == "sdeviation")
389        {
390          Ktest <- kernelMatrix(kernelf(object),xmatrix(object), newdata)
391          predres <- diag(kernelMatrix(kernelf(object),newdata) - t(Ktest)  %*% sol(object) %*% Ktest)
392          if (type== "sdeviation")
393            predres <- sqrt(predres)
394          if (!is.null(scaling(object)$y.scale))
395           predres <- predres * scaling(object)$y.scale$"scaled:scale" + scaling(object)$y.scale$"scaled:center"
396        }
397
398      else
399        {
400
401      predres <- kernelMult(kernelf(object),newdata,xmatrix(object),as.matrix(alpha(object)))
402
403      if (!is.null(scaling(object)$y.scale))
404        predres <- predres * scaling(object)$y.scale$"scaled:scale" + scaling(object)$y.scale$"scaled:center"
405    }
406
407   }
408
409
410 if (is.character(lev(object)))
411    {
412      ##classification & probabilities : return probabilitie matrix
413      if(type == "probabilities")
414        {
415          colnames(multiprob) <- lev(object)
416          return(multiprob)
417        }
418      ##classification & type response: return factors
419      if(type == "response")
420        return(factor (lev(object)[predres], levels = lev(object)))
421      ##classification & votes : return votematrix
422      if(type == "votes")
423        return(votematrix)
424    }
425  else
426    ##else: return raw values
427    return(predres)
428
429})
430
431
432setMethod("show","gausspr",
433function(object){
434  cat("Gaussian Processes object of class \"gausspr\"","\n")
435  cat(paste("Problem type:", type(object),"\n"))
436  cat("\n")
437  show(kernelf(object))
438  cat(paste("\nNumber of training instances learned :", dim(xmatrix(object))[1],"\n"))
439  if(!is.null(fitted(object)))
440    cat(paste("Train error :", round(error(object),9),"\n"))
441  ##train error & loss
442  if(cross(object)!=-1)
443    cat("Cross validation error :",round(cross(object),9),"\n")
444})
445
446
447solveiter <- function(B,noiseproc,b,x,itmax = 50,tol = 10e-4 ,verbose = FALSE) {
448
449## ----------------------------
450## Preconditioned Biconjugate Gradient method
451## solves linear system Ax <- b for general A
452## ------------------------------------------
453## x : initial guess
454## itmax : max # iterations
455## iterates while mean(abs(Ax-b)) > tol
456##
457## Simplified form of Numerical Recipes: linbcg
458##
459## The preconditioned matrix is set to inv(diag(A))
460## A defined through A <- I + N*B
461
462diagA <- matrix(1,dim(B)[1],1) + colSums(B)+ diag(B)*(noiseproc-1)
463## diags of A
464
465cont <- 0
466iter <- 0
467r <- .Amul2(x,B,noiseproc)
468r <- b - r
469rr <- r
470znrm <- 1
471
472bnrm <- sqrt(sum((b)^2))
473z <- r/diagA
474
475err <- sqrt(sum((.Amul2(x,B,noiseproc) - b)^2))/bnrm
476
477while (iter <= itmax){
478  iter <- iter + 1
479  zm1nrm <- znrm
480  zz <- rr/diagA
481  bknum<- drop(crossprod(z,rr))
482  if (iter == 1)
483    {
484      p <- z
485      pp <- zz
486    }
487  else
488    {
489      bk <- bknum/bkden
490      p <- bk*p + z
491      pp <- bk*pp + zz
492    }
493
494  bkden <- bknum
495  z <- .Amul2(p,B,noiseproc)
496  akden <- drop(crossprod(z,pp))
497  ak <- bknum/akden
498  zz <- .Amul2T(pp,B,noiseproc)
499
500  x <- x + ak*p
501  r <- r - ak*z
502  rr <- rr - ak*zz
503  z <- r/diagA
504  znrm <- 1
505
506  err <- mean(abs(r))
507
508  if (err<tol)
509    break
510}
511return(x)
512}
513
514.Amul2 <- function(d, B, noiseproc){
515ee <- B%*%d
516return(d + noiseproc*ee)
517}
518.Amul2T <- function(d, B, noiseproc){
519ee <- noiseproc*d
520return(d + B%*%ee)
521}
522