1\name{ksvm}
2\alias{ksvm}
3\alias{ksvm,formula-method}
4\alias{ksvm,vector-method}
5\alias{ksvm,matrix-method}
6\alias{ksvm,kernelMatrix-method}
7\alias{ksvm,list-method}
8\alias{show,ksvm-method}
9\alias{coef,ksvm-method}
10\title{Support Vector Machines}
11\description{
12  Support Vector Machines are an excellent tool for classification,
13  novelty detection, and regression. \code{ksvm} supports the
14  well known C-svc, nu-svc, (classification) one-class-svc (novelty)
15  eps-svr, nu-svr (regression) formulations along with
16  native multi-class classification formulations and
17 the bound-constraint SVM formulations.\cr
18  \code{ksvm} also supports class-probabilities output and
19  confidence intervals for regression.
20}
21\usage{
22\S4method{ksvm}{formula}(x, data = NULL, ..., subset, na.action = na.omit, scaled = TRUE)
23
24\S4method{ksvm}{vector}(x, ...)
25
26\S4method{ksvm}{matrix}(x, y = NULL, scaled = TRUE, type = NULL,
27     kernel ="rbfdot", kpar = "automatic",
28     C = 1, nu = 0.2, epsilon = 0.1, prob.model = FALSE,
29     class.weights = NULL, cross = 0, fit = TRUE, cache = 40,
30     tol = 0.001, shrinking = TRUE, ...,
31     subset, na.action = na.omit)
32
33\S4method{ksvm}{kernelMatrix}(x, y = NULL, type = NULL,
34     C = 1, nu = 0.2, epsilon = 0.1, prob.model = FALSE,
35     class.weights = NULL, cross = 0, fit = TRUE, cache = 40,
36     tol = 0.001, shrinking = TRUE, ...)
37
38\S4method{ksvm}{list}(x, y = NULL, type = NULL,
39     kernel = "stringdot", kpar = list(length = 4, lambda = 0.5),
40     C = 1, nu = 0.2, epsilon = 0.1, prob.model = FALSE,
41     class.weights = NULL, cross = 0, fit = TRUE, cache = 40,
42     tol = 0.001, shrinking = TRUE, ...,
43     na.action = na.omit)
44
45}
46
47\arguments{
48  \item{x}{a symbolic description of the model to be fit.  When not
49    using a formula x can be a matrix or vector containing the training
50    data
51    or a kernel matrix of class \code{kernelMatrix} of the training data
52    or a list of character vectors (for use with the string
53    kernel). Note, that the intercept is always excluded, whether
54    given in the formula or not.}
55
56  \item{data}{an optional data frame containing the training data, when using a formula.
57          By default the data is taken from the environment which
58          `ksvm' is called from.}
59
60  \item{y}{a response vector with one label for each row/component of \code{x}. Can be either
61    a factor (for classification tasks) or a numeric vector (for
62    regression).}
63
64  \item{scaled}{A logical vector indicating the variables to be
65    scaled. If \code{scaled} is of length 1, the value is recycled as
66    many times as needed and all non-binary variables are scaled.
67    Per default, data are scaled internally (both \code{x} and \code{y}
68    variables) to zero mean and unit variance. The center and scale
69    values are returned and used for later predictions.}
70
71  \item{type}{\code{ksvm} can be used for classification
72    , for regression, or for novelty detection.
73    Depending on whether \code{y} is
74    a factor or not, the default setting for \code{type} is \code{C-svc}
75    or \code{eps-svr},
76    respectively, but can be overwritten by setting an explicit value.\cr
77    Valid options are:
78
79    \itemize{
80      \item \code{C-svc}   C classification
81
82      \item \code{nu-svc}  nu classification
83
84      \item \code{C-bsvc}  bound-constraint svm classification
85
86      \item \code{spoc-svc}  Crammer, Singer native multi-class
87
88      \item \code{kbb-svc}  Weston, Watkins native multi-class
89
90      \item \code{one-svc}  novelty detection
91
92      \item \code{eps-svr}  epsilon regression
93
94      \item \code{nu-svr}   nu regression
95
96      \item \code{eps-bsvr}  bound-constraint svm regression
97    }
98  }
99
100  \item{kernel}{the kernel function used in training and predicting.
101    This parameter can be set to any function, of class kernel, which
102    computes the inner product in feature space between two
103    vector arguments (see \code{\link{kernels}}). \cr
104    kernlab provides the most popular kernel functions
105    which can be used by setting the kernel parameter to the following
106    strings:
107
108    \itemize{
109      \item \code{rbfdot} Radial Basis kernel "Gaussian"
110
111      \item \code{polydot} Polynomial kernel
112
113      \item \code{vanilladot} Linear kernel
114
115      \item \code{tanhdot} Hyperbolic tangent kernel
116
117      \item \code{laplacedot} Laplacian kernel
118
119      \item \code{besseldot} Bessel kernel
120
121      \item \code{anovadot} ANOVA RBF kernel
122
123      \item \code{splinedot} Spline kernel
124
125      \item \code{stringdot} String kernel
126    }
127
128    Setting the kernel parameter to "matrix" treats \code{x} as a kernel
129    matrix calling the \code{kernelMatrix} interface.\cr
130
131    The kernel parameter can also be set to a user defined function of
132    class kernel by passing the function name as an argument.
133  }
134
135  \item{kpar}{the list of hyper-parameters (kernel parameters).
136    This is a list which contains the parameters to be used with the
137    kernel function. For valid parameters for existing kernels are :
138
139    \itemize{
140      \item \code{sigma} inverse kernel width for the Radial Basis
141      kernel function "rbfdot" and the Laplacian kernel "laplacedot".
142
143      \item \code{degree, scale, offset} for the Polynomial kernel "polydot"
144
145      \item \code{scale, offset} for the Hyperbolic tangent kernel
146      function "tanhdot"
147
148      \item \code{sigma, order, degree} for the Bessel kernel "besseldot".
149
150      \item \code{sigma, degree} for the ANOVA kernel "anovadot".
151
152      \item \code{length, lambda, normalized} for the "stringdot" kernel
153      where length is the length of the strings considered, lambda the
154      decay factor and normalized a logical parameter determining if the
155      kernel evaluations should be normalized.
156    }
157
158    Hyper-parameters for user defined kernels can be passed through the
159    kpar parameter as well. In the case of a Radial Basis kernel function (Gaussian)
160     kpar can also be set to the string "automatic" which uses the heuristics in
161    \code{\link{sigest}} to calculate a good \code{sigma} value for the
162    Gaussian RBF or Laplace kernel, from the data.
163    (default = "automatic").}
164
165  \item{C}{cost of constraints violation (default: 1) this is the
166    `C'-constant of the regularization term in the Lagrange
167    formulation.}
168
169  \item{nu}{parameter needed for \code{nu-svc},
170    \code{one-svc}, and \code{nu-svr}. The \code{nu}
171    parameter sets the upper bound on the training error and the lower
172    bound on the fraction of data points to become Support Vectors (default: 0.2).}
173
174  \item{epsilon}{epsilon in the insensitive-loss function used for
175    \code{eps-svr}, \code{nu-svr} and \code{eps-bsvm} (default: 0.1)}
176
177  \item{prob.model}{if set to \code{TRUE} builds a model for calculating class
178    probabilities or in case of regression, calculates the scaling
179    parameter of the Laplacian distribution fitted on the residuals.
180    Fitting is done  on output data created by performing a
181    3-fold cross-validation on the training data. For details see
182    references. (default: \code{FALSE})}
183
184  \item{class.weights}{a named vector of weights for the different
185    classes, used for asymmetric class sizes. Not all factor levels have
186    to be supplied (default weight: 1). All components have to be named.}
187
188  \item{cache}{cache memory in MB (default 40)}
189
190  \item{tol}{tolerance of termination criterion (default: 0.001)}
191
192  \item{shrinking}{option whether to use the shrinking-heuristics
193    (default: \code{TRUE})}
194
195  \item{cross}{if a integer value k>0 is specified, a k-fold cross
196    validation on the training data is performed to assess the quality
197    of the model: the accuracy rate for classification and the Mean
198    Squared Error for regression}
199
200  \item{fit}{indicates whether the fitted values should be computed
201    and included in the model or not (default: \code{TRUE})}
202
203  \item{\dots}{additional parameters for the low level fitting function}
204
205  \item{subset}{An index vector specifying the cases to be used in the
206          training sample.  (NOTE: If given, this argument must be
207          named.)}
208
209   \item{na.action}{A function to specify the action to be taken if \code{NA}s are
210          found. The default action is \code{na.omit}, which leads to rejection of cases
211          with missing values on any required variable. An alternative
212	  is \code{na.fail}, which causes an error if \code{NA} cases
213	  are found. (NOTE: If given, this argument must be named.)}
214      }
215
216    \value{
217      An S4 object of class \code{"ksvm"} containing the fitted model,
218  Accessor functions can be used to access the slots of the object (see
219  examples) which include:
220  \item{alpha}{The resulting support vectors, (alpha vector) (possibly scaled).}
221  \item{alphaindex}{The index of the resulting support vectors in the data
222    matrix. Note that this index refers to the pre-processed data (after
223    the possible effect of \code{na.omit} and \code{subset})}
224  \item{coef}{The corresponding coefficients times the training labels.}
225  \item{b}{The negative intercept.}
226  \item{nSV}{The number of Support Vectors}
227 \item{obj}{The value of the objective function. In case of one-against-one classification this is a vector of values}
228  \item{error}{Training error}
229  \item{cross}{Cross validation error, (when cross > 0)}
230  \item{prob.model}{Contains the width of the Laplacian fitted on the
231    residuals in case of regression, or the parameters of the sigmoid
232    fitted on the decision values in case of classification.}
233}
234
235
236\details{
237  \code{ksvm} uses John Platt's SMO algorithm for solving the SVM QP problem an
238  most SVM formulations. On the \code{spoc-svc}, \code{kbb-svc}, \code{C-bsvc} and
239  \code{eps-bsvr} formulations a chunking algorithm based on the TRON QP
240  solver is used. \cr
241  For multiclass-classification with \eqn{k} classes, \eqn{k > 2}, \code{ksvm} uses the
242  `one-against-one'-approach, in which \eqn{k(k-1)/2} binary classifiers are
243  trained; the appropriate class is found by a voting scheme,
244  The \code{spoc-svc} and the \code{kbb-svc} formulations deal with the
245  multiclass-classification problems by solving a single quadratic problem involving all the classes.\cr
246  If the predictor variables include factors, the formula interface must be used to get a
247  correct model matrix. \cr
248  In classification when \code{prob.model} is \code{TRUE} a 3-fold cross validation is
249  performed on the data and a sigmoid function is fitted on the
250  resulting decision values \eqn{f}.
251  The data can be passed to the \code{ksvm} function in a \code{matrix} or a
252  \code{data.frame}, in addition \code{ksvm} also supports input in the form of a
253  kernel matrix of class \code{kernelMatrix} or as a list of character
254  vectors where a string kernel has to be used.\cr
255  The \code{plot} function for binary classification \code{ksvm} objects
256  displays a contour plot of the decision values with the corresponding
257  support vectors highlighted.\cr
258 The predict function can return class probabilities for
259 classification problems by setting the \code{type} parameter to
260 "probabilities". \cr
261 The problem of model selection is partially addressed by an empirical
262 observation for the RBF kernels (Gaussian , Laplace) where the optimal values of the
263 \eqn{sigma} width parameter are shown to lie in between the 0.1 and 0.9
264 quantile of the \eqn{\|x- x'\|} statistics. When using an RBF kernel
265 and setting \code{kpar} to "automatic", \code{ksvm} uses the \code{sigest} function
266 to estimate the quantiles and uses the median of the values.
267}
268\note{Data is scaled internally by default, usually yielding better results.}
269\references{
270  \itemize{
271    \item
272      Chang Chih-Chung, Lin Chih-Jen\cr
273      \emph{LIBSVM: a library for Support Vector Machines}\cr
274      \url{http://www.csie.ntu.edu.tw/~cjlin/libsvm}
275
276   \item
277      Chih-Wei Hsu, Chih-Jen Lin\cr
278       \emph{BSVM}
279       \url{http://www.csie.ntu.edu.tw/~cjlin/bsvm/}
280
281     \item
282     J. Platt\cr
283     \emph{Probabilistic outputs for support vector machines and comparison to regularized likelihood methods} \cr
284     Advances in Large Margin Classifiers, A. Smola, P. Bartlett, B. Schoelkopf and D. Schuurmans, Eds. Cambridge, MA: MIT Press, 2000.\cr
285     \url{http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.41.1639}
286
287    \item
288    H.-T. Lin, C.-J. Lin and R. C. Weng\cr
289    \emph{A note on Platt's probabilistic outputs for support vector machines}\cr
290    \url{http://www.csie.ntu.edu.tw/~htlin/paper/doc/plattprob.pdf}
291
292    \item
293     C.-W. Hsu and C.-J. Lin \cr
294     \emph{A comparison on methods for multi-class support vector machines}\cr
295     IEEE Transactions on Neural Networks, 13(2002) 415-425.\cr
296     \url{http://www.csie.ntu.edu.tw/~cjlin/papers/multisvm.ps.gz}
297
298     \item
299     K. Crammer, Y. Singer\cr
300     \emph{On the learnability and design of output codes for multiclass prolems}\cr
301     Computational Learning Theory, 35-46, 2000.\cr
302     \url{http://webee.technion.ac.il/people/koby/publications/ecoc-mlj02.pdf}
303
304     \item
305     J. Weston, C. Watkins\cr
306     \emph{Multi-class support vector machines} \cr
307     In M. Verleysen, Proceedings of ESANN99 Brussels, 1999\cr
308     \url{http://citeseer.ist.psu.edu/8884.html}
309  }
310}
311\author{
312  Alexandros Karatzoglou (SMO optimizers in C++ by Chih-Chung Chang & Chih-Jen Lin)\cr
313  \email{alexandros.karatzoglou@ci.tuwien.ac.at}
314}
315\seealso{\code{\link{predict.ksvm}}, \code{\link{ksvm-class}}, \code{\link{couple}} }
316
317\keyword{methods}
318\keyword{regression}
319\keyword{nonlinear}
320\keyword{classif}
321\keyword{neural}
322
323\examples{
324
325## simple example using the spam data set
326data(spam)
327
328## create test and training set
329index <- sample(1:dim(spam)[1])
330spamtrain <- spam[index[1:floor(dim(spam)[1]/2)], ]
331spamtest <- spam[index[((ceiling(dim(spam)[1]/2)) + 1):dim(spam)[1]], ]
332
333## train a support vector machine
334filter <- ksvm(type~.,data=spamtrain,kernel="rbfdot",
335               kpar=list(sigma=0.05),C=5,cross=3)
336filter
337
338## predict mail type on the test set
339mailtype <- predict(filter,spamtest[,-58])
340
341## Check results
342table(mailtype,spamtest[,58])
343
344
345## Another example with the famous iris data
346data(iris)
347
348## Create a kernel function using the build in rbfdot function
349rbf <- rbfdot(sigma=0.1)
350rbf
351
352## train a bound constraint support vector machine
353irismodel <- ksvm(Species~.,data=iris,type="C-bsvc",
354                  kernel=rbf,C=10,prob.model=TRUE)
355
356irismodel
357
358## get fitted values
359fitted(irismodel)
360
361## Test on the training set with probabilities as output
362predict(irismodel, iris[,-5], type="probabilities")
363
364
365## Demo of the plot function
366x <- rbind(matrix(rnorm(120),,2),matrix(rnorm(120,mean=3),,2))
367y <- matrix(c(rep(1,60),rep(-1,60)))
368
369svp <- ksvm(x,y,type="C-svc")
370plot(svp,data=x)
371
372
373### Use kernelMatrix
374K <- as.kernelMatrix(crossprod(t(x)))
375
376svp2 <- ksvm(K, y, type="C-svc")
377
378svp2
379
380# test data
381xtest <- rbind(matrix(rnorm(20),,2),matrix(rnorm(20,mean=3),,2))
382# test kernel matrix i.e. inner/kernel product of test data with
383# Support Vectors
384
385Ktest <- as.kernelMatrix(crossprod(t(xtest),t(x[SVindex(svp2), ])))
386
387predict(svp2, Ktest)
388
389
390#### Use custom kernel
391
392k <- function(x,y) {(sum(x*y) +1)*exp(-0.001*sum((x-y)^2))}
393class(k) <- "kernel"
394
395data(promotergene)
396
397## train svm using custom kernel
398gene <- ksvm(Class~.,data=promotergene[c(1:20, 80:100),],kernel=k,
399             C=5,cross=5)
400
401gene
402
403
404#### Use text with string kernels
405data(reuters)
406is(reuters)
407tsv <- ksvm(reuters,rlabels,kernel="stringdot",
408            kpar=list(length=5),cross=3,C=10)
409tsv
410
411
412## regression
413# create data
414x <- seq(-20,20,0.1)
415y <- sin(x)/x + rnorm(401,sd=0.03)
416
417# train support vector machine
418regm <- ksvm(x,y,epsilon=0.01,kpar=list(sigma=16),cross=3)
419plot(x,y,type="l")
420lines(x,predict(regm,x),col="red")
421}
422