1\name{glmnet}
2\alias{glmnet}
3\title{fit a GLM with lasso or elasticnet regularization}
4\description{
5    Fit a generalized linear model via penalized maximum likelihood.  The
6  regularization path is computed for the lasso or elasticnet penalty at a grid
7  of values for the regularization parameter lambda. Can deal with all
8  shapes of data, including very large sparse data matrices. Fits
9  linear, logistic and multinomial, poisson, and Cox regression models.
10  }
11\usage{
12glmnet(x, y,
13family=c("gaussian","binomial","poisson","multinomial","cox","mgaussian"), weights, offset=NULL,
14alpha = 1, nlambda = 100, lambda.min.ratio = ifelse(nobs<nvars,0.01,0.0001), lambda=NULL,
15standardize = TRUE, thresh = 1e-07,  dfmax = nvars + 1, pmax = min(dfmax * 2+20, nvars),
16 exclude, penalty.factor = rep(1, nvars), maxit=100000, type.gaussian=ifelse(nvars<500,"covariance","naive"),
17 standardize.response=FALSE,type.multinomial=c("ungrouped","grouped"))
18}
19
20\arguments{
21  \item{x}{input matrix, of dimension nobs x nvars; each row is an
22  observation vector. Can be in sparse matrix format (inherit from class \code{"sparseMatrix"} as in package \code{Matrix}; not yet available for \code{family="cox"})}
23  \item{y}{response variable. Quantitative for \code{family="gaussian"},
24  or \code{family="poisson"} (non-negative counts). For
25  \code{family="binomial"} should be either a factor with two levels, or
26  a two-column matrix of counts or proportions. For
27  \code{family="multinomial"}, can be a \code{nc>=2} level factor, or a
28  matrix with \code{nc} columns of counts or proportions. For
29  \code{family="cox"}, \code{y} should be a two-column matrix with
30  columns named 'time' and 'status'. The latter is a binary variable,
31  with '1' indicating death, and '0' indicating right censored. The
32  function \code{Surv()} in package \pkg{survival} produces such a
33  matrix. For  \code{family="mgaussian"}, \code{y} is a matrix of quantitative responses.}
34  \item{family}{Response type (see above)}
35  \item{weights}{observation weights. Can be total counts if responses are proportion matrices. Default is 1 for each observation}
36  \item{offset}{A vector of length \code{nobs} that is included in the linear predictor (a \code{nobs x nc} matrix for the \code{"multinomial"} family). Useful for the \code{"poisson"} family (e.g. log of exposure time), or for refining a model by starting at a current fit. Default is \code{NULL}. If supplied, then values must also be supplied to the \code{predict} function.}
37  \item{alpha}{The elasticnet mixing parameter, with
38    \eqn{0\le\alpha\le 1}. The penalty is defined
39    as \deqn{(1-\alpha)/2||\beta||_2^2+\alpha||\beta||_1.} \code{alpha=1}
40    is the lasso penalty, and \code{alpha=0} the ridge penalty.}
41  \item{nlambda}{The number of \code{lambda} values - default is 100.}
42  \item{lambda.min.ratio}{Smallest value for \code{lambda}, as a fraction of
43    \code{lambda.max}, the (data derived) entry value (i.e. the smallest
44  value for which all coefficients are zero). The default depends on the
45  sample size \code{nobs} relative to the number of variables
46  \code{nvars}. If \code{nobs > nvars}, the default is \code{0.0001},
47  close to zero.  If \code{nobs < nvars}, the default is \code{0.01}.
48  A very small value of
49  \code{lambda.min.ratio} will lead to a saturated fit in the \code{nobs <
50  nvars} case. This is undefined for
51  \code{"binomial"} and \code{"multinomial"} models, and \code{glmnet}
52  will exit gracefully when the percentage deviance explained is almost
53  1.}
54  \item{lambda}{A user supplied \code{lambda} sequence. Typical usage
55    is to have the
56    program compute its own \code{lambda} sequence based on
57    \code{nlambda} and \code{lambda.min.ratio}. Supplying a value of
58    \code{lambda} overrides this. WARNING: use with care. Do not supply
59  a single value for \code{lambda} (for predictions after CV use \code{predict()}
60  instead).  Supply instead
61    a decreasing sequence of \code{lambda} values. \code{glmnet} relies
62  on its warms starts for speed, and its often faster to fit a whole
63  path than compute a single fit.}
64  \item{standardize}{Logical flag for x variable standardization, prior to
65    fitting the model sequence. The coefficients are always returned on
66    the original scale. Default is \code{standardize=TRUE}.
67  If variables are in the same units already, you might not wish to
68  standardize. See details below for y standardization with \code{family="gaussian"}.}
69  \item{thresh}{Convergence threshold for coordinate descent. Each inner
70  coordinate-descent loop continues until the maximum change in the
71  objective after any coefficient update is less than \code{thresh}
72  times the null deviance. Defaults value is \code{1E-7}.}
73  \item{dfmax}{Limit the maximum number of variables in the
74    model. Useful for very large \code{nvars}, if a partial path is desired.}
75  \item{pmax}{Limit the maximum number of variables ever to be nonzero}
76  \item{exclude}{Indices of variables to be excluded from the
77    model. Default is none. Equivalent to an infinite penalty factor
78    (next item).}
79  \item{penalty.factor}{Separate penalty factors can be applied to each
80    coefficient. This is a number that multiplies \code{lambda} to allow
81    differential shrinkage. Can be 0 for some variables, which implies
82    no shrinkage, and that variable is always included in the
83    model. Default is 1 for all variables (and implicitly infinity for
84    variables listed in \code{exclude}). Note: the penalty factors are
85  internally rescaled to sum to nobs, and the lambda sequence will
86  reflect this change.}
87    \item{maxit}{Maximum number of passes over the data for all lambda
88  values; default is 10^5.}
89  \item{type.gaussian}{Two algorithm types are supported for (only)
90    \code{family="gaussian"}. The default when \code{nvar<500} is
91  \code{type.gaussian="covariance"}, and saves all
92    inner-products ever computed. This  can be much faster than
93    \code{type.gaussian="naive"}, which loops through \code{nobs} every
94  time an inner-product is computed. The latter can be far more efficient for \code{nvar >>
95    nobs} situations, or when \code{nvar > 500}.}
96\item{standardize.response}{This is for the \code{family="mgaussian"}
97  family, and allows the user to standardize the response variables}
98\item{type.multinomial}{If \code{"grouped"} then a grouped lasso penalty
99  is used on the multinomial coefficients for a variable. This ensures
100  they are all in our out together. The default is \code{"ungrouped"}}
101}
102\details{
103  The sequence of models implied by \code{lambda} is fit by coordinate
104  descent. For \code{family="gaussian"} this is the lasso sequence if
105  \code{alpha=1}, else it is the elasticnet sequence.
106 For the other families, this is a lasso or elasticnet regularization path
107  for fitting the generalized linear regression
108  paths, by maximizing the appropriate penalized log-likelihood (partial likelihood for the "cox" model). Sometimes the sequence is truncated before \code{nlambda}
109  values of \code{lambda} have been used, because of instabilities in
110  the inverse link functions near a saturated fit. \code{glmnet(...,family="binomial")}
111  fits a traditional logistic regression model for the
112  log-odds. \code{glmnet(...,family="multinomial")} fits a symmetric multinomial model, where
113  each class is represented by a linear model (on the log-scale). The
114  penalties take care of redundancies. A two-class \code{"multinomial"} model
115  will produce the same fit as the corresponding \code{"binomial"} model,
116  except the pair of coefficient matrices will be equal in magnitude and
117  opposite in sign, and half the \code{"binomial"} values.
118  Note that the objective function for \code{"gaussian"} is \deqn{1/2
119  RSS/nobs + \lambda*penalty,} and for the other models it is
120  \deqn{-loglik/nobs + \lambda*penalty.} Note also that for
121  \code{"gaussian"}, \code{glmnet} standardizes y to have unit variance
122  before computing its lambda sequence (and then unstandardizes the
123  resulting coefficients); if you wish to reproduce/compare results with other
124  software, best to supply a standardized y.
125  The latest two features in glmnet are the \code{family="mgaussian"}
126  family and the \code{type.multinomial="grouped"} option for
127  multinomial fitting. The former allows a multi-response gaussian model
128  to be fit, using a "group -lasso" penalty on the coefficients for each
129  variable. Tying the responses together like this is called
130  "multi-task" learning in some domains. The grouped multinomial allows the same penalty for the
131  \code{family="multinomial"} model, which is also multi-responsed. For
132  both of these the penalty on the coefficient vector for variable j is
133   \deqn{(1-\alpha)/2||\beta_j||_2^2+\alpha||\beta_j||_2.} When
134  \code{alpha=1} this is a group-lasso penalty, and otherwise it mixes
135  with quadratic just like elasticnet.
136  }
137\value{
138An object with S3 class \code{"glmnet","*" }, where \code{"*"} is
139\code{"elnet"}, \code{"lognet"},
140\code{"multnet"}, \code{"fishnet"} (poisson), \code{"coxnet"} or \code{"mrelnet"}  for the various types of models.
141  \item{call}{the call that produced this object}
142  \item{a0}{Intercept sequence of length \code{length(lambda)}}
143  \item{beta}{For \code{"elnet"}, \code{"lognet"}, \code{"fishnet"} and \code{"coxnet"} models, a \code{nvars x
144      length(lambda)} matrix of coefficients, stored in sparse column
145    format (\code{"CsparseMatrix"}). For \code{"multnet"} and \code{"mgaussian"}, a list of \code{nc} such
146    matrices, one for each class.}
147  \item{lambda}{The actual sequence of \code{lambda} values used}
148  \item{dev.ratio}{The fraction of (null) deviance explained (for \code{"elnet"}, this
149      is the R-square). The deviance calculations incorporate weights if
150  present in the model. The deviance is defined to be 2*(loglike_sat -
151  loglike), where loglike_sat is the log-likelihood for the saturated
152  model (a model with a free parameter per observation). Hence dev.ratio=1-dev/nulldev.}
153    \item{nulldev}{Null deviance (per observation). This is defined to
154  be  2*(loglike_sat -loglike(Null)); The NULL model refers to the
155  intercept model, except for the Cox, where it is the 0 model.}
156  \item{df}{The number of nonzero coefficients for each value of
157    \code{lambda}. For \code{"multnet"}, this is the number of variables
158    with a nonzero coefficient for \emph{any} class.}
159  \item{dfmat}{For \code{"multnet"} and \code{"mrelnet"} only. A matrix consisting of the
160    number of nonzero coefficients per class}
161  \item{dim}{dimension of coefficient matrix (ices)}
162  \item{nobs}{number of observations}
163  \item{npasses}{total passes over the data summed over all lambda
164    values}
165  \item{offset}{a logical variable indicating whether an offset was included in the model}
166  \item{jerr}{error flag, for warnings and errors (largely for internal debugging).}
167}
168\references{Friedman, J., Hastie, T. and Tibshirani, R. (2008)
169  \emph{Regularization Paths for Generalized Linear Models via Coordinate
170    Descent},   \url{http://www.stanford.edu/~hastie/Papers/glmnet.pdf}\cr
171  \emph{Journal of Statistical Software, Vol. 33(1), 1-22 Feb 2010}\cr
172  \url{http://www.jstatsoft.org/v33/i01/}\cr
173  Simon, N., Friedman, J., Hastie, T., Tibshirani, R. (2011)
174  \emph{Regularization Paths for Cox's Proportional Hazards Model via
175    Coordinate Descent, Journal of Statistical Software, Vol. 39(5)
176    1-13}\cr
177  \url{http://www.jstatsoft.org/v39/i05/}\cr
178  Tibshirani, Robert., Bien, J., Friedman, J.,Hastie, T.,Simon,
179  N.,Taylor, J. and Tibshirani, Ryan. (2010)
180  \emph{Strong Rules for Discarding Predictors in Lasso-type Problems},\cr
181  \url{http://www-stat.stanford.edu/~tibs/ftp/strong.pdf}\cr
182  \emph{Stanford Statistics Technical Report}
183  }
184\author{Jerome Friedman, Trevor Hastie and Rob Tibshirani\cr
185Noah Simon helped develop the 'cox' feature.\cr
186Maintainer: Trevor Hastie \email{hastie@stanford.edu}}
187\seealso{\code{print}, \code{predict}, \code{coef} and \code{plot} methods, and the \code{cv.glmnet} function.}
188\examples{
189# Gaussian
190x=matrix(rnorm(100*20),100,20)
191y=rnorm(100)
192fit1=glmnet(x,y)
193print(fit1)
194coef(fit1,s=0.01) # extract coefficients at a single value of lambda
195predict(fit1,newx=x[1:10,],s=c(0.01,0.005)) # make predictions
196
197#multivariate gaussian
198y=matrix(rnorm(100*3),100,3)
199fit1m=glmnet(x,y,family="mgaussian")
200plot(fit1m,type.coef="2norm")
201
202#binomial
203g2=sample(1:2,100,replace=TRUE)
204fit2=glmnet(x,g2,family="binomial")
205
206#multinomial
207g4=sample(1:4,100,replace=TRUE)
208fit3=glmnet(x,g4,family="multinomial")
209fit3a=glmnet(x,g4,family="multinomial",type.multinomial="grouped")
210#poisson
211N=500; p=20
212nzc=5
213x=matrix(rnorm(N*p),N,p)
214beta=rnorm(nzc)
215f = x[,seq(nzc)]\%*\%beta
216mu=exp(f)
217y=rpois(N,mu)
218fit=glmnet(x,y,family="poisson")
219plot(fit)
220pfit = predict(fit,x,s=0.001,type="response")
221plot(pfit,y)
222
223#Cox
224set.seed(10101)
225N=1000;p=30
226nzc=p/3
227x=matrix(rnorm(N*p),N,p)
228beta=rnorm(nzc)
229fx=x[,seq(nzc)]\%*\%beta/3
230hx=exp(fx)
231ty=rexp(N,hx)
232tcens=rbinom(n=N,prob=.3,size=1)# censoring indicator
233y=cbind(time=ty,status=1-tcens) # y=Surv(ty,1-tcens) with library(survival)
234fit=glmnet(x,y,family="cox")
235plot(fit)
236
237# Sparse
238n=10000;p=200
239nzc=trunc(p/10)
240x=matrix(rnorm(n*p),n,p)
241iz=sample(1:(n*p),size=n*p*.85,replace=FALSE)
242x[iz]=0
243sx=Matrix(x,sparse=TRUE)
244inherits(sx,"sparseMatrix")#confirm that it is sparse
245beta=rnorm(nzc)
246fx=x[,seq(nzc)]\%*\%beta
247eps=rnorm(n)
248y=fx+eps
249px=exp(fx)
250px=px/(1+px)
251ly=rbinom(n=length(px),prob=px,size=1)
252system.time(fit1<-glmnet(sx,y))
253system.time(fit2n<-glmnet(x,y))
254}
255\keyword{models}
256\keyword{regression}
257
258
259