1\name{glmnet} 2\alias{glmnet} 3\title{fit a GLM with lasso or elasticnet regularization} 4\description{ 5 Fit a generalized linear model via penalized maximum likelihood. The 6 regularization path is computed for the lasso or elasticnet penalty at a grid 7 of values for the regularization parameter lambda. Can deal with all 8 shapes of data, including very large sparse data matrices. Fits 9 linear, logistic and multinomial, poisson, and Cox regression models. 10 } 11\usage{ 12glmnet(x, y, 13family=c("gaussian","binomial","poisson","multinomial","cox","mgaussian"), weights, offset=NULL, 14alpha = 1, nlambda = 100, lambda.min.ratio = ifelse(nobs<nvars,0.01,0.0001), lambda=NULL, 15standardize = TRUE, thresh = 1e-07, dfmax = nvars + 1, pmax = min(dfmax * 2+20, nvars), 16 exclude, penalty.factor = rep(1, nvars), maxit=100000, type.gaussian=ifelse(nvars<500,"covariance","naive"), 17 standardize.response=FALSE,type.multinomial=c("ungrouped","grouped")) 18} 19 20\arguments{ 21 \item{x}{input matrix, of dimension nobs x nvars; each row is an 22 observation vector. Can be in sparse matrix format (inherit from class \code{"sparseMatrix"} as in package \code{Matrix}; not yet available for \code{family="cox"})} 23 \item{y}{response variable. Quantitative for \code{family="gaussian"}, 24 or \code{family="poisson"} (non-negative counts). For 25 \code{family="binomial"} should be either a factor with two levels, or 26 a two-column matrix of counts or proportions. For 27 \code{family="multinomial"}, can be a \code{nc>=2} level factor, or a 28 matrix with \code{nc} columns of counts or proportions. For 29 \code{family="cox"}, \code{y} should be a two-column matrix with 30 columns named 'time' and 'status'. The latter is a binary variable, 31 with '1' indicating death, and '0' indicating right censored. The 32 function \code{Surv()} in package \pkg{survival} produces such a 33 matrix. For \code{family="mgaussian"}, \code{y} is a matrix of quantitative responses.} 34 \item{family}{Response type (see above)} 35 \item{weights}{observation weights. Can be total counts if responses are proportion matrices. Default is 1 for each observation} 36 \item{offset}{A vector of length \code{nobs} that is included in the linear predictor (a \code{nobs x nc} matrix for the \code{"multinomial"} family). Useful for the \code{"poisson"} family (e.g. log of exposure time), or for refining a model by starting at a current fit. Default is \code{NULL}. If supplied, then values must also be supplied to the \code{predict} function.} 37 \item{alpha}{The elasticnet mixing parameter, with 38 \eqn{0\le\alpha\le 1}. The penalty is defined 39 as \deqn{(1-\alpha)/2||\beta||_2^2+\alpha||\beta||_1.} \code{alpha=1} 40 is the lasso penalty, and \code{alpha=0} the ridge penalty.} 41 \item{nlambda}{The number of \code{lambda} values - default is 100.} 42 \item{lambda.min.ratio}{Smallest value for \code{lambda}, as a fraction of 43 \code{lambda.max}, the (data derived) entry value (i.e. the smallest 44 value for which all coefficients are zero). The default depends on the 45 sample size \code{nobs} relative to the number of variables 46 \code{nvars}. If \code{nobs > nvars}, the default is \code{0.0001}, 47 close to zero. If \code{nobs < nvars}, the default is \code{0.01}. 48 A very small value of 49 \code{lambda.min.ratio} will lead to a saturated fit in the \code{nobs < 50 nvars} case. This is undefined for 51 \code{"binomial"} and \code{"multinomial"} models, and \code{glmnet} 52 will exit gracefully when the percentage deviance explained is almost 53 1.} 54 \item{lambda}{A user supplied \code{lambda} sequence. Typical usage 55 is to have the 56 program compute its own \code{lambda} sequence based on 57 \code{nlambda} and \code{lambda.min.ratio}. Supplying a value of 58 \code{lambda} overrides this. WARNING: use with care. Do not supply 59 a single value for \code{lambda} (for predictions after CV use \code{predict()} 60 instead). Supply instead 61 a decreasing sequence of \code{lambda} values. \code{glmnet} relies 62 on its warms starts for speed, and its often faster to fit a whole 63 path than compute a single fit.} 64 \item{standardize}{Logical flag for x variable standardization, prior to 65 fitting the model sequence. The coefficients are always returned on 66 the original scale. Default is \code{standardize=TRUE}. 67 If variables are in the same units already, you might not wish to 68 standardize. See details below for y standardization with \code{family="gaussian"}.} 69 \item{thresh}{Convergence threshold for coordinate descent. Each inner 70 coordinate-descent loop continues until the maximum change in the 71 objective after any coefficient update is less than \code{thresh} 72 times the null deviance. Defaults value is \code{1E-7}.} 73 \item{dfmax}{Limit the maximum number of variables in the 74 model. Useful for very large \code{nvars}, if a partial path is desired.} 75 \item{pmax}{Limit the maximum number of variables ever to be nonzero} 76 \item{exclude}{Indices of variables to be excluded from the 77 model. Default is none. Equivalent to an infinite penalty factor 78 (next item).} 79 \item{penalty.factor}{Separate penalty factors can be applied to each 80 coefficient. This is a number that multiplies \code{lambda} to allow 81 differential shrinkage. Can be 0 for some variables, which implies 82 no shrinkage, and that variable is always included in the 83 model. Default is 1 for all variables (and implicitly infinity for 84 variables listed in \code{exclude}). Note: the penalty factors are 85 internally rescaled to sum to nobs, and the lambda sequence will 86 reflect this change.} 87 \item{maxit}{Maximum number of passes over the data for all lambda 88 values; default is 10^5.} 89 \item{type.gaussian}{Two algorithm types are supported for (only) 90 \code{family="gaussian"}. The default when \code{nvar<500} is 91 \code{type.gaussian="covariance"}, and saves all 92 inner-products ever computed. This can be much faster than 93 \code{type.gaussian="naive"}, which loops through \code{nobs} every 94 time an inner-product is computed. The latter can be far more efficient for \code{nvar >> 95 nobs} situations, or when \code{nvar > 500}.} 96\item{standardize.response}{This is for the \code{family="mgaussian"} 97 family, and allows the user to standardize the response variables} 98\item{type.multinomial}{If \code{"grouped"} then a grouped lasso penalty 99 is used on the multinomial coefficients for a variable. This ensures 100 they are all in our out together. The default is \code{"ungrouped"}} 101} 102\details{ 103 The sequence of models implied by \code{lambda} is fit by coordinate 104 descent. For \code{family="gaussian"} this is the lasso sequence if 105 \code{alpha=1}, else it is the elasticnet sequence. 106 For the other families, this is a lasso or elasticnet regularization path 107 for fitting the generalized linear regression 108 paths, by maximizing the appropriate penalized log-likelihood (partial likelihood for the "cox" model). Sometimes the sequence is truncated before \code{nlambda} 109 values of \code{lambda} have been used, because of instabilities in 110 the inverse link functions near a saturated fit. \code{glmnet(...,family="binomial")} 111 fits a traditional logistic regression model for the 112 log-odds. \code{glmnet(...,family="multinomial")} fits a symmetric multinomial model, where 113 each class is represented by a linear model (on the log-scale). The 114 penalties take care of redundancies. A two-class \code{"multinomial"} model 115 will produce the same fit as the corresponding \code{"binomial"} model, 116 except the pair of coefficient matrices will be equal in magnitude and 117 opposite in sign, and half the \code{"binomial"} values. 118 Note that the objective function for \code{"gaussian"} is \deqn{1/2 119 RSS/nobs + \lambda*penalty,} and for the other models it is 120 \deqn{-loglik/nobs + \lambda*penalty.} Note also that for 121 \code{"gaussian"}, \code{glmnet} standardizes y to have unit variance 122 before computing its lambda sequence (and then unstandardizes the 123 resulting coefficients); if you wish to reproduce/compare results with other 124 software, best to supply a standardized y. 125 The latest two features in glmnet are the \code{family="mgaussian"} 126 family and the \code{type.multinomial="grouped"} option for 127 multinomial fitting. The former allows a multi-response gaussian model 128 to be fit, using a "group -lasso" penalty on the coefficients for each 129 variable. Tying the responses together like this is called 130 "multi-task" learning in some domains. The grouped multinomial allows the same penalty for the 131 \code{family="multinomial"} model, which is also multi-responsed. For 132 both of these the penalty on the coefficient vector for variable j is 133 \deqn{(1-\alpha)/2||\beta_j||_2^2+\alpha||\beta_j||_2.} When 134 \code{alpha=1} this is a group-lasso penalty, and otherwise it mixes 135 with quadratic just like elasticnet. 136 } 137\value{ 138An object with S3 class \code{"glmnet","*" }, where \code{"*"} is 139\code{"elnet"}, \code{"lognet"}, 140\code{"multnet"}, \code{"fishnet"} (poisson), \code{"coxnet"} or \code{"mrelnet"} for the various types of models. 141 \item{call}{the call that produced this object} 142 \item{a0}{Intercept sequence of length \code{length(lambda)}} 143 \item{beta}{For \code{"elnet"}, \code{"lognet"}, \code{"fishnet"} and \code{"coxnet"} models, a \code{nvars x 144 length(lambda)} matrix of coefficients, stored in sparse column 145 format (\code{"CsparseMatrix"}). For \code{"multnet"} and \code{"mgaussian"}, a list of \code{nc} such 146 matrices, one for each class.} 147 \item{lambda}{The actual sequence of \code{lambda} values used} 148 \item{dev.ratio}{The fraction of (null) deviance explained (for \code{"elnet"}, this 149 is the R-square). The deviance calculations incorporate weights if 150 present in the model. The deviance is defined to be 2*(loglike_sat - 151 loglike), where loglike_sat is the log-likelihood for the saturated 152 model (a model with a free parameter per observation). Hence dev.ratio=1-dev/nulldev.} 153 \item{nulldev}{Null deviance (per observation). This is defined to 154 be 2*(loglike_sat -loglike(Null)); The NULL model refers to the 155 intercept model, except for the Cox, where it is the 0 model.} 156 \item{df}{The number of nonzero coefficients for each value of 157 \code{lambda}. For \code{"multnet"}, this is the number of variables 158 with a nonzero coefficient for \emph{any} class.} 159 \item{dfmat}{For \code{"multnet"} and \code{"mrelnet"} only. A matrix consisting of the 160 number of nonzero coefficients per class} 161 \item{dim}{dimension of coefficient matrix (ices)} 162 \item{nobs}{number of observations} 163 \item{npasses}{total passes over the data summed over all lambda 164 values} 165 \item{offset}{a logical variable indicating whether an offset was included in the model} 166 \item{jerr}{error flag, for warnings and errors (largely for internal debugging).} 167} 168\references{Friedman, J., Hastie, T. and Tibshirani, R. (2008) 169 \emph{Regularization Paths for Generalized Linear Models via Coordinate 170 Descent}, \url{http://www.stanford.edu/~hastie/Papers/glmnet.pdf}\cr 171 \emph{Journal of Statistical Software, Vol. 33(1), 1-22 Feb 2010}\cr 172 \url{http://www.jstatsoft.org/v33/i01/}\cr 173 Simon, N., Friedman, J., Hastie, T., Tibshirani, R. (2011) 174 \emph{Regularization Paths for Cox's Proportional Hazards Model via 175 Coordinate Descent, Journal of Statistical Software, Vol. 39(5) 176 1-13}\cr 177 \url{http://www.jstatsoft.org/v39/i05/}\cr 178 Tibshirani, Robert., Bien, J., Friedman, J.,Hastie, T.,Simon, 179 N.,Taylor, J. and Tibshirani, Ryan. (2010) 180 \emph{Strong Rules for Discarding Predictors in Lasso-type Problems},\cr 181 \url{http://www-stat.stanford.edu/~tibs/ftp/strong.pdf}\cr 182 \emph{Stanford Statistics Technical Report} 183 } 184\author{Jerome Friedman, Trevor Hastie and Rob Tibshirani\cr 185Noah Simon helped develop the 'cox' feature.\cr 186Maintainer: Trevor Hastie \email{hastie@stanford.edu}} 187\seealso{\code{print}, \code{predict}, \code{coef} and \code{plot} methods, and the \code{cv.glmnet} function.} 188\examples{ 189# Gaussian 190x=matrix(rnorm(100*20),100,20) 191y=rnorm(100) 192fit1=glmnet(x,y) 193print(fit1) 194coef(fit1,s=0.01) # extract coefficients at a single value of lambda 195predict(fit1,newx=x[1:10,],s=c(0.01,0.005)) # make predictions 196 197#multivariate gaussian 198y=matrix(rnorm(100*3),100,3) 199fit1m=glmnet(x,y,family="mgaussian") 200plot(fit1m,type.coef="2norm") 201 202#binomial 203g2=sample(1:2,100,replace=TRUE) 204fit2=glmnet(x,g2,family="binomial") 205 206#multinomial 207g4=sample(1:4,100,replace=TRUE) 208fit3=glmnet(x,g4,family="multinomial") 209fit3a=glmnet(x,g4,family="multinomial",type.multinomial="grouped") 210#poisson 211N=500; p=20 212nzc=5 213x=matrix(rnorm(N*p),N,p) 214beta=rnorm(nzc) 215f = x[,seq(nzc)]\%*\%beta 216mu=exp(f) 217y=rpois(N,mu) 218fit=glmnet(x,y,family="poisson") 219plot(fit) 220pfit = predict(fit,x,s=0.001,type="response") 221plot(pfit,y) 222 223#Cox 224set.seed(10101) 225N=1000;p=30 226nzc=p/3 227x=matrix(rnorm(N*p),N,p) 228beta=rnorm(nzc) 229fx=x[,seq(nzc)]\%*\%beta/3 230hx=exp(fx) 231ty=rexp(N,hx) 232tcens=rbinom(n=N,prob=.3,size=1)# censoring indicator 233y=cbind(time=ty,status=1-tcens) # y=Surv(ty,1-tcens) with library(survival) 234fit=glmnet(x,y,family="cox") 235plot(fit) 236 237# Sparse 238n=10000;p=200 239nzc=trunc(p/10) 240x=matrix(rnorm(n*p),n,p) 241iz=sample(1:(n*p),size=n*p*.85,replace=FALSE) 242x[iz]=0 243sx=Matrix(x,sparse=TRUE) 244inherits(sx,"sparseMatrix")#confirm that it is sparse 245beta=rnorm(nzc) 246fx=x[,seq(nzc)]\%*\%beta 247eps=rnorm(n) 248y=fx+eps 249px=exp(fx) 250px=px/(1+px) 251ly=rbinom(n=length(px),prob=px,size=1) 252system.time(fit1<-glmnet(sx,y)) 253system.time(fit2n<-glmnet(x,y)) 254} 255\keyword{models} 256\keyword{regression} 257 258 259