1# functions to aid in detecting linear dependent columns in the (transformed) 2# model matrix or estimated plm models: 3# * detect.lindep 4# * alias (the latter is a wrapper around alias.lm) 5# * detect_lin_dep for backward compatibility 6# 7# doc file provides an extensive example how linear dependence can arise after 8# the data transformation, e. g., for within transformation 9 10### detect.lindep.matrix, .data.frame, .plm 11 12 13 14 15#' Functions to detect linear dependence 16#' 17#' Little helper functions to aid users to detect linear dependent columns in a 18#' two-dimensional data structure, especially in a (transformed) model matrix - 19#' typically useful in interactive mode during model building phase. 20#' 21#' 22#' Linear dependence of columns/variables is (usually) readily avoided when 23#' building one's model. However, linear dependence is sometimes not obvious 24#' and harder to detect for less experienced applied statisticians. The so 25#' called "dummy variable trap" is a common and probably the best--known 26#' fallacy of this kind (see e. g. Wooldridge (2016), sec. 7-2.). When building 27#' linear models with `lm` or `plm`'s `pooling` model, linear 28#' dependence in one's model is easily detected, at times post hoc. 29#' 30#' However, linear dependence might also occur after some transformations of 31#' the data, albeit it is not present in the untransformed data. The within 32#' transformation (also called fixed effect transformation) used in the 33#' `"within"` model can result in such linear dependence and this is 34#' harder to come to mind when building a model. See **Examples** for two 35#' examples of linear dependent columns after the within transformation: ex. 1) 36#' the transformed variables have the opposite sign of one another; ex. 2) the 37#' transformed variables are identical. 38#' 39#' During `plm`'s model estimation, linear dependent columns and their 40#' corresponding coefficients in the resulting object are silently dropped, 41#' while the corresponding model frame and model matrix still contain the 42#' affected columns. The plm object contains an element `aliased` which 43#' indicates any such aliased coefficients by a named logical. 44#' 45#' Both functions, `detect.lindep` and `alias`, help to 46#' detect linear dependence and accomplish almost the same: 47#' `detect.lindep` is a stand alone implementation while 48#' `alias` is a wrapper around 49#' [stats::alias.lm()], extending the `alias` 50#' generic to classes `"plm"` and `"pdata.frame"`. 51#' `alias` hinges on the availability of the package 52#' \CRANpkg{MASS} on the system. Not all arguments of `alias.lm` 53#' are supported. Output of `alias` is more informative as it 54#' gives the linear combination of dependent columns (after data 55#' transformations, i. e. after (quasi)-demeaning) while 56#' `detect.lindep` only gives columns involved in the linear 57#' dependence in a simple format (thus being more suited for automatic 58#' post--processing of the information). 59#' 60#' @aliases detect.lindep 61#' @param object for `detect.lindep`: an object which should be checked 62#' for linear dependence (of class `"matrix"`, `"data.frame"`, or 63#' `"plm"`); for `alias`: either an estimated model of class 64#' `"plm"` or a `"pdata.frame"`. Usually, one wants to input a model 65#' matrix here or check an already estimated plm model, 66#' @param suppressPrint for `detect.lindep` only: logical indicating 67#' whether a message shall be printed; defaults to printing the message, i. e. 68#' to `suppressPrint = FALSE`, 69#' @param model (see `plm`), 70#' @param effect (see `plm`), 71#' @param \dots further arguments. 72#' @return For `detect.lindep`: A named numeric vector containing column 73#' numbers of the linear dependent columns in the object after data 74#' transformation, if any are present. `NULL` if no linear dependent 75#' columns are detected. 76#' 77#' For `alias`: return value of [stats::alias.lm()] run on the 78#' (quasi-)demeaned model, i. e. the information outputted applies to 79#' the transformed model matrix, not the original data. 80#' @note function `detect.lindep` was called `detect_lin_dep` 81#' initially but renamed for naming consistency later with a 82#' back-compatible solution. 83#' @export 84#' @author Kevin Tappe 85#' @seealso [stats::alias()], [stats::model.matrix()] and especially 86#' `plm`'s [model.matrix()] for (transformed) model matrices, 87#' plm's [model.frame()]. 88#' @references 89#' 90#' \insertRef{WOOL:13}{plm} 91#' 92#' @keywords manip array 93#' @examples 94#' 95#' ### Example 1 ### 96#' # prepare the data 97#' data("Cigar" , package = "plm") 98#' Cigar[ , "fact1"] <- c(0,1) 99#' Cigar[ , "fact2"] <- c(1,0) 100#' Cigar.p <- pdata.frame(Cigar) 101#' 102#' # setup a formula and a model frame 103#' form <- price ~ 0 + cpi + fact1 + fact2 104#' mf <- model.frame(Cigar.p, form) 105#' # no linear dependence in the pooling model's model matrix 106#' # (with intercept in the formula, there would be linear depedence) 107#' detect.lindep(model.matrix(mf, model = "pooling")) 108#' # linear dependence present in the FE transformed model matrix 109#' modmat_FE <- model.matrix(mf, model = "within") 110#' detect.lindep(modmat_FE) 111#' mod_FE <- plm(form, data = Cigar.p, model = "within") 112#' detect.lindep(mod_FE) 113#' alias(mod_FE) # => fact1 == -1*fact2 114#' plm(form, data = mf, model = "within")$aliased # "fact2" indicated as aliased 115#' 116#' # look at the data: after FE transformation fact1 == -1*fact2 117#' head(modmat_FE) 118#' all.equal(modmat_FE[ , "fact1"], -1*modmat_FE[ , "fact2"]) 119#' 120#' ### Example 2 ### 121#' # Setup the data: 122#' # Assume CEOs stay with the firms of the Grunfeld data 123#' # for the firm's entire lifetime and assume some fictional 124#' # data about CEO tenure and age in year 1935 (first observation 125#' # in the data set) to be at 1 to 10 years and 38 to 55 years, respectively. 126#' # => CEO tenure and CEO age increase by same value (+1 year per year). 127#' data("Grunfeld", package = "plm") 128#' set.seed(42) 129#' # add fictional data 130#' Grunfeld$CEOtenure <- c(replicate(10, seq(from=s<-sample(1:10, 1), to=s+19, by=1))) 131#' Grunfeld$CEOage <- c(replicate(10, seq(from=s<-sample(38:65, 1), to=s+19, by=1))) 132#' 133#' # look at the data 134#' head(Grunfeld, 50) 135#' 136#' form <- inv ~ value + capital + CEOtenure + CEOage 137#' mf <- model.frame(pdata.frame(Grunfeld), form) 138#' # no linear dependent columns in original data/pooling model 139#' modmat_pool <- model.matrix(mf, model="pooling") 140#' detect.lindep(modmat_pool) 141#' mod_pool <- plm(form, data = Grunfeld, model = "pooling") 142#' alias(mod_pool) 143#' 144#' # CEOtenure and CEOage are linear dependent after FE transformation 145#' # (demeaning per individual) 146#' modmat_FE <- model.matrix(mf, model="within") 147#' detect.lindep(modmat_FE) 148#' mod_FE <- plm(form, data = Grunfeld, model = "within") 149#' detect.lindep(mod_FE) 150#' alias(mod_FE) 151#' 152#' # look at the transformed data: after FE transformation CEOtenure == 1*CEOage 153#' head(modmat_FE, 50) 154#' all.equal(modmat_FE[ , "CEOtenure"], modmat_FE[ , "CEOage"]) 155#' 156detect.lindep <- function(object, ...) { 157 UseMethod("detect.lindep") 158} 159 160#' @rdname plm-deprecated 161#' @export 162detect_lin_dep <- function(object, ...) { 163 .Deprecated(old = "detect_lin_dep", new = "detect.lindep", 164 msg = "Function name 'detect_lin_dep' deprecated, please use 'detect.lindep'") 165 UseMethod("detect.lindep") 166} 167 168#' @rdname detect.lindep 169#' @method detect.lindep matrix 170#' @export 171detect.lindep.matrix <- function(object, suppressPrint = FALSE, ...) { 172 if (!inherits(object, "matrix")) { 173 stop("Input 'object' must be a matrix. Presumably, one wants a model matrix 174 generated by some 'model.matrix' function.")} 175 176 # do rank reduction to detect lin. dep. columns 177 rank_rec <- sapply(1:ncol(object), function(col) qr(object[ , -col])$rank) 178 179 if (diff(range(rank_rec)) == 0) { 180 num <- NULL # return NULL if there is no linear dep. 181 } else { 182 num <- which(rank_rec == max(rank_rec)) 183 names(num) <- colnames(object)[num] 184 } 185 186 if(!suppressPrint) { 187 if(is.null(num)) { 188 print("No linear dependent column(s) detected.") 189 } else { 190 print(paste0("Suspicious column number(s): ", paste(num, collapse = ", "))) 191 print(paste0("Suspicious column name(s): ", paste(names(num), collapse = ", "))) 192 } 193 return(invisible(num)) 194 } 195 return(num) 196} 197 198#' @rdname detect.lindep 199#' @method detect.lindep data.frame 200#' @export 201detect.lindep.data.frame <- function(object, suppressPrint = FALSE, ...) { 202 if (!inherits(object, "data.frame")) { 203 stop("Input 'object' must be a data.frame")} 204 205 return(detect.lindep.matrix(as.matrix(object), suppressPrint = suppressPrint, ...)) 206} 207 208#' @rdname detect.lindep 209#' @method detect.lindep plm 210#' @export 211detect.lindep.plm <- function(object, suppressPrint = FALSE, ...) { 212 if (!inherits(object, "plm")) { 213 stop("Input 'object' must be of class \"plm\"")} 214 215 return(detect.lindep.matrix(model.matrix(object), suppressPrint = suppressPrint, ...)) 216} 217 218 219### alias.plm, alias.pFormula 220# This is just a wrapper function to allow to apply the generic stats::alias on 221# plm objects and pFormulas with the _transformed data_ (the transformed model.matrix). 222# NB: arguments 'model' and 'effect' are not treated here. 223 224 225#' @rdname detect.lindep 226#' @export 227alias.plm <- function(object, ...) { 228 dots <- list(...) 229 if (!is.null(dots$inst.method)) stop("alias.plm/alias.pFormula: IV not supported") 230 if (length(formula(object))[2] == 2) stop("alias.plm/alias.pFormula: IV not supported") 231 232 # catch unsupported alias.lm args and convert 233 if (!is.null(dots[["partial"]])) { 234 if (dots[["partial"]]) { 235 dots[["partial"]] <- FALSE 236 warning("alias.plm/alias.pFormula: arg partial = TRUE not supported, changed to FALSE") 237 } 238 } 239 if (!is.null(dots[["partial.pattern"]])) { 240 if (dots[["partial.pattern"]]) { 241 dots[["partial.pattern"]] <- FALSE 242 warning("alias.plm/alias.pFormula: arg partial.pattern = TRUE not supported, changed to FALSE") 243 } 244 } 245 246 X <- model.matrix(object) 247 y <- pmodel.response(object) 248 249 lm.fit.obj <- lm.fit(X, y) 250 class(lm.fit.obj) <- "lm" 251 lm.fit.obj$terms <- deparse(object$formula) 252 253 ## use lm.fit rather than lm(): 254 ## could estimate lm model with lm(), but takes more resources and 255 ## need to remove extra classes "formula" for lm to prevent warning 256 # form <- object$formula 257 # form <- setdiff(class(form), c("pFormula", "Formula")) 258 # Xdf <- as.data.frame(X) 259 # ydf <- as.data.frame(y) 260 # names(ydf) <- names(object$model)[1] 261 # data <- cbind(ydf, Xdf) 262 # lmobj <- lm(form, data = data) 263 # return(stats::alias(lmobj)) 264 265 return(stats::alias(lm.fit.obj, ... = dots)) 266} 267 268## alias.pFormula <- function(object, data, 269## model = c("pooling", "within", "Between", "between", 270## "mean", "random", "fd"), 271## effect = c("individual", "time", "twoways"), 272## ...) { 273## dots <- list(...) 274## if (!is.null(dots$inst.method)) stop("alias.plm/alias.pFormula: IV not supported") 275## model <- match.arg(model) 276## effect <- match.arg(effect) 277## formula <- object 278 279## # check if object is already pFormula, try to convert if not 280## if (!inherits(formula, "pFormula")) formula <- pFormula(formula) 281 282## # check if data is already a model frame, convert to if not 283## if (is.null(attr(data, "terms"))) { 284## data <- model.frame.pFormula(pFormula(formula), data) 285## } 286 287## plmobj <- plm(formula, data = data, model = model, effect = effect, ...) 288## # print(summary(plmobj)) 289## return(alias(plmobj, ...)) 290## } 291 292 293#' @rdname detect.lindep 294#' @export 295alias.pdata.frame <- function(object, 296 model = c("pooling", "within", "Between", "between", 297 "mean", "random", "fd"), 298 effect = c("individual", "time", "twoways"), 299 ...) { 300 dots <- list(...) 301 if (!is.null(dots$inst.method)) stop("alias.plm/alias.pFormula: IV not supported") 302 model <- match.arg(model) 303 effect <- match.arg(effect) 304 # check if data is already a model frame, if not exit 305 if (is.null(attr(object, "terms"))) 306 stop("the argument must be a model.frame") 307 formula <- attr(object, "formula") 308 plmobj <- plm(formula, data = object, model = model, effect = effect, ...) 309 return(alias(plmobj, ...)) 310} 311