1# functions to aid in detecting linear dependent columns in the (transformed)
2# model matrix or estimated plm models:
3#  * detect.lindep
4#  * alias (the latter is a wrapper around alias.lm)
5#  * detect_lin_dep for backward compatibility
6#
7# doc file provides an extensive example how linear dependence can arise after
8# the data transformation, e. g., for within transformation
9
10### detect.lindep.matrix, .data.frame, .plm
11
12
13
14
15#' Functions to detect linear dependence
16#'
17#' Little helper functions to aid users to detect linear dependent columns in a
18#' two-dimensional data structure, especially in a (transformed) model matrix -
19#' typically useful in interactive mode during model building phase.
20#'
21#'
22#' Linear dependence of columns/variables is (usually) readily avoided when
23#' building one's model.  However, linear dependence is sometimes not obvious
24#' and harder to detect for less experienced applied statisticians. The so
25#' called "dummy variable trap" is a common and probably the best--known
26#' fallacy of this kind (see e. g. Wooldridge (2016), sec. 7-2.). When building
27#' linear models with `lm` or `plm`'s `pooling` model, linear
28#' dependence in one's model is easily detected, at times post hoc.
29#'
30#' However, linear dependence might also occur after some transformations of
31#' the data, albeit it is not present in the untransformed data. The within
32#' transformation (also called fixed effect transformation) used in the
33#' `"within"` model can result in such linear dependence and this is
34#' harder to come to mind when building a model. See **Examples** for two
35#' examples of linear dependent columns after the within transformation: ex. 1)
36#' the transformed variables have the opposite sign of one another; ex. 2) the
37#' transformed variables are identical.
38#'
39#' During `plm`'s model estimation, linear dependent columns and their
40#' corresponding coefficients in the resulting object are silently dropped,
41#' while the corresponding model frame and model matrix still contain the
42#' affected columns.  The plm object contains an element `aliased` which
43#' indicates any such aliased coefficients by a named logical.
44#'
45#' Both functions, `detect.lindep` and `alias`, help to
46#' detect linear dependence and accomplish almost the same:
47#' `detect.lindep` is a stand alone implementation while
48#' `alias` is a wrapper around
49#' [stats::alias.lm()], extending the `alias`
50#' generic to classes `"plm"` and `"pdata.frame"`.
51#' `alias` hinges on the availability of the package
52#' \CRANpkg{MASS} on the system. Not all arguments of `alias.lm`
53#' are supported.  Output of `alias` is more informative as it
54#' gives the linear combination of dependent columns (after data
55#' transformations, i. e. after (quasi)-demeaning) while
56#' `detect.lindep` only gives columns involved in the linear
57#' dependence in a simple format (thus being more suited for automatic
58#' post--processing of the information).
59#'
60#' @aliases detect.lindep
61#' @param object for `detect.lindep`: an object which should be checked
62#' for linear dependence (of class `"matrix"`, `"data.frame"`, or
63#' `"plm"`); for `alias`: either an estimated model of class
64#' `"plm"` or a `"pdata.frame"`. Usually, one wants to input a model
65#' matrix here or check an already estimated plm model,
66#' @param suppressPrint for `detect.lindep` only: logical indicating
67#' whether a message shall be printed; defaults to printing the message, i. e.
68#' to `suppressPrint = FALSE`,
69#' @param model (see `plm`),
70#' @param effect (see `plm`),
71#' @param \dots further arguments.
72#' @return For `detect.lindep`: A named numeric vector containing column
73#' numbers of the linear dependent columns in the object after data
74#' transformation, if any are present. `NULL` if no linear dependent
75#' columns are detected.
76#'
77#' For `alias`: return value of [stats::alias.lm()] run on the
78#' (quasi-)demeaned model, i. e. the information outputted applies to
79#' the transformed model matrix, not the original data.
80#' @note function `detect.lindep` was called `detect_lin_dep`
81#'     initially but renamed for naming consistency later with a
82#'     back-compatible solution.
83#' @export
84#' @author Kevin Tappe
85#' @seealso [stats::alias()], [stats::model.matrix()] and especially
86#'     `plm`'s [model.matrix()] for (transformed) model matrices,
87#'     plm's [model.frame()].
88#' @references
89#'
90#' \insertRef{WOOL:13}{plm}
91#'
92#' @keywords manip array
93#' @examples
94#'
95#' ### Example 1 ###
96#' # prepare the data
97#' data("Cigar" , package = "plm")
98#' Cigar[ , "fact1"] <- c(0,1)
99#' Cigar[ , "fact2"] <- c(1,0)
100#' Cigar.p <- pdata.frame(Cigar)
101#'
102#' # setup a formula and a model frame
103#' form <- price ~ 0 + cpi + fact1 + fact2
104#' mf <- model.frame(Cigar.p, form)
105#' # no linear dependence in the pooling model's model matrix
106#' # (with intercept in the formula, there would be linear depedence)
107#' detect.lindep(model.matrix(mf, model = "pooling"))
108#' # linear dependence present in the FE transformed model matrix
109#' modmat_FE <- model.matrix(mf, model = "within")
110#' detect.lindep(modmat_FE)
111#' mod_FE <- plm(form, data = Cigar.p, model = "within")
112#' detect.lindep(mod_FE)
113#' alias(mod_FE) # => fact1 == -1*fact2
114#' plm(form, data = mf, model = "within")$aliased # "fact2" indicated as aliased
115#'
116#' # look at the data: after FE transformation fact1 == -1*fact2
117#' head(modmat_FE)
118#' all.equal(modmat_FE[ , "fact1"], -1*modmat_FE[ , "fact2"])
119#'
120#' ### Example 2 ###
121#' # Setup the data:
122#' # Assume CEOs stay with the firms of the Grunfeld data
123#' # for the firm's entire lifetime and assume some fictional
124#' # data about CEO tenure and age in year 1935 (first observation
125#' # in the data set) to be at 1 to 10 years and 38 to 55 years, respectively.
126#' # => CEO tenure and CEO age increase by same value (+1 year per year).
127#' data("Grunfeld", package = "plm")
128#' set.seed(42)
129#' # add fictional data
130#' Grunfeld$CEOtenure <- c(replicate(10, seq(from=s<-sample(1:10,  1), to=s+19, by=1)))
131#' Grunfeld$CEOage    <- c(replicate(10, seq(from=s<-sample(38:65, 1), to=s+19, by=1)))
132#'
133#' # look at the data
134#' head(Grunfeld, 50)
135#'
136#' form <- inv ~ value + capital + CEOtenure + CEOage
137#' mf <- model.frame(pdata.frame(Grunfeld), form)
138#' # no linear dependent columns in original data/pooling model
139#' modmat_pool <- model.matrix(mf, model="pooling")
140#' detect.lindep(modmat_pool)
141#' mod_pool <- plm(form, data = Grunfeld, model = "pooling")
142#' alias(mod_pool)
143#'
144#' # CEOtenure and CEOage are linear dependent after FE transformation
145#' # (demeaning per individual)
146#' modmat_FE <- model.matrix(mf, model="within")
147#' detect.lindep(modmat_FE)
148#' mod_FE <- plm(form, data = Grunfeld, model = "within")
149#' detect.lindep(mod_FE)
150#' alias(mod_FE)
151#'
152#' # look at the transformed data: after FE transformation CEOtenure == 1*CEOage
153#' head(modmat_FE, 50)
154#' all.equal(modmat_FE[ , "CEOtenure"], modmat_FE[ , "CEOage"])
155#'
156detect.lindep <- function(object, ...) {
157  UseMethod("detect.lindep")
158}
159
160#' @rdname plm-deprecated
161#' @export
162detect_lin_dep <- function(object, ...) {
163  .Deprecated(old = "detect_lin_dep", new = "detect.lindep",
164              msg = "Function name 'detect_lin_dep' deprecated, please use 'detect.lindep'")
165  UseMethod("detect.lindep")
166}
167
168#' @rdname detect.lindep
169#' @method detect.lindep matrix
170#' @export
171detect.lindep.matrix <- function(object, suppressPrint = FALSE, ...) {
172  if (!inherits(object, "matrix")) {
173    stop("Input 'object' must be a matrix. Presumably, one wants a model matrix
174         generated by some 'model.matrix' function.")}
175
176  # do rank reduction to detect lin. dep. columns
177  rank_rec <- sapply(1:ncol(object), function(col) qr(object[ , -col])$rank)
178
179  if (diff(range(rank_rec)) == 0) {
180    num <- NULL # return NULL if there is no linear dep.
181  } else {
182    num <- which(rank_rec == max(rank_rec))
183    names(num) <- colnames(object)[num]
184  }
185
186  if(!suppressPrint) {
187    if(is.null(num)) {
188      print("No linear dependent column(s) detected.")
189    } else {
190      print(paste0("Suspicious column number(s): ", paste(num,        collapse = ", ")))
191      print(paste0("Suspicious column name(s):   ", paste(names(num), collapse = ", ")))
192    }
193    return(invisible(num))
194  }
195  return(num)
196}
197
198#' @rdname detect.lindep
199#' @method detect.lindep data.frame
200#' @export
201detect.lindep.data.frame <- function(object, suppressPrint = FALSE, ...) {
202  if (!inherits(object, "data.frame")) {
203    stop("Input 'object' must be a data.frame")}
204
205  return(detect.lindep.matrix(as.matrix(object), suppressPrint = suppressPrint, ...))
206}
207
208#' @rdname detect.lindep
209#' @method detect.lindep plm
210#' @export
211detect.lindep.plm <- function(object, suppressPrint = FALSE, ...) {
212  if (!inherits(object, "plm")) {
213    stop("Input 'object' must be of class \"plm\"")}
214
215  return(detect.lindep.matrix(model.matrix(object), suppressPrint = suppressPrint, ...))
216}
217
218
219### alias.plm, alias.pFormula
220# This is just a wrapper function to allow to apply the generic stats::alias on
221# plm objects and pFormulas with the _transformed data_ (the transformed model.matrix).
222# NB: arguments 'model' and 'effect' are not treated here.
223
224
225#' @rdname detect.lindep
226#' @export
227alias.plm <- function(object, ...) {
228  dots <- list(...)
229  if (!is.null(dots$inst.method)) stop("alias.plm/alias.pFormula: IV not supported")
230  if (length(formula(object))[2] == 2) stop("alias.plm/alias.pFormula: IV not supported")
231
232  # catch unsupported alias.lm args and convert
233  if (!is.null(dots[["partial"]])) {
234    if (dots[["partial"]]) {
235      dots[["partial"]] <- FALSE
236      warning("alias.plm/alias.pFormula: arg partial = TRUE not supported, changed to FALSE")
237    }
238  }
239  if (!is.null(dots[["partial.pattern"]])) {
240    if (dots[["partial.pattern"]]) {
241      dots[["partial.pattern"]] <- FALSE
242      warning("alias.plm/alias.pFormula: arg partial.pattern = TRUE not supported, changed to FALSE")
243    }
244  }
245
246  X <- model.matrix(object)
247  y <- pmodel.response(object)
248
249  lm.fit.obj <- lm.fit(X, y)
250  class(lm.fit.obj) <- "lm"
251  lm.fit.obj$terms <- deparse(object$formula)
252
253  ## use lm.fit rather than lm():
254  ## could estimate lm model with lm(), but takes more resources and
255  ## need to remove extra classes "formula" for lm to prevent warning
256  # form <- object$formula
257  # form <- setdiff(class(form), c("pFormula", "Formula"))
258  # Xdf <- as.data.frame(X)
259  # ydf <- as.data.frame(y)
260  # names(ydf) <- names(object$model)[1]
261  # data <- cbind(ydf, Xdf)
262  # lmobj <- lm(form, data = data)
263  # return(stats::alias(lmobj))
264
265  return(stats::alias(lm.fit.obj, ... = dots))
266}
267
268## alias.pFormula <- function(object, data,
269##                            model = c("pooling", "within", "Between", "between",
270##                                      "mean", "random", "fd"),
271##                            effect = c("individual", "time", "twoways"),
272##                            ...) {
273##   dots <- list(...)
274##   if (!is.null(dots$inst.method)) stop("alias.plm/alias.pFormula: IV not supported")
275##   model <- match.arg(model)
276##   effect <- match.arg(effect)
277##   formula <- object
278
279##   # check if object is already pFormula, try to convert if not
280##   if (!inherits(formula, "pFormula")) formula <- pFormula(formula)
281
282##   # check if data is already a model frame, convert to if not
283##   if (is.null(attr(data, "terms"))) {
284##     data <- model.frame.pFormula(pFormula(formula), data)
285##   }
286
287##   plmobj <- plm(formula, data = data, model = model, effect = effect, ...)
288## #  print(summary(plmobj))
289##   return(alias(plmobj, ...))
290## }
291
292
293#' @rdname detect.lindep
294#' @export
295alias.pdata.frame <- function(object,
296                              model = c("pooling", "within", "Between", "between",
297                                        "mean", "random", "fd"),
298                              effect = c("individual", "time", "twoways"),
299                              ...) {
300    dots <- list(...)
301    if (!is.null(dots$inst.method)) stop("alias.plm/alias.pFormula: IV not supported")
302    model <- match.arg(model)
303    effect <- match.arg(effect)
304      # check if data is already a model frame, if not exit
305    if (is.null(attr(object, "terms")))
306        stop("the argument must be a model.frame")
307    formula <- attr(object, "formula")
308    plmobj <- plm(formula, data = object, model = model, effect = effect, ...)
309    return(alias(plmobj, ...))
310}
311