1## This file contains the relevant transformations used for panel data,
2## namely of course Within and between/Between, but also Sum (useful for
3## unbalanced panels).
4
5## They are all generics and have default, pseries and matrix
6## methods. The effect argument is an index vector for the default method
7## and a character ("individual", "time", "group", "twoways") for the
8## pseries method. It can be any of the two for the matrix method (the
9## second one only if the matrix argument has an index attribute
10
11## diff, lag and lead methods for pseries are also provided (lead is a
12## generic exported by plm, lag and diff being generic exported by
13## stats). All of them have a shift argument which can be either "time"
14## or "row".
15
16
17
18#' panel series
19#'
20#' A class for panel series for which several useful computations and
21#' data transformations are available.
22#'
23#' The functions `between`, `Between`, `Within`, and `Sum` perform specific
24#' data transformations, i. e., the between, within, and sum transformation,
25#' respectively.
26#'
27#' `between` returns a vector/matrix containing the individual means (over
28#' time) with the length of the vector equal to the number of
29#' individuals (if `effect = "individual"` (default); if `effect = "time"`,
30#' it returns the time means (over individuals)). `Between`
31#' duplicates the values and returns a vector/matrix which length/number of rows
32#' is the number of total observations. `Within` returns a vector/matrix
33#' containing the values in deviation from the individual means
34#' (if `effect = "individual"`, from time means if `effect = "time"`), the so
35#' called demeaned data. `Sum` returns a vector/matrix with sum per individual
36#' (over time) or the sum per time period (over individuals) with
37#' `effect = "individual"` or `effect = "time"`, respectively, and has length/
38#' number of rows of the total observations (like `Between`).
39#'
40#' For `between`, `Between`, `Within`, and `Sum` in presence of NA values it
41#' can be useful to supply `na.rm = TRUE` as an additional argument to
42#' keep as many observations as possible in the resulting transformation.
43#' na.rm is passed on to the mean()/sum() function used by these transformations
44#' (i.e., it does not remove NAs prior to any processing!), see also
45#' **Examples**.
46#'
47#' @name pseries
48#' @aliases pseries
49#' @param x,object a `pseries` or a matrix; or a `summary.pseries` object,
50#' @param effect for the pseries methods: character string indicating the
51#'     `"individual"`, `"time"`, or `"group"` effect, for `Within`
52#'     `"twoways"` additionally; for non-pseries methods, `effect` is a factor
53#'     specifying the dimension (`"twoways"` is not possible),
54#' @param idbyrow if `TRUE` in the `as.matrix` method, the lines of
55#'     the matrix are the individuals,
56#' @param rm.null if `TRUE`, for the `Within.matrix` method, remove
57#'     the empty columns,
58#' @param plot,scale,transparency,col,lwd plot arguments,
59#' @param \dots further arguments, e. g., `na.rm = TRUE` for
60#'     transformation functions like `beetween`, see **Details**
61#'     and **Examples**.
62#' @return All these functions return an object of class `pseries` or a matrix,
63#'     except:\cr `between`, which returns a numeric vector or a matrix;
64#'     `as.matrix`, which returns a matrix.
65#' @author Yves Croissant
66#' @seealso [is.pseries()] to check if an object is a pseries. For
67#'     more functions on class 'pseries' see [lag()], [lead()],
68#'     [diff()] for lagging values, leading values (negative lags) and
69#'     differencing.
70#' @keywords classes
71#' @examples
72#'
73#' # First, create a pdata.frame
74#' data("EmplUK", package = "plm")
75#' Em <- pdata.frame(EmplUK)
76#'
77#' # Then extract a series, which becomes additionally a pseries
78#' z <- Em$output
79#' class(z)
80#'
81#' # obtain the matrix representation
82#' as.matrix(z)
83#'
84#' # compute the between and within transformations
85#' between(z)
86#' Within(z)
87#'
88#' # Between and Sum replicate the values for each time observation
89#' Between(z)
90#' Sum(z)
91#'
92#' # between, Between, Within, and Sum transformations on other dimension
93#' between(z, effect = "time")
94#' Between(z, effect = "time")
95#' Within(z, effect = "time")
96#' Sum(z, effect = "time")
97#'
98#' # NA treatment for between, Between, Within, and Sum
99#' z2 <- z
100#' z2[length(z2)] <- NA # set last value to NA
101#' between(z2, na.rm = TRUE) # non-NA value for last individual
102#' Between(z2, na.rm = TRUE) # only the NA observation is lost
103#' Within(z2, na.rm = TRUE)  # only the NA observation is lost
104#' Sum(z2, na.rm = TRUE)     # only the NA observation is lost
105#'
106#' sum(is.na(Between(z2))) # 9 observations lost due to one NA value
107#' sum(is.na(Between(z2, na.rm = TRUE))) # only the NA observation is lost
108#' sum(is.na(Within(z2))) # 9 observations lost due to one NA value
109#' sum(is.na(Within(z2, na.rm = TRUE))) # only the NA observation is lost
110#' sum(is.na(Sum(z2))) # 9 observations lost due to one NA value
111#' sum(is.na(Sum(z2, na.rm = TRUE))) # only the NA observation is lost
112#'
113NULL
114
115
116
117#' @rdname pseries
118#' @export
119print.pseries <- function(x, ...){
120  x.orig <- x
121  attr(x, "index") <- NULL
122  attr(x, "class") <- base::setdiff(attr(x, "class"), "pseries")
123  if(length(attr(x, "class")) == 1L && class(x) %in% c("character", "logical", "numeric", "integer", "complex")) {
124    attr(x, "class") <- NULL
125  }
126  print(x, ...)
127  x.orig
128}
129
130#' @rdname pseries
131#' @export
132as.matrix.pseries <- function(x, idbyrow = TRUE, ...){
133    index <- unclass(attr(x, "index")) # unclass for speed
134    id <- index[[1L]]
135    time <- index[[2L]]
136    time.names <- levels(time)
137    x <- split(data.frame(x, time), id)
138    x <- lapply(x, function(x){
139        rownames(x) <- x[ , 2L]
140        x[ , -2L, drop = FALSE]
141    })
142    x <- lapply(x, function(x){
143        x <- x[time.names, , drop = FALSE]
144        rownames(x) <- time.names
145        x
146    }
147    )
148    id.names <- names(x)
149    x <- as.matrix(as.data.frame((x)))
150    colnames(x) <- id.names
151    if(idbyrow) x <- t(x)
152    x
153}
154
155## plots a panel series by time index
156##
157## can supply any panel function, e.g., a loess smoother
158## > mypanel<-function(x,...) {
159## + panel.xyplot(x,...)
160## + panel.loess(x, col="red", ...)}
161## >
162## > plot(pres(mod), panel=mypanel)
163
164#' @rdname pseries
165#' @importFrom lattice xyplot
166#' @export
167plot.pseries <- function(x, plot = c("lattice", "superposed"),
168                         scale = FALSE, transparency = TRUE,
169                         col = "blue", lwd = 1, ...) {
170
171    if(scale) {
172        scalefun <- function(x) scale(x)
173    } else {
174        scalefun <- function(x) return(x)}
175
176    nx <- as.numeric(x)
177    ind <- attr(x, "index")[[1L]]
178    tind <- attr(x, "index")[[2L]] # possibly as.numeric():
179                                   # activates autom. tick
180                                   # but loses time labels
181
182    xdata <- data.frame(nx = nx, ind = ind, tind = tind)
183
184    switch(match.arg(plot),
185           "lattice" = {
186               ##require(lattice) # make a ggplot2 version
187               xyplot(nx ~ tind | ind, data = xdata, type = "l", col = col, ...)
188           },
189           "superposed" = {
190               ylim <- c(min(tapply(scalefun(nx), ind, min, na.rm = TRUE)),
191                         max(tapply(scalefun(nx), ind, max, na.rm = TRUE)))
192               unind <- unique(ind)
193               nx1 <- nx[ind == unind[1L]]
194               tind1 <- as.numeric(tind[ind == unind[1L]])
195               ## plot empty plot to provide frame
196               plot(NA, xlim = c(min(as.numeric(tind)),
197                                 max(as.numeric(tind))),
198                    ylim = ylim, xlab = "", ylab = "", xaxt = "n", ...)
199               axis(1, at = as.numeric(unique(tind)),
200                    labels = unique(tind))
201
202               ## determine lwd and transparency level as a function
203               ## of n
204               if(transparency) {
205                   alpha <- 5 / length(unind)
206                   col <- heat.colors(1, alpha = alpha)
207                   lwd <- length(unind) / 10
208               }
209               ## plot lines (notice: tind. are factors, so they
210               ## retain the correct labels which would be lost if
211               ## using as.numeric
212               for(i in 1:length(unind)) {
213                   nxi <- nx[ind == unind[i]]
214                   tindi <- tind[ind == unind[i]]
215                   lines(x = tindi, y = scalefun(nxi),
216                         col = col, lwd = lwd, ...)
217               }
218           })
219}
220
221#' @rdname pseries
222#' @export
223summary.pseries <- function(object, ...) {
224    if(!inherits(object, c("factor", "logical", "character"))) {
225        index <- unclass(attr(object, "index")) # unclass for speed
226        id   <- index[[1L]]
227        time <- index[[2L]]
228        Bid   <- Between(object, na.rm = TRUE)
229        Btime <- Between(object, effect = "time", na.rm = TRUE)
230        ## res <- structure(c(total = sumsq(object),
231        ##                    between_id = sumsq(Bid),
232        ##                    between_time = sumsq(Btime)),
233        ##                  class = c("summary.pseries", "numeric"))
234        res <- structure(c(total        = sum( (na.omit(object) - mean(object, na.rm = TRUE)) ^ 2),
235                           between_id   = sum( (na.omit(Bid)    - mean(Bid,    na.rm = TRUE)) ^ 2),
236                           between_time = sum( (na.omit(Btime)  - mean(Btime,  na.rm = TRUE)) ^ 2)),
237                          class = c("summary.pseries", "numeric"))
238    } else {
239        class(object) <- setdiff(class(object), c("pseries"))
240        res <- summary(object, ...)
241        class(res) <- c("summary.pseries", class(object), class(res))
242    }
243    return(res)
244}
245
246#' @rdname pseries
247#' @export
248plot.summary.pseries <- function(x, ...){
249    x <- as.numeric(x)
250    share <- x[-1L]/x[1L] # vec with length == 2
251    names(share) <- c("id", "time")
252    barplot(share, ...)
253}
254
255#' @rdname pseries
256#' @export
257print.summary.pseries <- function(x, ...){
258    x.orig <- x
259    digits <- getOption("digits")
260    special_treatment_vars <- c("factor", "logical", "character")
261    if(!inherits(x, special_treatment_vars)) {
262        x <- as.numeric(x)
263        share <- x[-1L]/x[1L] # vec with length == 2
264        names(share) <- c("id", "time")
265        cat(paste("total sum of squares:", signif(x[1L], digits = digits),"\n"))
266        print.default(share, ...)
267    } else {
268        class(x) <- setdiff(class(x), c("summary.pseries", special_treatment_vars))
269        print(x, ...)
270    }
271    invisible(x.orig)
272}
273
274
275Tapply <- function(x, ...) {
276    UseMethod("Tapply")
277}
278
279myave <- function(x, ...) {
280  UseMethod("myave")
281}
282
283Tapply.default <- function(x, effect, func, ...) {
284    # argument 'effect' is assumed to be a factor
285    na.x <- is.na(x)
286    uniqval <- tapply(x, effect, func, ...)
287    nms <- attr(uniqval, "dimnames")[[1L]]
288    attr(uniqval, "dimnames") <- attr(uniqval, "dim") <- NULL
289    names(uniqval) <- nms
290    result <- uniqval[as.character(effect)]
291    result[na.x] <- NA
292    return(result)
293}
294
295#' @importFrom stats ave
296myave.default <- function(x, effect, func, ...) {
297  # argument 'effect' is assumed to be a factor
298  na.x <- is.na(x)
299  res <- ave(x, effect, FUN = function(x) func(x, ...))
300  names(res) <- as.character(effect)
301  res[na.x] <- NA
302  return(res)
303}
304
305Tapply.pseries <- function(x, effect = c("individual", "time", "group"), func, ...){
306    effect <- match.arg(effect)
307    xindex <- unclass(attr(x, "index")) # unclass for speed
308    checkNA.index(xindex) # index may not contain any NA
309    effect <- switch(effect,
310                     "individual"= xindex[[1L]],
311                     "time"      = xindex[[2L]],
312                     "group"     = xindex[[3L]]
313                     )
314    z <- as.numeric(x)
315    z <- Tapply.default(z, effect, func, ...)
316    attr(z, "index") <- attr(x, "index") # insert original index
317    class(z) <- c("pseries", class(z))
318    return(z)
319}
320
321myave.pseries <- function(x, effect = c("individual", "time", "group"), func, ...) {
322  effect <- match.arg(effect)
323  xindex <- unclass(attr(x, "index")) # unclass for speed
324  checkNA.index(xindex) # index may not contain any NA
325  eff.fac <- switch(effect,
326                   "individual"= xindex[[1L]],
327                   "time"      = xindex[[2L]],
328                   "group"     = xindex[[3L]]
329  )
330  z <- as.numeric(x)
331  z <- myave.default(z, eff.fac, func, ...)
332  attr(z, "index") <- attr(x, "index") # insert original index
333  class(z) <- c("pseries", class(z))
334  return(z)
335}
336
337Tapply.matrix <- function(x, effect, func, ...) {
338    # argument 'effect' is assumed to be a factor
339    na.x <- is.na(x)
340    uniqval <- apply(x, 2, tapply, effect, func, ...)
341    result <- uniqval[as.character(effect), , drop = FALSE]
342    result[na.x] <- NA_real_
343    return(result)
344}
345
346myave.matrix <- function(x, effect, func, ...) {
347    # argument 'effect' is assumed to be a factor
348    na.x <- is.na(x)
349    result <- apply(x, 2, FUN = function(x) ave(x, effect, FUN = function(y) func(y, ...)))
350    rownames(result) <- as.character(effect)
351    result[na.x] <- NA_real_
352    return(result)
353}
354
355## non-exported
356Mean <- function(x) matrix(.colMeans(x, nrow(x), ncol(x)),
357                           nrow(x), ncol(x), byrow = TRUE)
358
359#' @rdname pseries
360#' @export
361Sum <- function(x, ...) {
362    UseMethod("Sum")
363}
364
365#' @rdname pseries
366#' @export
367Sum.default <- function(x, effect, ...) {
368# print("Sum.default(.baseR)")
369# browser()
370
371    # argument 'effect' is assumed to be a factor
372    if(!is.numeric(x)) stop("The Sum function only applies to numeric vectors")
373    #   Tapply(x, effect, sum, ...)
374    return(myave(x, droplevels(effect), sum, ...))
375}
376
377#' @rdname pseries
378#' @export
379Sum.pseries <- function(x, effect = c("individual", "time", "group"), ...) {
380# print("Sum.pseries(.baseR)")
381# browser()
382
383    effect <- match.arg(effect)
384    #   Tapply(x, effect, sum, ...)
385    # myave.pseries takes care of checking the index for NAs
386    return(myave(x, effect, sum, ...))
387}
388
389#' @rdname pseries
390#' @export
391Sum.matrix <- function(x, effect, ...) {
392# print("Sum.matrix(.baseR)")
393# browser()
394
395  # if no index attribute, argument 'effect' is assumed to be a factor
396  eff.fac <- if(is.null(xindex <- attr(x, "index"))) {
397    droplevels(effect)
398  } else {
399    if(!is.character(effect) && length(effect) > 1L)
400      stop("for matrices with index attributes, the effect argument must be a character")
401    if(! effect %in% c("individual", "time", "group"))
402      stop("irrelevant effect for a between transformation")
403    eff.no <- switch(effect,
404                     "individual" = 1L,
405                     "time"       = 2L,
406                     "group"      = 3L,
407                     stop("unknown value of argument 'effect'"))
408    xindex <- unclass(xindex) # unclass for speed
409    checkNA.index(xindex) # index may not contain any NA
410    xindex[[eff.no]]
411  }
412  return(myave(x, eff.fac, sum, ...))
413}
414
415#' @rdname pseries
416#' @export
417Between <- function(x, ...) {
418    UseMethod("Between")
419}
420
421#' @rdname pseries
422#' @export
423Between.default <- function(x, effect, ...) {
424# print("Between.default(.baseR)")
425# browser()
426
427    # argument 'effect' is assumed to be a factor
428    if(!is.numeric(x)) stop("The Between function only applies to numeric vectors")
429    #   Tapply(x, effect, mean, ...)
430    return(myave(x, droplevels(effect), mean, ...))
431}
432
433#' @rdname pseries
434#' @export
435Between.pseries <- function(x, effect = c("individual", "time", "group"), ...) {
436# print("Between.pseries(.baseR)")
437# browser()
438
439    effect <- match.arg(effect)
440    #   Tapply(x, effect = effect, mean, ...)
441    # myave.pseries takes care of checking the index for NAs
442    return(myave(x, effect = effect, mean, ...))
443}
444
445#' @rdname pseries
446#' @export
447Between.matrix <- function(x, effect, ...) {
448# print("Between.matrix(.baseR)")
449# browser()
450
451  # if no index attribute, argument 'effect' is assumed to be a factor
452  eff.fac <- if(is.null(xindex <- attr(x, "index"))) {
453    droplevels(effect)
454  } else {
455    if(!is.character(effect) && length(effect) > 1L)
456      stop("for matrices with index attributes, the effect argument must be a character")
457    if(! effect %in% c("individual", "time", "group"))
458      stop("irrelevant effect for a between transformation")
459    eff.no <- switch(effect,
460                     "individual" = 1L,
461                     "time"       = 2L,
462                     "group"      = 3L,
463                     stop("unknown value of argument 'effect'"))
464    xindex <- unclass(xindex)
465    checkNA.index(xindex) # index may not contain any NA
466    xindex[[eff.no]]
467  }
468  return(myave.matrix(x, eff.fac, mean, ...))
469}
470
471#' @rdname pseries
472#' @export
473between <- function(x, ...) {
474    UseMethod("between")
475}
476
477#' @rdname pseries
478#' @export
479between.default <- function(x, effect, ...) {
480# print("between.default(.baseR)")
481# browser()
482
483    # argument 'effect' is assumed to be a factor
484    if(!is.numeric(x)) stop("The between function only applies to numeric vectors")
485
486    # use tapply here as tapply's output is sorted by levels factor effect (unlike ave's output)
487    # difference is only relevant for between (small "b") as data is compressed down to # levels
488    res <- tapply(x, droplevels(effect), mean, ...)
489    nms <- attr(res, "dimnames")[[1L]]
490    attr(res, "dimnames") <- attr(res, "dim") <- NULL
491    names(res) <- nms
492    return(res)
493}
494
495#' @rdname pseries
496#' @export
497between.pseries <- function(x, effect = c("individual", "time", "group"), ...) {
498# print("between.pseries(.baseR)")
499# browser()
500
501    effect <- match.arg(effect)
502    xindex <- unclass(attr(x, "index")) # unclass for speed
503    checkNA.index(xindex) # index may not contain any NA
504    eff.fac <- switch(effect,
505                     "individual" = xindex[[1L]],
506                     "time"       = xindex[[2L]],
507                     "group"      = xindex[[3L]],
508                     )
509    res <- between.default(x, effect = eff.fac, ...)
510    # data compressed by transformation, so pseries features, esp. index, do not make sense
511    res <- remove_pseries_features(res)
512    return(res)
513}
514
515#' @rdname pseries
516#' @export
517between.matrix <- function(x, effect, ...) {
518# print("between.matrix(.baseR)")
519# browser()
520
521  # if no index attribute, argument 'effect' is assumed to be a factor
522  eff.fac <- if(is.null(xindex <- attr(x, "index"))) {
523    droplevels(effect)
524  } else {
525    if(!is.character(effect) && length(effect) > 1L)
526      stop("for matrices with index attributes, the effect argument must be a character")
527    if(! effect %in% c("individual", "time", "group"))
528      stop("irrelevant effect for a between transformation")
529    eff.no <- switch(effect,
530                     "individual" = 1L,
531                     "time"       = 2L,
532                     "group"      = 3L,
533                     stop("unknown value of argument 'effect'"))
534    xindex <- unclass(xindex) # unclass for speed
535    checkNA.index(xindex) # index may not contain any NA
536    xindex[[eff.no]]
537  }
538
539  # use tapply here as tapply's output is sorted by levels factor effect (unlike ave's output)
540  # difference is only relevant for between (small "b") as data is compressed down to # levels
541  res <- apply(x, 2, tapply, eff.fac, mean, ...)
542  return(res)
543}
544
545#' @rdname pseries
546#' @export
547Within <- function(x, ...) {
548    UseMethod("Within")
549}
550
551#' @rdname pseries
552#' @export
553Within.default <- function(x, effect, ...) {
554# print("Within.default(.baseR)")
555# browser()
556
557  # arg 'effect' is assumed to be a factor
558
559  # NB: Contrary to the other Within.* methods, Within.default does not handle
560  #     twoways effects
561  # TODO: could add support for twoways by supplying a list containing two factors
562    if(!is.numeric(x)) stop("the within function only applies to numeric vectors")
563    return(x - Between(x, droplevels(effect), ...))
564}
565
566#' @rdname pseries
567#' @export
568Within.pseries <- function(x, effect = c("individual", "time", "group", "twoways"), ...) {
569# print("Within.pseries(.baseR)")
570# browser()
571
572    effect <- match.arg(effect)
573    xindex <- unclass(attr(x, "index")) # unclass for speed
574    checkNA.index(xindex) # index may not contain any NA
575    if(effect != "twoways") result <- x - Between(x, effect, ...)
576    else {
577        if(is.pbalanced(x)) result <- x - Between(x, "individual", ...) - Between(x, "time") + mean(x, ...)
578        else {
579            time <- xindex[[2L]]
580            Dmu <- model.matrix(~ time - 1)
581            attr(Dmu, "index") <- attr(x, "index") # need original index
582            W1   <- Within(x,   "individual", ...)
583            WDmu <- Within(Dmu, "individual", ...)
584            W2 <- lm.fit(WDmu, x)$fitted.values
585            result <- W1 - W2
586        }
587    }
588    return(result)
589}
590
591#' @rdname pseries
592#' @export
593Within.matrix <- function(x, effect, rm.null = TRUE, ...) {
594# print("Within.matrix(.baseR)")
595# browser()
596
597    if(is.null(xindex <- unclass(attr(x, "index")))) { # unclass for speed
598      # non-index case
599        result <- Within.default(x, effect, ...)
600        othervar <- colSums(abs(x)) > sqrt(.Machine$double.eps)
601        if(rm.null) {
602            result <- result[ , othervar, drop = FALSE]
603            attr(result, "constant") <- character(0)
604        }
605        else attr(result, "constant") <- colnames(x)[! othervar]
606        return(result)
607    }
608    else {
609      # index case
610        if(effect %in% c("individual", "time", "group")) result <- x - Between(x, effect, ...)
611        if(effect == "twoways") {
612            checkNA.index(xindex) # index may not contain any NA
613            if(is.pbalanced(xindex[[1L]], xindex[[2L]])) {
614                result <- x - Between(x, "individual", ...) - Between(x, "time", ...) +
615                    matrix(colMeans(x, ...), nrow = nrow(x), ncol = ncol(x), byrow = TRUE)
616            }
617            else { # unbalanced twoways
618                time <- xindex[[2L]]
619                Dmu <- model.matrix(~ time - 1)
620                attr(Dmu, "index") <- attr(x, "index") # need orig. index here
621                W1   <- Within(x,   "individual", rm.null = FALSE, ...)
622                WDmu <- Within(Dmu, "individual", ...)
623                W2 <- lm.fit(WDmu, x)$fitted.values
624                result <- W1 - W2
625            }
626        }
627    }
628    return(result)
629}
630
631############### LAG and DIFF
632#
633# lag/lead/diff for pseries are a wrappers for lagt, leadt, difft (if shift = "time") and
634#                                          for lagr, leadr, diffr (if shift = "row")
635#
636# The "t" and "r" methods are not exported (by intention).
637#
638# The "t" methods perform shifting while taking the time period into
639# account (they "look" at the value in the time dimension).
640#
641# The "r" methods perform shifting row-wise (without taking the value
642# in the time dimension into account).
643#
644# Generic needed only for lead (lag and diff generics are already included in base R)
645
646
647#' lag, lead, and diff for panel data
648#'
649#' lag, lead, and diff functions for class pseries.
650#'
651#' This set of functions perform lagging, leading (lagging in the
652#' opposite direction), and differencing operations on `pseries`
653#' objects, i. e., they take the panel structure of the data into
654#' account by performing the operations per individual.
655#'
656#' Argument `shift` controls the shifting of observations to be used
657#' by methods `lag`, `lead`, and `diff`:
658#'
659#' - `shift = "time"` (default): Methods respect the
660#' numerical value in the time dimension of the index. The time
661#' dimension needs to be interpretable as a sequence t, t+1, t+2,
662#' \ldots{} where t is an integer (from a technical viewpoint,
663#' `as.numeric(as.character(index(your_pdata.frame)[[2]]))` needs to
664#' result in a meaningful integer).
665#'
666#' - `shift = "row": `Methods perform the shifting operation based
667#' solely on the "physical position" of the observations,
668#' i.e., neighbouring rows are shifted per individual. The value in the
669#' time index is not relevant in this case.
670#'
671#' For consecutive time periods per individual, a switch of shifting
672#' behaviour results in no difference. Different return values will
673#' occur for non-consecutive time periods per individual
674#' ("holes in time"), see also Examples.
675#'
676#' @name lag.plm
677#' @aliases lag lead diff
678#' @importFrom stats lag
679#' @param x a `pseries` object,
680#' @param k an integer, the number of lags for the `lag` and `lead`
681#'     methods (can also be negative).  For the `lag` method, a
682#'     positive (negative) `k` gives lagged (leading) values.  For the
683#'     `lead` method, a positive (negative) `k` gives leading (lagged)
684#'     values, thus, `lag(x, k = -1L)` yields the same as `lead(x, k = 1L)`.
685#'     If `k` is an integer with length > 1 (`k = c(k1, k2, ...)`), a
686#'     `matrix` with multiple lagged `pseries` is returned,
687#' @param lag integer, the number of lags for the `diff` method, can also be of
688#'     length > 1 (see argument `k`) (only non--negative values in
689#'     argument `lag` are allowed for `diff`),
690#' @param shift character, either `"time"` (default) or `"row"`
691#'     determining how the shifting in the `lag`/`lead`/`diff`
692#'     functions is performed (see Details and Examples).
693#' @param ... further arguments (currently none evaluated).
694#' @return
695#'
696#' - An object of class `pseries`, if the argument specifying the lag
697#'     has length 1 (argument `k` in functions `lag` and `lead`,
698#'     argument `lag` in function `diff`).
699#'
700#' - A matrix containing the various series in its columns, if the
701#'     argument specifying the lag has length > 1.
702#'
703#' @note The sign of `k` in `lag.pseries` results in inverse behaviour
704#'     compared to [stats::lag()] and [zoo::lag.zoo()].
705#' @author Yves Croissant and Kevin Tappe
706#' @seealso To check if the time periods are consecutive per
707#'     individual, see [is.pconsecutive()].
708#'
709#' For further function for 'pseries' objects: [between()],
710#' [Between()], [Within()], [summary.pseries()],
711#' [print.summary.pseries()], [as.matrix.pseries()].
712#' @keywords classes
713#' @examples
714#'
715#' # First, create a pdata.frame
716#' data("EmplUK", package = "plm")
717#' Em <- pdata.frame(EmplUK)
718#'
719#' # Then extract a series, which becomes additionally a pseries
720#' z <- Em$output
721#' class(z)
722#'
723#' # compute the first and third lag, and the difference lagged twice
724#' lag(z)
725#' lag(z, 3L)
726#' diff(z, 2L)
727#'
728#' # compute negative lags (= leading values)
729#' lag(z, -1L)
730#' lead(z, 1L) # same as line above
731#' identical(lead(z, 1L), lag(z, -1L)) # TRUE
732#'
733#' # compute more than one lag and diff at once (matrix returned)
734#' lag(z, c(1L,2L))
735#' diff(z, c(1L,2L))
736#'
737#' ## demonstrate behaviour of shift = "time" vs. shift = "row"
738#' # delete 2nd time period for first individual (1978 is missing (not NA)):
739#' Em_hole <- Em[-2L, ]
740#' is.pconsecutive(Em_hole) # check: non-consecutive for 1st individual now
741#'
742#' # original non-consecutive data:
743#' head(Em_hole$emp, 10)
744#' # for shift = "time", 1-1979 contains the value of former 1-1977 (2 periods lagged):
745#' head(lag(Em_hole$emp, k = 2L, shift = "time"), 10L)
746#' # for shift = "row", 1-1979 contains NA (2 rows lagged (and no entry for 1976):
747#' head(lag(Em_hole$emp, k = 2L, shift = "row"), 10L)
748#'
749NULL
750
751#' @rdname lag.plm
752#' @export
753lead <- function(x, k = 1L, ...) {
754  UseMethod("lead")
755}
756
757#' @rdname lag.plm
758#' @exportS3Method
759#' @export lag
760lag.pseries <- function(x, k = 1L, shift = c("time", "row"), ...) {
761  shift <- match.arg(shift)
762  res <- if(shift == "time") lagt.pseries(x = x, k = k, ...) else lagr.pseries(x = x, k = k, ...)
763  return(res)
764}
765
766#' @rdname lag.plm
767#' @export
768lead.pseries <- function(x, k = 1L, shift = c("time", "row"), ...) {
769  shift <- match.arg(shift)
770  res <- if(shift == "time") leadt.pseries(x = x, k = k, ...) else leadr.pseries(x = x, k = k, ...)
771  return(res)
772}
773
774#' @rdname lag.plm
775#' @exportS3Method
776diff.pseries <- function(x, lag = 1L, shift = c("time", "row"), ...) {
777  shift <- match.arg(shift)
778  res <- if(shift == "time") difft.pseries(x = x, lag = lag, ...) else diffr.pseries(x = x, lag = lag, ...)
779  return(res)
780}
781
782## lagt.pseries lagging taking the time variable into account
783lagt.pseries <- function(x, k = 1L, ...) {
784  index <- unclass(attr(x, "index")) # unclass for speed
785  id <- index[[1L]]
786  time <- index[[2L]]
787
788  if(length(k) > 1L) {
789    rval <- sapply(k, function(i) alagt(x, i))
790    colnames(rval) <- k
791  }
792  else {
793    rval <- alagt(x, k)
794  }
795  return(rval)
796}
797
798## leadt.pseries(x, k) is a wrapper for lagt.pseries(x, -k)
799leadt.pseries <- function(x, k = 1L, ...) {
800  ret <- lagt.pseries(x, k = -k)
801  if(length(k) > 1L) colnames(ret) <- k
802  return(ret)
803}
804
805## difft: diff-ing taking the time variable into account
806difft.pseries <- function(x, lag = 1L, ...){
807  ## copied/adapted from diffr.pseries except lines which use lagt() ("t") instead of lagr() ("r")
808  islogi <- is.logical(x)
809  if(! (is.numeric(x) || islogi)) stop("diff is only relevant for numeric or logical series")
810
811  non.int <- vapply(lag, function(l) round(l) != l, FUN.VALUE = TRUE, USE.NAMES = FALSE)
812  if(any(non.int)) stop("Lagging value(s) in 'lag' must be whole-numbered (and non-negative)")
813
814  # prevent input of negative values, because it will most likely confuse users
815  # what difft would do in this case
816  neg <- vapply(lag, function(l) l < 0L, FUN.VALUE = TRUE, USE.NAMES = FALSE)
817  if(any(neg)) stop("diff is only relevant for non-negative values in 'lag'")
818
819  lagtx <- lagt.pseries(x, k = lag) # use "time-based" lagging for difft
820
821  if(is.matrix(lagtx)) {
822    # if 'lagtx' is matrix (case length(lag) > 1):
823    # perform subtraction without pseries feature of 'x', because otherwise
824    # the result would be c("pseries", "matrix") which is not supported
825    res <- as.numeric(x) - lagtx
826  } else {
827    res <- x - lagtx
828  }
829
830  return(res)
831}
832
833## alagt: non-exported helper function for lagt (actual work horse),
834## performs shifting of observations while respecting the time dimension
835alagt <- function(x, ak) {
836  if(round(ak) != ak) stop("Lagging value 'k' must be whole-numbered (positive, negative or zero)")
837  if(ak != 0) {
838    index <- unclass(attr(x, "index")) # unclass for speed
839    id   <- index[[1L]]
840    time <- index[[2L]]
841
842    # Idea: split times in blocks per individuals and do lagging there
843    # by computation of correct time shifting
844
845    # need to convert to numeric, do this by coercing to character
846    # first (otherwise wrong results!)
847    #  see R FAQ 7.10 for coercing factors to numeric:
848    #      as.numeric(levels(factor_var))[as.integer(factor_var)] is
849    #      more efficient than
850    #      as.numeric(as.character(factor_var))
851
852    # YC 2019/08/29 only works if time values can be coerced to
853    ## numeric, ie integers like years. When year is period (ie 5 years),
854    ## values used to be 1950 for the 1950-54 period, time is now a
855    ## factor in the original data.frame with levels "1950-54",
856    ## "1955-59", ... In this case coercing the levels to a numeric gives
857    ## NA so coerce the *factor* to a numeric.
858
859    levtime <- levels(time)
860    numlevtime <- suppressWarnings(as.numeric(levtime))
861    if(! anyNA(numlevtime)) time <- as.numeric(levels(time))[as.integer(time)]
862    else time <- as.numeric(time)
863
864    list_id_timevar <- split(time, id, drop = TRUE)
865
866    index_lag_ak_all_list <- sapply(list_id_timevar,
867                                    function(x) match(x - ak, x, incomparables = NA),
868                                    simplify = FALSE, USE.NAMES = FALSE)
869
870    # translate block-wise positions to positions in full vector
871    index_lag_ak_all <- unlist(index_lag_ak_all_list, use.names = FALSE)
872
873    NApos <- is.na(index_lag_ak_all) # save NA positions for later
874    substitute_blockwise <- index_lag_ak_all
875
876    block_lengths <- vapply(index_lag_ak_all_list, length, FUN.VALUE = 0.0, USE.NAMES = FALSE)
877
878    # not needed but leave here for illustration:
879    #    startpos_block <- cumsum(block_lengths) - block_lengths + 1
880    #    endpos_block <- startpos_block + block_lengths - 1
881
882    indexes_blockwise <- unlist(sapply(block_lengths, function(x) seq(from = 1, to = x), simplify = FALSE), use.names = FALSE)
883
884    orig_pos_x <- seq.int(x) # make vector with indexes for original input
885    new_pos <- orig_pos_x - (indexes_blockwise - substitute_blockwise) # calc. new positions
886    new_pos[NApos] <- orig_pos_x[NApos] # fill NAs with arbitrary values to allow proper subsetting in next step
887
888    orig_attr <- attributes(x)
889    x <- x[new_pos] # re-arrange according to lagging
890    x[NApos] <- NA  # set NAs where necessary
891    attributes(x) <- orig_attr # restore original names and 'pseries' class (lost by subsetting x)
892  }
893  return(x)
894} # END alagt
895
896
897## lagr: lagging row-wise
898lagr.pseries <- function(x, k = 1L, ...) {
899    index <- unclass(attr(x, "index")) # unclass for speed
900    id <- index[[1L]]
901    time <- index[[2L]]
902
903    # catch the case when an index of pdata.frame shall be lagged
904    # (index variables are always factors) NB: this catches -
905    # unintentionally - also the case when a factor variable is the
906    # same "on the character level" as one of the corresponding index
907    # variables but not the index variable itself
908    #
909    # -> shall we prevent lagging of index variables at all? -> turned
910    # off for now, 2016-03-03 if(is.factor(x)) if
911    # (all(as.character(x) == as.character(id)) |
912    # all(as.character(x)==as.character(time))) stop("Lagged vector
913    # cannot be index.")
914
915    alagr <- function(x, ak){
916        if(round(ak) != ak) stop("Lagging value 'k' must be whole-numbered (positive, negative or zero)")
917        if(ak > 0L) {
918
919        # NB: this code does row-wise shifting
920        # delete first ak observations for each unit
921            isNAtime <- c(rep(TRUE, ak), (diff(as.numeric(time), lag = ak) != ak))
922            isNAid   <- c(rep(TRUE, ak), (diff(as.numeric(id),   lag = ak) != 0L))
923            isNA <- (isNAtime | isNAid)
924
925            result <- x                                             # copy x first ...
926            result[1:ak] <- NA                                      # ... then make first ak obs NA ...
927            result[(ak+1):length(result)] <- x[1:(length(x)-ak)]    # ... shift and ...
928            result[isNA] <- NA                                      # ... make more NAs in between: this way, we keep: all factor levels, names, classes
929
930        } else if(ak < 0L) { # => compute leading values
931
932        # delete last |ak| observations for each unit
933            num_time <- as.numeric(time)
934            num_id   <- as.numeric(id)
935            isNAtime <- c(c((num_time[1:(length(num_time)+ak)] - num_time[(-ak+1):length(num_time)]) != ak), rep(TRUE, -ak))
936            isNAid   <- c(c((num_id[1:(length(num_id)+ak)]     - num_id[(-ak+1):length(num_id)])     != 0L), rep(TRUE, -ak))
937            isNA <- (isNAtime | isNAid)
938
939            result <- x                                            # copy x first ...
940            result[(length(result)+ak+1):length(result)] <- NA     # ... then make last |ak| obs NA ...
941            result[1:(length(result)+ak)] <- x[(1-ak):(length(x))] # ... shift and ...
942            result[isNA] <- NA                                     # ... make more NAs in between: this way, we keep: all factor levels, names, classes
943
944        } else { # ak == 0 => nothing to do, return original pseries (no lagging/no leading)
945            result <- x
946        }
947
948        return(result)
949    } # END function alagr
950
951    if(length(k) > 1L) {
952        rval <- sapply(k, function(i) alagr(x, i))
953        colnames(rval) <- k
954    }
955    else {
956        rval <- alagr(x, k)
957    }
958    return(rval)
959}
960
961
962# leadr.pseries(x, k) is a wrapper for lagr.pseries(x, -k)
963leadr.pseries <- function(x, k = 1L, ...) {
964    ret <- lagr.pseries(x, k = -k)
965    if(length(k) > 1L) colnames(ret) <- k
966    return(ret)
967}
968
969## diffr: lagging row-wise
970diffr.pseries <- function(x, lag = 1L, ...) {
971    islogi <- is.logical(x)
972    if(! (is.numeric(x) || islogi)) stop("diff is only relevant for numeric or logical series")
973
974    non.int <- vapply(lag, function(l) round(l) != l, FUN.VALUE = TRUE, USE.NAMES = FALSE)
975    if(any(non.int)) stop("Lagging value(s) in 'lag' must be whole-numbered (and non-negative)")
976
977    # prevent input of negative values, because it will most likely confuse users
978    # what diff would do in this case
979    neg <- vapply(lag, function(l) l < 0L, FUN.VALUE = TRUE, USE.NAMES = FALSE)
980    if(any(neg)) stop("diff is only relevant for non-negative values in 'lag'")
981
982    lagrx <- lagr.pseries(x, k = lag)
983
984    if(is.matrix(lagrx)) {
985      # if 'lagrx' is matrix (case length(lag) > 1):
986      # perform subtraction without pseries feature of 'x', because otherwise
987      # the result would be c("pseries", "matrix") which is not supported
988      res <- as.numeric(x) - lagrx
989    } else {
990      res <- x - lagrx
991    }
992    return(res)
993}
994
995## pdiff is (only) used in model.matrix.pFormula to calculate the
996## model.matrix for FD models, works for effect = "individual" only,
997## see model.matrix on how to call pdiff. Result is in order (id,
998## time) for both effects
999##
1000## Performs row-wise shifting
1001pdiff <- function(x, effect = c("individual", "time"), has.intercept = FALSE){
1002  # NB: x is assumed to have an index attribute, e.g., a pseries
1003  #     can check with has.index(x)
1004    effect <- match.arg(effect)
1005    cond <- as.numeric(unclass(attr(x, "index"))[[1L]]) # unclass for speed
1006    n <- if(is.matrix(x)) nrow(x) else length(x)
1007    cond <- c(NA, cond[2:n] - cond[1:(n-1)]) # this assumes a certain ordering
1008    cond[cond != 0] <- NA
1009    if(! is.matrix(x)){
1010        result <- c(NA , x[2:n] - x[1:(n-1)])
1011        result[is.na(cond)] <- NA
1012        result <- na.omit(result)
1013    }
1014    else{
1015        result <- rbind(NA, x[2:n, , drop = FALSE] - x[1:(n-1), , drop = FALSE])
1016        result[is.na(cond), ] <- NA
1017        result <- na.omit(result)
1018        result <- result[ , apply(result, 2, var) > 1E-12, drop = FALSE]
1019        if(has.intercept){
1020            result <- cbind(1, result)
1021            colnames(result)[1L] <- "(Intercept)"
1022        }
1023    }
1024    attr(result, "na.action") <- NULL
1025    result
1026}
1027
1028