1########### is.pconsecutive ##############
2# little helper function to determine if the time periods of an object are consecutive per id.
3# By consecutive we mean "consecutive in the numbers", i.e., is.pconsecutive takes the numerical
4# value of the time variable into account: t, t+1, t+2, ... where t is an integer
5#
6# For this, we need as.numeric(as.character(time_var)) where as.character is a crucial part!
7# Equivalent but more efficient is as.numeric(levels(id_timevar))[as.integer(id_timevar)]
8# (see R FAQ 7.10 for coercing factors to numeric]
9# and the coerction of time_var in this manner needs to be meaningful numbers.
10#
11# see also in separate file make.pconsecutive.R:
12#   * make.pconsecutive
13#   * make.pbalanced
14
15
16#' Check if time periods are consecutive
17#'
18#' This function checks for each individual if its associated time periods are
19#' consecutive (no "gaps" in time dimension per individual)
20#'
21#' (p)data.frame, pseries and estimated panelmodel objects can be tested if
22#' their time periods are consecutive per individual.  For evaluation of
23#' consecutiveness, the time dimension is interpreted to be numeric, and the
24#' data are tested for being a regularly spaced sequence with distance 1
25#' between the time periods for each individual (for each individual the time
26#' dimension can be interpreted as sequence t, t+1, t+2, \ldots{} where t is an
27#' integer). As such, the "numerical content" of the time index variable is
28#' considered for consecutiveness, not the "physical position" of the various
29#' observations for an individuals in the (p)data.frame/pseries (it is not
30#' about "neighbouring" rows). If the object to be evaluated is a pseries or a
31#' pdata.frame, the time index is coerced from factor via as.character to
32#' numeric, i.e., the series
33#' `as.numeric(as.character(index(<pseries/pdata.frame>)[[2]]))]` is
34#' evaluated for gaps.
35#'
36#' The default method also works for argument `x` being an arbitrary
37#' vector (see **Examples**), provided one can supply arguments `id`
38#' and `time`, which need to ordered as stacked time series. As only
39#' `id` and `time` are really necessary for the default method to
40#' evaluate the consecutiveness, `x = NULL` is also possible. However, if
41#' the vector `x` is also supplied, additional input checking for equality
42#' of the lengths of `x`, `id` and `time` is performed, which is
43#' safer.
44#'
45#' For the data.frame interface, the data is ordered in the appropriate way
46#' (stacked time series) before the consecutiveness is evaluated. For the
47#' pdata.frame and pseries interface, ordering is not performed because both
48#' data types are already ordered in the appropriate way when created.
49#'
50#' Note: Only the presence of the time period itself in the object is tested,
51#' not if there are any other variables.  `NA` values in individual index
52#' are not examined but silently dropped - In this case, it is not clear which
53#' individual is meant by id value `NA`, thus no statement about
54#' consecutiveness of time periods for those "`NA`-individuals" is
55#' possible.
56#'
57#' @name is.pconsecutive
58#' @aliases is.pconsecutive
59#' @param x usually, an object of class `pdata.frame`,
60#'     `data.frame`, `pseries`, or an estimated
61#'     `panelmodel`; for the default method `x` can also be
62#'     an arbitrary vector or `NULL`, see **Details**,
63#' @param na.rm.tindex logical indicating whether any `NA` values
64#'     in the time index are removed before consecutiveness is
65#'     evaluated (defaults to `FALSE`),
66#' @param index only relevant for `data.frame` interface; if
67#'     `NULL`, the first two columns of the data.frame are
68#'     assumed to be the index variables; if not `NULL`, both
69#'     dimensions ('individual', 'time') need to be specified by
70#'     `index` for `is.pconsecutive` on data frames, for
71#'     further details see [pdata.frame()],
72#' @param id,time only relevant for default method: vectors specifying
73#'     the id and time dimensions, i. e. a sequence of individual and
74#'     time identifiers, each as stacked time series,
75#' @param \dots further arguments.
76#' @return A named `logical` vector (names are those of the
77#'     individuals). The i-th element of the returned vector
78#'     corresponds to the i-th individual. The values of the i-th
79#'     element can be: \item{TRUE}{if the i-th individual has
80#'     consecutive time periods,} \item{FALSE}{if the i-th
81#'     individual has non-consecutive time periods,}
82#'     \item{"NA"}{if there are any NA values in time index of
83#'     the i-th the individual; see also argument `na.rm.tindex`
84#'     to remove those.}
85#' @export
86#' @author Kevin Tappe
87#' @seealso [make.pconsecutive()] to make data consecutive
88#'     (and, as an option, balanced at the same time) and
89#'     [make.pbalanced()] to make data balanced.\cr
90#'     [pdim()] to check the dimensions of a 'pdata.frame'
91#'     (and other objects), [pvar()] to check for individual
92#'     and time variation of a 'pdata.frame' (and other objects),
93#'     [lag()] for lagged (and leading) values of a
94#'     'pseries' object.\cr
95#'
96#' [pseries()], [data.frame()], [pdata.frame()],
97#' for class 'panelmodel' see [plm()] and [pgmm()].
98#' @keywords attribute
99#' @examples
100#'
101#' data("Grunfeld", package = "plm")
102#' is.pconsecutive(Grunfeld)
103#' is.pconsecutive(Grunfeld, index=c("firm", "year"))
104#'
105#' # delete 2nd row (2nd time period for first individual)
106#' # -> non consecutive
107#' Grunfeld_missing_period <- Grunfeld[-2, ]
108#' is.pconsecutive(Grunfeld_missing_period)
109#' all(is.pconsecutive(Grunfeld_missing_period)) # FALSE
110#'
111#' # delete rows 1 and 2 (1st and 2nd time period for first individual)
112#' # -> consecutive
113#' Grunfeld_missing_period_other <- Grunfeld[-c(1,2), ]
114#' is.pconsecutive(Grunfeld_missing_period_other) # all TRUE
115#'
116#' # delete year 1937 (3rd period) for _all_ individuals
117#' Grunfeld_wo_1937 <- Grunfeld[Grunfeld$year != 1937, ]
118#' is.pconsecutive(Grunfeld_wo_1937) # all FALSE
119#'
120#' # pdata.frame interface
121#' pGrunfeld <- pdata.frame(Grunfeld)
122#' pGrunfeld_missing_period <- pdata.frame(Grunfeld_missing_period)
123#' is.pconsecutive(pGrunfeld) # all TRUE
124#' is.pconsecutive(pGrunfeld_missing_period) # first FALSE, others TRUE
125#'
126#'
127#' # panelmodel interface (first, estimate some models)
128#' mod_pGrunfeld <- plm(inv ~ value + capital, data = Grunfeld)
129#' mod_pGrunfeld_missing_period <- plm(inv ~ value + capital, data = Grunfeld_missing_period)
130#'
131#' is.pconsecutive(mod_pGrunfeld)
132#' is.pconsecutive(mod_pGrunfeld_missing_period)
133#'
134#' nobs(mod_pGrunfeld) # 200
135#' nobs(mod_pGrunfeld_missing_period) # 199
136#'
137#'
138#' # pseries interface
139#' pinv <- pGrunfeld$inv
140#' pinv_missing_period <- pGrunfeld_missing_period$inv
141#'
142#' is.pconsecutive(pinv)
143#' is.pconsecutive(pinv_missing_period)
144#'
145#' # default method for arbitrary vectors or NULL
146#' inv <- Grunfeld$inv
147#' inv_missing_period <- Grunfeld_missing_period$inv
148#' is.pconsecutive(inv, id = Grunfeld$firm, time = Grunfeld$year)
149#' is.pconsecutive(inv_missing_period, id = Grunfeld_missing_period$firm,
150#'                                     time = Grunfeld_missing_period$year)
151#'
152#' # (not run) demonstrate mismatch lengths of x, id, time
153#' # is.pconsecutive(x = inv_missing_period, id = Grunfeld$firm, time = Grunfeld$year)
154#'
155#' # only id and time are needed for evaluation
156#' is.pconsecutive(NULL, id = Grunfeld$firm, time = Grunfeld$year)
157#'
158is.pconsecutive <- function(x, ...){
159  UseMethod("is.pconsecutive")
160}
161
162#' @rdname is.pconsecutive
163#' @export
164is.pconsecutive.default <- function(x, id, time, na.rm.tindex = FALSE, ...) {
165  # argument 'x' just used for input check (if it is not NULL and is a vector)
166
167  # input checks
168  if(length(id) != length(time))
169    stop(paste0("arguments 'id' and 'time' must have same length: length(id): ", length(id), ", length(time) ", length(time)))
170
171  if(!is.null(x) && is.vector(x)) { # is.vector could be too strict? factor is not a vector
172    if(!(length(x) == length(id) && length(x) == length(time) && length(id) == length(time)))
173      stop(paste0("arguments 'x', 'id', 'time' must have same length: length(x): ",
174                  length(x), ", length(id): ", length(id), ", length(time): ", length(time)))
175
176  }
177
178  # NB: 'time' is assumed to be organised as stacked time series (sorted for each individual)
179  #     (successive blocks of individuals, each block being a time series for the respective individual))
180  #
181  #   'time' is in the correct order if is.pconsecutive.default is called by
182  #   is.pconsecutive.pdata.frame or is.pconsecutive.pseries as a pdata.frame (which is sorted) was constructed
183  #   in the first place; for data.frame interface the ordering is done in the respective function
184
185  if(na.rm.tindex) {
186    NA_tindex <- is.na(time)
187    time <- time[!NA_tindex]
188    id <- id[!NA_tindex]
189  }
190
191  # if time var is factor (as is TRUE for pdata.frames, pseries):
192  # need to convert to numeric, do this by coering to character first (otherwise wrong results!)
193  #  see R FAQ 7.10 for coercing factors to numeric:
194  #      as.numeric(levels(factor_var))[as.integer(factor_var)]   is more efficient than
195  #      as.numeric(as.character(factor_var))
196  if(!is.numeric(time) && is.factor(time)) time <- as.numeric(levels(time))[as.integer(time)]
197
198  list_id_timevar <- split(time, id, drop = TRUE)
199
200  res <- vapply(list_id_timevar, function(id_timevar) { if(anyNA(id_timevar)) {
201                                                           NA # return NA if NA found in the time periods for individual
202                                                          } else {
203                                                              begin <- id_timevar[1L]
204                                                              end   <- id_timevar[length(id_timevar)]
205
206                                                              # compare to length(original id_timevar) to find out if times are consecutive
207                                                              (end - begin + 1L) == length(id_timevar)
208
209                                                              # Alternative way of checking:
210                                                                # consecutive time periods from begin to end (if id_timevar were consecutive)
211                                                                # consecutive <- seq(from = begin, to = end, by = 1)
212                                                                # length(consecutive) == length(id_timevar)
213                                                          }
214                                                      }, FUN.VALUE = TRUE)
215
216  return(res)
217}
218
219#' @rdname is.pconsecutive
220#' @export
221is.pconsecutive.data.frame <- function(x, index = NULL, na.rm.tindex = FALSE, ...){
222  if (!is.null(index) && length(index) != 2L)
223    stop("if argument 'index' is not NULL, 'index' needs to specify
224         'individual' and 'time' dimension for is.pconsecutive to work on a data.frame")
225
226  # if index not provided, assume first two columns to be the index vars
227  index_orig_names <- if(is.null(index)) names(x)[1:2] else index
228
229  id   <- x[ , index_orig_names[1L]]
230  time <- x[ , index_orig_names[2L]]
231
232  # order as stacked time series (by id and time) first, otherwise default method does not work correctly!
233  ord <- order(id, time)
234  x_ordered    <- x[ord, ]
235  id_ordered   <- id[ord]
236  time_ordered <- time[ord]
237
238#  if (!identical(x, x_ordered))
239#    print("Note: for test of consecutiveness of time periods, the data.frame was ordered by index variables (id, time)")
240
241  return(is.pconsecutive.default(x_ordered, id_ordered, time_ordered, na.rm.tindex = na.rm.tindex, ...))
242}
243
244#' @rdname is.pconsecutive
245#' @export
246is.pconsecutive.pseries <- function(x, na.rm.tindex = FALSE, ...){
247  index <- unclass(attr(x, "index")) # unclass for speed
248  return(is.pconsecutive.default(x, index[[1L]], index[[2L]], na.rm.tindex = na.rm.tindex, ...))
249}
250
251
252#' @rdname is.pconsecutive
253#' @export
254is.pconsecutive.pdata.frame <- function(x, na.rm.tindex = FALSE, ...){
255  index <- unclass(attr(x, "index")) # unclass for speed
256  return(is.pconsecutive.default(x, index[[1L]], index[[2L]], na.rm.tindex = na.rm.tindex, ...))
257}
258
259#' @rdname is.pconsecutive
260#' @export
261is.pconsecutive.panelmodel <- function(x, na.rm.tindex = FALSE, ...){
262  index <- unclass(attr(x$model, "index")) # unclass for speed
263  return(is.pconsecutive.default(x, index[[1L]], index[[2L]], na.rm.tindex = na.rm.tindex, ...))
264}
265
266
267########### is.pbalanced ##############
268### for convenience and to be faster than pdim() for the purpose
269### of the determination of balancedness only, because it avoids
270### pdim()'s calculations which are unnecessary for balancedness.
271###
272### copied (and adapted) methods and code from pdim.*
273### (only relevant parts to determine balancedness)
274
275
276#' Check if data are balanced
277#'
278#' This function checks if the data are balanced, i.e., if each individual has
279#' the same time periods
280#'
281#' Balanced data are data for which each individual has the same time periods.
282#' The returned values of the `is.pbalanced(object)` methods are identical
283#' to `pdim(object)$balanced`.  `is.pbalanced` is provided as a short
284#' cut and is faster than `pdim(object)$balanced` because it avoids those
285#' computations performed by `pdim` which are unnecessary to determine the
286#' balancedness of the data.
287#'
288#' @aliases is.pbalanced
289#' @param x an object of class `pdata.frame`, `data.frame`,
290#'     `pseries`, `panelmodel`, or `pgmm`,
291#' @param y (only in default method) the time index variable (2nd index
292#' variable),
293#' @param index only relevant for `data.frame` interface; if
294#'     `NULL`, the first two columns of the data.frame are
295#'     assumed to be the index variables; if not `NULL`, both
296#'     dimensions ('individual', 'time') need to be specified by
297#'     `index` as character of length 2 for data frames, for
298#'     further details see [pdata.frame()],
299#' @param \dots further arguments.
300#' @return A logical indicating whether the data associated with
301#'     object `x` are balanced (`TRUE`) or not
302#'     (`FALSE`).
303#' @seealso [punbalancedness()] for two measures of
304#'     unbalancedness, [make.pbalanced()] to make data
305#'     balanced; [is.pconsecutive()] to check if data are
306#'     consecutive; [make.pconsecutive()] to make data
307#'     consecutive (and, optionally, also balanced).\cr
308#'     [pdim()] to check the dimensions of a 'pdata.frame'
309#'     (and other objects), [pvar()] to check for individual
310#'     and time variation of a 'pdata.frame' (and other objects),
311#'     [pseries()], [data.frame()],
312#'     [pdata.frame()].
313#' @export
314#' @keywords attribute
315#' @examples
316#'
317#' # take balanced data and make it unbalanced
318#' # by deletion of 2nd row (2nd time period for first individual)
319#' data("Grunfeld", package = "plm")
320#' Grunfeld_missing_period <- Grunfeld[-2, ]
321#' is.pbalanced(Grunfeld_missing_period)     # check if balanced: FALSE
322#' pdim(Grunfeld_missing_period)$balanced    # same
323#'
324#' # pdata.frame interface
325#' pGrunfeld_missing_period <- pdata.frame(Grunfeld_missing_period)
326#' is.pbalanced(Grunfeld_missing_period)
327#'
328#' # pseries interface
329#' is.pbalanced(pGrunfeld_missing_period$inv)
330#'
331is.pbalanced <- function(x, ...) {
332  UseMethod("is.pbalanced")
333}
334
335#' @rdname is.pbalanced
336#' @export
337is.pbalanced.default <- function(x, y, ...) {
338  if (length(x) != length(y)) stop("The length of the two inputs differs\n")
339  x <- x[drop = TRUE] # drop unused factor levels so that table
340  y <- y[drop = TRUE] # gives only needed combinations
341  z <- table(x, y)
342  balanced <- if(any(v <- as.vector(z) == 0L)) FALSE else TRUE
343  if (any(v > 1L)) warning("duplicate couples (id-time)\n")
344  return(balanced)
345}
346
347#' @rdname is.pbalanced
348#' @export
349is.pbalanced.data.frame <- function(x, index = NULL, ...) {
350  x <- pdata.frame(x, index)
351  index <- unclass(attr(x, "index")) # unclass for speed
352  return(is.pbalanced(index[[1L]], index[[2L]]))
353}
354
355#' @rdname is.pbalanced
356#' @export
357is.pbalanced.pdata.frame <- function(x, ...) {
358  index <- unclass(attr(x, "index")) # unclass for speed
359  return(is.pbalanced(index[[1L]], index[[2L]]))
360}
361
362#' @rdname is.pbalanced
363#' @export
364is.pbalanced.pseries <- function(x, ...) {
365  index <- unclass(attr(x, "index")) # unclass for speed
366  return(is.pbalanced(index[[1L]], index[[2L]]))
367}
368
369#' @rdname is.pbalanced
370#' @export
371is.pbalanced.panelmodel <- function(x, ...) {
372  x <- model.frame(x)
373  return(is.pbalanced(x))
374}
375
376#' @rdname is.pbalanced
377#' @export
378is.pbalanced.pgmm <- function(x, ...) {
379## pgmm is also class panelmodel, but take advantage of the pdim attribute in it
380  return(attr(x, "pdim")$balanced)
381}
382