1########### is.pconsecutive ############## 2# little helper function to determine if the time periods of an object are consecutive per id. 3# By consecutive we mean "consecutive in the numbers", i.e., is.pconsecutive takes the numerical 4# value of the time variable into account: t, t+1, t+2, ... where t is an integer 5# 6# For this, we need as.numeric(as.character(time_var)) where as.character is a crucial part! 7# Equivalent but more efficient is as.numeric(levels(id_timevar))[as.integer(id_timevar)] 8# (see R FAQ 7.10 for coercing factors to numeric] 9# and the coerction of time_var in this manner needs to be meaningful numbers. 10# 11# see also in separate file make.pconsecutive.R: 12# * make.pconsecutive 13# * make.pbalanced 14 15 16#' Check if time periods are consecutive 17#' 18#' This function checks for each individual if its associated time periods are 19#' consecutive (no "gaps" in time dimension per individual) 20#' 21#' (p)data.frame, pseries and estimated panelmodel objects can be tested if 22#' their time periods are consecutive per individual. For evaluation of 23#' consecutiveness, the time dimension is interpreted to be numeric, and the 24#' data are tested for being a regularly spaced sequence with distance 1 25#' between the time periods for each individual (for each individual the time 26#' dimension can be interpreted as sequence t, t+1, t+2, \ldots{} where t is an 27#' integer). As such, the "numerical content" of the time index variable is 28#' considered for consecutiveness, not the "physical position" of the various 29#' observations for an individuals in the (p)data.frame/pseries (it is not 30#' about "neighbouring" rows). If the object to be evaluated is a pseries or a 31#' pdata.frame, the time index is coerced from factor via as.character to 32#' numeric, i.e., the series 33#' `as.numeric(as.character(index(<pseries/pdata.frame>)[[2]]))]` is 34#' evaluated for gaps. 35#' 36#' The default method also works for argument `x` being an arbitrary 37#' vector (see **Examples**), provided one can supply arguments `id` 38#' and `time`, which need to ordered as stacked time series. As only 39#' `id` and `time` are really necessary for the default method to 40#' evaluate the consecutiveness, `x = NULL` is also possible. However, if 41#' the vector `x` is also supplied, additional input checking for equality 42#' of the lengths of `x`, `id` and `time` is performed, which is 43#' safer. 44#' 45#' For the data.frame interface, the data is ordered in the appropriate way 46#' (stacked time series) before the consecutiveness is evaluated. For the 47#' pdata.frame and pseries interface, ordering is not performed because both 48#' data types are already ordered in the appropriate way when created. 49#' 50#' Note: Only the presence of the time period itself in the object is tested, 51#' not if there are any other variables. `NA` values in individual index 52#' are not examined but silently dropped - In this case, it is not clear which 53#' individual is meant by id value `NA`, thus no statement about 54#' consecutiveness of time periods for those "`NA`-individuals" is 55#' possible. 56#' 57#' @name is.pconsecutive 58#' @aliases is.pconsecutive 59#' @param x usually, an object of class `pdata.frame`, 60#' `data.frame`, `pseries`, or an estimated 61#' `panelmodel`; for the default method `x` can also be 62#' an arbitrary vector or `NULL`, see **Details**, 63#' @param na.rm.tindex logical indicating whether any `NA` values 64#' in the time index are removed before consecutiveness is 65#' evaluated (defaults to `FALSE`), 66#' @param index only relevant for `data.frame` interface; if 67#' `NULL`, the first two columns of the data.frame are 68#' assumed to be the index variables; if not `NULL`, both 69#' dimensions ('individual', 'time') need to be specified by 70#' `index` for `is.pconsecutive` on data frames, for 71#' further details see [pdata.frame()], 72#' @param id,time only relevant for default method: vectors specifying 73#' the id and time dimensions, i. e. a sequence of individual and 74#' time identifiers, each as stacked time series, 75#' @param \dots further arguments. 76#' @return A named `logical` vector (names are those of the 77#' individuals). The i-th element of the returned vector 78#' corresponds to the i-th individual. The values of the i-th 79#' element can be: \item{TRUE}{if the i-th individual has 80#' consecutive time periods,} \item{FALSE}{if the i-th 81#' individual has non-consecutive time periods,} 82#' \item{"NA"}{if there are any NA values in time index of 83#' the i-th the individual; see also argument `na.rm.tindex` 84#' to remove those.} 85#' @export 86#' @author Kevin Tappe 87#' @seealso [make.pconsecutive()] to make data consecutive 88#' (and, as an option, balanced at the same time) and 89#' [make.pbalanced()] to make data balanced.\cr 90#' [pdim()] to check the dimensions of a 'pdata.frame' 91#' (and other objects), [pvar()] to check for individual 92#' and time variation of a 'pdata.frame' (and other objects), 93#' [lag()] for lagged (and leading) values of a 94#' 'pseries' object.\cr 95#' 96#' [pseries()], [data.frame()], [pdata.frame()], 97#' for class 'panelmodel' see [plm()] and [pgmm()]. 98#' @keywords attribute 99#' @examples 100#' 101#' data("Grunfeld", package = "plm") 102#' is.pconsecutive(Grunfeld) 103#' is.pconsecutive(Grunfeld, index=c("firm", "year")) 104#' 105#' # delete 2nd row (2nd time period for first individual) 106#' # -> non consecutive 107#' Grunfeld_missing_period <- Grunfeld[-2, ] 108#' is.pconsecutive(Grunfeld_missing_period) 109#' all(is.pconsecutive(Grunfeld_missing_period)) # FALSE 110#' 111#' # delete rows 1 and 2 (1st and 2nd time period for first individual) 112#' # -> consecutive 113#' Grunfeld_missing_period_other <- Grunfeld[-c(1,2), ] 114#' is.pconsecutive(Grunfeld_missing_period_other) # all TRUE 115#' 116#' # delete year 1937 (3rd period) for _all_ individuals 117#' Grunfeld_wo_1937 <- Grunfeld[Grunfeld$year != 1937, ] 118#' is.pconsecutive(Grunfeld_wo_1937) # all FALSE 119#' 120#' # pdata.frame interface 121#' pGrunfeld <- pdata.frame(Grunfeld) 122#' pGrunfeld_missing_period <- pdata.frame(Grunfeld_missing_period) 123#' is.pconsecutive(pGrunfeld) # all TRUE 124#' is.pconsecutive(pGrunfeld_missing_period) # first FALSE, others TRUE 125#' 126#' 127#' # panelmodel interface (first, estimate some models) 128#' mod_pGrunfeld <- plm(inv ~ value + capital, data = Grunfeld) 129#' mod_pGrunfeld_missing_period <- plm(inv ~ value + capital, data = Grunfeld_missing_period) 130#' 131#' is.pconsecutive(mod_pGrunfeld) 132#' is.pconsecutive(mod_pGrunfeld_missing_period) 133#' 134#' nobs(mod_pGrunfeld) # 200 135#' nobs(mod_pGrunfeld_missing_period) # 199 136#' 137#' 138#' # pseries interface 139#' pinv <- pGrunfeld$inv 140#' pinv_missing_period <- pGrunfeld_missing_period$inv 141#' 142#' is.pconsecutive(pinv) 143#' is.pconsecutive(pinv_missing_period) 144#' 145#' # default method for arbitrary vectors or NULL 146#' inv <- Grunfeld$inv 147#' inv_missing_period <- Grunfeld_missing_period$inv 148#' is.pconsecutive(inv, id = Grunfeld$firm, time = Grunfeld$year) 149#' is.pconsecutive(inv_missing_period, id = Grunfeld_missing_period$firm, 150#' time = Grunfeld_missing_period$year) 151#' 152#' # (not run) demonstrate mismatch lengths of x, id, time 153#' # is.pconsecutive(x = inv_missing_period, id = Grunfeld$firm, time = Grunfeld$year) 154#' 155#' # only id and time are needed for evaluation 156#' is.pconsecutive(NULL, id = Grunfeld$firm, time = Grunfeld$year) 157#' 158is.pconsecutive <- function(x, ...){ 159 UseMethod("is.pconsecutive") 160} 161 162#' @rdname is.pconsecutive 163#' @export 164is.pconsecutive.default <- function(x, id, time, na.rm.tindex = FALSE, ...) { 165 # argument 'x' just used for input check (if it is not NULL and is a vector) 166 167 # input checks 168 if(length(id) != length(time)) 169 stop(paste0("arguments 'id' and 'time' must have same length: length(id): ", length(id), ", length(time) ", length(time))) 170 171 if(!is.null(x) && is.vector(x)) { # is.vector could be too strict? factor is not a vector 172 if(!(length(x) == length(id) && length(x) == length(time) && length(id) == length(time))) 173 stop(paste0("arguments 'x', 'id', 'time' must have same length: length(x): ", 174 length(x), ", length(id): ", length(id), ", length(time): ", length(time))) 175 176 } 177 178 # NB: 'time' is assumed to be organised as stacked time series (sorted for each individual) 179 # (successive blocks of individuals, each block being a time series for the respective individual)) 180 # 181 # 'time' is in the correct order if is.pconsecutive.default is called by 182 # is.pconsecutive.pdata.frame or is.pconsecutive.pseries as a pdata.frame (which is sorted) was constructed 183 # in the first place; for data.frame interface the ordering is done in the respective function 184 185 if(na.rm.tindex) { 186 NA_tindex <- is.na(time) 187 time <- time[!NA_tindex] 188 id <- id[!NA_tindex] 189 } 190 191 # if time var is factor (as is TRUE for pdata.frames, pseries): 192 # need to convert to numeric, do this by coering to character first (otherwise wrong results!) 193 # see R FAQ 7.10 for coercing factors to numeric: 194 # as.numeric(levels(factor_var))[as.integer(factor_var)] is more efficient than 195 # as.numeric(as.character(factor_var)) 196 if(!is.numeric(time) && is.factor(time)) time <- as.numeric(levels(time))[as.integer(time)] 197 198 list_id_timevar <- split(time, id, drop = TRUE) 199 200 res <- vapply(list_id_timevar, function(id_timevar) { if(anyNA(id_timevar)) { 201 NA # return NA if NA found in the time periods for individual 202 } else { 203 begin <- id_timevar[1L] 204 end <- id_timevar[length(id_timevar)] 205 206 # compare to length(original id_timevar) to find out if times are consecutive 207 (end - begin + 1L) == length(id_timevar) 208 209 # Alternative way of checking: 210 # consecutive time periods from begin to end (if id_timevar were consecutive) 211 # consecutive <- seq(from = begin, to = end, by = 1) 212 # length(consecutive) == length(id_timevar) 213 } 214 }, FUN.VALUE = TRUE) 215 216 return(res) 217} 218 219#' @rdname is.pconsecutive 220#' @export 221is.pconsecutive.data.frame <- function(x, index = NULL, na.rm.tindex = FALSE, ...){ 222 if (!is.null(index) && length(index) != 2L) 223 stop("if argument 'index' is not NULL, 'index' needs to specify 224 'individual' and 'time' dimension for is.pconsecutive to work on a data.frame") 225 226 # if index not provided, assume first two columns to be the index vars 227 index_orig_names <- if(is.null(index)) names(x)[1:2] else index 228 229 id <- x[ , index_orig_names[1L]] 230 time <- x[ , index_orig_names[2L]] 231 232 # order as stacked time series (by id and time) first, otherwise default method does not work correctly! 233 ord <- order(id, time) 234 x_ordered <- x[ord, ] 235 id_ordered <- id[ord] 236 time_ordered <- time[ord] 237 238# if (!identical(x, x_ordered)) 239# print("Note: for test of consecutiveness of time periods, the data.frame was ordered by index variables (id, time)") 240 241 return(is.pconsecutive.default(x_ordered, id_ordered, time_ordered, na.rm.tindex = na.rm.tindex, ...)) 242} 243 244#' @rdname is.pconsecutive 245#' @export 246is.pconsecutive.pseries <- function(x, na.rm.tindex = FALSE, ...){ 247 index <- unclass(attr(x, "index")) # unclass for speed 248 return(is.pconsecutive.default(x, index[[1L]], index[[2L]], na.rm.tindex = na.rm.tindex, ...)) 249} 250 251 252#' @rdname is.pconsecutive 253#' @export 254is.pconsecutive.pdata.frame <- function(x, na.rm.tindex = FALSE, ...){ 255 index <- unclass(attr(x, "index")) # unclass for speed 256 return(is.pconsecutive.default(x, index[[1L]], index[[2L]], na.rm.tindex = na.rm.tindex, ...)) 257} 258 259#' @rdname is.pconsecutive 260#' @export 261is.pconsecutive.panelmodel <- function(x, na.rm.tindex = FALSE, ...){ 262 index <- unclass(attr(x$model, "index")) # unclass for speed 263 return(is.pconsecutive.default(x, index[[1L]], index[[2L]], na.rm.tindex = na.rm.tindex, ...)) 264} 265 266 267########### is.pbalanced ############## 268### for convenience and to be faster than pdim() for the purpose 269### of the determination of balancedness only, because it avoids 270### pdim()'s calculations which are unnecessary for balancedness. 271### 272### copied (and adapted) methods and code from pdim.* 273### (only relevant parts to determine balancedness) 274 275 276#' Check if data are balanced 277#' 278#' This function checks if the data are balanced, i.e., if each individual has 279#' the same time periods 280#' 281#' Balanced data are data for which each individual has the same time periods. 282#' The returned values of the `is.pbalanced(object)` methods are identical 283#' to `pdim(object)$balanced`. `is.pbalanced` is provided as a short 284#' cut and is faster than `pdim(object)$balanced` because it avoids those 285#' computations performed by `pdim` which are unnecessary to determine the 286#' balancedness of the data. 287#' 288#' @aliases is.pbalanced 289#' @param x an object of class `pdata.frame`, `data.frame`, 290#' `pseries`, `panelmodel`, or `pgmm`, 291#' @param y (only in default method) the time index variable (2nd index 292#' variable), 293#' @param index only relevant for `data.frame` interface; if 294#' `NULL`, the first two columns of the data.frame are 295#' assumed to be the index variables; if not `NULL`, both 296#' dimensions ('individual', 'time') need to be specified by 297#' `index` as character of length 2 for data frames, for 298#' further details see [pdata.frame()], 299#' @param \dots further arguments. 300#' @return A logical indicating whether the data associated with 301#' object `x` are balanced (`TRUE`) or not 302#' (`FALSE`). 303#' @seealso [punbalancedness()] for two measures of 304#' unbalancedness, [make.pbalanced()] to make data 305#' balanced; [is.pconsecutive()] to check if data are 306#' consecutive; [make.pconsecutive()] to make data 307#' consecutive (and, optionally, also balanced).\cr 308#' [pdim()] to check the dimensions of a 'pdata.frame' 309#' (and other objects), [pvar()] to check for individual 310#' and time variation of a 'pdata.frame' (and other objects), 311#' [pseries()], [data.frame()], 312#' [pdata.frame()]. 313#' @export 314#' @keywords attribute 315#' @examples 316#' 317#' # take balanced data and make it unbalanced 318#' # by deletion of 2nd row (2nd time period for first individual) 319#' data("Grunfeld", package = "plm") 320#' Grunfeld_missing_period <- Grunfeld[-2, ] 321#' is.pbalanced(Grunfeld_missing_period) # check if balanced: FALSE 322#' pdim(Grunfeld_missing_period)$balanced # same 323#' 324#' # pdata.frame interface 325#' pGrunfeld_missing_period <- pdata.frame(Grunfeld_missing_period) 326#' is.pbalanced(Grunfeld_missing_period) 327#' 328#' # pseries interface 329#' is.pbalanced(pGrunfeld_missing_period$inv) 330#' 331is.pbalanced <- function(x, ...) { 332 UseMethod("is.pbalanced") 333} 334 335#' @rdname is.pbalanced 336#' @export 337is.pbalanced.default <- function(x, y, ...) { 338 if (length(x) != length(y)) stop("The length of the two inputs differs\n") 339 x <- x[drop = TRUE] # drop unused factor levels so that table 340 y <- y[drop = TRUE] # gives only needed combinations 341 z <- table(x, y) 342 balanced <- if(any(v <- as.vector(z) == 0L)) FALSE else TRUE 343 if (any(v > 1L)) warning("duplicate couples (id-time)\n") 344 return(balanced) 345} 346 347#' @rdname is.pbalanced 348#' @export 349is.pbalanced.data.frame <- function(x, index = NULL, ...) { 350 x <- pdata.frame(x, index) 351 index <- unclass(attr(x, "index")) # unclass for speed 352 return(is.pbalanced(index[[1L]], index[[2L]])) 353} 354 355#' @rdname is.pbalanced 356#' @export 357is.pbalanced.pdata.frame <- function(x, ...) { 358 index <- unclass(attr(x, "index")) # unclass for speed 359 return(is.pbalanced(index[[1L]], index[[2L]])) 360} 361 362#' @rdname is.pbalanced 363#' @export 364is.pbalanced.pseries <- function(x, ...) { 365 index <- unclass(attr(x, "index")) # unclass for speed 366 return(is.pbalanced(index[[1L]], index[[2L]])) 367} 368 369#' @rdname is.pbalanced 370#' @export 371is.pbalanced.panelmodel <- function(x, ...) { 372 x <- model.frame(x) 373 return(is.pbalanced(x)) 374} 375 376#' @rdname is.pbalanced 377#' @export 378is.pbalanced.pgmm <- function(x, ...) { 379## pgmm is also class panelmodel, but take advantage of the pdim attribute in it 380 return(attr(x, "pdim")$balanced) 381} 382