1## Code for bootstrapped Amelia ported to R 2## 17/09/05 jh - Added "subset" routine for idvars and completely missing observations 3## 22/09/05 jh - Changed stack function to optionally fix column positions, changed bootx to reject some bootstraps, changed emarch to work when no data missing 4## 23/09/05 mb - Added "amcheck" function to change data and check for errors, "impdata" now in format given to amelia. 5## 24/09/05 jh - Modified "amcheck," added polynomials of time, added ability to impute "logicals" from data frames 6## 25/09/05 jh - Finalized plumbing for observational priors 7## 26/09/05 mb - Added "frontend" argument and screen box to amelia and emarch functions 8## 27/09/05 jh - Added observational and empirical priors 9## 28/09/05 mb - Fixed "frontend" to update the GUI after each print. 10## 30/09/05 mb - "amcheck" expanded, priors passed as individual matrices 11## 07/10/05 mb - Added passing of lags and multiple levels of polynomials; expanded "amcheck" to cover these 12## 08/10/05 jh - Enabled variable degree of polynomials of time, enabled interaction with cross-section 13## 14/10/05 mb - Put "amcheck" into its own file 14## 21/10/05 mb - Changed "stack" to "amstack" (and "unstack"); added log transformations in "amtransform"; adding "archive" option that saves a list of the settings 15## 21/10/05 mb - Added a soft-crash that will print and output the error number and message. 16## 24/10/05 mb - Added "sqrt" option for square root transformations, "lgstc" for logistic transformations 17## 27/10/05 mb - Enabled lags and leads 18## 9//11/05 mb - Enabled nominals; added "incheck" to allow skipping amcheck; moved dataframe->matrix conversion to framemat function. 19## 15/12/05 mb - new (fixed) impute function; 20## 21/02/06 mb - added positive definite check to "startvals", now defaults to identity if not pd. 21## 22/02/06 mb - penrose inverse function added in sweep; soft-crashes on a non invertible covariance matrix at the end of EM 22## 23/02/06 mb - empri increases if EM hits a non-monotonic section; added 'startvals' option; added iteration history to archive; 23## 21/03/06 mb - added "ords" option and added ordinal support in "unsubset"; fixed a bug in nominals that wouldn't fill in imputations; 24## 22/03/06 mb - character/factors can be specified and returned correctly as ordinals; 25## 08/04/06 jh - revised functions to handle large datasets, merged with parallel emb.r version 26## 10/04/06 mb - added "nametonumber" function that converts column names to numbers in all of the list options 27## 28/04/06 mb - extracted transformation functions to prep.r 28## 29/04/06 jh - changed screen output for "p2s", ivector and icap in "indxs", revised "diff" convergence monitor to upper triangular 29## 01/05/06 mb - removed "rbind" calls in emfred, impute. 30## 01/06/06 mb - added "allthetas" option to emarch for overdispersion diagnostic 31## 15/06/06 jh - merged with priors version changing all EM and impute procs, modified how lists are generated in indxs("icap") and amelia("impdata"). 32## 27/06/06 mb - added arglist argument to load in output from amelia or the gui. 33## 13/07/06 mb - moved gc() calls out of emfred into emarch 34## 02/08/06 mb - removed data.matrix() call when calling unsubset (moved to prep), fixed impfill for char. 35## 29/08/06 jh - changed tolerance defaults 36## 20/09/06 mb - new option (temp?) keep.data that will trash datasets from memory 37## 01/10/06 mb - added additional info to p2s=2. 38## 27/11/06 mb - new priors format 39## 15/01/07 jh/mb - final version changes, degrees of freedom messages,autoprior option, modified comments, rearranged core arguments 40## 10/05/07 mb - changed 'impute' to 'amelia.impute' 41## 04/07/07 jh - added "emburn" option to modify convergence criteria 42## 04/06/08 mb - changed the writing to GUI ('if (frontend)' calls) to remove globals 43## 17/07/08 mb - fixed frontend error bug (dumping output to screen 44## 22/07/08 mb - good coding update: T->TRUE/F->FALSE 45## 27/03/10 jh - small changes to arguments of functions to deal with "splinetime" option in same fashion as "polytime" 46 47 48## Draw from a multivariate normal distribution 49## n: number of draws 50## mu: vector of means 51## vcv: variance-covariance matrix 52rmvnorm <- function(n,mu,vcv){ 53 return(matrix(rnorm(n*length(mu)),n,length(mu)) %*% (chol(vcv)) + (matrix(1,n,1) %*% mu ) ) 54} 55 56## Returns the data matrix without the rows with missing values 57## (Same return as na.omit, without any attributes) 58## x: data matrix 59## can't send it a vector right now 60packr<-function(x) { 61 r<-is.na(x) 62 sumr<-rowSums(r) 63 x2<-x[sumr==0, , drop=FALSE] 64 return(x2) 65} 66 67## Create dataset bootstrapped from original dataset 68## Rejects Bootstraps where an entire variable becomes missing 69## x: data (matrix) 70## priors: matrix of priors about means for observations 71bootx<-function(x,priors=NULL, boot.type="np"){ 72 flag <- TRUE 73 AMn <- nrow(x) 74 if (!is.null(boot.type)) { 75 if (boot.type == "none") { 76 return(list(x=x,priors=priors)) 77 } 78 } 79 while (flag){ 80 order<-trunc(runif(nrow(x), min=1, max=nrow(x)+1)) 81 xboot<-x[order,] 82 if (!identical(priors,NULL)){ 83 sigPriors <- matrix(NA,nrow(x),ncol(x)) 84 muPriors <- matrix(NA,nrow(x),ncol(x)) 85 muPriors[priors[,1:2]] <- priors[,3] 86 sigPriors[priors[,1:2]] <- priors[,4] 87 muPriors <- muPriors[order,] 88 sigPriors <- sigPriors[order,] 89 prior.ind <- which(!is.na(muPriors), arr.ind = TRUE) 90 priors <- cbind(prior.ind, muPriors[prior.ind], sigPriors[prior.ind]) 91 # priors[,1]<-match(priors[,1],order) 92 #priors <- priors[!is.na(priors[,1]),,drop=FALSE] 93 } 94 95 flag<-any(colSums(is.na(xboot))==AMn & !((1:ncol(xboot)) %in% priors[,2])) 96 } 97 return(list(x=xboot,priors=priors)) 98} 99 100## Put imputations into the original data format 101## Converts integer values back to factors or characters 102impfill<-function(x.orig, x.imp, noms, ords, priors, overimp) { 103 if (!is.null(priors)) { 104 is.na(x.orig)[priors[,c(1,2)]] <- TRUE 105 } 106 107 if (!is.null(overimp)) { 108 is.na(x.orig)[overimp] <- TRUE 109 } 110 111 AMr1.orig <- is.na(x.orig) 112 orig.fact <- sapply(x.orig, is.factor) 113 orig.char <- sapply(x.orig, is.character) 114 x.imp <- as.data.frame(x.imp[,1:ncol(x.orig)]) 115 for (i in 1:ncol(x.orig)) { 116 if (is.logical(x.orig[,i]) & sum(!is.na(x.orig[,i])) > 0) { 117 x.imp[,i]<-as.logical(x.imp[,i]>0.5) 118 } 119 } 120 121 possibleFactors <- unique(c(noms,ords)) 122 123 if (!is.null(possibleFactors)) { 124 if (ncol(x.orig) > length(possibleFactors)) { 125 AMr1.orig <- is.na(x.orig[,-possibleFactors]) 126 x.orig[,-possibleFactors][AMr1.orig] <- x.imp[,-possibleFactors][AMr1.orig] 127 } 128 for (i in possibleFactors) { 129 if (orig.fact[i]) 130 x.orig[is.na(x.orig[,i]),i]<- levels(x.orig[,i])[x.imp[is.na(x.orig[,i]),i]] 131 else if (orig.char[i]) 132 x.orig[,i]<-levels(factor(x.orig[,i]))[x.imp[,i]] 133 else 134 x.orig[is.na(x.orig[,i]),i] <- x.imp[is.na(x.orig[,i]),i] 135 } 136 } else { 137 x.orig[AMr1.orig] <- x.imp[AMr1.orig] 138 } 139 new.char <- sapply(x.orig, is.character) 140 char.match <- orig.char!=new.char 141 if (sum(char.match)!=0) 142 for (i in 1:length(char.match)) 143 if (char.match[i]) 144 x.orig[,i]<-as.numeric(x.orig[,i]) 145 146 return(x.orig) 147} 148 149## Create Starting Values for EM Chain 150startval<-function(x,startvals=0,priors=NULL){ 151 152 AMp<-ncol(x) 153 if (!is.null(priors)) { 154 ## fill in prior means 155 x[(priors[,2]-1)*nrow(x)+priors[,1]] <- priors[,3] 156 } 157 if (ncol(as.matrix(startvals)) == AMp+1 && nrow(as.matrix(startvals)) == AMp+1) #checks for correct size of start value matrix 158 if (startvals[1,1]==-1) #checks for the -1 necessary for sweep 159 return(startvals) 160 161 thetast<-matrix(0,nrow=AMp+1,ncol=AMp+1) # Create matrix of zeros 162 thetast[row(thetast)==col(thetast)] <- 1 # Create Identity matrix 163 thetast[1,1]<-(-1) 164 165 if (startvals==0){ # Defaults to Identity if too few rows fully observed 166 cmpr<-packr(x) 167 if (nrow(cmpr)>AMp){ 168 means<-colMeans(cmpr) 169 if (all(eigen(cov(cmpr))$values > 10*.Machine$double.eps)) { #Checks for positive definiteness (positive eigenvalues) 170 thetast[2:(AMp+1),2:(AMp+1)]<-cov(cmpr) #.Machine$double.eps instead of 0 to account for rounding. 171 thetast[2:(AMp+1),1]<-means 172 thetast[1,2:(AMp+1)]<-means 173 } 174 } 175 } 176 return(thetast) 177} 178 179## Create certain indicies. Only needs to be called once, not every pattern. 180## o,m,icap come from omiindxs 181## ivector is i from indexm 182indxs<-function(x){ 183 184 AMn<-nrow(x) 185 AMr1<-is.na(x) # True if missing. 186 AMr2<-unique(AMr1) 187 o<- !AMr2 # (or o<-AMr2==1) Maybe == is not robust to small fluctuations 188 m<- AMr2 # so put in check procedure (m<-) 189 190 ## The following can be replaced by fortran .dll, although this has only moderate time savings ## 191 ivector<-1 192 for(i in 2:AMn){ 193 ischange<- !identical(AMr1[i,],AMr1[i-1,]) 194 if(ischange){ 195 ivector<-c(ivector,i) 196 } 197 } 198 ivector<-c(ivector,AMn+1) 199##################################################### 200 201 ## ivector<-.Fortran("indxs",1*AMr1,as.integer(AMn),as.integer(ncol(x)),as.integer(nrow(AMr2)+1),ivector=integer(nrow(AMr2)+1))$ivector 202 203 icap<-vector(mode="list",nrow(AMr2)) # This is a useful index, although no longer currently used 204 for (i in 2:length(ivector)){ 205 icap[[i]]<-seq(ivector[i-1],ivector[i]-1) 206 } 207 208 return(list(AMr1=AMr1,AMr2=AMr2,o=o,m=m,icap=icap,ivector=ivector)) 209} 210 211 212## EM chain architecture calls 213emarch<-function(x,p2s=TRUE,thetaold=NULL,startvals=0,tolerance=0.0001,priors=NULL,empri=NULL,frontend=FALSE,collect=FALSE,allthetas=FALSE,autopri=0.05,emburn=c(0,0)){ 214 if (p2s == 2) { 215 cat("setting up EM chain indicies\n") 216 flush.console() 217 } 218 219 iter.hist<-matrix(0,nrow=1,ncol=3) 220 if (sum(complete.cases(x)) < nrow(x)){ # Check for fully observed data 221 222 if (identical(thetaold,NULL)) thetaold<-startval(x,startvals=startvals,priors=priors) 223 indx<-indxs(x) # This needs x.NA 224 if (!identical(priors,NULL)){ 225 priors[,4]<-1/priors[,4] # change sd to 1/var 226 priors[,3]<-priors[,3]*priors[,4] # get the precision-weighted 227 # mus 228 priors <- priors[order(priors[,1],priors[,2]),,drop = FALSE] 229 230 } 231 232 x[is.na(x)]<-0 # Change x.NA to x.0s 233 AM1stln<-sum(indx$m[1,])==0 & nrow(indx$m) > 1 234 count<-0 235 diff<- 1+tolerance 236 AMr1 <- 1 * indx$AMr1 237 oo <- 1 * indx$o 238 mm <- 1 * indx$m 239 if (is.null(empri)) { 240 empri <- 0 241 } 242 theta <- .Call("emcore", x, AMr1, oo, mm, 243 indx$ivector, thetaold, tolerance, emburn, p2s, 244 empri,autopri, allthetas, priors, PACKAGE="Amelia") 245 } else { 246 if (p2s) cat("\n","No missing data in bootstrapped sample: EM chain unnecessary") 247 pp1<-ncol(x)+1 # p (the number of variables) plus one 248 means<-colMeans(x) 249 thetanew<-matrix(0,pp1,pp1) 250 thetanew[1,1]<-(-1) 251 thetanew[2:pp1,1]<-means 252 thetanew[1,2:pp1]<-means 253 thetanew[2:pp1,2:pp1]<-cov(x) # Need to consider Priors in these cases, 254 iter.hist<-NA # Although not 255 # currently necessary. 256 return(list(thetanew=thetanew,iter.hist=iter.hist)) 257 } 258 return(list(thetanew=theta$theta,iter.hist=theta$iter.hist)) 259} 260 261## Draw imputations for missing values from a given theta matrix 262amelia.impute<-function(x,thetareal,priors=NULL,bounds=NULL,max.resample=NULL){ 263 264 indx<-indxs(x) # This needs x.NA 265 if (!identical(priors,NULL)){ 266 priors[,4]<-1/priors[,4] 267 priors[,3]<-priors[,3]*priors[,4] 268 priors <- priors[order(priors[,1],priors[,2]),,drop = FALSE] 269 } 270 271 272 273 x[is.na(x)]<-0 # Change x.NA to x.0s 274 AM1stln<-sum(indx$m[1,])==0 & nrow(indx$m) > 1 # Create sundry simple indicators 275 i<-indx$ivector 276 iii<-indx$icap 277 AMp<-ncol(x) 278 AMn<-nrow(x) 279 AMr1 <- 1 * indx$AMr1 280 oo <- 1 * indx$o 281 mm <- 1 * indx$m 282 if (is.null(bounds)) max.resample <- NULL 283 imp <- .Call("ameliaImpute", x, AMr1, oo, mm, indx$ivector, thetareal, 284 priors, bounds, max.resample, PACKAGE="Amelia") 285 return(imp) 286 287} 288 289 290 291 292 293#' Combine multiple runs of Amelia 294#' 295#' Combines multiple runs of \code{amelia} with the same 296#' arguments and data into one \code{amelia} object. 297#' 298#' @param ... one or more objects of class \code{amelia} with the same 299#' arguments and created from the same data. 300#' 301#' @details \code{ameliabind} will combine multiple runs of \code{amelia} into one 302#' object so that you can utilize diagnostics and modelling on all the 303#' imputations together. This function is useful for combining multiple 304#' runs of \code{amelia} run on parallel machines. 305#' 306#' Note that \code{ameliabind} only checks that they arguments and the 307#' missingness matrix are identical. Thus, it could be fooled by two 308#' datasets that are identical up to a transformation of one variable. 309#' 310#' @return An object of class \code{amelia}. 311#' 312#' @seealso \code{\link{amelia}} 313#' 314#' @examples 315#' data(africa) 316#' a1.out <- amelia(x = africa, cs = "country", ts = "year", logs = "gdp_pc") 317#' a2.out <- amelia(x = africa, cs = "country", ts = "year", logs = "gdp_pc") 318#' all.out <- ameliabind(a1.out, a2.out) 319#' summary(all.out) 320#' plot(all.out) 321#' 322ameliabind <- function(...) { 323 args <- list(...) 324 325 if (any(!sapply(args, is, "amelia"))) 326 stop("All arguments must be amelia output.") 327 328 if (length(args) > 1) { 329 ## test that data is the same. we'll just compare the missMatrices. 330 ## this will allow datasets with the same size and missingness 331 ## matrix to be combined unintentionally, but this seems unlikely. 332 datacheck <- lapply(args, 333 function(x) isTRUE(identical(x$missMatrix,args[[1]]$missMatrix))) 334 if (any(!unlist(datacheck))) 335 stop("Non-compatible datasets.") 336 337 ## test that all the arguments are the same 338 check <- lapply(args, 339 function(x) isTRUE(identical(x$arguments, args[[1]]$arguments))) 340 if (any(!unlist(check))) 341 stop("Non-compatible amelia arguments") 342 343 check <- lapply(args, 344 function(x) isTRUE(identical(x$transform.calls, 345 args[[1]]$transform.calls))) 346 if (any(!unlist(check))) 347 stop("Non-compatible transformations on imputed datasets") 348 349 imps <- unlist(lapply(args, function(x) return(x$m))) 350 newm <- sum(imps) 351 impindex <- c(0,cumsum(imps)) 352 353 k <- nrow(args[[1]]$mu) 354 out <- list(imputations = list(), 355 m = integer(0), 356 missMatrix = matrix(NA,0,0), 357 overvalues = args[[1]]$overvalues, 358 theta = array(NA, dim = c(k+1,k+1,newm) ), 359 mu = matrix(NA, nrow = k, ncol = newm), 360 covMatrices = array(NA, dim = c(k,k,newm)), 361 code = integer(0), 362 message = character(0), 363 iterHist = list(), 364 arguments = list(), 365 orig.vars = args[[1]]$orig.vars) 366 367 out$m <- newm 368 out$missMatrix <- args[[1]]$missMatrix 369 out$arguments <- args[[1]]$arguments 370 out$transform.calls <- args[[1]]$transform.calls 371 out$transform.vars <- args[[1]]$transform.vars 372 373 ## since code==1 is good and code==2 means we have an NA, 374 ## then our new output should inherit a 2 if there are any 375 out$code <- max(unlist(lapply(args,function(x) return(x$code)))) 376 377 if (out$code > 2) 378 stop("Amelia output contains error.") 379 if (out$code==2) 380 out$message <- "One or more of the imputations resulted in a covariance matrix that was not invertible." 381 else 382 out$message <- "Normal EM convergence" 383 384 for (i in 1:length(args)) { 385 currimps <- (impindex[i]+1):impindex[i+1] 386 out$mu[,currimps] <- args[[i]]$mu 387 out$theta[,,currimps] <- args[[i]]$theta 388 out$covMatrices[,,currimps] <- args[[i]]$covMatrices 389 out$imputations <- c(out$imputations, args[[i]]$imputations) 390 out$iterHist <- c(out$iterHist, args[[i]]$iterHist) 391 392 } 393 names(out$imputations) <- paste("imp",1:length(out$imputations),sep="") 394 #or: names(out$imputations) <- paste("imp",1:impindex[i+1],sep="") 395 class(out) <- "amelia" 396 class(out$imputations) <- c("mi","list") 397 } else { 398 out <- args[[1]] 399 if (out$code > 2) 400 stop("Amelia output contains error.") 401 } 402 return(out) 403} 404 405getOriginalData <- function(obj) { 406 data <- obj$imputations[[1]] 407 is.na(data) <- obj$missMatrix 408 data <- data[, obj$orig.vars] 409 oi <- obj$arguments$overimp 410 if (!is.null(oi)) { 411 for (i in 1:nrow(oi)) { 412 data[oi[i,1], oi[i,2]] <- obj$overvalues[i] 413 } 414 } 415 return(data) 416} 417 418 419remove.imputations <- function(obj) { 420 data <- obj$imputations[[1]] 421 is.na(data) <- obj$missMatrix 422 oi <- obj$arguments$overimp 423 if (!is.null(oi)) { 424 for (i in 1:nrow(oi)) { 425 data[oi[i,1], oi[i,2]] <- obj$overvalues[i] 426 } 427 } 428 return(data) 429} 430 431## amelia - multiple imputation. core function 432## 433 434 435#' AMELIA: Multiple Imputation of Incomplete Multivariate Data 436#' 437#' Runs the bootstrap EM algorithm on incomplete data and creates 438#' imputed datasets. 439#' 440#' @author James Honaker 441#' @author Gary King 442#' @author Matt Blackwell 443#' 444#' 445#' @param x either a matrix, data.frame, a object of class 446#' "amelia", or an object of class "molist". The first two will call the 447#' default S3 method. The third a convenient way to perform more imputations 448#' with the same parameters. The fourth will impute based on the settings from 449#' \code{moPrep} and any additional arguments. 450#' @param m the number of imputed datasets to create. 451#' @param p2s an integer value taking either 0 for no screen output, 452#' 1 for normal screen printing of iteration numbers, and 2 for detailed 453#' screen output. See "Details" for specifics on output when p2s=2. 454#' @param frontend a logical value used internally for the GUI. 455#' @param idvars a vector of column numbers or column names that indicates 456#' identification variables. These will be dropped from the analysis but 457#' copied into the imputed datasets. 458#' @param ts column number or variable name indicating the variable identifying time 459#' in time series data. 460#' @param cs column number or variable name indicating the cross section variable. 461#' @param polytime integer between 0 and 3 indicating what 462#' power of polynomial should be included in the imputation model 463#' to account for the effects of time. A setting of 0 would 464#' indicate constant levels, 1 would indicate linear time 465#' effects, 2 would indicate squared effects, and 3 would 466#' indicate cubic time effects. 467#' @param splinetime interger value of 0 or greater to control cubic 468#' smoothing splines of time. Values between 0 and 3 create a simple 469#' polynomial of time (identical to the polytime argument). Values \code{k} greater 470#' than 3 create a spline with an additional \code{k-3} 471#' knotpoints. 472#' @param intercs a logical variable indicating if the 473#' time effects of \code{polytime} should vary across the 474#' cross-section. 475#' @param lags a vector of numbers or names indicating columns in the data 476#' that should have their lags included in the imputation model. 477#' @param leads a vector of numbers or names indicating columns in the data 478#' that should have their leads (future values) included in the imputation 479#' model. 480#' @param startvals starting values, 0 for the parameter matrix from 481#' listwise deletion, 1 for an identity matrix. 482#' @param tolerance the convergence threshold for the EM algorithm. 483#' @param logs a vector of column numbers or column names that refer 484#' to variables that require log-linear transformation. 485#' @param sqrts a vector of numbers or names indicating columns in the data 486#' that should be transformed by a sqaure root function. Data in this 487#' column cannot be less than zero. 488#' @param lgstc a vector of numbers or names indicating columns in the data 489#' that should be transformed by a logistic function for proportional data. 490#' Data in this column must be between 0 and 1. 491#' @param noms a vector of numbers or names indicating columns in the data 492#' that are nominal variables. 493#' @param ords a vector of numbers or names indicating columns in the 494#' data that should be treated as ordinal variables. 495#' @param incheck a logical indicating whether or not the inputs to the 496#' function should be checked before running \code{amelia}. This should 497#' only be set to \code{FALSE} if you are extremely confident that your 498#' settings are non-problematic and you are trying to save computational 499#' time. 500#' @param collect a logical value indicating whether or 501#' not the garbage collection frequency should be increased during the 502#' imputation model. Only set this to \code{TRUE} if you are experiencing memory 503#' issues as it can significantly slow down the imputation 504#' process 505#' @param arglist an object of class "ameliaArgs" from a previous run of 506#' Amelia. Including this object will use the arguments from that run. 507#' @param empri number indicating level of the empirical (or ridge) prior. 508#' This prior shrinks the covariances of the data, but keeps the means 509#' and variances the same for problems of high missingness, small N's or 510#' large correlations among the variables. Should be kept small, 511#' perhaps 0.5 to 1 percent of the rows of the data; a 512#' reasonable upper bound is around 10 percent of the rows of the 513#' data. 514#' @param priors a four or five column matrix containing the priors for 515#' either individual missing observations or variable-wide missing 516#' values. See "Details" for more information. 517#' @param autopri allows the EM chain to increase the empirical prior if 518#' the path strays into an nonpositive definite covariance matrix, up 519#' to a maximum empirical prior of the value of this argument times 520#' \code{n}, the number of observations. Must be between 0 and 1, and at 521#' zero this turns off this feature. 522#' @param emburn a numeric vector of length 2, where \code{emburn[1]} is 523#' a the minimum EM chain length and \code{emburn[2]} is the 524#' maximum EM chain length. These are ignored if they are less than 1. 525#' @param bounds a three column matrix to hold logical bounds on the 526#' imputations. Each row of the matrix should be of the form 527#' \code{c(column.number, lower.bound,upper.bound)} See Details below. 528#' @param max.resample an integer that specifies how many times Amelia 529#' should redraw the imputed values when trying to meet the logical 530#' constraints of \code{bounds}. After this value, imputed values are 531#' set to the bounds. 532#' @param overimp a two-column matrix describing which cells are to be 533#' overimputed. Each row of the matrix should be a \code{c(row,column)} pair. 534#' Each of these cells will be treated as missing and 535#' replaced with draws from the imputation model. 536#' @param boot.type choice of bootstrap, currently restricted to either 537#' \code{"ordinary"} for the usual non-parametric bootstrap and 538#' \code{"none"} for no bootstrap. 539#' @param parallel the type of parallel operation to be used (if any). If 540#' missing, the default is taken from the option 541#' \code{"amelia.parallel"} (and if that is not set, \code{"no"}). 542#' @param ncpus integer: the number of processes to be used in parallel 543#' operation: typically one would choose the number of available CPUs. 544#' @param cl an optional \pkg{parallel} or \pkg{snow} cluster for use if 545#' \code{parallel = "snow"}. If not supplied, a cluster on the local 546#' machine is created for the duration of the \code{amelia} call. 547#' @param ... further arguments to be passed. 548#' 549#' @details 550#' Multiple imputation is a method for analyzing incomplete 551#' multivariate data. This function will take an incomplete dataset in 552#' either data frame or matrix form and return \code{m} imputed datatsets 553#' with no missing values. The algorithm first creates a bootstrapped 554#' version of the original data, estimates the sufficient statistics 555#' (with priors if specified) by EM on this bootstrapped sample, and 556#' then imputes the missing values of the original data using the 557#' estimated sufficient statistics. It repeats this process \code{m} 558#' times to produce the \code{m} complete datasets where the 559#' observed values are the same and the unobserved values are drawn 560#' from their posterior distributions. 561#' 562#' The function will start a "fresh" run of the algorithm if \code{x} is 563#' either a incomplete matrix or data.frame. In this method, all of the 564#' options will be user-defined or set to their default. If \code{x} 565#' is the output of a previous Amelia run (that is, an object of 566#' class "amelia"), then Amelia will run with the options used in 567#' that previous run. This is a convenient way to run more 568#' imputations of the same model. 569#' 570#' You can provide Amelia with informational priors about the missing 571#' observations in your data. To specify priors, pass a four or five 572#' column matrix to the \code{priors} argument with each row specifying a 573#' different priors as such: 574#' 575#' \code{ one.prior <- c(row, column, mean,standard deviation)} 576#' 577#' or, 578#' 579#' \code{ one.prior <- c(row, column, minimum, maximum, confidence)}. 580#' 581#' So, in the first and second column of the priors matrix should be the 582#' row and column number of the prior being set. In the other columns 583#' should either be the mean and standard deviation of the prior, or a 584#' minimum, maximum and confidence level for the prior. You must specify 585#' your priors all as distributions or all as confidence ranges. Note 586#' that ranges are converted to distributions, so setting a confidence of 587#' 1 will generate an error. 588#' 589#' Setting a priors for the missing values of an entire variable is done 590#' in the same manner as above, but inputing a \code{0} for the row 591#' instead of the row number. If priors are set for both the entire 592#' variable and an individual observation, the individual prior takes 593#' precedence. 594#' 595#' In addition to priors, Amelia allows for logical bounds on 596#' variables. The \code{bounds} argument should be a matrix with 3 597#' columns, with each row referring to a logical bound on a variable. The 598#' first column should be the column number of the variable to be 599#' bounded, the second column should be the lower bounds for that 600#' variable, and the third column should be the upper bound for that 601#' variable. As Amelia enacts these bounds by resampling, particularly 602#' poor bounds will end up resampling forever. Amelia will stop 603#' resampling after \code{max.resample} attempts and simply set the 604#' imputation to the relevant bound. 605#' 606#' If each imputation is taking a long time to converge, you can increase 607#' the empirical prior, \code{empri}. This value has the effect of smoothing 608#' out the likelihood surface so that the EM algorithm can more easily find 609#' the maximum. It should be kept as low as possible and only used if needed. 610#' 611#' Amelia assumes the data is distributed multivariate normal. There are a 612#' number of variables that can break this assumption. Usually, though, a 613#' transformation can make any variable roughly continuous and unbounded. 614#' We have included a number of commonly needed transformations for data. 615#' Note that the data will not be transformed in the output datasets and the 616#' transformation is simply useful for climbing the likelihood. 617#' 618#' Amelia can run its imputations in parallel using the methods of the 619#' \pkg{parallel} package. The \code{parallel} argument names the 620#' parallel backend that Amelia should use. Users on Windows systems must 621#' use the \code{"snow"} option and users on Unix-like systems should use 622#' \code{"multicore"}. The \code{multicore} backend sets itself up 623#' automatically, but the \code{snow} backend requires more setup. You 624#' can pass a predefined cluster from the 625#' \code{parallel::makePSOCKcluster} function to the \code{cl} 626#' argument. Without this cluster, Amelia will attempt to create a 627#' reasonable default cluster and stop it once computation is 628#' complete. When using the parallel backend, users can set the number of 629#' CPUs to use with the \code{ncpus} argument. The defaults for these two 630#' arguments can be set with the options \code{"amelia.parallel"} and 631#' \code{"amelia.ncpus"}. 632#' 633#' Please refer to the Amelia manual for more information on the function 634#' or the options. 635#' 636#' @return An instance of S3 class "amelia" with the following objects: 637#' \item{imputations}{a list of length \code{m} with an imputed dataset in 638#' each entry. The class (matrix or data.frame) of these entries will 639#' match \code{x}.} 640#' \item{m}{an integer indicating the number of imputations run.} 641#' \item{missMatrix}{a matrix identical in size to the original dataset 642#' with 1 indicating a missing observation and a 0 indicating an observed 643#' observation.} 644#' \item{theta}{An array with dimensions \eqn{(p+1)} by \eqn{(p+1)} by \eqn{m} (where 645#' \eqn{p} is the number of variables in the imputations model) holding 646#' the converged parameters for each of the \code{m} EM chains.} 647#' \item{mu}{A \eqn{p} by \eqn{m} matrix of of the posterior modes for the 648#' complete-data means in each of the EM chains.} 649#' \item{covMatrices}{An array with dimensions \eqn{(p)} by \eqn{(p)} by 650#' \eqn{m} where the first two dimensions hold the posterior modes of the 651#' covariance matrix of the complete data for each of the EM chains.} 652#' \item{code}{a integer indicating the exit code of the Amelia run.} 653#' \item{message}{an exit message for the Amelia run} 654#' \item{iterHist}{a list of iteration histories for each EM chain. See 655#' documentation for details.} 656#' \item{arguments}{a instance of the class "ameliaArgs" which holds the 657#' arguments used in the Amelia run.} 658#' \item{overvalues}{a vector of values removed for overimputation. Used to 659#' reformulate the original data from the imputations. } 660#' 661#' Note that the \code{theta}, \code{mu} and \code{covMatrcies} objects 662#' refers to the data as seen by the EM algorithm and is thusly centered, 663#' scaled, stacked, tranformed and rearranged. See the manual for details 664#' and how to access this information. 665#' 666#' @references 667#' Honaker, J., King, G., Blackwell, M. (2011). 668#' Amelia II: A Program for Missing Data. 669#' \emph{Journal of Statistical Software}, \bold{45(7)}, 1--47. 670#' URL http://www.jstatsoft.org/v45/i07/. 671#' 672#' @seealso For imputation diagnostics, \code{\link{missmap}}, 673#' \code{\link{compare.density}}, 674#' \code{\link{overimpute}} and \code{\link{disperse}}. For time series 675#' plots, \code{\link{tscsPlot}}. Also: \code{\link{plot.amelia}}, 676#' \code{\link{write.amelia}}, and \code{\link{ameliabind}} 677#' 678#' @examples 679#' data(africa) 680#' a.out <- amelia(x = africa, cs = "country", ts = "year", logs = "gdp_pc") 681#' summary(a.out) 682#' plot(a.out) 683#' 684#' @keywords models 685amelia <- function(x, ...) { 686 UseMethod("amelia", x) 687} 688 689#' @describeIn amelia Run additional imputations for Amelia output 690amelia.amelia <- function(x, m = 5, p2s = 1, frontend = FALSE, ...) { 691 692 ## The original data is the imputed data with the 693 ## imputations marked to NA. These two lines do that 694 data <- x$imputations[[1]] 695 696 ## Only the variables in the missMatrix should be passed. This is 697 ## because the others are 698 data <- getOriginalData(x) 699 700 out <- amelia.default(x = data, m = m, arglist=x$arguments, p2s=p2s, 701 frontend = frontend, incheck=FALSE) 702 num.tcalls <- length(x$transform.calls) 703 if (num.tcalls > 0) { 704 for (i in 1:num.tcalls) { 705 tcall <- x$transform.calls[[i]] 706 tcall[[2]] <- as.name("out") 707 out <- eval(tcall) 708 } 709 out$transform.calls <- x$transform.calls 710 } 711 ret <- ameliabind(x, out) 712 return(ret) 713} 714 715#' @describeIn amelia Perform multiple overimputation from moPrep 716amelia.molist <- function(x, ...) { 717 m <- match.call(expand.dots=TRUE) 718 m$x <- x$data 719 m$priors <- x$priors 720 m$overimp <- x$overimp 721 m[[1]] <- quote(Amelia::amelia.default) 722 ret <- eval(m, parent.frame()) 723 724 return(ret) 725} 726 727#' @describeIn amelia Run core Amelia algorithm 728amelia.default <- function(x, m = 5, p2s = 1, frontend = FALSE, idvars = NULL, 729 ts = NULL, cs = NULL, polytime = NULL, 730 splinetime = NULL, intercs = FALSE, lags = NULL, 731 leads = NULL, startvals = 0, tolerance = 0.0001, 732 logs = NULL, sqrts = NULL, lgstc = NULL, noms = NULL, 733 ords = NULL, incheck = TRUE, collect = FALSE, 734 arglist = NULL, empri = NULL, priors = NULL, 735 autopri = 0.05, emburn = c(0,0), bounds = NULL, 736 max.resample = 100, overimp = NULL, 737 boot.type = "ordinary", 738 parallel = c("no", "multicore", "snow"), 739 ncpus = getOption("amelia.ncpus", 1L), cl = NULL, 740 ...) { 741 742 ## parellel infrastructure modeled off of 'boot' package 743 if (missing(parallel)) parallel <- getOption("amelia.parallel", "no") 744 parallel <- match.arg(parallel) 745 have_mc <- have_snow <- FALSE 746 if (parallel != "no" && ncpus > 1L) { 747 if (parallel == "multicore") have_mc <- .Platform$OS.type != "windows" 748 else if (parallel == "snow") have_snow <- TRUE 749 if (!have_mc && !have_snow) ncpus <- 1L 750 if (p2s == 2) { 751 cat("\nUsing '", parallel, "' parallel backend with", ncpus, "cores.") 752 } 753 } 754 755 756 if (p2s == 2) { 757 cat("\namelia starting\n") 758 flush.console() 759 } 760 am.call <- match.call(expand.dots = TRUE) 761 archv <- am.call 762 763 prepped<-amelia.prep(x = x, m = m, idvars = idvars, empri = empri, ts = ts, 764 cs = cs, tolerance = tolerance, polytime = polytime, 765 splinetime = splinetime, lags = lags, leads = leads, 766 logs = logs, sqrts = sqrts, lgstc = lgstc, p2s = p2s, 767 frontend = frontend, intercs = intercs, noms = noms, 768 startvals = startvals, ords = ords, incheck = incheck, 769 collect = collect, arglist = arglist, priors = priors, 770 autopri = autopri, bounds = bounds, 771 max.resample = max.resample, overimp = overimp, 772 emburn = emburn, boot.type = boot.type) 773 774 if (prepped$code != 1) { 775 cat("Amelia Error Code: ", prepped$code, "\n", prepped$message, "\n") 776 return(invisible(list(code = prepped$code, message = prepped$message))) 777 } 778 779 780 do.amelia <- function(X, ...) { 781 782 if (p2s == 2) { 783 cat("running bootstrap\n") 784 } 785 k <- ncol(prepped$x) 786 if (!is.null(colnames(x))) { 787 ovars <- colnames(x) 788 } else { 789 ovars <- 1:k 790 } 791 792 code <- 1 793 794 impdata <- list(imputations = list(), 795 m = 1, 796 missMatrix = prepped$missMatrix, 797 overvalues = prepped$overvalues, 798 theta = array(NA, dim = c(k+1,k+1,1) ), 799 mu = matrix(NA, nrow = k, ncol = 1), 800 covMatrices = array(NA, dim = c(k,k,1)), 801 code = integer(0), 802 message = character(0), 803 iterHist = list(), 804 arguments = list(), 805 orig.vars = ovars) 806 class(impdata) <- "amelia" 807 class(impdata$imputations) <- c("mi","list") 808 809 x.boot<-bootx(prepped$x,prepped$priors, boot.type) 810 # Don't reorder columns thetanew will not align with d.stacked$x 811 x.stacked<-amstack(x.boot$x,colorder=FALSE,x.boot$priors) 812 813 if (p2s) cat("-- Imputation", X, "--\n") 814 815 thetanew <- emarch(x.stacked$x, p2s = p2s, thetaold = NULL, 816 tolerance = tolerance, startvals = startvals, 817 priors = x.stacked$priors, empri = empri, 818 frontend = frontend, collect = collect, 819 autopri = prepped$autopri, emburn = emburn) 820 821 ##thetanew <- .Call("emarch", PACKAGE = "Amelia") 822 ## update the amelia ouptut 823 impdata$iterHist[[1]] <- thetanew$iter.hist 824 impdata$theta[,,1] <- thetanew$thetanew 825 impdata$mu[,1] <- thetanew$thetanew[-1,1] 826 impdata$covMatrices[,,1] <- thetanew$thetanew[-1,-1] 827 dimnames(impdata$covMatrices)[[1]] <- prepped$theta.names 828 dimnames(impdata$covMatrices)[[2]] <- prepped$theta.names 829 dimnames(impdata$mu)[[1]] <- prepped$theta.names 830 831 evs <- eigen(thetanew$thetanew[-1, -1, drop = FALSE], only.values=TRUE, 832 symmetric=TRUE) 833 if (any(evs$values < .Machine$double.eps)) { 834 impdata$imputations[[1]] <- NA 835 impdata$code <- 2 836 impdata$arguments <- prepped$archv 837 class(impdata$arguments) <- c("ameliaArgs", "list") 838 839 cat("\n\nThe resulting variance matrix was not invertible.", 840 " Please check your data for highly collinear variables.\n\n") 841 return(impdata) 842 } 843 844 ximp <- amelia.impute(prepped$x, thetanew$thetanew, priors = prepped$priors, 845 bounds = prepped$bounds, max.resample) 846 ximp <- amunstack(ximp, n.order = prepped$n.order, 847 p.order = prepped$p.order) 848 ximp <- unscale(ximp, mu = prepped$scaled.mu, sd = prepped$scaled.sd) 849 ximp <- unsubset(x.orig = prepped$trans.x, x.imp = ximp, 850 blanks = prepped$blanks, idvars = prepped$idvars, 851 ts = prepped$ts, cs = prepped$cs, polytime = polytime, 852 splinetime = splinetime, intercs = intercs, 853 noms = prepped$noms, index = prepped$index, 854 ords = prepped$ords) 855 ximp <- untransform(ximp, logs = prepped$logs, xmin = prepped$xmin, 856 sqrts = prepped$sqrts, lgstc = prepped$lgstc) 857 858 if (p2s==2) { 859 cat("\n saving and cleaning\n") 860 } 861 862 ## here we deal with the imputed matrix. 863 864 ## first, we put the data into the output list and name it 865 impdata$imputations[[1]] <- impfill(x.orig = x, x.imp = ximp, 866 noms = prepped$noms, 867 ords = prepped$ords, priors = priors, 868 overimp = overimp) 869 870 if (p2s) cat("\n") 871 if (frontend) { 872 requireNamespace("tcltk") 873 tcltk::tcl(getAmelia("runAmeliaProgress"), "step",(100/m -1)) 874 } 875 876 impdata$code <- code 877 impdata$arguments <- prepped$archv 878 names(impdata$imputations) <- paste("imp", X, sep = "") 879 class(impdata$arguments) <- c("ameliaArgs", "list") 880 881 return(impdata) 882 } 883 884 ## parallel infrastructure from the 'boot' package 885 impdata <- if (ncpus > 1L && (have_mc || have_snow)) { 886 if (have_mc) { 887 parallel::mclapply(seq_len(m), do.amelia, mc.cores = ncpus) 888 } else if (have_snow) { 889 list(...) # evaluate any promises 890 if (is.null(cl)) { 891 cl <- parallel::makePSOCKcluster(rep("localhost", ncpus)) 892 if(RNGkind()[1L] == "L'Ecuyer-CMRG") 893 parallel::clusterSetRNGStream(cl) 894 res <- parallel::parLapply(cl, seq_len(m), do.amelia) 895 parallel::stopCluster(cl) 896 res 897 } else parallel::parLapply(cl, seq_len(m), do.amelia) 898 } 899 } else lapply(seq_len(m), do.amelia) 900 901 if (all(sapply(impdata, is, class="amelia"))) { 902 if (!all(sapply(impdata, function(x) is.na(x$imputations)))) { 903 impdata <- do.call(ameliabind, impdata) 904 if (impdata$code == 2) { 905 impdata$message <- paste("One or more of the imputations resulted in a", 906 "covariance matrix that was not invertible.") 907 } else { 908 impdata$message <- paste("Normal EM convergence.") 909 } 910 } else { 911 impdata <- do.call(ameliabind, impdata) 912 impdata$code <- 2 913 impdata$message <- paste("All of the imputations resulted in a covariance", 914 "matrix that is not invertible.") 915 } 916 } 917 918 919 return(impdata) 920} 921