1## Code for bootstrapped Amelia ported to R
2## 17/09/05 jh - Added "subset" routine for idvars and completely missing observations
3## 22/09/05 jh - Changed stack function to optionally fix column positions, changed bootx to reject some bootstraps, changed emarch to work when no data missing
4## 23/09/05 mb - Added "amcheck" function to change data and check for errors, "impdata" now in format given to amelia.
5## 24/09/05 jh - Modified "amcheck," added polynomials of time, added ability to impute "logicals" from data frames
6## 25/09/05 jh - Finalized plumbing for observational priors
7## 26/09/05 mb - Added "frontend" argument and screen box to amelia and emarch functions
8## 27/09/05 jh - Added observational and empirical priors
9## 28/09/05 mb - Fixed "frontend" to update the GUI after each print.
10## 30/09/05 mb - "amcheck" expanded, priors passed as individual matrices
11## 07/10/05 mb - Added passing of lags and multiple levels of polynomials;  expanded "amcheck" to cover these
12## 08/10/05 jh - Enabled variable degree of polynomials of time, enabled interaction with cross-section
13## 14/10/05 mb - Put "amcheck" into its own file
14## 21/10/05 mb - Changed "stack" to "amstack" (and "unstack"); added log transformations in "amtransform"; adding "archive" option that saves a list of the settings
15## 21/10/05 mb - Added a soft-crash that will print and output the error number and message.
16## 24/10/05 mb - Added "sqrt" option for square root transformations, "lgstc" for logistic transformations
17## 27/10/05 mb - Enabled lags and leads
18## 9//11/05 mb - Enabled nominals;  added "incheck" to allow skipping amcheck;  moved dataframe->matrix conversion to framemat function.
19## 15/12/05 mb - new (fixed) impute function;
20## 21/02/06 mb - added positive definite check to "startvals", now defaults to identity if not pd.
21## 22/02/06 mb - penrose inverse function added in sweep; soft-crashes on a non invertible covariance matrix at the end of EM
22## 23/02/06 mb - empri increases if EM hits a non-monotonic section; added 'startvals' option; added iteration history to archive;
23## 21/03/06 mb - added "ords" option and added ordinal support in "unsubset"; fixed a bug in nominals that wouldn't fill in imputations;
24## 22/03/06 mb - character/factors can be specified and returned correctly as ordinals;
25## 08/04/06 jh - revised functions to handle large datasets, merged with parallel emb.r version
26## 10/04/06 mb - added "nametonumber" function that converts column names to numbers in all of the list options
27## 28/04/06 mb - extracted transformation functions to prep.r
28## 29/04/06 jh - changed screen output for "p2s", ivector and icap in "indxs", revised "diff" convergence monitor to upper triangular
29## 01/05/06 mb - removed "rbind" calls in emfred, impute.
30## 01/06/06 mb - added "allthetas" option to emarch for overdispersion diagnostic
31## 15/06/06 jh - merged with priors version changing all EM and impute procs, modified how lists are generated in indxs("icap") and amelia("impdata").
32## 27/06/06 mb - added arglist argument to load in output from amelia or the gui.
33## 13/07/06 mb - moved gc() calls out of emfred into emarch
34## 02/08/06 mb - removed data.matrix() call when calling unsubset (moved to prep), fixed impfill for char.
35## 29/08/06 jh - changed tolerance defaults
36## 20/09/06 mb - new option (temp?) keep.data that will trash datasets from memory
37## 01/10/06 mb - added additional info to p2s=2.
38## 27/11/06 mb - new priors format
39## 15/01/07 jh/mb - final version changes, degrees of freedom messages,autoprior option, modified comments, rearranged core arguments
40## 10/05/07 mb - changed 'impute' to 'amelia.impute'
41## 04/07/07 jh - added "emburn" option to modify convergence criteria
42## 04/06/08 mb - changed the writing to GUI ('if (frontend)' calls) to remove globals
43## 17/07/08 mb - fixed frontend error bug (dumping output to screen
44## 22/07/08 mb - good coding update: T->TRUE/F->FALSE
45## 27/03/10 jh - small changes to arguments of functions to deal with "splinetime" option in same fashion as "polytime"
46
47
48## Draw from a multivariate normal distribution
49##   n: number of draws
50##   mu: vector of means
51##   vcv: variance-covariance matrix
52rmvnorm <- function(n,mu,vcv){
53  return(matrix(rnorm(n*length(mu)),n,length(mu)) %*% (chol(vcv)) + (matrix(1,n,1) %*% mu ) )
54}
55
56## Returns the data matrix without the rows with missing values
57## (Same return as na.omit, without any attributes)
58##   x: data matrix
59##   can't send it a vector right now
60packr<-function(x) {
61  r<-is.na(x)
62  sumr<-rowSums(r)
63  x2<-x[sumr==0, , drop=FALSE]
64  return(x2)
65}
66
67## Create dataset bootstrapped from original dataset
68## Rejects Bootstraps where an entire variable becomes missing
69##   x:          data (matrix)
70##   priors:     matrix of priors about means for observations
71bootx<-function(x,priors=NULL, boot.type="np"){
72  flag <- TRUE
73  AMn <- nrow(x)
74  if (!is.null(boot.type)) {
75      if (boot.type == "none") {
76          return(list(x=x,priors=priors))
77      }
78  }
79  while (flag){
80    order<-trunc(runif(nrow(x), min=1, max=nrow(x)+1))
81    xboot<-x[order,]
82    if (!identical(priors,NULL)){
83      sigPriors <- matrix(NA,nrow(x),ncol(x))
84      muPriors <- matrix(NA,nrow(x),ncol(x))
85      muPriors[priors[,1:2]] <- priors[,3]
86      sigPriors[priors[,1:2]] <- priors[,4]
87      muPriors <- muPriors[order,]
88      sigPriors <- sigPriors[order,]
89      prior.ind <- which(!is.na(muPriors), arr.ind = TRUE)
90      priors <- cbind(prior.ind, muPriors[prior.ind], sigPriors[prior.ind])
91                                        # priors[,1]<-match(priors[,1],order)
92                                        #priors <- priors[!is.na(priors[,1]),,drop=FALSE]
93    }
94
95    flag<-any(colSums(is.na(xboot))==AMn & !((1:ncol(xboot)) %in% priors[,2]))
96  }
97  return(list(x=xboot,priors=priors))
98}
99
100## Put imputations into the original data format
101## Converts integer values back to factors or characters
102impfill<-function(x.orig, x.imp, noms, ords, priors, overimp) {
103  if (!is.null(priors)) {
104    is.na(x.orig)[priors[,c(1,2)]] <- TRUE
105  }
106
107  if (!is.null(overimp)) {
108    is.na(x.orig)[overimp] <- TRUE
109  }
110
111  AMr1.orig <- is.na(x.orig)
112  orig.fact <- sapply(x.orig, is.factor)
113  orig.char <- sapply(x.orig, is.character)
114  x.imp <- as.data.frame(x.imp[,1:ncol(x.orig)])
115  for (i in 1:ncol(x.orig)) {
116    if (is.logical(x.orig[,i]) & sum(!is.na(x.orig[,i])) > 0) {
117      x.imp[,i]<-as.logical(x.imp[,i]>0.5)
118    }
119  }
120
121  possibleFactors <- unique(c(noms,ords))
122
123  if (!is.null(possibleFactors)) {
124    if (ncol(x.orig) > length(possibleFactors)) {
125      AMr1.orig <- is.na(x.orig[,-possibleFactors])
126      x.orig[,-possibleFactors][AMr1.orig] <- x.imp[,-possibleFactors][AMr1.orig]
127    }
128    for (i in possibleFactors) {
129      if (orig.fact[i])
130        x.orig[is.na(x.orig[,i]),i]<- levels(x.orig[,i])[x.imp[is.na(x.orig[,i]),i]]
131      else if (orig.char[i])
132        x.orig[,i]<-levels(factor(x.orig[,i]))[x.imp[,i]]
133      else
134        x.orig[is.na(x.orig[,i]),i] <- x.imp[is.na(x.orig[,i]),i]
135    }
136  } else {
137    x.orig[AMr1.orig] <- x.imp[AMr1.orig]
138  }
139  new.char <- sapply(x.orig, is.character)
140  char.match <- orig.char!=new.char
141  if (sum(char.match)!=0)
142    for (i in 1:length(char.match))
143      if (char.match[i])
144        x.orig[,i]<-as.numeric(x.orig[,i])
145
146  return(x.orig)
147}
148
149## Create Starting Values for EM Chain
150startval<-function(x,startvals=0,priors=NULL){
151
152  AMp<-ncol(x)
153  if (!is.null(priors)) {
154    ## fill in prior means
155    x[(priors[,2]-1)*nrow(x)+priors[,1]] <- priors[,3]
156  }
157  if (ncol(as.matrix(startvals)) == AMp+1 && nrow(as.matrix(startvals)) == AMp+1)       #checks for correct size of start value matrix
158    if (startvals[1,1]==-1)                                       #checks for the -1 necessary for sweep
159      return(startvals)
160
161  thetast<-matrix(0,nrow=AMp+1,ncol=AMp+1)  # Create matrix of zeros
162  thetast[row(thetast)==col(thetast)] <- 1  # Create Identity matrix
163  thetast[1,1]<-(-1)
164
165  if (startvals==0){                            # Defaults to Identity if too few rows fully observed
166    cmpr<-packr(x)
167    if (nrow(cmpr)>AMp){
168      means<-colMeans(cmpr)
169      if (all(eigen(cov(cmpr))$values > 10*.Machine$double.eps)) {   #Checks for positive definiteness (positive eigenvalues)
170        thetast[2:(AMp+1),2:(AMp+1)]<-cov(cmpr)                   #.Machine$double.eps instead of 0 to account for rounding.
171        thetast[2:(AMp+1),1]<-means
172        thetast[1,2:(AMp+1)]<-means
173      }
174    }
175  }
176  return(thetast)
177}
178
179## Create certain indicies.  Only needs to be called once, not every pattern.
180## o,m,icap come from omiindxs
181## ivector is i from indexm
182indxs<-function(x){
183
184  AMn<-nrow(x)
185  AMr1<-is.na(x)       # True if missing.
186  AMr2<-unique(AMr1)
187  o<- !AMr2            # (or o<-AMr2==1)  Maybe == is not robust to small fluctuations
188  m<- AMr2             # so put in check procedure (m<-)
189
190  ## The following can be replaced by fortran .dll, although this has only moderate time savings ##
191  ivector<-1
192  for(i in 2:AMn){
193    ischange<- !identical(AMr1[i,],AMr1[i-1,])
194    if(ischange){
195      ivector<-c(ivector,i)
196    }
197  }
198  ivector<-c(ivector,AMn+1)
199#####################################################
200
201  ##  ivector<-.Fortran("indxs",1*AMr1,as.integer(AMn),as.integer(ncol(x)),as.integer(nrow(AMr2)+1),ivector=integer(nrow(AMr2)+1))$ivector
202
203  icap<-vector(mode="list",nrow(AMr2))                     # This is a useful index, although no longer currently used
204  for (i in 2:length(ivector)){
205    icap[[i]]<-seq(ivector[i-1],ivector[i]-1)
206  }
207
208  return(list(AMr1=AMr1,AMr2=AMr2,o=o,m=m,icap=icap,ivector=ivector))
209}
210
211
212## EM chain architecture calls
213emarch<-function(x,p2s=TRUE,thetaold=NULL,startvals=0,tolerance=0.0001,priors=NULL,empri=NULL,frontend=FALSE,collect=FALSE,allthetas=FALSE,autopri=0.05,emburn=c(0,0)){
214  if (p2s == 2) {
215    cat("setting up EM chain indicies\n")
216    flush.console()
217  }
218
219  iter.hist<-matrix(0,nrow=1,ncol=3)
220  if (sum(complete.cases(x)) < nrow(x)){          # Check for fully observed data
221
222    if (identical(thetaold,NULL)) thetaold<-startval(x,startvals=startvals,priors=priors)
223    indx<-indxs(x)                      # This needs x.NA
224    if (!identical(priors,NULL)){
225      priors[,4]<-1/priors[,4]          # change sd to 1/var
226      priors[,3]<-priors[,3]*priors[,4] # get the precision-weighted
227                                        # mus
228      priors <- priors[order(priors[,1],priors[,2]),,drop = FALSE]
229
230    }
231
232    x[is.na(x)]<-0                      # Change x.NA to x.0s
233    AM1stln<-sum(indx$m[1,])==0 & nrow(indx$m) > 1
234    count<-0
235    diff<- 1+tolerance
236    AMr1 <- 1 * indx$AMr1
237    oo <- 1 * indx$o
238    mm <- 1 * indx$m
239    if (is.null(empri)) {
240      empri <- 0
241    }
242    theta <- .Call("emcore", x, AMr1, oo, mm,
243                   indx$ivector, thetaold, tolerance, emburn, p2s,
244                   empri,autopri, allthetas, priors, PACKAGE="Amelia")
245  } else {
246    if (p2s) cat("\n","No missing data in bootstrapped sample:  EM chain unnecessary")
247    pp1<-ncol(x)+1                       # p (the number of variables) plus one
248    means<-colMeans(x)
249    thetanew<-matrix(0,pp1,pp1)
250    thetanew[1,1]<-(-1)
251    thetanew[2:pp1,1]<-means
252    thetanew[1,2:pp1]<-means
253    thetanew[2:pp1,2:pp1]<-cov(x)              # Need to consider Priors in these cases,
254    iter.hist<-NA                              # Although not
255                                        # currently necessary.
256    return(list(thetanew=thetanew,iter.hist=iter.hist))
257  }
258  return(list(thetanew=theta$theta,iter.hist=theta$iter.hist))
259}
260
261## Draw imputations for missing values from a given theta matrix
262amelia.impute<-function(x,thetareal,priors=NULL,bounds=NULL,max.resample=NULL){
263
264  indx<-indxs(x)                      # This needs x.NA
265  if (!identical(priors,NULL)){
266    priors[,4]<-1/priors[,4]
267    priors[,3]<-priors[,3]*priors[,4]
268    priors <- priors[order(priors[,1],priors[,2]),,drop = FALSE]
269  }
270
271
272
273  x[is.na(x)]<-0                      # Change x.NA to x.0s
274  AM1stln<-sum(indx$m[1,])==0 & nrow(indx$m) > 1  # Create sundry simple indicators
275  i<-indx$ivector
276  iii<-indx$icap
277  AMp<-ncol(x)
278  AMn<-nrow(x)
279  AMr1 <- 1 * indx$AMr1
280  oo <- 1 * indx$o
281  mm <- 1 * indx$m
282  if (is.null(bounds)) max.resample <- NULL
283  imp <- .Call("ameliaImpute", x, AMr1, oo, mm, indx$ivector, thetareal,
284                  priors, bounds, max.resample, PACKAGE="Amelia")
285  return(imp)
286
287}
288
289
290
291
292
293#' Combine multiple runs of Amelia
294#'
295#' Combines multiple runs of \code{amelia} with the same
296#' arguments and data into one \code{amelia} object.
297#'
298#' @param ... one or more objects of class \code{amelia} with the same
299#'        arguments and created from the same data.
300#'
301#' @details   \code{ameliabind} will combine multiple runs of \code{amelia} into one
302#' object so that you can utilize diagnostics and modelling on all the
303#' imputations together. This function is useful for combining multiple
304#' runs of \code{amelia} run on parallel machines.
305#'
306#' Note that \code{ameliabind} only checks that they arguments and the
307#' missingness matrix are identical. Thus, it could be fooled by two
308#' datasets that are identical up to a transformation of one variable.
309#'
310#' @return An object of class \code{amelia}.
311#'
312#' @seealso \code{\link{amelia}}
313#'
314#' @examples
315#' data(africa)
316#' a1.out <- amelia(x = africa, cs = "country", ts = "year", logs = "gdp_pc")
317#' a2.out <- amelia(x = africa, cs = "country", ts = "year", logs = "gdp_pc")
318#' all.out <- ameliabind(a1.out, a2.out)
319#' summary(all.out)
320#' plot(all.out)
321#'
322ameliabind <- function(...) {
323  args <- list(...)
324
325  if (any(!sapply(args, is, "amelia")))
326    stop("All arguments must be amelia output.")
327
328  if (length(args) > 1) {
329    ## test that data is the same. we'll just compare the missMatrices.
330    ## this will allow datasets with the same size and missingness
331    ## matrix to be combined unintentionally, but this seems unlikely.
332    datacheck <- lapply(args,
333                        function(x) isTRUE(identical(x$missMatrix,args[[1]]$missMatrix)))
334    if (any(!unlist(datacheck)))
335      stop("Non-compatible datasets.")
336
337    ## test that all the arguments are the same
338    check <- lapply(args,
339                    function(x) isTRUE(identical(x$arguments, args[[1]]$arguments)))
340    if (any(!unlist(check)))
341      stop("Non-compatible amelia arguments")
342
343    check <- lapply(args,
344                    function(x) isTRUE(identical(x$transform.calls,
345                                                 args[[1]]$transform.calls)))
346    if (any(!unlist(check)))
347      stop("Non-compatible transformations on imputed datasets")
348
349    imps <- unlist(lapply(args, function(x) return(x$m)))
350    newm <- sum(imps)
351    impindex <- c(0,cumsum(imps))
352
353    k <- nrow(args[[1]]$mu)
354    out  <- list(imputations = list(),
355                 m           = integer(0),
356                 missMatrix  = matrix(NA,0,0),
357                 overvalues  = args[[1]]$overvalues,
358                 theta       = array(NA, dim = c(k+1,k+1,newm) ),
359                 mu          = matrix(NA, nrow = k, ncol = newm),
360                 covMatrices = array(NA, dim = c(k,k,newm)),
361                 code        = integer(0),
362                 message     = character(0),
363                 iterHist    = list(),
364                 arguments   = list(),
365                 orig.vars   = args[[1]]$orig.vars)
366
367    out$m <- newm
368    out$missMatrix <- args[[1]]$missMatrix
369    out$arguments <- args[[1]]$arguments
370    out$transform.calls <- args[[1]]$transform.calls
371    out$transform.vars <- args[[1]]$transform.vars
372
373    ## since code==1 is good and code==2 means we have an NA,
374    ## then our new output should inherit a 2 if there are any
375    out$code <- max(unlist(lapply(args,function(x) return(x$code))))
376
377    if (out$code > 2)
378      stop("Amelia output contains error.")
379    if (out$code==2)
380      out$message <- "One or more of the imputations resulted in a covariance matrix that was not invertible."
381    else
382      out$message <- "Normal EM convergence"
383
384    for (i in 1:length(args)) {
385      currimps <- (impindex[i]+1):impindex[i+1]
386      out$mu[,currimps] <- args[[i]]$mu
387      out$theta[,,currimps] <- args[[i]]$theta
388      out$covMatrices[,,currimps] <- args[[i]]$covMatrices
389      out$imputations <- c(out$imputations, args[[i]]$imputations)
390      out$iterHist    <- c(out$iterHist, args[[i]]$iterHist)
391
392    }
393    names(out$imputations) <- paste("imp",1:length(out$imputations),sep="")
394    #or: names(out$imputations) <- paste("imp",1:impindex[i+1],sep="")
395    class(out) <- "amelia"
396    class(out$imputations) <- c("mi","list")
397  } else {
398    out <- args[[1]]
399    if (out$code > 2)
400      stop("Amelia output contains error.")
401  }
402  return(out)
403}
404
405getOriginalData <- function(obj) {
406  data <- obj$imputations[[1]]
407  is.na(data) <- obj$missMatrix
408  data <- data[, obj$orig.vars]
409  oi <- obj$arguments$overimp
410  if (!is.null(oi)) {
411    for (i in 1:nrow(oi)) {
412      data[oi[i,1], oi[i,2]] <- obj$overvalues[i]
413    }
414  }
415  return(data)
416}
417
418
419remove.imputations <- function(obj) {
420  data <- obj$imputations[[1]]
421  is.na(data) <- obj$missMatrix
422  oi <- obj$arguments$overimp
423  if (!is.null(oi)) {
424    for (i in 1:nrow(oi)) {
425      data[oi[i,1], oi[i,2]] <- obj$overvalues[i]
426    }
427  }
428  return(data)
429}
430
431## amelia - multiple imputation. core function
432##
433
434
435#' AMELIA: Multiple Imputation of Incomplete Multivariate Data
436#'
437#'   Runs the bootstrap EM algorithm on incomplete data and creates
438#' imputed datasets.
439#'
440#' @author James Honaker
441#' @author Gary King
442#' @author Matt Blackwell
443#'
444#'
445#' @param x either a matrix, data.frame, a object of class
446#'        "amelia", or an object of class "molist". The first two will call the
447#'        default S3 method. The third a convenient way to perform more imputations
448#'        with the same parameters. The fourth will impute based on the settings from
449#'        \code{moPrep} and any additional arguments.
450#' @param m the number of imputed datasets to create.
451#' @param p2s an integer value taking either 0 for no screen output,
452#'        1 for normal screen printing of iteration numbers, and 2 for detailed
453#'        screen output.  See "Details" for specifics on output when p2s=2.
454#' @param frontend a logical value used internally for the GUI.
455#' @param idvars a vector of column numbers or column names that indicates
456#'        identification variables.  These will be dropped from the analysis but
457#'        copied into the imputed datasets.
458#' @param ts column number or variable name indicating the variable identifying time
459#'        in time series data.
460#' @param cs column number or variable name indicating the cross section variable.
461#' @param polytime integer between 0 and 3 indicating what
462#'        power of polynomial should be included in the imputation model
463#'        to account for the effects of time.  A setting of 0 would
464#'        indicate constant levels, 1 would indicate linear time
465#'        effects, 2 would indicate squared effects, and 3 would
466#'        indicate cubic time effects.
467#' @param splinetime interger value of 0 or greater to control cubic
468#'        smoothing splines of time. Values between 0 and 3 create a simple
469#'        polynomial of time (identical to the polytime argument). Values \code{k} greater
470#'        than 3 create a spline with an additional \code{k-3}
471#'        knotpoints.
472#' @param intercs a logical variable indicating if the
473#'        time effects of \code{polytime} should vary across the
474#'        cross-section.
475#' @param lags a vector of numbers or names indicating columns in the data
476#'        that should have their lags included in the imputation model.
477#' @param leads a vector of numbers or names indicating columns in the data
478#'        that should have their leads (future values) included in the imputation
479#'        model.
480#' @param startvals starting values, 0 for the parameter matrix from
481#'        listwise deletion, 1 for an identity matrix.
482#' @param tolerance the convergence threshold for the EM algorithm.
483#' @param logs a vector of column numbers or column names that refer
484#'        to variables that require log-linear transformation.
485#' @param sqrts a vector of numbers or names indicating columns in the data
486#'        that should be transformed by a sqaure root function.  Data in this
487#'        column cannot be less than zero.
488#' @param lgstc a vector of numbers or names indicating columns in the data
489#'        that should be transformed by a logistic function for proportional data.
490#'        Data in this column must be between 0 and 1.
491#' @param noms a vector of numbers or names indicating columns in the data
492#'        that are nominal variables.
493#' @param ords a vector of numbers or names indicating columns in the
494#'        data that should be treated as ordinal variables.
495#' @param incheck a logical indicating whether or not the inputs to the
496#'        function should be checked before running \code{amelia}.  This should
497#'        only be set to \code{FALSE} if you are extremely confident that your
498#'        settings are non-problematic and you are trying to save computational
499#'        time.
500#' @param collect a logical value indicating whether or
501#'        not the garbage collection frequency should be increased during the
502#'        imputation model.  Only set this to \code{TRUE} if you are experiencing memory
503#'        issues as it can significantly slow down the imputation
504#'        process
505#' @param arglist an object of class "ameliaArgs" from a previous run of
506#'        Amelia. Including this object will use the arguments from that run.
507#' @param empri number indicating level of the empirical (or ridge) prior.
508#'        This prior shrinks the covariances of the data, but keeps the means
509#'        and variances the same for problems of high missingness, small N's or
510#'        large correlations among the variables.  Should be kept small,
511#'        perhaps 0.5 to 1 percent of the rows of the data; a
512#'        reasonable upper bound is around 10 percent of the rows of the
513#'        data.
514#' @param priors a four or five column matrix containing the priors for
515#'        either individual missing observations or variable-wide missing
516#'        values.  See "Details" for more information.
517#' @param autopri allows the EM chain to increase the empirical prior if
518#'        the path strays into an nonpositive definite covariance matrix, up
519#'        to a maximum empirical prior of the value of this argument times
520#'        \code{n}, the number of observations.  Must be between 0 and 1, and at
521#'        zero this turns off this feature.
522#' @param emburn a numeric vector of length 2, where \code{emburn[1]} is
523#'        a the minimum EM chain length and \code{emburn[2]} is the
524#'        maximum EM chain length. These are ignored if they are less than 1.
525#' @param bounds a three column matrix to hold logical bounds on the
526#'        imputations. Each row of the matrix should be of the form
527#'        \code{c(column.number, lower.bound,upper.bound)} See Details below.
528#' @param max.resample an integer that specifies how many times Amelia
529#'        should redraw the imputed values when trying to meet the logical
530#'        constraints of \code{bounds}. After this value, imputed values are
531#'        set to the bounds.
532#' @param overimp a two-column matrix describing which cells are to be
533#'        overimputed. Each row of the matrix should be a \code{c(row,column)} pair.
534#'        Each of these cells will be treated as missing and
535#'        replaced with draws from the imputation model.
536#' @param boot.type choice of bootstrap, currently restricted to either
537#'        \code{"ordinary"} for the usual non-parametric bootstrap and
538#'        \code{"none"} for no bootstrap.
539#' @param parallel the type of parallel operation to be used (if any). If
540#'        missing, the default is taken from the option
541#'        \code{"amelia.parallel"} (and if that is not set, \code{"no"}).
542#' @param ncpus integer: the number of processes to be used in parallel
543#'        operation: typically one would choose the number of available CPUs.
544#' @param cl an optional \pkg{parallel} or \pkg{snow} cluster for use if
545#'        \code{parallel = "snow"}. If not supplied, a cluster on the local
546#'        machine is created for the duration of the \code{amelia} call.
547#' @param ... further arguments to be passed.
548#'
549#' @details
550#' Multiple imputation is a method for analyzing incomplete
551#' multivariate data. This function will take an incomplete dataset in
552#' either data frame or matrix form and return \code{m} imputed datatsets
553#' with no missing values. The algorithm first creates a bootstrapped
554#' version of the original data, estimates the sufficient statistics
555#' (with priors if specified) by EM on this bootstrapped sample, and
556#'   then imputes the missing values of the original data using the
557#'   estimated sufficient statistics. It repeats this process \code{m}
558#'   times to produce the \code{m} complete datasets where the
559#'   observed values are the same and the unobserved values are drawn
560#'   from their posterior distributions.
561#'
562#' The function will start a "fresh" run of the algorithm if \code{x} is
563#' either a incomplete matrix or data.frame. In this method, all of the
564#' options will be user-defined or set to their default. If \code{x}
565#'   is  the output of a previous Amelia run (that is, an object of
566#'   class "amelia"), then Amelia will run with the options used in
567#'   that previous run. This is a convenient way to run more
568#'   imputations of the same model.
569#'
570#' You can provide Amelia with informational priors about the missing
571#' observations in your data.  To specify priors, pass a four or five
572#' column matrix to the \code{priors} argument with each row specifying a
573#' different priors as such:
574#'
575#'   \code{ one.prior <- c(row, column, mean,standard deviation)}
576#'
577#' or,
578#'
579#' \code{ one.prior <- c(row, column, minimum, maximum, confidence)}.
580#'
581#' So, in the first and second column of the priors matrix should be the
582#' row and column number of the prior being set.  In the other columns
583#' should either be the mean and standard deviation of the prior, or a
584#' minimum, maximum and confidence level for the prior. You must specify
585#' your priors all as distributions or all as confidence ranges.  Note
586#' that ranges are converted to distributions, so setting a confidence of
587#' 1 will generate an error.
588#'
589#' Setting a priors for the missing values of an entire variable is done
590#' in the same manner as above, but inputing a \code{0} for the row
591#' instead of the row number.  If priors are set for both the entire
592#' variable and an individual observation, the individual prior takes
593#' precedence.
594#'
595#' In addition to priors, Amelia allows for logical bounds on
596#' variables. The \code{bounds} argument should be a matrix with 3
597#' columns, with each row referring to a logical bound on a variable. The
598#' first column should be the column number of the variable to be
599#' bounded, the second column should be the lower bounds for that
600#' variable, and the third column should be the upper bound for that
601#' variable. As Amelia enacts these bounds by resampling, particularly
602#' poor bounds will end up resampling forever. Amelia will stop
603#' resampling after \code{max.resample} attempts and simply set the
604#' imputation to the relevant bound.
605#'
606#' If each imputation is taking a long time to converge, you can increase
607#' the empirical prior, \code{empri}.  This value has the effect of smoothing
608#' out the likelihood surface so that the EM algorithm can more easily find
609#' the maximum.  It should be kept as low as possible and only used if needed.
610#'
611#' Amelia assumes the data is distributed multivariate normal.  There are a
612#' number of variables that can break this assumption.  Usually, though, a
613#' transformation can make any variable roughly continuous and unbounded.
614#' We have included a number of commonly needed transformations for data.
615#' Note that the data will not be transformed in the output datasets and the
616#' transformation is simply useful for climbing the likelihood.
617#'
618#' Amelia can run its imputations in parallel using the methods of the
619#' \pkg{parallel} package. The \code{parallel} argument names the
620#' parallel backend that Amelia should use. Users on Windows systems must
621#' use the \code{"snow"} option and users on Unix-like systems should use
622#' \code{"multicore"}.  The \code{multicore} backend sets itself up
623#' automatically, but the \code{snow} backend requires more setup. You
624#' can pass a predefined cluster from the
625#' \code{parallel::makePSOCKcluster} function to the \code{cl}
626#' argument. Without this cluster, Amelia will attempt to create a
627#' reasonable default cluster and stop it once computation is
628#' complete. When using the parallel backend, users can set the number of
629#' CPUs to use with the \code{ncpus} argument. The defaults for these two
630#' arguments can be set with the options \code{"amelia.parallel"} and
631#' \code{"amelia.ncpus"}.
632#'
633#' Please refer to the Amelia manual for more information on the function
634#' or the options.
635#'
636#' @return An instance of S3 class "amelia" with the following objects:
637#' \item{imputations}{a list of length \code{m} with an imputed dataset in
638#'   each entry. The class (matrix or data.frame) of these entries will
639#'   match \code{x}.}
640#' \item{m}{an integer indicating the number of imputations run.}
641#' \item{missMatrix}{a matrix identical in size to the original dataset
642#'   with 1 indicating a missing observation and a 0 indicating an observed
643#'   observation.}
644#' \item{theta}{An array with dimensions \eqn{(p+1)} by \eqn{(p+1)} by \eqn{m}  (where
645#' \eqn{p} is the number of variables in the imputations model) holding
646#'   the converged parameters for each of the \code{m} EM chains.}
647#' \item{mu}{A \eqn{p} by \eqn{m} matrix of of the posterior modes for the
648#'   complete-data means in each of the EM chains.}
649#' \item{covMatrices}{An array with dimensions \eqn{(p)} by \eqn{(p)} by
650#'   \eqn{m} where the first two dimensions hold the posterior modes of the
651#'   covariance matrix of the complete data for each of the EM chains.}
652#' \item{code}{a integer indicating the exit code of the Amelia run.}
653#' \item{message}{an exit message for the Amelia run}
654#' \item{iterHist}{a list of iteration histories for each EM chain. See
655#'   documentation for details.}
656#' \item{arguments}{a instance of the class "ameliaArgs" which holds the
657#'   arguments used in the Amelia run.}
658#' \item{overvalues}{a vector of values removed for overimputation. Used to
659#'   reformulate the original data from the imputations. }
660#'
661#' Note that the \code{theta}, \code{mu} and \code{covMatrcies} objects
662#' refers to the data as seen by the EM algorithm and is thusly centered,
663#' scaled, stacked, tranformed and rearranged. See the manual for details
664#' and how to access this information.
665#'
666#' @references
667#' Honaker, J., King, G., Blackwell, M. (2011).
668#' Amelia II: A Program for Missing Data.
669#' \emph{Journal of Statistical Software}, \bold{45(7)}, 1--47.
670#' URL http://www.jstatsoft.org/v45/i07/.
671#'
672#' @seealso For imputation diagnostics, \code{\link{missmap}},
673#' \code{\link{compare.density}},
674#' \code{\link{overimpute}} and \code{\link{disperse}}. For time series
675#' plots, \code{\link{tscsPlot}}. Also: \code{\link{plot.amelia}},
676#' \code{\link{write.amelia}}, and \code{\link{ameliabind}}
677#'
678#' @examples
679#' data(africa)
680#' a.out <- amelia(x = africa, cs = "country", ts = "year", logs = "gdp_pc")
681#' summary(a.out)
682#' plot(a.out)
683#'
684#' @keywords models
685amelia <- function(x, ...) {
686  UseMethod("amelia", x)
687}
688
689#' @describeIn amelia Run additional imputations for Amelia output
690amelia.amelia <- function(x, m = 5, p2s = 1, frontend = FALSE, ...) {
691
692  ## The original data is the imputed data with the
693  ## imputations marked to NA. These two lines do that
694  data <- x$imputations[[1]]
695
696  ## Only the variables in the missMatrix should be passed. This is
697  ## because the others are
698  data <- getOriginalData(x)
699
700  out <- amelia.default(x = data, m = m, arglist=x$arguments, p2s=p2s,
701                        frontend = frontend, incheck=FALSE)
702  num.tcalls <- length(x$transform.calls)
703  if (num.tcalls > 0) {
704    for (i in 1:num.tcalls) {
705      tcall <- x$transform.calls[[i]]
706      tcall[[2]] <- as.name("out")
707      out <- eval(tcall)
708    }
709    out$transform.calls <- x$transform.calls
710  }
711  ret <- ameliabind(x, out)
712  return(ret)
713}
714
715#' @describeIn amelia Perform multiple overimputation from moPrep
716amelia.molist <- function(x, ...) {
717  m <- match.call(expand.dots=TRUE)
718  m$x <- x$data
719  m$priors <- x$priors
720  m$overimp <- x$overimp
721  m[[1]] <- quote(Amelia::amelia.default)
722  ret <- eval(m, parent.frame())
723
724  return(ret)
725}
726
727#' @describeIn amelia Run core Amelia algorithm
728amelia.default <- function(x, m = 5, p2s = 1, frontend = FALSE, idvars = NULL,
729                           ts = NULL, cs = NULL, polytime = NULL,
730                           splinetime = NULL, intercs = FALSE, lags = NULL,
731                           leads = NULL, startvals = 0, tolerance = 0.0001,
732                           logs = NULL, sqrts = NULL, lgstc = NULL, noms = NULL,
733                           ords = NULL, incheck = TRUE, collect = FALSE,
734                           arglist = NULL, empri = NULL, priors = NULL,
735                           autopri = 0.05, emburn = c(0,0), bounds = NULL,
736                           max.resample = 100, overimp = NULL,
737                           boot.type = "ordinary",
738                           parallel = c("no", "multicore", "snow"),
739                           ncpus = getOption("amelia.ncpus", 1L), cl = NULL,
740                           ...) {
741
742  ## parellel infrastructure modeled off of 'boot' package
743  if (missing(parallel)) parallel <- getOption("amelia.parallel", "no")
744  parallel <- match.arg(parallel)
745  have_mc <- have_snow <- FALSE
746  if (parallel != "no" && ncpus > 1L) {
747    if (parallel == "multicore") have_mc <- .Platform$OS.type != "windows"
748    else if (parallel == "snow") have_snow <- TRUE
749    if (!have_mc && !have_snow) ncpus <- 1L
750    if (p2s == 2) {
751      cat("\nUsing '", parallel, "' parallel backend with", ncpus, "cores.")
752    }
753  }
754
755
756  if (p2s == 2) {
757    cat("\namelia starting\n")
758    flush.console()
759  }
760  am.call <- match.call(expand.dots = TRUE)
761  archv <- am.call
762
763  prepped<-amelia.prep(x = x, m = m, idvars = idvars, empri = empri, ts = ts,
764                       cs = cs, tolerance = tolerance, polytime = polytime,
765                       splinetime = splinetime, lags = lags, leads = leads,
766                       logs = logs, sqrts = sqrts, lgstc = lgstc, p2s = p2s,
767                       frontend = frontend, intercs = intercs, noms = noms,
768                       startvals = startvals, ords = ords, incheck = incheck,
769                       collect = collect, arglist = arglist, priors = priors,
770                       autopri = autopri, bounds = bounds,
771                       max.resample = max.resample, overimp = overimp,
772                       emburn = emburn, boot.type = boot.type)
773
774  if (prepped$code != 1) {
775    cat("Amelia Error Code: ", prepped$code, "\n", prepped$message, "\n")
776    return(invisible(list(code = prepped$code, message = prepped$message)))
777  }
778
779
780  do.amelia <- function(X, ...) {
781
782    if (p2s == 2) {
783      cat("running bootstrap\n")
784    }
785    k <- ncol(prepped$x)
786    if (!is.null(colnames(x))) {
787      ovars <- colnames(x)
788    } else {
789      ovars <- 1:k
790    }
791
792    code <- 1
793
794    impdata <- list(imputations = list(),
795                    m           = 1,
796                    missMatrix  = prepped$missMatrix,
797                    overvalues  = prepped$overvalues,
798                    theta       = array(NA, dim = c(k+1,k+1,1) ),
799                    mu          = matrix(NA, nrow = k, ncol = 1),
800                    covMatrices = array(NA, dim = c(k,k,1)),
801                    code        = integer(0),
802                    message     = character(0),
803                    iterHist    = list(),
804                    arguments   = list(),
805                    orig.vars   = ovars)
806    class(impdata) <- "amelia"
807    class(impdata$imputations) <- c("mi","list")
808
809    x.boot<-bootx(prepped$x,prepped$priors, boot.type)
810    # Don't reorder columns thetanew will not align with d.stacked$x
811    x.stacked<-amstack(x.boot$x,colorder=FALSE,x.boot$priors)
812
813    if (p2s) cat("-- Imputation", X, "--\n")
814
815    thetanew <- emarch(x.stacked$x, p2s = p2s, thetaold = NULL,
816                       tolerance = tolerance, startvals = startvals,
817                       priors = x.stacked$priors, empri = empri,
818                       frontend = frontend, collect = collect,
819                       autopri = prepped$autopri, emburn = emburn)
820
821    ##thetanew <- .Call("emarch", PACKAGE = "Amelia")
822    ## update the amelia ouptut
823    impdata$iterHist[[1]]    <- thetanew$iter.hist
824    impdata$theta[,,1]       <- thetanew$thetanew
825    impdata$mu[,1]           <- thetanew$thetanew[-1,1]
826    impdata$covMatrices[,,1] <- thetanew$thetanew[-1,-1]
827    dimnames(impdata$covMatrices)[[1]] <- prepped$theta.names
828    dimnames(impdata$covMatrices)[[2]] <- prepped$theta.names
829    dimnames(impdata$mu)[[1]] <- prepped$theta.names
830
831    evs <- eigen(thetanew$thetanew[-1, -1, drop = FALSE], only.values=TRUE,
832                 symmetric=TRUE)
833    if (any(evs$values < .Machine$double.eps)) {
834      impdata$imputations[[1]] <- NA
835      impdata$code <- 2
836      impdata$arguments <- prepped$archv
837      class(impdata$arguments) <- c("ameliaArgs", "list")
838
839      cat("\n\nThe resulting variance matrix was not invertible.",
840          "  Please check your data for highly collinear variables.\n\n")
841      return(impdata)
842    }
843
844    ximp <- amelia.impute(prepped$x, thetanew$thetanew, priors = prepped$priors,
845                          bounds = prepped$bounds, max.resample)
846    ximp <- amunstack(ximp, n.order = prepped$n.order,
847                      p.order = prepped$p.order)
848    ximp <- unscale(ximp, mu = prepped$scaled.mu, sd = prepped$scaled.sd)
849    ximp <- unsubset(x.orig = prepped$trans.x, x.imp = ximp,
850                     blanks = prepped$blanks, idvars = prepped$idvars,
851                     ts = prepped$ts, cs = prepped$cs, polytime = polytime,
852                     splinetime = splinetime, intercs = intercs,
853                     noms = prepped$noms, index = prepped$index,
854                     ords = prepped$ords)
855    ximp <- untransform(ximp, logs = prepped$logs, xmin = prepped$xmin,
856                        sqrts = prepped$sqrts, lgstc = prepped$lgstc)
857
858    if (p2s==2) {
859      cat("\n saving and cleaning\n")
860    }
861
862    ## here we deal with the imputed matrix.
863
864    ## first, we put the data into the output list and name it
865    impdata$imputations[[1]] <- impfill(x.orig = x, x.imp = ximp,
866                                        noms = prepped$noms,
867                                        ords = prepped$ords, priors = priors,
868                                        overimp = overimp)
869
870    if (p2s) cat("\n")
871    if (frontend) {
872      requireNamespace("tcltk")
873      tcltk::tcl(getAmelia("runAmeliaProgress"), "step",(100/m -1))
874    }
875
876    impdata$code <- code
877    impdata$arguments <- prepped$archv
878    names(impdata$imputations) <- paste("imp", X, sep = "")
879    class(impdata$arguments) <- c("ameliaArgs", "list")
880
881    return(impdata)
882  }
883
884  ## parallel infrastructure from the 'boot' package
885  impdata <- if (ncpus > 1L && (have_mc || have_snow)) {
886    if (have_mc) {
887      parallel::mclapply(seq_len(m), do.amelia, mc.cores = ncpus)
888    } else if (have_snow) {
889      list(...) # evaluate any promises
890      if (is.null(cl)) {
891        cl <- parallel::makePSOCKcluster(rep("localhost", ncpus))
892        if(RNGkind()[1L] == "L'Ecuyer-CMRG")
893          parallel::clusterSetRNGStream(cl)
894        res <- parallel::parLapply(cl, seq_len(m), do.amelia)
895        parallel::stopCluster(cl)
896        res
897      } else parallel::parLapply(cl, seq_len(m), do.amelia)
898    }
899  } else lapply(seq_len(m), do.amelia)
900
901  if (all(sapply(impdata, is, class="amelia"))) {
902    if (!all(sapply(impdata, function(x) is.na(x$imputations)))) {
903      impdata <- do.call(ameliabind, impdata)
904      if (impdata$code == 2) {
905        impdata$message <- paste("One or more of the imputations resulted in a",
906                                 "covariance matrix that was not invertible.")
907      } else {
908        impdata$message <- paste("Normal EM convergence.")
909      }
910    } else {
911      impdata <- do.call(ameliabind, impdata)
912      impdata$code <- 2
913      impdata$message <- paste("All of the imputations resulted in a covariance",
914                               "matrix that is not invertible.")
915    }
916  }
917
918
919  return(impdata)
920}
921