1##  diag.r
2##  amelia diagnostic functins
3##
4##  05/05/06 mb - added amelia.arg compatibility
5##  07/05/06 jh - compare: changed how variable names are found, changed titles/labels, set x-axis values in matplot, colours for no imputations
6##                overimpute: added new m-name in output,
7##  09/05/06 mb - overimpute: added frontend check for overimpute.
8##  15/05/06 jh - overimpute: stacking of original data, and various graphics adjustments
9##  01/06/06 mb - added "gethull" and "disperse" for overdispersion diagnostic
10##  19/07/06 mb - moved handling of arglists to prep.
11##  01/12/06 mb - can't compare non-numerics, only use the relevant columns when
12##                building compare
13##  13/12/06 mb - changed for new priors.
14##  26/03/07 jh - overimpute: excluded polynomials of time from missingness count, reordered ploting of ci's (smallest last), allow variable name as var argument
15##  28/03/07 jh - disperse: changed tolerance and empri handling.
16##  03/04/07 jh - disperse: changed 1d plot settings, number of colors, minor edits to "patt" construction.
17##  10/04/07 jh - created sigalert function to view disperse principal components.
18##  22/07/08 mb - good coding update: T->TRUE/F->FALSE
19##  10/02/09 mb - compare: added lwd, col, main, lab, etc for user
20##                control, added scale so that users can control scaling,
21##                uses amelia class
22##                overimpute: uses amelia class, added lwd, col, main, lab, etc for user
23##                disperse: now uses amelia class
24##  02/21/12 jh - added mi.meld to combine multiply imputed quantities of interest and se's.
25##  10/30/12 jh - tscsPlot: expanded to allow to cycle through sets of cross sectional units efficiently.
26
27
28#' Compare observed versus imputed densities
29#'
30#' Plots smoothed density plots of observed and imputed values from output
31#' from the \code{amelia} function.
32#'
33#' @param output output from the function \code{amelia}.
34#' @param var column number or variable name of the variable to plot.
35#' @param col a vector of length 2 containing the color to plot the (1)
36#'        imputed density and (2) the observed density.
37#' @param scaled a logical indicating if the two densities should be
38#'        scaled to reflect the difference in number of units in each.
39#' @param lwd the line width of the density plots.
40#' @param main main title of the plot. The default is to title the plot
41#'        using the variable name.
42#' @param xlab the label for the x-axis. The default is the name of the
43#'        variable.
44#' @param ylab the label for the y-axis. The default is "Relative Density."
45#' @param legend a logical value indicating if a legend should be
46#'        plotted.
47#' @param frontend a logical value used internally for the Amelia GUI.
48#' @param ... further graphical parameters for the plot.
49#'
50#' @details   This function first plots a density plot of the observed units for the
51#' variable \code{var} in \code{col[2]}. The the function plots a density plot of the mean
52#' or modal imputations for the missing units in \code{col[1]}. If a
53#' variable is marked "ordinal" or "nominal" with the \code{ords} or
54#' \code{noms} options in \code{amelia}, then the modal imputation will
55#' be used. If \code{legend} is \code{TRUE}, then a legend is plotted as well.
56#'
57#' @references
58#' Abayomi, K. and Gelman, A. and Levy, M. 2005 "Diagnostics for
59#' Multivariate Imputations," \emph{Applied Statistics}. 57,3: 273--291.
60#'
61#' @examples
62#' data(africa)
63#'
64#' @seealso For more information on how densities are computed,
65#' \code{\link{density}}; Other imputation diagnostics are
66#' \code{\link{overimpute}}, \code{\link{disperse}}, and
67#' \code{\link{tscsPlot}}.
68#'
69compare.density <- function(output, var, col = c("indianred", "dodgerblue"),
70                            scaled = FALSE, lwd = 1, main, xlab, ylab,
71                            legend = TRUE, frontend = FALSE, ...) {
72
73  if (!("amelia" %in% class(output)))
74    stop("The 'output' is not Amelia output.")
75
76  ##data <- getOriginalData(output)
77  data <- remove.imputations(output)
78
79  ## Checks on if the variable makes sense to plot.
80  if (class(var) == "character")
81    if (!(var %in% names(data)))
82      stop("The variable name (var) doesn't correspond to a column in the data.")
83    else
84      var <- match(var,names(data))
85  if (any(var > ncol(data), var < 0, (var %% 1) != 0))
86    stop("The 'var' option points to a non-existant column.")
87  if (var %in% output$arguments$idvar)
88    stop("the variable selected was marked as an idvar")
89
90  ## We need to clean the data to make sure that
91  ## we're not going to run into NAs
92  mcount <- sum(!is.na(output$imputations))
93  imputed <- (1:output$m)[!is.na(output$imputations)]
94
95  ## create an empty vector to sum across
96  varimp <- matrix(NA, nrow(data), mcount)
97
98  for (i in 1:mcount) {
99    if (is.data.frame(data)) {
100      varimp[,i] <- output$imputations[[imputed[i]]][[var]]
101    } else {
102      varimp[,i] <- output$imputations[[imputed[i]]][,var]
103    }
104  }
105  if (var %in% c(output$arguments$noms, output$arguments$ords)) {
106    leg.text <- "Modal Imputations"
107    varimp <- apply(varimp, 1, function(x) as.numeric(names(which.max(table(x)))))
108  } else {
109    leg.text <- "Mean Imputations"
110    varimp <- rowMeans(varimp)
111  }
112
113  if (frontend) {
114    dev.new()
115  }
116
117  if (is.data.frame(data)) {
118    vars <- data[[var]]
119  } else {
120    vars <- data[,var]
121  }
122
123  if (scaled)
124    ratio <- sum(is.na(vars))/sum(!is.na(vars))
125  else
126    ratio <- 1
127  varnames <- dimnames(data)[[2]]            # This will work for both data.frames AND matricies.
128  vname <- varnames[var]                     # This will work for both data.frames AND matricies.
129
130
131  if (sum(is.na(vars)) > 1) {
132    oiDetect <- (sum(output$missMatrix[,var]) + sum(!is.na(vars))) > length(vars)
133    if (missing(main)) {
134      if (oiDetect) {
135        main <-  paste("Observed and Overimputed values of", vname)
136      } else {
137        main <- paste("Observed and Imputed values of", vname)
138      }
139    }
140    if (missing(xlab)) {
141      xlab <- paste(vname,"  --  Fraction Missing:", round(mean(is.na(vars)), digits = 3))
142
143    }
144    if (missing(ylab)) {
145      ylab <- "Relative Density"
146    }
147
148    xmiss <- density(varimp[output$missMatrix[, var]], na.rm = TRUE)
149    xobs  <- density(vars[!is.na(vars)], na.rm = TRUE)
150    compplot <- matplot(x = cbind(xmiss$x, xobs$x),
151                        y = cbind(ratio * xmiss$y, xobs$y),
152                        xlab = xlab, ylab = ylab, type = "l", lwd = lwd,
153                        lty = 1, main = main, col = col, ...)
154    if (legend) {
155      legend("topright", legend = c(leg.text, "Observed Values"),
156             col = col, lty = c(1,1), bg = 'gray90', lwd = lwd)
157    }
158  } else {
159    if (missing(main)) {
160      main <- paste("Observed values of",vname)
161    }
162    if (missing(xlab)) {
163      xlab <- vname
164    }
165    if (missing(ylab)) {
166      ylab <- "Relative Density"
167    }
168
169    compplot <- plot(density(varimp, na.rm = TRUE), col = col[2],
170                     main = main,...)
171    if (sum(is.na(vars)) == 1) {
172      abline(v = varimp[output$missMatrix[, var]], col = col[1])
173    }
174
175    if (legend) {
176      legend("topright", legend = c("Mean Imputations","Observed Values"),
177             col = col, lty = c(1,1), bg = 'gray90')
178    }
179  }
180
181  invisible()
182}
183
184
185
186#' Overimputation diagnostic plot
187#'
188#' Treats each observed value as missing and imputes from the imputation
189#' model from \code{amelia} output.
190#'
191#' @param output output from the function \code{amelia}.
192#' @param var column number or variable name of the variable to
193#'        overimpute.
194#' @param draws the number of draws per imputed dataset to generate
195#'   overimputations. Total number of simulations will \code{m *
196#'   draws} where \code{m} is the number of imputations.
197#' @param subset an optional vector specifying a subset of observations
198#'        to be used in the overimputation.
199#' @param legend a logical value indicating if a legend should be
200#'        plotted.
201#' @param xlab the label for the x-axis. The default is "Observed Values."
202#' @param ylab the label for the y-axis. The default is "Imputed Values."
203#' @param main main title of the plot. The default is to smartly title the plot
204#'        using the variable name.
205#' @param frontend a logical value used internally for the Amelia GUI.
206#' @param ... further graphical parameters for the plot.
207#'
208#' @details
209#' This function temporarily treats each observed value in
210#' \code{var} as missing and imputes that value based on the imputation
211#' model of \code{output}. The dots are the mean imputation and the
212#' vertical lines are the 90\% percent confidence intervals for
213#' imputations of each observed value. The diagonal line is the \eqn{y=x}
214#' line. If all of the imputations were perfect, then our points would
215#' all fall on the line. A good imputation model would have about 90\% of
216#' the confidence intervals containing the truth; that is, about 90\% of
217#' the vertical lines should cross the diagonal.
218#'
219#' The color of the vertical lines displays the fraction of missing
220#' observations in the pattern of missingness for that
221#' observation. The legend codes this information. Obviously, the
222#' imputations will be much tighter if there are more observed covariates
223#' to use to impute that observation.
224#'
225#' The \code{subset} argument evaluates in the environment of the
226#' data. That is, it can but is not required to refer to variables in the
227#' data frame as if it were attached.
228#'
229#' @return A list that contains (1) the row in the original data
230#'   (\code{row}), (2) the observed value of that observation
231#'   (\code{orig}), (2) the mean of the overimputations
232#'   (\code{mean.overimputed}), (3) the lower bound of the 95\%
233#'   confidence interval of the overimputations
234#'   (\code{lower.overimputed}), (4) the upper bound of the 95\%
235#'   confidence interval of the overimputations
236#'   (\code{upper.overimputed}), (5) the fraction of the variables
237#'   that were missing for that observation in the original data
238#'   (\code{prcntmiss}), and (6) a matrix of the raw overimputations,
239#'   with observations in rows and the different draws in columns (\code{overimps}).
240#'
241#' @seealso Other imputation diagnostics are
242#' \code{\link{compare.density}}, \code{\link{disperse}}, and
243#' \code{\link{tscsPlot}}.
244
245overimpute <- function(output, var, draws = 20, subset, legend = TRUE, xlab,
246                       ylab, main, frontend = FALSE, ...) {
247
248  if (!("amelia" %in% class(output)))
249    stop("The 'output' is not Amelia output.")
250
251
252  data <- getOriginalData(output)
253
254  ## via the subset.data.frame function
255  if (missing(subset)) {
256    r <- TRUE
257  } else {
258    e <- substitute(subset)
259    r <- eval(e, data, parent.frame())
260    if (!is.logical(r)) {
261      stop("'subset' must evaluate to logical")
262    }
263    r <- r & !is.na(r)
264    if (sum(r) == 0) {
265      stop("no observations in the subset")
266    }
267  }
268  data <- data[r,]
269
270  origAMr1 <- is.na(data)
271
272
273  ## Allow character names as arguments for "var" with data.frames
274
275  if(is.character(var)){
276    if(!is.data.frame(data)){
277      stop("var must be identified by column number as dataset is not a data frame.")
278    } else {
279      nomnames <- colnames(output$imputations[[1]])[output$arguments$noms]
280      if (var %in% nomnames) {
281        stop("Cannot overimpute variables set to be nominal")
282      }
283      varpos <- match(var, colnames(data))
284      if(is.na(varpos)){
285        stop("The name provided for var argument does not exist in the dataset provided.")
286      } else {
287        var <- varpos
288      }
289    }
290  }
291
292
293  ## The argument list for an amelia output is now
294  ## at "output$arguments"
295  prepped <- amelia.prep(x = data, arglist = output$arguments, incheck = FALSE)
296
297  stacked.var <- match(var, prepped$subset.index[prepped$p.order])
298  subset.var <- match(var, prepped$subset.index)
299  if (!is.null(prepped$blanks))
300    fully.missing <- origAMr1[-prepped$blanks, var][prepped$n.order]
301  else
302    fully.missing <- origAMr1[, var][prepped$n.order]
303
304  if (is.na(stacked.var)) {
305    if (frontend)
306      tcltk::tkmessageBox(message="The variable you selected doesn't exist in the Amelia output becuase it wasn't imputed.",icon="error",type="ok")
307    stop("var doesn't exist in the amelia output.  It either didn't get imputed or is out of the range of columns.")
308  }
309
310  means <- c()
311  lowers <- c()
312  uppers <- c()
313  pcnts <- c()
314  color <- c()
315  AMr1 <- is.na(prepped$x)
316  ## if (sum(!AMr1[,stacked.var]) == 0){
317  ##   if (frontend) {
318  ##     tkmessageBox(parent = getAmelia("gui"),
319  ##                  message="The variable needs to have at least one fully observed cell.",icon="error",type="ok")
320  ##   }
321  ##   stop("function needs at least one fully observed cell in 'var'.")
322  ## }
323  AMr1[,stacked.var] <- TRUE
324  AMp <- ncol(prepped$x)
325  imphold <- matrix(NA, nrow = nrow(prepped$x), ncol = output$m * draws)
326  for (i in 1:nrow(prepped$x)) {
327    if (fully.missing[i]) {
328      next()
329    }
330
331    x <- prepped$x[i,,drop=FALSE]
332    x[1, stacked.var] <- NA
333    o <- !is.na(x)
334    miss <- !o
335    x[is.na(x)] <- 0
336    oo <- 1 * o
337    mm <- 1 * miss
338                                        #o<-!AMr1[i,]
339                                        #o[stacked.var]<-FALSE
340
341    pcntmiss <- (sum(miss))/(length(miss)-sum(prepped$index==0))   # Does not include time polynomials (index==0) in the denominator
342    ## These are always fully observed by construction, but auxiliary.
343    ## Leaves constructed lags and
344    ## leads, and nominal variables
345    ## in count, however.
346
347    conf <- c()
348    for (k in 1:output$m) {
349
350      ## The theta matrix is now stored in an array with
351      ## dimensions c(vars+1,vars+1,m), so this grabs
352      ## the kth theta matrix.
353
354      thetareal <- output$theta[,,k]
355      xx <- matrix(x, draws, AMp, byrow = TRUE)
356      rr <- matrix(AMr1[i,], draws, AMp, byrow = TRUE)
357      xc <- .Call("ameliaImpute", xx, rr, oo, mm, c(1, nrow(xx) + 1), thetareal, NULL,
358                   NULL, NULL, PACKAGE = "Amelia")
359      conf <- c(conf, xc[, stacked.var])
360    }
361
362    scaled.conf <- (conf * prepped$scaled.sd[subset.var])  + prepped$scaled.mu[subset.var]
363    varlog <- match(var, prepped$logs)
364
365    if (!is.na(varlog)) {
366      scaled.conf <- untransform(as.matrix(scaled.conf), logs = 1,
367                                 xmin = prepped$xmin[varlog], sqrts = NULL,
368                                 lgstc = NULL)
369    }
370    if (!is.na(match(var,prepped$sqrts))) {
371      scaled.conf <- untransform(as.matrix(scaled.conf), logs = NULL,
372                                 xmin = NULL, sqrts = 1, lgstc = NULL)
373    }
374    if (!is.na(match(var,prepped$lgstc))) {
375      scaled.conf <- untransform(as.matrix(scaled.conf), logs = NULL,
376                                 xmin = NULL, sqrts = NULL, lgstc = 1)
377    }
378
379    ##colors are based on rainbow roygbiv l->r is higher missingness  \
380    blue <- rgb(0,0,1, alpha = 0.75)
381    green <- rgb(0,.75,0, alpha = 0.75)
382    orange <- rgb(1, 0.65,0, alpha = 0.75)
383    tomato <- rgb(1, 0.39, 0.28, alpha = 0.75)
384    red <- rgb(0.75, 0, 0, alpha = 0.75)
385    spectrum <- c(blue, green, orange, tomato, red)
386
387    if (pcntmiss < .20)
388      color <- c(color, spectrum[1])
389    else if (pcntmiss >= .20 && pcntmiss < .40)
390      color <- c(color, spectrum[2])
391    else if (pcntmiss >= .40 && pcntmiss < .60)
392      color <- c(color, spectrum[3])
393    else if (pcntmiss >= .60 && pcntmiss < .80)
394      color <- c(color, spectrum[4])
395    else if (pcntmiss >= .80)
396      color <- c(color, spectrum[5])
397
398    imphold[i,] <- scaled.conf
399    means <- c(means, mean(scaled.conf))
400    lowers <- c(lowers, sort(scaled.conf)[round(output$m * draws * 0.05)])
401    uppers <- c(uppers, sort(scaled.conf)[round(output$m * draws * 0.95)])
402    pcnts <- c(pcnts, pcntmiss)
403
404  }
405
406                                        #AMr1<-is.na(prepped$x[,stacked.var])
407                                        #partial.n.order<-prepped$n.order[!origAMr1]
408
409  if (is.data.frame(data)) {
410    xplot <- data[[var]]
411  } else {
412    xplot <- data[,var]
413  }
414  if (is.null(prepped$blanks)) {
415    xplot <- xplot[prepped$n.order][!fully.missing]
416  } else {
417    xplot <- xplot[-prepped$blanks][prepped$n.order][!fully.missing]
418  }
419  addedroom <- (max(uppers) - min(lowers)) * 0.1
420  if (!hasArg(log)) {
421    this.ylim <- range(c(lowers - addedroom, uppers))
422    legpos <- "bottomright"
423  } else {
424    this.ylim <- range(c(lowers[lowers > 0], uppers + addedroom))
425    legpos <- "topright"
426  }
427
428  if (missing(xlab)) {
429    xlab <- "Observed Values"
430  }
431  if (missing(ylab)) {
432    ylab <- "Imputed Values"
433  }
434  if (missing(main)) {
435    main <- paste("Observed versus Imputed Values of",colnames(data)[var])
436  }
437
438  if (frontend) {
439    dev.new()
440  }
441  ci.order <- order(uppers - lowers, decreasing = TRUE)     # Allows smallest CI's to be printed last, and thus not buried in the plot.
442  overplot <- plot(xplot[ci.order], means[ci.order], xlab = xlab, ylab = ylab,
443                   ylim = this.ylim, type = 'p', main = main,
444                   col = color[ci.order], pch = 19,...)
445  segments(xplot[ci.order], lowers[ci.order], xplot[ci.order], uppers[ci.order],
446           col = color[ci.order])
447  if (legend) {
448    legend(legpos, legend = c(" 0-.2",".2-.4",".4-.6",".6-.8",".8-1"),
449           col = spectrum, lty = c(1,1), horiz = TRUE, bty = "n")
450  }
451
452  abline(0,1)
453
454  out <- list(row = prepped$n.order[!fully.missing],
455              orig = xplot,
456              mean.overimputed = means,
457              lower.overimputed = lowers,
458              upper.overimputed = uppers,
459              prcntmiss = pcnts,
460              overimps = imphold[!is.na(imphold[,1]),])
461  invisible(out)
462}
463
464gethull <- function(st,tol,rots) {
465  stvec <- st
466  for (i in 1:length(st)) {
467    addedvec <- rep(0,length(st))
468    addedvec[i] <- tol * 100
469    newvec <- cbind(st + addedvec, st - addedvec)
470    stvec <- cbind(stvec, newvec)
471  }
472  reduced.hull <- t(rots) %*% stvec
473  return(reduced.hull)
474}
475
476
477#' Overdispersed starting values diagnostic for multiple imputation
478#'
479#' A visual diagnostic of EM convergence from multiple overdispersed
480#' starting values for an output from \code{amelia}.
481#'
482#' @param output output from the function \code{amelia}.
483#' @param m the number of EM chains to run from overdispersed starting values.
484#' @param dims the number of principle components of the parameters to
485#'        display and assess convergence on (up to 2).
486#' @param p2s an integer that controls printing to screen. 0 (default)
487#'        indicates no printing, 1 indicates normal screen output and 2
488#'        indicates diagnostic output.
489#' @param frontend a logical value used internally for the Amelia GUI.
490#' @param xlim limits of the plot in the horizontal dimension.
491#' @param ylim limits of the plot in vertical dimension.
492#' @param ... further graphical parameters for the plot.
493#'
494#' @details   This function tracks the convergence of \code{m} EM chains which start
495#' from various overdispersed starting values. This plot should give some
496#' indication of the sensitivity of the EM algorithm to the choice of
497#' starting values in the imputation model in \code{output}. If all of
498#' the lines converge to the same point, then we can be confident that
499#' starting values are not affecting the EM algorithm.
500#'
501#' As the parameter space of the imputation model is of a
502#' high-dimension, this plot tracks how the first (and second if
503#'                                                 \code{dims} is 2) principle component(s) change over the iterations of
504#' the EM algorithm. Thus, the plot is a lower dimensional summary of the
505#' convergence and is subject to all the drawbacks inherent in said
506#' summaries.
507#'
508#' For \code{dims==1}, the function plots a horizontal line at the
509#' position where the first EM chain converges. Thus, we are checking
510#' that the other chains converge close to that horizontal line. For
511#' \code{dims==2}, the function draws a convex hull around the point of
512#' convergence for the first EM chain. The hull is scaled to be within
513#' the tolerance of the EM algorithm. Thus, we should check that the
514#' other chains end up in this hull.
515#'
516#' @seealso Other imputation diagnostics are
517#' \code{\link{compare.density}}, \code{\link{disperse}}, and
518#' \code{\link{tscsPlot}}
519disperse <- function(output, m = 5, dims = 1, p2s = 0, frontend = FALSE, ...,
520                     xlim = NULL, ylim = NULL) {
521
522  if (!("amelia" %in% class(output)))
523    stop("The 'output' is not Amelia output.")
524
525  ## The original data is the imputed data with the
526  ## imputations marked to NA. These two lines do that
527  data <- getOriginalData(output)
528
529  if (frontend) {
530    requireNamespace("tcltk")
531    putAmelia("output.log", c(getAmelia("output.log"), "==== Overdispersion Output ====\n"))
532  }
533
534                                        # prep the data and arguments
535  prepped<-amelia.prep(x=data, arglist=output$arguments)
536
537  if (p2s) cat("-- Imputation", "1", "--")
538  if (frontend) {
539    putAmelia("output.log", c(getAmelia("output.log"), paste("-- Imputation","1","--\n")))
540  }
541  flush.console()
542
543                                        # run EM, but return it with the theta at each iteration
544  thetanew <- emarch(prepped$x, p2s = p2s, thetaold = NULL,
545                     tolerance = prepped$tolerance, startvals = 0,
546                     priors = prepped$priors, empri = prepped$empri,
547                     frontend = frontend, allthetas = TRUE, collect = FALSE)  #change 4
548
549                                        # thetanew is a matrix whose columns are vectorized upper triangles of theta
550                                        # matrices for each iteration. thus, there are k(k+1)/2 rows.
551  impdata <- thetanew$thetanew
552
553                                        # we'll put the theta of the last iteration into a new starting theta
554  startsmat <- matrix(0, ncol(prepped$x) + 1, ncol(prepped$x) + 1)
555  startsmat[upper.tri(startsmat, TRUE)] <- c(-1, impdata[, ncol(impdata)])
556  startsmat <- t(startsmat)
557  startsmat[upper.tri(startsmat, TRUE)] <- c(-1, impdata[, ncol(impdata)])
558  iters <- nrow(thetanew$iter.hist) + 1
559
560  for (i in 2:m) {
561    if (p2s) cat("-- Imputation", i, "--\n")
562    if (frontend) {
563      putAmelia("output.log", c(getAmelia("output.log"), paste("-- Imputation",i,"--\n")))
564    }
565
566                                        # get a noisy sample of data from the that starting value (which is the
567                                        # Amelia answer) and use that to estimate a new starting theta (mus/vcov)
568    newstarts <- rmvnorm(round(2.5 * ncol(prepped$x)),
569                         startsmat[1,2:ncol(startsmat)],
570                         startsmat[2:nrow(startsmat),2:nrow(startsmat)])
571    startcov <- var(newstarts)
572    startmus <- colMeans(newstarts)
573    newstartsmat <- matrix(-1, ncol(prepped$x) + 1, ncol(prepped$x) + 1)
574    newstartsmat[2:nrow(startsmat),2:nrow(startsmat)] <- startcov
575    newstartsmat[1,2:nrow(startsmat)] <- startmus
576    newstartsmat[2:nrow(startsmat),1] <- startmus
577
578                                        # grab the iteration history of the thetas
579    thetanew <- emarch(prepped$x, p2s = p2s, thetaold = newstartsmat,
580                       tolerance = prepped$tolerance, startvals = 0,
581                       priors = prepped$priors, empri = prepped$empri,
582                       frontend = frontend, allthetas = TRUE, collect = FALSE)  # change 5
583    impdata <- cbind(impdata, thetanew$thetanew)
584    iters <- c(iters, nrow(thetanew$iter.hist) + 1)
585  }
586  if (dims == 1)
587    comps <- c(1)
588  else
589    comps <- c(1,2)
590
591                                        # reduce the dimenionality from k(k+1)/2 to 1 or 2 via principle components
592  rotations <- prcomp(t(impdata))$rotation[, comps]
593  reduced.imps <- t(rotations) %*% impdata
594
595  cols <- rainbow(m)
596                                        # plot the imputations
597  if (frontend) {
598    dev.new()
599  }
600  if (dims == 1) {
601    addedroom <- (max(reduced.imps) - min(reduced.imps)) * 0.1
602    x <- seq(iters[1])
603    if (is.null(xlim)) xlim <- c(0, max(iters))
604    if (is.null(ylim)) ylim <- range(c(reduced.imps - addedroom, reduced.imps))
605    y <- reduced.imps[1, 1:iters[1]]
606    patt <- seq(1, length(x) - 1)
607    plot(x, y, col = 1, main = "Overdispersed Start Values",
608         xlab = "Number of Iterations", ylab = "Largest Principle Component",
609         xlim = xlim, ylim = ylim, type = "n")
610    segments(x[patt], y[patt], x[patt + 1], y[patt + 1], col = cols[1])
611    for (i in 2:length(iters)) {
612      x <- seq(iters[i])
613      y <- reduced.imps[1, (sum(iters[1:(i-1)])+1):(sum(iters[1:i]))]
614      patt <- seq(1, length(x)-1)
615      segments(x[patt], y[patt], x[patt+1], y[patt+1], col=cols[i])
616                                        #points(x,y,col=i)
617    }
618    abline(h = reduced.imps[iters[1]], lwd = 2)
619    legend("bottomright", legend = c("Convergence of original starting values"),
620           lwd = 2, bty = "n")
621  } else {
622    xrange <- c((min(reduced.imps[1,])), (max(reduced.imps[1,])))
623    yrange <- c((min(reduced.imps[2,])), (max(reduced.imps[2,])))
624    if (is.null(xlim)) xlim <- xrange
625    if (is.null(ylim)) ylim <- yrange
626    plot(reduced.imps[1,1:iters[1]], reduced.imps[2,1:iters[1]], type = "n",
627         main = "Overdispersed Starting Values",
628         xlab = "First Principle Component", ylab = "Second Principle Component",
629         col=cols[1], xlim = xlim, ylim = ylim)
630    for (i in 2:length(iters)) {
631      x <- reduced.imps[1, (sum(iters[1:(i-1)])+1):(sum(iters[1:i]))]
632      y <- reduced.imps[2, (sum(iters[1:(i-1)])+1):(sum(iters[1:i]))]
633      patt <- c()
634      xdiffs <- diff(x)
635      ydiffs <- diff(y)
636      veclength <- sqrt(xdiffs^2+ydiffs^2)
637      for (j in 1:length(xdiffs))
638        if (veclength[j] > xinch(1/500))
639          patt <- c(patt,j)
640      if (!is.null(patt))
641        arrows(x[patt], y[patt], x[patt + 1], y[patt + 1], length = .1,
642               col = cols[i])
643      patt <- seq(1, length(x) - 1)
644      segments(x[patt], y[patt], x[patt+1], y[patt+1], col = cols[i])
645    }
646    x <- reduced.imps[1,1:iters[1]]
647    y <- reduced.imps[2,1:iters[1]]
648    xdiffs <- diff(x)
649    ydiffs <- diff(y)
650    veclength <- sqrt(xdiffs^2+ydiffs^2)
651    inchlength <- sqrt(sum(xyinch(1/500)^2))
652    patt <- c()
653    for (j in 1:length(xdiffs))
654      if (veclength[j] > inchlength)
655        patt <- c(patt,j)
656                                        #if (!is.null(patt))
657                                        #  arrows(x[patt],y[patt],x[patt+1],y[patt+1],length=.15,col=1,lwd=5)
658    patt <- seq(1, length(x)  -1)
659    segments(x[patt], y[patt], x[patt + 1], y[patt + 1], col = cols[1], lwd = 1)
660    dists <- gethull(st = impdata[ ,iters[1]], tol = prepped$tolerance,
661                     rots = rotations)
662    convexhull <- chull(t(dists))
663    convexhull <- c(convexhull, convexhull[1])
664    lines(t(dists)[convexhull,], col = "orange", pch = 19, lwd = 2)
665    abline(h = 0, lty = 2)
666    abline(v = 0, lty = 2)
667  }
668                                        #if (frontend)
669                                        #  tkdestroy(getAmelia("tcl.window"))
670
671  out <- list(impdat = impdata, p.order = prepped$p.order, index = prepped$index,
672              iters = iters, rotations = rotations, dims = dims)
673  invisible(out)
674
675}
676
677
678sigalert <- function(data, disperse.list, output, notorious = 5){
679
680  k <- length(disperse.list$p.order) + 1
681
682                                        # Construct Variable Names for all variables constructed in Imputation Model.
683                                        # This uses the "index" which details all the variables included in the imputation model.
684                                        # The index is in the unstacked variable order.
685                                        # Possibly, if this is useful elsewhere, this might be moved to "prep.r".
686
687  varnm <- NULL
688  lag.count <- 0
689  lead.count <- 0
690  poly.count <- 0
691  unknown.count <- 0
692
693  for (i in 1:(k-1)) {
694    if (identical(disperse.list$index[i], -0.5)) {
695      lag.count <- lag.count + 1
696      varnm <- c(varnm, paste("lag", lag.count))
697    } else if (identical(disperse.list$index[i], 0.5)) {
698      lead.count <- lead.count + 1
699      varnm <- c(varnm, paste("lead", lead.count))
700    } else if (identical(disperse.list$index[i],0)) {
701      poly.count <- poly.count + 1
702      varnm <- c(varnm, paste("polytime", poly.count))
703    } else if(disperse.list$index[i] >= 1) {
704      varnm <- c(varnm, names(data[disperse.list$index[i]]))      # Check what this does with matricies?
705    } else {
706      unknown.count <- unknown.count + 1
707      varnm <- c(varnm, paste("unknown", unknown.count))
708    }
709  }
710
711                                        #  WARNING: Currently assumes rotations is a vector.  If dim=2, rotations is a matrix.
712                                        #  if(!identical(disperse.list$dims,1)){
713                                        #    disperse.list$rotations<-disperse.list$rotations[1,]
714                                        #  }
715
716                                        # This is a flag vector that identifies the largest values in the first principal component.
717
718  largest.rotations <- disperse.list$rotations * 0
719  largest.rotations[order(abs(disperse.list$rotations),decreasing = TRUE)[1:notorious]] <- 1
720
721                                        # This is a matrix of the size of theta, which has a 1 in the positions of the largest
722                                        # contributions to the first principal component.
723                                        # (largest corresponding elements of disperse.list$rotations)
724
725  map <- matrix(0, k, k)
726  map[upper.tri(map, TRUE)] <- c(0, largest.rotations)
727  map <- t(map)
728  map[upper.tri(map, TRUE)] <- c(0, largest.rotations)
729  map[c(1, disperse.list$p.order + 1), c(1, disperse.list$p.order + 1)] <- map                         # Rearrange to unstacked variable positions
730
731  print(abs(map))
732
733  gtz<-function(a)
734    return(sum(a) > 0)
735  row.keep <- apply(map, 1, gtz)
736  col.keep <- apply(map, 2, gtz)
737
738                                        # This is the submatrix of rotations, reshaped as a theta matrix, with the largest elements.
739
740  prcomp.matrix <- matrix(0,k,k)
741  prcomp.matrix[upper.tri(prcomp.matrix, TRUE)] <- c(0, disperse.list$rotations)
742  prcomp.matrix <- t(prcomp.matrix)
743  prcomp.matrix[upper.tri(prcomp.matrix, TRUE)] <- c(0, disperse.list$rotations)
744  prcomp.matrix[c(1,disperse.list$p.order+1),c(1,disperse.list$p.order+1)] <- prcomp.matrix     # Rearrange to unstacked variable positions
745
746                                        # This is the submatrix that we want to represent
747  portal <- prcomp.matrix[row.keep,col.keep]
748  portalsize <- ncol(portal)
749
750  portal.row.names <- varnm[row.keep]               # In symmetric matricies, these are the same.
751  portal.col.names <- varnm[col.keep]               # In symmetric matricies, these are the same.
752
753                                        # This is a matrix that gives the relative rank of every element.
754  col.map <- matrix(0, portalsize, portalsize)
755  col.portal <- rank(abs(portal[upper.tri(portal, TRUE)]))
756  col.map[upper.tri(col.map, TRUE)] <- col.portal
757  col.map <- t(col.map)
758  col.map[upper.tri(col.map, TRUE)] <- col.portal
759
760                                        # This creates a continuous color palette of the correct size.
761  n.unique <- sum(upper.tri(matrix(1, portalsize, portalsize), TRUE))
762  Lab.palette <- colorRampPalette(c("white", "yellow", "red"), space = "Lab")
763  my.palette <- Lab.palette(n.unique)
764
765                                        # Plot the submatrix to be represented.
766
767  plot.new()
768  plot.window(xlim = c(-2, portalsize + 1), ylim = c(1, portalsize + 3))
769  for(i in 1:portalsize){
770    text(x = 1, y = portalsize - i + 1 + 0.5, pos = 2,
771         labels = portal.row.names[i])        # Row variable names
772    for(j in 1:portalsize){
773      rect(xleft = j, ybottom = portalsize - i + 1, xright = j + 1,
774           ytop = portalsize - i + 2, density = NULL, angle = 45,
775           col = my.palette[col.map[i, j]], border = NULL, lty = par("lty"),
776           lwd = par("lwd"))
777      text(x = j + 0.5, y = portalsize - i + 1 + 0.5,
778           labels = as.character(round(portal[i,j]*100)/100) )      # SHOULD FIND BETTER SIG FIGS HACK
779    }
780  }
781  for(j in 1:portalsize){
782    text(x = j + 0.6, y = portalsize + 1.1, pos = 2,
783         labels = portal.col.names[j], srt = 270)  # Column variable names.
784  }
785
786  return(NULL)
787}
788
789
790#' Plot observed and imputed time-series for a single cross-section
791#'
792#' Plots a time series for a given variable in a given cross-section and
793#' provides confidence intervals for the imputed values.
794#'
795#' @param output output from the function \code{amelia}.
796#' @param var the column number or variable name of the variable to plot.
797#' @param cs the name (or level) of the cross-sectional unit to plot.
798#'           Maybe a vector of names which will panel a window of plots
799#' @param draws the number of imputations on which to base the confidence
800#'        intervals.
801#' @param conf the confidence level of the confidence intervals to plot
802#'        for the imputated values.
803#' @param misscol the color of the imputed values and their confidence
804#'        intervals.
805#' @param obscol the color of the points for observed units.
806#' @param xlab x axis label
807#' @param ylab y axis label
808#' @param main overall plot title
809#' @param pch point shapes for the plot.
810#' @param ylim y limits (y1, y2) of the plot.
811#' @param xlim x limits (x1, x2) of the plot.
812#' @param frontend a logical value for use with the \code{AmeliaView} GUI.
813#' @param plotall a logical value that provides a shortcut for ploting all unique values of the level.
814#'        A shortcut for the \code{cs} argument, a TRUE value overwrites any
815#'        \code{cs} argument.
816#' @param nr the number of rows of plots to use when ploting multiple cross-sectional
817#'        units.  The default value will try to minimize this value to create a roughly
818#'        square representation, up to a value of four.  If all plots do not fit on the
819#'        window, a new window will be started.
820#' @param nc the number of columns of plots to use.  See \code{nr}
821#' @param pdfstub a stub string used to write pdf copies of each window created by the
822#'        plot.  The default is not to write pdf output, but any string value will turn
823#'        on pdf output to the local working directory.  If the stub is \code{mystub},
824#'        then plots will be saved as \code{mystub1.pdf}, \code{mystub2.pdf}, etc.
825#' @param ... further graphical parameters for the plot.
826#'
827#' @details
828#'   The \code{cs} argument should be a value from the variable set to the
829#'  \code{cs} argument in the \code{amelia} function for this output. This
830#'  function will not work if the \code{ts} and \code{cs} arguments were
831#'  not set in the \code{amelia} function. If an observation has been
832#'  overimputed, \code{tscsPlot} will plot both an observed and an imputed
833#'  value.
834
835tscsPlot <- function(output, var, cs, draws = 100, conf = .90,
836                     misscol = "red", obscol = "black", xlab, ylab, main,
837                     pch, ylim, xlim, frontend = FALSE, plotall=FALSE, nr, nc, pdfstub, ...) {
838  if (missing(var))
839    stop("I don't know which variable (var) to plot")
840  if (missing(cs) && !plotall)
841    stop("case name (cs) is not specified")
842  if (is.null(output$arguments$ts) || is.null(output$arguments$cs))
843    stop("both 'ts' and 'cs' need to be set in the amelia output")
844  if (!("amelia" %in% class(output)))
845    stop("the 'output' is not Amelia output")
846
847  data <- getOriginalData(output)
848
849                                        # Allow character names as arguments for "var" with data.frames
850
851  if (is.character(var)) {
852    if (!is.data.frame(data)) {
853      stop("'var' must be identified by column number as dataset is not a data frame")
854    } else {
855      varpos <- match(var, colnames(data))
856      if (is.na(varpos)) {
857        stop("the name provided for 'var' argument does not exist in the dataset provided")
858      } else {
859        var <- varpos
860      }
861    }
862  }
863
864  csvarname <- output$arguments$cs
865  tsvarname <- output$arguments$ts
866  if (is.data.frame(data)) {
867    csvar <- data[[csvarname]]
868    tsvar <- data[[tsvarname]]
869  } else {
870    csvar <- data[,output$arguments$cs]
871    tsvar <- data[,output$arguments$ts]
872  }
873
874  if (is.factor(csvar)) {
875      units <- levels(csvar)
876  } else {
877      units <- unique(csvar)
878  }
879
880  if (plotall) {
881        cs <- units
882  } else {
883    if (!(all(cs %in% units)))
884      stop("some cross-section unit requested for the plot is not in the data")
885  }
886
887# Picks a number of rows and columns if not user defined.  Maxs out at 4-by-4, unless user defined
888  if (missing(nr)) {
889    nr <- min(4, ceiling(sqrt(length(cs))))
890  }
891  if (missing(nc)) {
892    nc <- min(4, ceiling(length(cs)/nr))
893  }
894
895  if (length(cs)>1) {
896    oldmfcol <- par()$mfcol
897    par(mfcol = c(nr, nc))
898  }
899
900  prepped <- amelia.prep(x = data, arglist = output$arguments)
901  if (!is.null(prepped$blanks)) {
902    data <- data[-prepped$blanks,]
903    unit.rows <- which(csvar %in% cs)
904    miss <- output$missMatrix[-prepped$blanks,][unit.rows, var] == 1
905  } else {
906    unit.rows <- which(csvar %in% cs)
907    miss <- output$missMatrix[unit.rows, var] == 1
908  }
909
910  time <- tsvar[unit.rows]   # These are the time values for rows appearing in some future plot
911  imps.cs <- csvar[unit.rows]   # These are the cs units for rows appearing in some future plot
912  cross.sec <- prepped$x[!is.na(match(prepped$n.order, unit.rows)),]
913  stacked.var <- match(var, prepped$subset.index[prepped$p.order])
914  subset.var <- match(var, prepped$subset.index)
915  imps <- array(NA, dim = c(nrow(cross.sec), draws))
916
917
918  drawsperimp <- draws/output$m
919  if (sum(miss) > 0) {
920    for (i in 1:draws) {
921      currtheta <- output$theta[,,ceiling(i/drawsperimp)]
922      imps[,i] <- amelia.impute(x = cross.sec, thetareal = currtheta,
923                                bounds = prepped$bounds,
924                                priors = prepped$priors,
925                                max.resample = output$arguments$max.resample)[,stacked.var]
926    }
927
928    imps <- imps*prepped$scaled.sd[subset.var] + prepped$scaled.mu[subset.var]
929    if (var %in% output$arguments$logs) {
930      imps <- exp(imps) + prepped$xmin[which(var == output$arguments$logs)]
931    }
932    if (var %in% output$arguments$sqrt) {
933      imps <- imps^2
934    }
935    if (var %in% output$arguments$lgstc) {
936      imps <- exp(imps)/(1 + exp(imps))
937    }
938
939    outoforder <- match(prepped$n.order, unit.rows)[!is.na(match(prepped$n.order, unit.rows))]
940    imps <- imps[order(outoforder),]
941  }
942
943  if (missing(pch)) pch <- 19
944  if (missing(xlab)) xlab <- "time"
945  if (missing(ylab)) ylab <- names(data)[var]
946
947  if (frontend) {
948    dev.new()
949  }
950
951  if (!missing(main)) {
952    main <- rep(main, length.out = length(cs))
953  }
954  count <- 0
955  for(i in 1:length(cs)){
956
957      current.rows <- which(csvar == cs[i])
958      current.time <- tsvar[current.rows]
959
960      flag <- imps.cs == cs[i]
961      current.miss <- miss[flag]
962
963      if (sum(current.miss) > 0) {
964        current.imps <- imps[flag,]
965        current.means <- rowMeans(current.imps)
966        current.uppers <- apply(current.imps, 1, quantile, probs = (conf + (1 - conf)/2))   # THIS IS LIKELY SLOW
967        current.lowers <- apply(current.imps, 1, quantile, probs = (1-conf)/2)              # THIS IS LIKELY SLOW
968      } else {
969        current.means <- data[[var]][current.rows]
970        current.uppers <- current.lowers <- current.means
971      }
972
973      cols <- ifelse(current.miss, misscol, obscol)
974      current.main <- ifelse(missing(main), as.character(cs[i]), main[i])  # Allow title to be rolling if not defined
975      if (missing(xlim)) {                                # Allow axes to vary by unit, if not defined
976        current.xlim <- range(current.time)
977      } else {
978        current.xlim <- xlim
979      }
980      if (missing(ylim)) {
981        current.ylim <- range(current.uppers,current.lowers,current.means)
982      } else {
983        current.ylim <- ylim
984      }
985
986    plot(x = current.time, y = current.means, col = cols, pch = pch,
987         ylim = current.ylim, xlim = current.xlim, ylab = ylab, xlab = xlab,
988         main = current.main, ...)
989    segments(x0 = current.time, x1 = current.time, y0 = current.lowers,
990             y1 = current.uppers, col = cols, ...)
991
992    oiDetect <- (sum(output$missMatrix[current.rows,var]) +
993                   sum(!is.na(data[current.rows, var]))) > length(current.rows)
994    if (oiDetect) {
995      points(x = current.time, y = data[current.rows, var], pch = pch,
996             col = obscol)
997    }
998
999    # print page if window full
1000    if ((!missing(pdfstub)) & (i %% (nr*nc) ==0)) {
1001      count <- count + 1
1002      dev.copy2pdf(file = paste(pdfstub, count, ".pdf", sep=""))
1003    }
1004  }
1005
1006  if (!missing(pdfstub)) {
1007    if ((i %% (nr*nc)) != 0) {           # print last page if not complete
1008      count <- count + 1
1009      dev.copy2pdf(file = paste(pdfstub, count, ".pdf", sep=""))
1010    }
1011    par(mfcol = oldmfcol)             # return to previous windowing
1012  }                                 # although always now fills by col even if previously by row
1013
1014  invisible(imps)
1015
1016}
1017
1018
1019#' Combine Multiple Results From Multiply Imputed Datasets
1020#'
1021#' Combine sets of estimates (and their standard errors) generated from
1022#' different multiply imputed datasets into one set of results.
1023#'
1024#' @param q A matrix or data frame of (k) quantities of interest (eg.
1025#'        coefficients, parameters, means) from (m) multiply imputed datasets.
1026#'        Default is to assume the matrix is m-by-k (see \code{byrow}), thus each
1027#'        row represents a set of results from one dataset, and each column
1028#'        represents the different values of a particular quantity of interest
1029#'        across the imputed datasets.
1030#' @param se A matrix or data frame of standard errors that correspond to each of the
1031#'        elements of the quantities of interest in \code{q}.  Should be the same
1032#'        dimensions as \code{q}.
1033#' @param byrow logical.  If \code{TRUE}, \code{q} and \code{se} are treated as
1034#'        though each row represents the set of results from one dataset
1035#'        (thus m-by-k).  If \code{FALSE}, each column represents results from one
1036#'        dataset (thus k-by-m).
1037#'
1038#' @details Uses Rubin's rules for combining a set of results from multiply imputed
1039#' datasets to reflect the average result, with standard errors that both average
1040#' uncertainty across models and account for disagreement in the estimated values
1041#' across the models.
1042#'
1043#' @return
1044#'   \item{q.mi}{Average value of each quantity of interest across the m models}
1045#'   \item{se.mi}{Standard errors of each quantity of interest}
1046#'
1047#' @references
1048#' Rubin, D. (1987). \emph{Multiple Imputation for Nonresponse in Surveys}.
1049#' New York: Wiley.
1050#'
1051#' Honaker, J., King, G., Honaker, J. Joseph, A. Scheve K. (2001). Analyzing
1052#' Incomplete Political Science Data: An Alternative Algorithm for Multiple
1053#' Imputation \emph{American Political Science Review}, \bold{95(1)}, 49--69. (p53)
1054#'
1055mi.meld<-function(q, se, byrow = TRUE) {
1056
1057  if (!byrow) {
1058    q <- t(q)
1059    se <- t(se)
1060  }
1061
1062  if (is.data.frame(q)) {
1063    q <- as.matrix(q)
1064  }
1065
1066  if (is.data.frame(se)) {
1067    se <- as.matrix(se)
1068  }
1069
1070  am.m <- nrow(q)
1071
1072  ones <- matrix(1, nrow = 1, ncol = am.m)
1073  imp.q <- (ones %*% q)/am.m        # Slightly faster than "apply(b,2,mean)"
1074  ave.se2 <- (ones %*% (se^2))/am.m # Similarly, faster than "apply(se^2,2,mean)"
1075  diff <- q - matrix(1, nrow = am.m, ncol = 1) %*% imp.q
1076  sq2 <- (ones %*% (diff^2))/(am.m - 1)
1077  imp.se <- sqrt(ave.se2 + sq2 * (1 + 1/am.m))
1078
1079  return(list(q.mi = imp.q, se.mi = imp.se))
1080
1081}
1082