1## diag.r 2## amelia diagnostic functins 3## 4## 05/05/06 mb - added amelia.arg compatibility 5## 07/05/06 jh - compare: changed how variable names are found, changed titles/labels, set x-axis values in matplot, colours for no imputations 6## overimpute: added new m-name in output, 7## 09/05/06 mb - overimpute: added frontend check for overimpute. 8## 15/05/06 jh - overimpute: stacking of original data, and various graphics adjustments 9## 01/06/06 mb - added "gethull" and "disperse" for overdispersion diagnostic 10## 19/07/06 mb - moved handling of arglists to prep. 11## 01/12/06 mb - can't compare non-numerics, only use the relevant columns when 12## building compare 13## 13/12/06 mb - changed for new priors. 14## 26/03/07 jh - overimpute: excluded polynomials of time from missingness count, reordered ploting of ci's (smallest last), allow variable name as var argument 15## 28/03/07 jh - disperse: changed tolerance and empri handling. 16## 03/04/07 jh - disperse: changed 1d plot settings, number of colors, minor edits to "patt" construction. 17## 10/04/07 jh - created sigalert function to view disperse principal components. 18## 22/07/08 mb - good coding update: T->TRUE/F->FALSE 19## 10/02/09 mb - compare: added lwd, col, main, lab, etc for user 20## control, added scale so that users can control scaling, 21## uses amelia class 22## overimpute: uses amelia class, added lwd, col, main, lab, etc for user 23## disperse: now uses amelia class 24## 02/21/12 jh - added mi.meld to combine multiply imputed quantities of interest and se's. 25## 10/30/12 jh - tscsPlot: expanded to allow to cycle through sets of cross sectional units efficiently. 26 27 28#' Compare observed versus imputed densities 29#' 30#' Plots smoothed density plots of observed and imputed values from output 31#' from the \code{amelia} function. 32#' 33#' @param output output from the function \code{amelia}. 34#' @param var column number or variable name of the variable to plot. 35#' @param col a vector of length 2 containing the color to plot the (1) 36#' imputed density and (2) the observed density. 37#' @param scaled a logical indicating if the two densities should be 38#' scaled to reflect the difference in number of units in each. 39#' @param lwd the line width of the density plots. 40#' @param main main title of the plot. The default is to title the plot 41#' using the variable name. 42#' @param xlab the label for the x-axis. The default is the name of the 43#' variable. 44#' @param ylab the label for the y-axis. The default is "Relative Density." 45#' @param legend a logical value indicating if a legend should be 46#' plotted. 47#' @param frontend a logical value used internally for the Amelia GUI. 48#' @param ... further graphical parameters for the plot. 49#' 50#' @details This function first plots a density plot of the observed units for the 51#' variable \code{var} in \code{col[2]}. The the function plots a density plot of the mean 52#' or modal imputations for the missing units in \code{col[1]}. If a 53#' variable is marked "ordinal" or "nominal" with the \code{ords} or 54#' \code{noms} options in \code{amelia}, then the modal imputation will 55#' be used. If \code{legend} is \code{TRUE}, then a legend is plotted as well. 56#' 57#' @references 58#' Abayomi, K. and Gelman, A. and Levy, M. 2005 "Diagnostics for 59#' Multivariate Imputations," \emph{Applied Statistics}. 57,3: 273--291. 60#' 61#' @examples 62#' data(africa) 63#' 64#' @seealso For more information on how densities are computed, 65#' \code{\link{density}}; Other imputation diagnostics are 66#' \code{\link{overimpute}}, \code{\link{disperse}}, and 67#' \code{\link{tscsPlot}}. 68#' 69compare.density <- function(output, var, col = c("indianred", "dodgerblue"), 70 scaled = FALSE, lwd = 1, main, xlab, ylab, 71 legend = TRUE, frontend = FALSE, ...) { 72 73 if (!("amelia" %in% class(output))) 74 stop("The 'output' is not Amelia output.") 75 76 ##data <- getOriginalData(output) 77 data <- remove.imputations(output) 78 79 ## Checks on if the variable makes sense to plot. 80 if (class(var) == "character") 81 if (!(var %in% names(data))) 82 stop("The variable name (var) doesn't correspond to a column in the data.") 83 else 84 var <- match(var,names(data)) 85 if (any(var > ncol(data), var < 0, (var %% 1) != 0)) 86 stop("The 'var' option points to a non-existant column.") 87 if (var %in% output$arguments$idvar) 88 stop("the variable selected was marked as an idvar") 89 90 ## We need to clean the data to make sure that 91 ## we're not going to run into NAs 92 mcount <- sum(!is.na(output$imputations)) 93 imputed <- (1:output$m)[!is.na(output$imputations)] 94 95 ## create an empty vector to sum across 96 varimp <- matrix(NA, nrow(data), mcount) 97 98 for (i in 1:mcount) { 99 if (is.data.frame(data)) { 100 varimp[,i] <- output$imputations[[imputed[i]]][[var]] 101 } else { 102 varimp[,i] <- output$imputations[[imputed[i]]][,var] 103 } 104 } 105 if (var %in% c(output$arguments$noms, output$arguments$ords)) { 106 leg.text <- "Modal Imputations" 107 varimp <- apply(varimp, 1, function(x) as.numeric(names(which.max(table(x))))) 108 } else { 109 leg.text <- "Mean Imputations" 110 varimp <- rowMeans(varimp) 111 } 112 113 if (frontend) { 114 dev.new() 115 } 116 117 if (is.data.frame(data)) { 118 vars <- data[[var]] 119 } else { 120 vars <- data[,var] 121 } 122 123 if (scaled) 124 ratio <- sum(is.na(vars))/sum(!is.na(vars)) 125 else 126 ratio <- 1 127 varnames <- dimnames(data)[[2]] # This will work for both data.frames AND matricies. 128 vname <- varnames[var] # This will work for both data.frames AND matricies. 129 130 131 if (sum(is.na(vars)) > 1) { 132 oiDetect <- (sum(output$missMatrix[,var]) + sum(!is.na(vars))) > length(vars) 133 if (missing(main)) { 134 if (oiDetect) { 135 main <- paste("Observed and Overimputed values of", vname) 136 } else { 137 main <- paste("Observed and Imputed values of", vname) 138 } 139 } 140 if (missing(xlab)) { 141 xlab <- paste(vname," -- Fraction Missing:", round(mean(is.na(vars)), digits = 3)) 142 143 } 144 if (missing(ylab)) { 145 ylab <- "Relative Density" 146 } 147 148 xmiss <- density(varimp[output$missMatrix[, var]], na.rm = TRUE) 149 xobs <- density(vars[!is.na(vars)], na.rm = TRUE) 150 compplot <- matplot(x = cbind(xmiss$x, xobs$x), 151 y = cbind(ratio * xmiss$y, xobs$y), 152 xlab = xlab, ylab = ylab, type = "l", lwd = lwd, 153 lty = 1, main = main, col = col, ...) 154 if (legend) { 155 legend("topright", legend = c(leg.text, "Observed Values"), 156 col = col, lty = c(1,1), bg = 'gray90', lwd = lwd) 157 } 158 } else { 159 if (missing(main)) { 160 main <- paste("Observed values of",vname) 161 } 162 if (missing(xlab)) { 163 xlab <- vname 164 } 165 if (missing(ylab)) { 166 ylab <- "Relative Density" 167 } 168 169 compplot <- plot(density(varimp, na.rm = TRUE), col = col[2], 170 main = main,...) 171 if (sum(is.na(vars)) == 1) { 172 abline(v = varimp[output$missMatrix[, var]], col = col[1]) 173 } 174 175 if (legend) { 176 legend("topright", legend = c("Mean Imputations","Observed Values"), 177 col = col, lty = c(1,1), bg = 'gray90') 178 } 179 } 180 181 invisible() 182} 183 184 185 186#' Overimputation diagnostic plot 187#' 188#' Treats each observed value as missing and imputes from the imputation 189#' model from \code{amelia} output. 190#' 191#' @param output output from the function \code{amelia}. 192#' @param var column number or variable name of the variable to 193#' overimpute. 194#' @param draws the number of draws per imputed dataset to generate 195#' overimputations. Total number of simulations will \code{m * 196#' draws} where \code{m} is the number of imputations. 197#' @param subset an optional vector specifying a subset of observations 198#' to be used in the overimputation. 199#' @param legend a logical value indicating if a legend should be 200#' plotted. 201#' @param xlab the label for the x-axis. The default is "Observed Values." 202#' @param ylab the label for the y-axis. The default is "Imputed Values." 203#' @param main main title of the plot. The default is to smartly title the plot 204#' using the variable name. 205#' @param frontend a logical value used internally for the Amelia GUI. 206#' @param ... further graphical parameters for the plot. 207#' 208#' @details 209#' This function temporarily treats each observed value in 210#' \code{var} as missing and imputes that value based on the imputation 211#' model of \code{output}. The dots are the mean imputation and the 212#' vertical lines are the 90\% percent confidence intervals for 213#' imputations of each observed value. The diagonal line is the \eqn{y=x} 214#' line. If all of the imputations were perfect, then our points would 215#' all fall on the line. A good imputation model would have about 90\% of 216#' the confidence intervals containing the truth; that is, about 90\% of 217#' the vertical lines should cross the diagonal. 218#' 219#' The color of the vertical lines displays the fraction of missing 220#' observations in the pattern of missingness for that 221#' observation. The legend codes this information. Obviously, the 222#' imputations will be much tighter if there are more observed covariates 223#' to use to impute that observation. 224#' 225#' The \code{subset} argument evaluates in the environment of the 226#' data. That is, it can but is not required to refer to variables in the 227#' data frame as if it were attached. 228#' 229#' @return A list that contains (1) the row in the original data 230#' (\code{row}), (2) the observed value of that observation 231#' (\code{orig}), (2) the mean of the overimputations 232#' (\code{mean.overimputed}), (3) the lower bound of the 95\% 233#' confidence interval of the overimputations 234#' (\code{lower.overimputed}), (4) the upper bound of the 95\% 235#' confidence interval of the overimputations 236#' (\code{upper.overimputed}), (5) the fraction of the variables 237#' that were missing for that observation in the original data 238#' (\code{prcntmiss}), and (6) a matrix of the raw overimputations, 239#' with observations in rows and the different draws in columns (\code{overimps}). 240#' 241#' @seealso Other imputation diagnostics are 242#' \code{\link{compare.density}}, \code{\link{disperse}}, and 243#' \code{\link{tscsPlot}}. 244 245overimpute <- function(output, var, draws = 20, subset, legend = TRUE, xlab, 246 ylab, main, frontend = FALSE, ...) { 247 248 if (!("amelia" %in% class(output))) 249 stop("The 'output' is not Amelia output.") 250 251 252 data <- getOriginalData(output) 253 254 ## via the subset.data.frame function 255 if (missing(subset)) { 256 r <- TRUE 257 } else { 258 e <- substitute(subset) 259 r <- eval(e, data, parent.frame()) 260 if (!is.logical(r)) { 261 stop("'subset' must evaluate to logical") 262 } 263 r <- r & !is.na(r) 264 if (sum(r) == 0) { 265 stop("no observations in the subset") 266 } 267 } 268 data <- data[r,] 269 270 origAMr1 <- is.na(data) 271 272 273 ## Allow character names as arguments for "var" with data.frames 274 275 if(is.character(var)){ 276 if(!is.data.frame(data)){ 277 stop("var must be identified by column number as dataset is not a data frame.") 278 } else { 279 nomnames <- colnames(output$imputations[[1]])[output$arguments$noms] 280 if (var %in% nomnames) { 281 stop("Cannot overimpute variables set to be nominal") 282 } 283 varpos <- match(var, colnames(data)) 284 if(is.na(varpos)){ 285 stop("The name provided for var argument does not exist in the dataset provided.") 286 } else { 287 var <- varpos 288 } 289 } 290 } 291 292 293 ## The argument list for an amelia output is now 294 ## at "output$arguments" 295 prepped <- amelia.prep(x = data, arglist = output$arguments, incheck = FALSE) 296 297 stacked.var <- match(var, prepped$subset.index[prepped$p.order]) 298 subset.var <- match(var, prepped$subset.index) 299 if (!is.null(prepped$blanks)) 300 fully.missing <- origAMr1[-prepped$blanks, var][prepped$n.order] 301 else 302 fully.missing <- origAMr1[, var][prepped$n.order] 303 304 if (is.na(stacked.var)) { 305 if (frontend) 306 tcltk::tkmessageBox(message="The variable you selected doesn't exist in the Amelia output becuase it wasn't imputed.",icon="error",type="ok") 307 stop("var doesn't exist in the amelia output. It either didn't get imputed or is out of the range of columns.") 308 } 309 310 means <- c() 311 lowers <- c() 312 uppers <- c() 313 pcnts <- c() 314 color <- c() 315 AMr1 <- is.na(prepped$x) 316 ## if (sum(!AMr1[,stacked.var]) == 0){ 317 ## if (frontend) { 318 ## tkmessageBox(parent = getAmelia("gui"), 319 ## message="The variable needs to have at least one fully observed cell.",icon="error",type="ok") 320 ## } 321 ## stop("function needs at least one fully observed cell in 'var'.") 322 ## } 323 AMr1[,stacked.var] <- TRUE 324 AMp <- ncol(prepped$x) 325 imphold <- matrix(NA, nrow = nrow(prepped$x), ncol = output$m * draws) 326 for (i in 1:nrow(prepped$x)) { 327 if (fully.missing[i]) { 328 next() 329 } 330 331 x <- prepped$x[i,,drop=FALSE] 332 x[1, stacked.var] <- NA 333 o <- !is.na(x) 334 miss <- !o 335 x[is.na(x)] <- 0 336 oo <- 1 * o 337 mm <- 1 * miss 338 #o<-!AMr1[i,] 339 #o[stacked.var]<-FALSE 340 341 pcntmiss <- (sum(miss))/(length(miss)-sum(prepped$index==0)) # Does not include time polynomials (index==0) in the denominator 342 ## These are always fully observed by construction, but auxiliary. 343 ## Leaves constructed lags and 344 ## leads, and nominal variables 345 ## in count, however. 346 347 conf <- c() 348 for (k in 1:output$m) { 349 350 ## The theta matrix is now stored in an array with 351 ## dimensions c(vars+1,vars+1,m), so this grabs 352 ## the kth theta matrix. 353 354 thetareal <- output$theta[,,k] 355 xx <- matrix(x, draws, AMp, byrow = TRUE) 356 rr <- matrix(AMr1[i,], draws, AMp, byrow = TRUE) 357 xc <- .Call("ameliaImpute", xx, rr, oo, mm, c(1, nrow(xx) + 1), thetareal, NULL, 358 NULL, NULL, PACKAGE = "Amelia") 359 conf <- c(conf, xc[, stacked.var]) 360 } 361 362 scaled.conf <- (conf * prepped$scaled.sd[subset.var]) + prepped$scaled.mu[subset.var] 363 varlog <- match(var, prepped$logs) 364 365 if (!is.na(varlog)) { 366 scaled.conf <- untransform(as.matrix(scaled.conf), logs = 1, 367 xmin = prepped$xmin[varlog], sqrts = NULL, 368 lgstc = NULL) 369 } 370 if (!is.na(match(var,prepped$sqrts))) { 371 scaled.conf <- untransform(as.matrix(scaled.conf), logs = NULL, 372 xmin = NULL, sqrts = 1, lgstc = NULL) 373 } 374 if (!is.na(match(var,prepped$lgstc))) { 375 scaled.conf <- untransform(as.matrix(scaled.conf), logs = NULL, 376 xmin = NULL, sqrts = NULL, lgstc = 1) 377 } 378 379 ##colors are based on rainbow roygbiv l->r is higher missingness \ 380 blue <- rgb(0,0,1, alpha = 0.75) 381 green <- rgb(0,.75,0, alpha = 0.75) 382 orange <- rgb(1, 0.65,0, alpha = 0.75) 383 tomato <- rgb(1, 0.39, 0.28, alpha = 0.75) 384 red <- rgb(0.75, 0, 0, alpha = 0.75) 385 spectrum <- c(blue, green, orange, tomato, red) 386 387 if (pcntmiss < .20) 388 color <- c(color, spectrum[1]) 389 else if (pcntmiss >= .20 && pcntmiss < .40) 390 color <- c(color, spectrum[2]) 391 else if (pcntmiss >= .40 && pcntmiss < .60) 392 color <- c(color, spectrum[3]) 393 else if (pcntmiss >= .60 && pcntmiss < .80) 394 color <- c(color, spectrum[4]) 395 else if (pcntmiss >= .80) 396 color <- c(color, spectrum[5]) 397 398 imphold[i,] <- scaled.conf 399 means <- c(means, mean(scaled.conf)) 400 lowers <- c(lowers, sort(scaled.conf)[round(output$m * draws * 0.05)]) 401 uppers <- c(uppers, sort(scaled.conf)[round(output$m * draws * 0.95)]) 402 pcnts <- c(pcnts, pcntmiss) 403 404 } 405 406 #AMr1<-is.na(prepped$x[,stacked.var]) 407 #partial.n.order<-prepped$n.order[!origAMr1] 408 409 if (is.data.frame(data)) { 410 xplot <- data[[var]] 411 } else { 412 xplot <- data[,var] 413 } 414 if (is.null(prepped$blanks)) { 415 xplot <- xplot[prepped$n.order][!fully.missing] 416 } else { 417 xplot <- xplot[-prepped$blanks][prepped$n.order][!fully.missing] 418 } 419 addedroom <- (max(uppers) - min(lowers)) * 0.1 420 if (!hasArg(log)) { 421 this.ylim <- range(c(lowers - addedroom, uppers)) 422 legpos <- "bottomright" 423 } else { 424 this.ylim <- range(c(lowers[lowers > 0], uppers + addedroom)) 425 legpos <- "topright" 426 } 427 428 if (missing(xlab)) { 429 xlab <- "Observed Values" 430 } 431 if (missing(ylab)) { 432 ylab <- "Imputed Values" 433 } 434 if (missing(main)) { 435 main <- paste("Observed versus Imputed Values of",colnames(data)[var]) 436 } 437 438 if (frontend) { 439 dev.new() 440 } 441 ci.order <- order(uppers - lowers, decreasing = TRUE) # Allows smallest CI's to be printed last, and thus not buried in the plot. 442 overplot <- plot(xplot[ci.order], means[ci.order], xlab = xlab, ylab = ylab, 443 ylim = this.ylim, type = 'p', main = main, 444 col = color[ci.order], pch = 19,...) 445 segments(xplot[ci.order], lowers[ci.order], xplot[ci.order], uppers[ci.order], 446 col = color[ci.order]) 447 if (legend) { 448 legend(legpos, legend = c(" 0-.2",".2-.4",".4-.6",".6-.8",".8-1"), 449 col = spectrum, lty = c(1,1), horiz = TRUE, bty = "n") 450 } 451 452 abline(0,1) 453 454 out <- list(row = prepped$n.order[!fully.missing], 455 orig = xplot, 456 mean.overimputed = means, 457 lower.overimputed = lowers, 458 upper.overimputed = uppers, 459 prcntmiss = pcnts, 460 overimps = imphold[!is.na(imphold[,1]),]) 461 invisible(out) 462} 463 464gethull <- function(st,tol,rots) { 465 stvec <- st 466 for (i in 1:length(st)) { 467 addedvec <- rep(0,length(st)) 468 addedvec[i] <- tol * 100 469 newvec <- cbind(st + addedvec, st - addedvec) 470 stvec <- cbind(stvec, newvec) 471 } 472 reduced.hull <- t(rots) %*% stvec 473 return(reduced.hull) 474} 475 476 477#' Overdispersed starting values diagnostic for multiple imputation 478#' 479#' A visual diagnostic of EM convergence from multiple overdispersed 480#' starting values for an output from \code{amelia}. 481#' 482#' @param output output from the function \code{amelia}. 483#' @param m the number of EM chains to run from overdispersed starting values. 484#' @param dims the number of principle components of the parameters to 485#' display and assess convergence on (up to 2). 486#' @param p2s an integer that controls printing to screen. 0 (default) 487#' indicates no printing, 1 indicates normal screen output and 2 488#' indicates diagnostic output. 489#' @param frontend a logical value used internally for the Amelia GUI. 490#' @param xlim limits of the plot in the horizontal dimension. 491#' @param ylim limits of the plot in vertical dimension. 492#' @param ... further graphical parameters for the plot. 493#' 494#' @details This function tracks the convergence of \code{m} EM chains which start 495#' from various overdispersed starting values. This plot should give some 496#' indication of the sensitivity of the EM algorithm to the choice of 497#' starting values in the imputation model in \code{output}. If all of 498#' the lines converge to the same point, then we can be confident that 499#' starting values are not affecting the EM algorithm. 500#' 501#' As the parameter space of the imputation model is of a 502#' high-dimension, this plot tracks how the first (and second if 503#' \code{dims} is 2) principle component(s) change over the iterations of 504#' the EM algorithm. Thus, the plot is a lower dimensional summary of the 505#' convergence and is subject to all the drawbacks inherent in said 506#' summaries. 507#' 508#' For \code{dims==1}, the function plots a horizontal line at the 509#' position where the first EM chain converges. Thus, we are checking 510#' that the other chains converge close to that horizontal line. For 511#' \code{dims==2}, the function draws a convex hull around the point of 512#' convergence for the first EM chain. The hull is scaled to be within 513#' the tolerance of the EM algorithm. Thus, we should check that the 514#' other chains end up in this hull. 515#' 516#' @seealso Other imputation diagnostics are 517#' \code{\link{compare.density}}, \code{\link{disperse}}, and 518#' \code{\link{tscsPlot}} 519disperse <- function(output, m = 5, dims = 1, p2s = 0, frontend = FALSE, ..., 520 xlim = NULL, ylim = NULL) { 521 522 if (!("amelia" %in% class(output))) 523 stop("The 'output' is not Amelia output.") 524 525 ## The original data is the imputed data with the 526 ## imputations marked to NA. These two lines do that 527 data <- getOriginalData(output) 528 529 if (frontend) { 530 requireNamespace("tcltk") 531 putAmelia("output.log", c(getAmelia("output.log"), "==== Overdispersion Output ====\n")) 532 } 533 534 # prep the data and arguments 535 prepped<-amelia.prep(x=data, arglist=output$arguments) 536 537 if (p2s) cat("-- Imputation", "1", "--") 538 if (frontend) { 539 putAmelia("output.log", c(getAmelia("output.log"), paste("-- Imputation","1","--\n"))) 540 } 541 flush.console() 542 543 # run EM, but return it with the theta at each iteration 544 thetanew <- emarch(prepped$x, p2s = p2s, thetaold = NULL, 545 tolerance = prepped$tolerance, startvals = 0, 546 priors = prepped$priors, empri = prepped$empri, 547 frontend = frontend, allthetas = TRUE, collect = FALSE) #change 4 548 549 # thetanew is a matrix whose columns are vectorized upper triangles of theta 550 # matrices for each iteration. thus, there are k(k+1)/2 rows. 551 impdata <- thetanew$thetanew 552 553 # we'll put the theta of the last iteration into a new starting theta 554 startsmat <- matrix(0, ncol(prepped$x) + 1, ncol(prepped$x) + 1) 555 startsmat[upper.tri(startsmat, TRUE)] <- c(-1, impdata[, ncol(impdata)]) 556 startsmat <- t(startsmat) 557 startsmat[upper.tri(startsmat, TRUE)] <- c(-1, impdata[, ncol(impdata)]) 558 iters <- nrow(thetanew$iter.hist) + 1 559 560 for (i in 2:m) { 561 if (p2s) cat("-- Imputation", i, "--\n") 562 if (frontend) { 563 putAmelia("output.log", c(getAmelia("output.log"), paste("-- Imputation",i,"--\n"))) 564 } 565 566 # get a noisy sample of data from the that starting value (which is the 567 # Amelia answer) and use that to estimate a new starting theta (mus/vcov) 568 newstarts <- rmvnorm(round(2.5 * ncol(prepped$x)), 569 startsmat[1,2:ncol(startsmat)], 570 startsmat[2:nrow(startsmat),2:nrow(startsmat)]) 571 startcov <- var(newstarts) 572 startmus <- colMeans(newstarts) 573 newstartsmat <- matrix(-1, ncol(prepped$x) + 1, ncol(prepped$x) + 1) 574 newstartsmat[2:nrow(startsmat),2:nrow(startsmat)] <- startcov 575 newstartsmat[1,2:nrow(startsmat)] <- startmus 576 newstartsmat[2:nrow(startsmat),1] <- startmus 577 578 # grab the iteration history of the thetas 579 thetanew <- emarch(prepped$x, p2s = p2s, thetaold = newstartsmat, 580 tolerance = prepped$tolerance, startvals = 0, 581 priors = prepped$priors, empri = prepped$empri, 582 frontend = frontend, allthetas = TRUE, collect = FALSE) # change 5 583 impdata <- cbind(impdata, thetanew$thetanew) 584 iters <- c(iters, nrow(thetanew$iter.hist) + 1) 585 } 586 if (dims == 1) 587 comps <- c(1) 588 else 589 comps <- c(1,2) 590 591 # reduce the dimenionality from k(k+1)/2 to 1 or 2 via principle components 592 rotations <- prcomp(t(impdata))$rotation[, comps] 593 reduced.imps <- t(rotations) %*% impdata 594 595 cols <- rainbow(m) 596 # plot the imputations 597 if (frontend) { 598 dev.new() 599 } 600 if (dims == 1) { 601 addedroom <- (max(reduced.imps) - min(reduced.imps)) * 0.1 602 x <- seq(iters[1]) 603 if (is.null(xlim)) xlim <- c(0, max(iters)) 604 if (is.null(ylim)) ylim <- range(c(reduced.imps - addedroom, reduced.imps)) 605 y <- reduced.imps[1, 1:iters[1]] 606 patt <- seq(1, length(x) - 1) 607 plot(x, y, col = 1, main = "Overdispersed Start Values", 608 xlab = "Number of Iterations", ylab = "Largest Principle Component", 609 xlim = xlim, ylim = ylim, type = "n") 610 segments(x[patt], y[patt], x[patt + 1], y[patt + 1], col = cols[1]) 611 for (i in 2:length(iters)) { 612 x <- seq(iters[i]) 613 y <- reduced.imps[1, (sum(iters[1:(i-1)])+1):(sum(iters[1:i]))] 614 patt <- seq(1, length(x)-1) 615 segments(x[patt], y[patt], x[patt+1], y[patt+1], col=cols[i]) 616 #points(x,y,col=i) 617 } 618 abline(h = reduced.imps[iters[1]], lwd = 2) 619 legend("bottomright", legend = c("Convergence of original starting values"), 620 lwd = 2, bty = "n") 621 } else { 622 xrange <- c((min(reduced.imps[1,])), (max(reduced.imps[1,]))) 623 yrange <- c((min(reduced.imps[2,])), (max(reduced.imps[2,]))) 624 if (is.null(xlim)) xlim <- xrange 625 if (is.null(ylim)) ylim <- yrange 626 plot(reduced.imps[1,1:iters[1]], reduced.imps[2,1:iters[1]], type = "n", 627 main = "Overdispersed Starting Values", 628 xlab = "First Principle Component", ylab = "Second Principle Component", 629 col=cols[1], xlim = xlim, ylim = ylim) 630 for (i in 2:length(iters)) { 631 x <- reduced.imps[1, (sum(iters[1:(i-1)])+1):(sum(iters[1:i]))] 632 y <- reduced.imps[2, (sum(iters[1:(i-1)])+1):(sum(iters[1:i]))] 633 patt <- c() 634 xdiffs <- diff(x) 635 ydiffs <- diff(y) 636 veclength <- sqrt(xdiffs^2+ydiffs^2) 637 for (j in 1:length(xdiffs)) 638 if (veclength[j] > xinch(1/500)) 639 patt <- c(patt,j) 640 if (!is.null(patt)) 641 arrows(x[patt], y[patt], x[patt + 1], y[patt + 1], length = .1, 642 col = cols[i]) 643 patt <- seq(1, length(x) - 1) 644 segments(x[patt], y[patt], x[patt+1], y[patt+1], col = cols[i]) 645 } 646 x <- reduced.imps[1,1:iters[1]] 647 y <- reduced.imps[2,1:iters[1]] 648 xdiffs <- diff(x) 649 ydiffs <- diff(y) 650 veclength <- sqrt(xdiffs^2+ydiffs^2) 651 inchlength <- sqrt(sum(xyinch(1/500)^2)) 652 patt <- c() 653 for (j in 1:length(xdiffs)) 654 if (veclength[j] > inchlength) 655 patt <- c(patt,j) 656 #if (!is.null(patt)) 657 # arrows(x[patt],y[patt],x[patt+1],y[patt+1],length=.15,col=1,lwd=5) 658 patt <- seq(1, length(x) -1) 659 segments(x[patt], y[patt], x[patt + 1], y[patt + 1], col = cols[1], lwd = 1) 660 dists <- gethull(st = impdata[ ,iters[1]], tol = prepped$tolerance, 661 rots = rotations) 662 convexhull <- chull(t(dists)) 663 convexhull <- c(convexhull, convexhull[1]) 664 lines(t(dists)[convexhull,], col = "orange", pch = 19, lwd = 2) 665 abline(h = 0, lty = 2) 666 abline(v = 0, lty = 2) 667 } 668 #if (frontend) 669 # tkdestroy(getAmelia("tcl.window")) 670 671 out <- list(impdat = impdata, p.order = prepped$p.order, index = prepped$index, 672 iters = iters, rotations = rotations, dims = dims) 673 invisible(out) 674 675} 676 677 678sigalert <- function(data, disperse.list, output, notorious = 5){ 679 680 k <- length(disperse.list$p.order) + 1 681 682 # Construct Variable Names for all variables constructed in Imputation Model. 683 # This uses the "index" which details all the variables included in the imputation model. 684 # The index is in the unstacked variable order. 685 # Possibly, if this is useful elsewhere, this might be moved to "prep.r". 686 687 varnm <- NULL 688 lag.count <- 0 689 lead.count <- 0 690 poly.count <- 0 691 unknown.count <- 0 692 693 for (i in 1:(k-1)) { 694 if (identical(disperse.list$index[i], -0.5)) { 695 lag.count <- lag.count + 1 696 varnm <- c(varnm, paste("lag", lag.count)) 697 } else if (identical(disperse.list$index[i], 0.5)) { 698 lead.count <- lead.count + 1 699 varnm <- c(varnm, paste("lead", lead.count)) 700 } else if (identical(disperse.list$index[i],0)) { 701 poly.count <- poly.count + 1 702 varnm <- c(varnm, paste("polytime", poly.count)) 703 } else if(disperse.list$index[i] >= 1) { 704 varnm <- c(varnm, names(data[disperse.list$index[i]])) # Check what this does with matricies? 705 } else { 706 unknown.count <- unknown.count + 1 707 varnm <- c(varnm, paste("unknown", unknown.count)) 708 } 709 } 710 711 # WARNING: Currently assumes rotations is a vector. If dim=2, rotations is a matrix. 712 # if(!identical(disperse.list$dims,1)){ 713 # disperse.list$rotations<-disperse.list$rotations[1,] 714 # } 715 716 # This is a flag vector that identifies the largest values in the first principal component. 717 718 largest.rotations <- disperse.list$rotations * 0 719 largest.rotations[order(abs(disperse.list$rotations),decreasing = TRUE)[1:notorious]] <- 1 720 721 # This is a matrix of the size of theta, which has a 1 in the positions of the largest 722 # contributions to the first principal component. 723 # (largest corresponding elements of disperse.list$rotations) 724 725 map <- matrix(0, k, k) 726 map[upper.tri(map, TRUE)] <- c(0, largest.rotations) 727 map <- t(map) 728 map[upper.tri(map, TRUE)] <- c(0, largest.rotations) 729 map[c(1, disperse.list$p.order + 1), c(1, disperse.list$p.order + 1)] <- map # Rearrange to unstacked variable positions 730 731 print(abs(map)) 732 733 gtz<-function(a) 734 return(sum(a) > 0) 735 row.keep <- apply(map, 1, gtz) 736 col.keep <- apply(map, 2, gtz) 737 738 # This is the submatrix of rotations, reshaped as a theta matrix, with the largest elements. 739 740 prcomp.matrix <- matrix(0,k,k) 741 prcomp.matrix[upper.tri(prcomp.matrix, TRUE)] <- c(0, disperse.list$rotations) 742 prcomp.matrix <- t(prcomp.matrix) 743 prcomp.matrix[upper.tri(prcomp.matrix, TRUE)] <- c(0, disperse.list$rotations) 744 prcomp.matrix[c(1,disperse.list$p.order+1),c(1,disperse.list$p.order+1)] <- prcomp.matrix # Rearrange to unstacked variable positions 745 746 # This is the submatrix that we want to represent 747 portal <- prcomp.matrix[row.keep,col.keep] 748 portalsize <- ncol(portal) 749 750 portal.row.names <- varnm[row.keep] # In symmetric matricies, these are the same. 751 portal.col.names <- varnm[col.keep] # In symmetric matricies, these are the same. 752 753 # This is a matrix that gives the relative rank of every element. 754 col.map <- matrix(0, portalsize, portalsize) 755 col.portal <- rank(abs(portal[upper.tri(portal, TRUE)])) 756 col.map[upper.tri(col.map, TRUE)] <- col.portal 757 col.map <- t(col.map) 758 col.map[upper.tri(col.map, TRUE)] <- col.portal 759 760 # This creates a continuous color palette of the correct size. 761 n.unique <- sum(upper.tri(matrix(1, portalsize, portalsize), TRUE)) 762 Lab.palette <- colorRampPalette(c("white", "yellow", "red"), space = "Lab") 763 my.palette <- Lab.palette(n.unique) 764 765 # Plot the submatrix to be represented. 766 767 plot.new() 768 plot.window(xlim = c(-2, portalsize + 1), ylim = c(1, portalsize + 3)) 769 for(i in 1:portalsize){ 770 text(x = 1, y = portalsize - i + 1 + 0.5, pos = 2, 771 labels = portal.row.names[i]) # Row variable names 772 for(j in 1:portalsize){ 773 rect(xleft = j, ybottom = portalsize - i + 1, xright = j + 1, 774 ytop = portalsize - i + 2, density = NULL, angle = 45, 775 col = my.palette[col.map[i, j]], border = NULL, lty = par("lty"), 776 lwd = par("lwd")) 777 text(x = j + 0.5, y = portalsize - i + 1 + 0.5, 778 labels = as.character(round(portal[i,j]*100)/100) ) # SHOULD FIND BETTER SIG FIGS HACK 779 } 780 } 781 for(j in 1:portalsize){ 782 text(x = j + 0.6, y = portalsize + 1.1, pos = 2, 783 labels = portal.col.names[j], srt = 270) # Column variable names. 784 } 785 786 return(NULL) 787} 788 789 790#' Plot observed and imputed time-series for a single cross-section 791#' 792#' Plots a time series for a given variable in a given cross-section and 793#' provides confidence intervals for the imputed values. 794#' 795#' @param output output from the function \code{amelia}. 796#' @param var the column number or variable name of the variable to plot. 797#' @param cs the name (or level) of the cross-sectional unit to plot. 798#' Maybe a vector of names which will panel a window of plots 799#' @param draws the number of imputations on which to base the confidence 800#' intervals. 801#' @param conf the confidence level of the confidence intervals to plot 802#' for the imputated values. 803#' @param misscol the color of the imputed values and their confidence 804#' intervals. 805#' @param obscol the color of the points for observed units. 806#' @param xlab x axis label 807#' @param ylab y axis label 808#' @param main overall plot title 809#' @param pch point shapes for the plot. 810#' @param ylim y limits (y1, y2) of the plot. 811#' @param xlim x limits (x1, x2) of the plot. 812#' @param frontend a logical value for use with the \code{AmeliaView} GUI. 813#' @param plotall a logical value that provides a shortcut for ploting all unique values of the level. 814#' A shortcut for the \code{cs} argument, a TRUE value overwrites any 815#' \code{cs} argument. 816#' @param nr the number of rows of plots to use when ploting multiple cross-sectional 817#' units. The default value will try to minimize this value to create a roughly 818#' square representation, up to a value of four. If all plots do not fit on the 819#' window, a new window will be started. 820#' @param nc the number of columns of plots to use. See \code{nr} 821#' @param pdfstub a stub string used to write pdf copies of each window created by the 822#' plot. The default is not to write pdf output, but any string value will turn 823#' on pdf output to the local working directory. If the stub is \code{mystub}, 824#' then plots will be saved as \code{mystub1.pdf}, \code{mystub2.pdf}, etc. 825#' @param ... further graphical parameters for the plot. 826#' 827#' @details 828#' The \code{cs} argument should be a value from the variable set to the 829#' \code{cs} argument in the \code{amelia} function for this output. This 830#' function will not work if the \code{ts} and \code{cs} arguments were 831#' not set in the \code{amelia} function. If an observation has been 832#' overimputed, \code{tscsPlot} will plot both an observed and an imputed 833#' value. 834 835tscsPlot <- function(output, var, cs, draws = 100, conf = .90, 836 misscol = "red", obscol = "black", xlab, ylab, main, 837 pch, ylim, xlim, frontend = FALSE, plotall=FALSE, nr, nc, pdfstub, ...) { 838 if (missing(var)) 839 stop("I don't know which variable (var) to plot") 840 if (missing(cs) && !plotall) 841 stop("case name (cs) is not specified") 842 if (is.null(output$arguments$ts) || is.null(output$arguments$cs)) 843 stop("both 'ts' and 'cs' need to be set in the amelia output") 844 if (!("amelia" %in% class(output))) 845 stop("the 'output' is not Amelia output") 846 847 data <- getOriginalData(output) 848 849 # Allow character names as arguments for "var" with data.frames 850 851 if (is.character(var)) { 852 if (!is.data.frame(data)) { 853 stop("'var' must be identified by column number as dataset is not a data frame") 854 } else { 855 varpos <- match(var, colnames(data)) 856 if (is.na(varpos)) { 857 stop("the name provided for 'var' argument does not exist in the dataset provided") 858 } else { 859 var <- varpos 860 } 861 } 862 } 863 864 csvarname <- output$arguments$cs 865 tsvarname <- output$arguments$ts 866 if (is.data.frame(data)) { 867 csvar <- data[[csvarname]] 868 tsvar <- data[[tsvarname]] 869 } else { 870 csvar <- data[,output$arguments$cs] 871 tsvar <- data[,output$arguments$ts] 872 } 873 874 if (is.factor(csvar)) { 875 units <- levels(csvar) 876 } else { 877 units <- unique(csvar) 878 } 879 880 if (plotall) { 881 cs <- units 882 } else { 883 if (!(all(cs %in% units))) 884 stop("some cross-section unit requested for the plot is not in the data") 885 } 886 887# Picks a number of rows and columns if not user defined. Maxs out at 4-by-4, unless user defined 888 if (missing(nr)) { 889 nr <- min(4, ceiling(sqrt(length(cs)))) 890 } 891 if (missing(nc)) { 892 nc <- min(4, ceiling(length(cs)/nr)) 893 } 894 895 if (length(cs)>1) { 896 oldmfcol <- par()$mfcol 897 par(mfcol = c(nr, nc)) 898 } 899 900 prepped <- amelia.prep(x = data, arglist = output$arguments) 901 if (!is.null(prepped$blanks)) { 902 data <- data[-prepped$blanks,] 903 unit.rows <- which(csvar %in% cs) 904 miss <- output$missMatrix[-prepped$blanks,][unit.rows, var] == 1 905 } else { 906 unit.rows <- which(csvar %in% cs) 907 miss <- output$missMatrix[unit.rows, var] == 1 908 } 909 910 time <- tsvar[unit.rows] # These are the time values for rows appearing in some future plot 911 imps.cs <- csvar[unit.rows] # These are the cs units for rows appearing in some future plot 912 cross.sec <- prepped$x[!is.na(match(prepped$n.order, unit.rows)),] 913 stacked.var <- match(var, prepped$subset.index[prepped$p.order]) 914 subset.var <- match(var, prepped$subset.index) 915 imps <- array(NA, dim = c(nrow(cross.sec), draws)) 916 917 918 drawsperimp <- draws/output$m 919 if (sum(miss) > 0) { 920 for (i in 1:draws) { 921 currtheta <- output$theta[,,ceiling(i/drawsperimp)] 922 imps[,i] <- amelia.impute(x = cross.sec, thetareal = currtheta, 923 bounds = prepped$bounds, 924 priors = prepped$priors, 925 max.resample = output$arguments$max.resample)[,stacked.var] 926 } 927 928 imps <- imps*prepped$scaled.sd[subset.var] + prepped$scaled.mu[subset.var] 929 if (var %in% output$arguments$logs) { 930 imps <- exp(imps) + prepped$xmin[which(var == output$arguments$logs)] 931 } 932 if (var %in% output$arguments$sqrt) { 933 imps <- imps^2 934 } 935 if (var %in% output$arguments$lgstc) { 936 imps <- exp(imps)/(1 + exp(imps)) 937 } 938 939 outoforder <- match(prepped$n.order, unit.rows)[!is.na(match(prepped$n.order, unit.rows))] 940 imps <- imps[order(outoforder),] 941 } 942 943 if (missing(pch)) pch <- 19 944 if (missing(xlab)) xlab <- "time" 945 if (missing(ylab)) ylab <- names(data)[var] 946 947 if (frontend) { 948 dev.new() 949 } 950 951 if (!missing(main)) { 952 main <- rep(main, length.out = length(cs)) 953 } 954 count <- 0 955 for(i in 1:length(cs)){ 956 957 current.rows <- which(csvar == cs[i]) 958 current.time <- tsvar[current.rows] 959 960 flag <- imps.cs == cs[i] 961 current.miss <- miss[flag] 962 963 if (sum(current.miss) > 0) { 964 current.imps <- imps[flag,] 965 current.means <- rowMeans(current.imps) 966 current.uppers <- apply(current.imps, 1, quantile, probs = (conf + (1 - conf)/2)) # THIS IS LIKELY SLOW 967 current.lowers <- apply(current.imps, 1, quantile, probs = (1-conf)/2) # THIS IS LIKELY SLOW 968 } else { 969 current.means <- data[[var]][current.rows] 970 current.uppers <- current.lowers <- current.means 971 } 972 973 cols <- ifelse(current.miss, misscol, obscol) 974 current.main <- ifelse(missing(main), as.character(cs[i]), main[i]) # Allow title to be rolling if not defined 975 if (missing(xlim)) { # Allow axes to vary by unit, if not defined 976 current.xlim <- range(current.time) 977 } else { 978 current.xlim <- xlim 979 } 980 if (missing(ylim)) { 981 current.ylim <- range(current.uppers,current.lowers,current.means) 982 } else { 983 current.ylim <- ylim 984 } 985 986 plot(x = current.time, y = current.means, col = cols, pch = pch, 987 ylim = current.ylim, xlim = current.xlim, ylab = ylab, xlab = xlab, 988 main = current.main, ...) 989 segments(x0 = current.time, x1 = current.time, y0 = current.lowers, 990 y1 = current.uppers, col = cols, ...) 991 992 oiDetect <- (sum(output$missMatrix[current.rows,var]) + 993 sum(!is.na(data[current.rows, var]))) > length(current.rows) 994 if (oiDetect) { 995 points(x = current.time, y = data[current.rows, var], pch = pch, 996 col = obscol) 997 } 998 999 # print page if window full 1000 if ((!missing(pdfstub)) & (i %% (nr*nc) ==0)) { 1001 count <- count + 1 1002 dev.copy2pdf(file = paste(pdfstub, count, ".pdf", sep="")) 1003 } 1004 } 1005 1006 if (!missing(pdfstub)) { 1007 if ((i %% (nr*nc)) != 0) { # print last page if not complete 1008 count <- count + 1 1009 dev.copy2pdf(file = paste(pdfstub, count, ".pdf", sep="")) 1010 } 1011 par(mfcol = oldmfcol) # return to previous windowing 1012 } # although always now fills by col even if previously by row 1013 1014 invisible(imps) 1015 1016} 1017 1018 1019#' Combine Multiple Results From Multiply Imputed Datasets 1020#' 1021#' Combine sets of estimates (and their standard errors) generated from 1022#' different multiply imputed datasets into one set of results. 1023#' 1024#' @param q A matrix or data frame of (k) quantities of interest (eg. 1025#' coefficients, parameters, means) from (m) multiply imputed datasets. 1026#' Default is to assume the matrix is m-by-k (see \code{byrow}), thus each 1027#' row represents a set of results from one dataset, and each column 1028#' represents the different values of a particular quantity of interest 1029#' across the imputed datasets. 1030#' @param se A matrix or data frame of standard errors that correspond to each of the 1031#' elements of the quantities of interest in \code{q}. Should be the same 1032#' dimensions as \code{q}. 1033#' @param byrow logical. If \code{TRUE}, \code{q} and \code{se} are treated as 1034#' though each row represents the set of results from one dataset 1035#' (thus m-by-k). If \code{FALSE}, each column represents results from one 1036#' dataset (thus k-by-m). 1037#' 1038#' @details Uses Rubin's rules for combining a set of results from multiply imputed 1039#' datasets to reflect the average result, with standard errors that both average 1040#' uncertainty across models and account for disagreement in the estimated values 1041#' across the models. 1042#' 1043#' @return 1044#' \item{q.mi}{Average value of each quantity of interest across the m models} 1045#' \item{se.mi}{Standard errors of each quantity of interest} 1046#' 1047#' @references 1048#' Rubin, D. (1987). \emph{Multiple Imputation for Nonresponse in Surveys}. 1049#' New York: Wiley. 1050#' 1051#' Honaker, J., King, G., Honaker, J. Joseph, A. Scheve K. (2001). Analyzing 1052#' Incomplete Political Science Data: An Alternative Algorithm for Multiple 1053#' Imputation \emph{American Political Science Review}, \bold{95(1)}, 49--69. (p53) 1054#' 1055mi.meld<-function(q, se, byrow = TRUE) { 1056 1057 if (!byrow) { 1058 q <- t(q) 1059 se <- t(se) 1060 } 1061 1062 if (is.data.frame(q)) { 1063 q <- as.matrix(q) 1064 } 1065 1066 if (is.data.frame(se)) { 1067 se <- as.matrix(se) 1068 } 1069 1070 am.m <- nrow(q) 1071 1072 ones <- matrix(1, nrow = 1, ncol = am.m) 1073 imp.q <- (ones %*% q)/am.m # Slightly faster than "apply(b,2,mean)" 1074 ave.se2 <- (ones %*% (se^2))/am.m # Similarly, faster than "apply(se^2,2,mean)" 1075 diff <- q - matrix(1, nrow = am.m, ncol = 1) %*% imp.q 1076 sq2 <- (ones %*% (diff^2))/(am.m - 1) 1077 imp.se <- sqrt(ave.se2 + sq2 * (1 + 1/am.m)) 1078 1079 return(list(q.mi = imp.q, se.mi = imp.se)) 1080 1081} 1082