1
2#' Missingness Map
3#'
4#' Plots a missingness map showing where missingness occurs in
5#' the dataset passed to \code{amelia}.
6#'
7#' @param obj an object of class "amelia"; typically output from the
8#'        function \code{amelia}, a matrix or a dataframe.
9#' @param vars a vector of column numbers or column names of the data
10#'   to include in the plot. The default is to plot all variables.
11#' @param legend should a legend be drawn? (True or False)
12#' @param col a vector of length two where the first element specifies
13#'        the color for missing cells and the second element specifies
14#"        the color for observed cells.
15#' @param main main title of the plot. Defaults to "Missingness Map".
16#' @param y.cex expansion for the variables names on the x-axis.
17#' @param x.cex expansion for the unit names on the y-axis.
18#' @param y.labels a vector of row labels to print on the y-axis
19#' @param y.at a vector of the same length as \code{y.labels} with row
20#'        nmumbers associated with the labels.
21#' @param csvar column number or name of the variable corresponding to
22#'        the unit indicator. Only used when the \code{obj} is not of class
23#'        \code{amelia}.
24#' @param tsvar column number or name of the variable corresponding to
25#'        the time indicator. Only used when the \code{obj} is not of class
26#'        \code{amelia}.
27#' @param rank.order a logical value. If \code{TRUE}, the default, then
28#'        the order of the variables along the the x-axis is sorted by the
29#'        percent missing (from highest to lowest). If \code{FALSE}, it is
30#'        simply the order of the variables in the data.
31#' @param margins a vector of length two that specifies the bottom and
32#'        left margins of the plot. Useful for when variable names or
33#'        row names are long.
34#' @param gap.xaxis value to pass to the \code{gap.axis} argument of
35#'   the \code{axis} function that plots the x-axis. See
36#'   \code{\link{axis}} for more details. Ignored on R versions less
37#'   than 4.0.0.
38#' @param x.las value of the \code{las} argument to pass to the
39#'   \code{\link{axis}} function creating the x-axis.
40#' @param ... further graphical arguments.
41#'
42#' @details \code{missmap} draws a map of the missingness in a dataset using the
43#' \code{image} function. The columns are reordered to put the most
44#' missing variable farthest to the left. The rows are reordered to a
45#' unit-period order if the \code{ts} and \code{cs} arguments were passed
46#' to \code{amelia}. If not, the rows are not reordered.
47#'
48#' The \code{y.labels} and \code{y.at} commands can be used to associate
49#' labels with rows in the data to identify them in the plot. The y-axis
50#' is internally inverted so that the first row of the data is associated
51#' with the top-most row of the missingness map. The values of
52#' \code{y.at} should refer to the rows of the data, not to any point on
53#' the plotting region.
54#'
55#' @seealso \code{\link{compare.density}}, \code{\link{overimpute}},
56#' \code{\link{tscsPlot}}, \code{\link{image}}, \code{\link{heatmap}}
57missmap <- function(obj, vars, legend = TRUE, col, main,
58                    y.cex = 0.8, x.cex = 0.8, y.labels, y.at, csvar = NULL,
59                    tsvar = NULL, rank.order = TRUE, margins = c(5, 5),
60                    gap.xaxis = 1, x.las = 2, ...) {
61
62
63  if (inherits(obj, "amelia")) {
64    vnames <- colnames(obj$imputations[[1]])
65    n <- nrow(obj$missMatrix)
66    p <- ncol(obj$missMatrix)
67    percent.missing <- colMeans(obj$missMatrix)
68    pmiss.all <- mean(obj$missMatrix)
69    r1 <- obj$missMatrix
70  } else {
71    vnames <- colnames(obj)
72    n <- nrow(obj)
73    p <- ncol(obj)
74    percent.missing <- colMeans(is.na(obj))
75    pmiss.all <- mean(is.na(obj))
76    r1 <- 1 * is.na(obj)
77  }
78
79  if (missing(col)) col <- c("#eff3ff", "#2171b5")
80  if (!missing(vars))  {
81    if (is.character(vars)) {
82      vars <- match(vars, vnames)
83      if (any(is.na(vars))) {
84        stop("vars not found in the data")
85      }
86    }
87    if (any(!(vars %in% 1:p))) {
88      stop("vars outside range of the data")
89    }
90    p <- length(vars)
91    r1 <- r1[, vars]
92    percent.missing <- percent.missing[vars]
93    pmiss.all <- mean(r1)
94  }
95
96  if (!missing(y.labels) &&
97      (missing(y.at) && (length(y.labels) != n))) {
98    stop("y.at must accompany y.labels if there is less than onefor each row")
99  }
100
101  if (is.null(csvar)) csvar <- obj$arguments$cs
102  if (is.null(tsvar)) tsvar <- obj$arguments$ts
103
104  if (missing(y.labels)) {
105    if (!is.null(csvar)) {
106      if (class(obj) == "amelia") {
107        cs <- obj$imputations[[1]][, csvar]
108      } else {
109        cs <- obj[, csvar]
110      }
111      y.labels <- cs
112      if (is.factor(y.labels)) y.labels <- levels(y.labels)[unclass(y.labels)]
113
114      cs.names <- y.labels
115
116
117      if (!is.numeric(cs)) cs <- as.numeric(as.factor(cs))
118      if (!is.null(tsvar)) {
119        if (class(obj) == "amelia") {
120          ts <- as.numeric(obj$imputations[[1]][, tsvar])
121        } else {
122          ts <- as.numeric(obj[, tsvar])
123        }
124        unit.period <- order(cs, ts)
125      } else {
126        unit.period <- 1:n
127      }
128
129      y.labels <- y.labels[unit.period]
130      r1 <- r1[unit.period, ]
131
132      brks <- c(TRUE,rep(FALSE, times = (n-1)))
133      for (i in 2:n) {
134        brks[i] <- (cs[unit.period][i] != cs[unit.period][i - 1])
135      }
136      y.at <- which(brks)
137
138      y.labels <- y.labels[brks]
139    } else {
140      y.labels <- row.names(obj$imputations[[1]])
141      y.at <- seq(1, n, by = 15)
142      y.labels <- y.labels[y.at]
143    }
144  } else {
145    if (missing(y.at))
146      y.at <- n:1
147  }
148  missrank <- rev(order(percent.missing))
149  if (rank.order) {
150    chess <- t(!r1[n:1, missrank])
151    vnames <- vnames[missrank]
152  } else {
153    chess <- t(!r1[n:1, ])
154  }
155  y.at <- (n:1)[y.at]
156
157  if (missing(main))
158    main <- "Missingness Map"
159
160  par(mar = c(margins, 2, 1) + 0.1)
161  ## here we fork for data/tscs type plots. users cant set this yet.
162  type <- "data"
163  if (legend) {
164    graphics::layout(matrix(c(1, 2), nrow = 1), widths = c(0.75, 0.25))
165    par(mar = c(margins, 2, 0) + 0.1, mgp = c(3, 0.25, 0))
166  }
167  if (type == "data") {
168
169    col.fix <- col
170    if (sum(!chess) == 0) {
171      col.fix <- col[2]
172    }
173    image(x = 1:(p), y = 1:n, z = chess, axes = FALSE,
174          col = col.fix, xlab = "", ylab = "", main = main)
175    if (getRversion() >= "4.0.0") {
176      axis(1, lwd = 0, labels = vnames, las = x.las, at = 1:p, cex.axis = x.cex,
177           gap.axis = gap.xaxis)
178    } else {
179      axis(1, lwd = 0, labels = vnames, las = x.las, at = 1:p, cex.axis = x.cex)
180    }
181    axis(2, lwd = 0, labels = y.labels, las = 1, at = y.at, cex.axis = y.cex)
182
183
184    if (legend) {
185      pm.lab <- paste("Missing (", round(100 * pmiss.all), "%)", sep = "")
186      po.lab <- paste("Observed (", 100 - round(100 * pmiss.all), "%)",
187                      sep = "")
188      par(mar = c(0, 0, 0, 0.3))
189      plot(0, 0, type = "n", axes = FALSE, ann = FALSE)
190      legend("left", col = col, bty = "n", xjust = 0, border = "grey",
191             legend = c(pm.lab, po.lab), fill = col, horiz = FALSE)
192    }
193  } else {
194    tscsdata <- data.frame(cs.names, ts, rowMeans(r1))
195    tscsdata <- reshape(tscsdata, idvar = "cs.names", timevar = "ts",
196                        direction = "wide")
197    rownames(tscsdata) <- tscsdata[, 1]
198    colnames(tscsdata) <- unique(ts)
199    tscsdata <- as.matrix(tscsdata[, -1])
200
201    cols <- rev(heat.colors(5))
202
203    image(z = t(tscsdata), axes = FALSE, col = cols, main = main,
204          ylab = "", xlab = "")
205    at.seq <- seq(from = 0, to = 1, length = ncol(tscsdata))
206    axis(1, labels = unique(ts), at = at.seq, tck = 0, lwd = 0, las = 2)
207    axis(2, labels = rownames(tscsdata), at = at.seq, tck = 0, lwd = 0,
208         las = 1, cex.axis = .8)
209
210    if (legend) {
211      leg.names <- c("0-0.2", "0.2-0.4", "0.4-0.6", "0.6-0.8", "0.8-1")
212      legend(x = 0.95, y = 1.01, col = cols, bty = "n", xjust = 1,
213             legend = leg.names, fill = cols, horiz = TRUE)
214    }
215  }
216
217  invisible(NULL)
218}
219