1##########################################################################
2## tobit regression model
3##
4## This software is distributed under the terms of the GNU GENERAL
5## PUBLIC LICENSE Version 2, June 1991.  See the package LICENSE
6## file for more information.
7##
8## Copyright (C) 2003-2007 Andrew D. Martin and Kevin M. Quinn
9## Copyright (C) 2007-present Andrew D. Martin, Kevin M. Quinn,
10##    and Jong Hee Park
11##########################################################################
12
13
14#' Markov Chain Monte Carlo for Gaussian Linear Regression with a Censored
15#' Dependent Variable
16#'
17#' This function generates a sample from the posterior distribution of a linear
18#' regression model with Gaussian errors using Gibbs sampling (with a
19#' multivariate Gaussian prior on the beta vector, and an inverse Gamma prior
20#' on the conditional error variance).  The dependent variable may be censored
21#' from below, from above, or both. The user supplies data and priors, and a
22#' sample from the posterior distribution is returned as an mcmc object, which
23#' can be subsequently analyzed with functions provided in the coda package.
24#'
25#' \code{MCMCtobit} simulates from the posterior distribution using standard
26#' Gibbs sampling (a multivariate Normal draw for the betas, and an inverse
27#' Gamma draw for the conditional error variance). \code{MCMCtobit} differs
28#' from \code{MCMCregress} in that the dependent variable may be censored from
29#' below, from above, or both. The simulation proper is done in compiled C++
30#' code to maximize efficiency.  Please consult the coda documentation for a
31#' comprehensive list of functions that can be used to analyze the posterior
32#' sample.
33#'
34#' The model takes the following form:
35#'
36#' \deqn{y_i = x_i ' \beta + \varepsilon_{i},}
37#'
38#' where the errors are assumed
39#' to be Gaussian:
40#'
41#' \deqn{\varepsilon_{i} \sim \mathcal{N}(0, \sigma^2).}
42#'
43#' Let \eqn{c_1} and \eqn{c_2} be the two censoring points, and let
44#' \eqn{y_i^\ast} be the partially observed dependent variable. Then,
45#'
46#' \deqn{y_i = y_i^{\ast} \texttt{ if } c_1 < y_i^{\ast} < c_2,}
47#'
48#' \deqn{y_i = c_1 \texttt{ if } c_1 \geq y_i^{\ast},}
49#'
50#' \deqn{y_i = c_2 \texttt{ if } c_2 \leq y_i^{\ast}.}
51#'
52#' We assume standard, semi-conjugate priors:
53#'
54#' \deqn{\beta \sim \mathcal{N}(b_0,B_0^{-1}),}
55#'
56#' and:
57#'
58#' \deqn{\sigma^{-2} \sim \mathcal{G}amma(c_0/2, d_0/2),}
59#'
60#' where \eqn{\beta} and \eqn{\sigma^{-2}} are
61#' assumed \emph{a priori} independent.  Note that only starting
62#' values for \eqn{\beta} are allowed because simulation is done
63#' using Gibbs sampling with the conditional error variance as the
64#' first block in the sampler.
65#'
66#' @param formula A model formula.
67#'
68#' @param data A dataframe.
69#'
70#' @param below The point at which the dependent variable is censored from
71#' below. The default is zero. To censor from above only, specify that below =
72#' -Inf.
73#' @param above The point at which the dependent variable is censored from
74#' above. To censor from below only, use the default value of Inf.
75#'
76#' @param burnin The number of burn-in iterations for the sampler.
77#'
78#' @param mcmc The number of MCMC iterations after burnin.
79#'
80#' @param thin The thinning interval used in the simulation.  The number of
81#' MCMC iterations must be divisible by this value.
82#'
83#' @param verbose A switch which determines whether or not the progress of the
84#' sampler is printed to the screen.  If \code{verbose} is greater than 0 the
85#' iteration number, the \eqn{\beta} vector, and the error variance is
86#' printed to the screen every \code{verbose}th iteration.
87#'
88#' @param seed The seed for the random number generator.  If NA, the Mersenne
89#' Twister generator is used with default seed 12345; if an integer is passed
90#' it is used to seed the Mersenne twister.  The user can also pass a list of
91#' length two to use the L'Ecuyer random number generator, which is suitable
92#' for parallel computation.  The first element of the list is the L'Ecuyer
93#' seed, which is a vector of length six or NA (if NA a default seed of
94#' \code{rep(12345,6)} is used).  The second element of list is a positive
95#' substream number. See the MCMCpack specification for more details.
96#'
97#' @param beta.start The starting values for the \eqn{\beta} vector.
98#' This can either be a scalar or a column vector with dimension equal to the
99#' number of betas. The default value of of NA will use the OLS estimate of
100#' \eqn{\beta} as the starting value.  If this is a scalar, that value
101#' will serve as the starting value mean for all of the betas.
102#'
103#' @param b0 The prior mean of \eqn{\beta}.  This can either be a scalar
104#' or a column vector with dimension equal to the number of betas. If this
105#' takes a scalar value, then that value will serve as the prior mean for all
106#' of the betas.
107#'
108#' @param B0 The prior precision of \eqn{\beta}.  This can either be a
109#' scalar or a square matrix with dimensions equal to the number of betas.  If
110#' this takes a scalar value, then that value times an identity matrix serves
111#' as the prior precision of beta. Default value of 0 is equivalent to an
112#' improper uniform prior for beta.
113#'
114#' @param c0 \eqn{c_0/2} is the shape parameter for the inverse Gamma
115#' prior on \eqn{\sigma^2} (the variance of the disturbances). The
116#' amount of information in the inverse Gamma prior is something like that from
117#' \eqn{c_0} pseudo-observations.
118#'
119#' @param d0 \eqn{d_0/2} is the scale parameter for the inverse Gamma
120#' prior on \eqn{\sigma^2} (the variance of the disturbances). In
121#' constructing the inverse Gamma prior, \eqn{d_0} acts like the sum of
122#' squared errors from the \eqn{c_0} pseudo-observations.
123#'
124#' @param ... further arguments to be passed
125#'
126#' @return An mcmc object that contains the posterior sample.  This object can
127#' be summarized by functions provided by the coda package.
128#'
129#' @export
130#'
131#' @author Ben Goodrich, \email{goodrich.ben@gmail.com},
132#' \url{http://www.columbia.edu/~bg2382/}
133#'
134#' @seealso \code{\link[coda]{plot.mcmc}}, \code{\link[coda]{summary.mcmc}},
135#' \code{\link[survival]{survreg}}, \code{\link[MCMCpack]{MCMCregress}}
136#'
137#' @references Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park. 2011.
138#' ``MCMCpack: Markov Chain Monte Carlo in R.'', \emph{Journal of Statistical
139#' Software}. 42(9): 1-21.  \doi{10.18637/jss.v042.i09}.
140#'
141#' Daniel Pemstein, Kevin M. Quinn, and Andrew D. Martin.  2007.  \emph{Scythe
142#' Statistical Library 1.0.} \url{http://scythe.lsa.umich.edu}.
143#'
144#' Martyn Plummer, Nicky Best, Kate Cowles, and Karen Vines. 2006.  ``Output
145#' Analysis and Diagnostics for MCMC (CODA)'', \emph{R News}. 6(1): 7-11.
146#' \url{https://CRAN.R-project.org/doc/Rnews/Rnews_2006-1.pdf}.
147#'
148#' Siddhartha Chib. 1992. ``Bayes inference in the Tobit censored regression
149#' model."  \emph{Journal of Econometrics}. 51:79-99.
150#'
151#' James Tobin. 1958. ``Estimation of relationships for limited dependent
152#' variables." \emph{Econometrica.} 26:24-36.
153#'
154#' @keywords models
155#'
156#' @examples
157#'
158#' \dontrun{
159#' library(survival)
160#' example(tobin)
161#' summary(tfit)
162#' tfit.mcmc <- MCMCtobit(durable ~ age + quant, data=tobin, mcmc=30000,
163#'                         verbose=1000)
164#' plot(tfit.mcmc)
165#' raftery.diag(tfit.mcmc)
166#' summary(tfit.mcmc)
167#' }
168#'
169"MCMCtobit" <-
170function(formula, data=NULL, below = 0.0, above = Inf,
171           burnin = 1000, mcmc = 10000,
172           thin=1, verbose = 0, seed = NA, beta.start = NA,
173           b0 = 0, B0 = 0, c0 = 0.001, d0 = 0.001, ...) {
174
175    # checks
176    check.offset(list(...))
177    check.mcmc.parameters(burnin, mcmc, thin)
178    if (!is.numeric(below) | !is.numeric(above)) {
179        cat("Error: Censoring points must be numeric, which includes +-Inf.\n")
180        stop("Please respecify and call ", calling.function(), " again.",
181        call.=FALSE)
182    }
183    if (below >= above) {
184        cat("Error: Truncation points are logically inconsistent.\n")
185        stop("Please respecify and call ", calling.function(), " again.",
186        call.=FALSE)
187    }
188
189    # convert infinite values to finite approximations
190    if(is.infinite(below)) below <- .Machine$double.xmax*-1
191    if(is.infinite(above)) above <- .Machine$double.xmax
192
193    # seeds
194    seeds <- form.seeds(seed)
195    lecuyer <- seeds[[1]]
196    seed.array <- seeds[[2]]
197    lecuyer.stream <- seeds[[3]]
198
199    # form response and model matrices
200    holder <- parse.formula(formula, data=data)
201    Y <- holder[[1]]
202    X <- holder[[2]]
203    xnames <- holder[[3]]
204    K <- ncol(X)  # number of covariates
205
206    # starting values and priors
207    beta.start <- coef.start(beta.start, K, formula, family=gaussian, data)
208    mvn.prior <- form.mvn.prior(b0, B0, K)
209    b0 <- mvn.prior[[1]]
210    B0 <- mvn.prior[[2]]
211    check.ig.prior(c0, d0)
212
213    # define holder for posterior sample
214    sample <- matrix(data=0, mcmc/thin, K+1)
215    posterior <- NULL
216
217    # call C++ code to draw sample
218    auto.Scythe.call(output.object="posterior", cc.fun.name="cMCMCtobit",
219                     sample.nonconst=sample, Y=Y, X=X, below=as.double(below),
220                     above=as.double(above),
221                     burnin=as.integer(burnin), mcmc=as.integer(mcmc),
222                     thin=as.integer(thin),
223                     lecuyer=as.integer(lecuyer),
224                     seedarray=as.integer(seed.array),
225                     lecuyerstream=as.integer(lecuyer.stream),
226                     verbose=as.integer(verbose), betastart=beta.start,
227                     b0=b0, B0=B0, c0=as.double(c0), d0=as.double(d0))
228
229    # pull together matrix and build MCMC object to return
230    output <- form.mcmc.object(posterior,
231                               names=c(xnames, "sigma2"),
232                               title="MCMCtobit Posterior Sample")
233    return(output)
234  }
235