1\name{Matchby}
2\alias{Matchby}
3\title{Grouped Multivariate and Propensity Score Matching}
4\description{
5  This function is a wrapper for the \code{\link{Match}} function which
6  separates the matching problem into subgroups defined by a factor.
7  This is equivalent to conducting exact matching on each level of a factor.
8  Matches within each level are found as determined by the
9  usual matching options.  This function is much faster for large
10  datasets than the \code{\link{Match}} function itself.  For additional
11  speed, consider doing matching without replacement---see the
12  \code{replace} option.  This function is more limited than the
13  \code{\link{Match}} function.  For example, \code{Matchby} cannot be
14  used if the user wishes to provide observation specific weights.
15}
16
17\usage{
18Matchby(Y, Tr, X, by, estimand = "ATT", M = 1, ties=FALSE, replace=TRUE,
19        exact = NULL, caliper = NULL, AI=FALSE, Var.calc=0,
20        Weight = 1, Weight.matrix = NULL, distance.tolerance = 1e-05,
21        tolerance = sqrt(.Machine$double.eps), print.level=1, version="Matchby", ...)
22}
23\arguments{
24  \item{Y}{ A vector containing the outcome of interest. Missing values are not allowed.}
25  \item{Tr}{ A vector indicating the observations which are
26    in the treatment regime and those which are not.  This can either be a
27    logical vector or a real vector where 0 denotes control and 1 denotes
28    treatment.}
29  \item{X}{ A matrix containing the variables we wish to match on.
30    This matrix may contain the actual observed covariates or the
31    propensity score or a combination of both.}
32  \item{by}{A "factor" in the sense that \code{as.factor(by)} defines the
33    grouping, or a list of such factors in which case their
34    interaction is used for the grouping.}
35  \item{estimand}{ A character string for the estimand.  The default
36    estimand is "ATT", the sample average treatment effect for the
37    treated. "ATE" is the sample average treatment effect (for all), and
38    "ATC" is the sample average treatment effect for the controls.}
39  \item{M}{ A scalar for the number of matches which should be
40    found. The default is one-to-one matching. Also see the
41    \code{ties} option.}
42  \item{ties}{A logical flag for whether ties should be handled
43    deterministically.  By default \code{ties==TRUE}. If, for example, one
44    treated observation matches more than one control observation, the
45    matched dataset will include the multiple matched control observations
46    and the matched data will be weighted to reflect the multiple matches.
47    The sum of the weighted observations will still equal the original
48    number of observations. If \code{ties==FALSE}, ties will be randomly
49    broken.  \emph{If the dataset is large and there are many ties,
50      setting \code{ties=FALSE} often results in a large speedup.} Whether
51    two potential matches are close enough to be considered tied, is
52    controlled by the \code{distance.tolerance} option.}
53  \item{replace}{Whether matching should be done with replacement.  Note
54    that if \code{FALSE}, the order of matches generally matters.  Matches
55    will be found in the same order as the data is sorted.  Thus, the
56    match(es) for the first observation will be found first and then for
57    the second etc. Matching without replacement will generally increase
58    bias so it is not recommended.  \emph{But if the dataset is large and
59      there are many potential matches, setting \code{replace=false} often
60      results in a large speedup and negligible or no bias.} Ties are
61    randomly broken when \code{replace==FALSE}---see the \code{ties}
62    option for details.}
63  \item{exact}{ A logical scalar or vector for whether exact matching
64    should be done.  If a logical scalar is provided, that logical value is
65    applied to all covariates of
66    \code{X}.  If a logical vector is provided, a logical value should
67    be provided for each covariate in \code{X}. Using a logical vector
68    allows the user to specify exact matching for some but not other
69    variables.  When exact matches are not found, observations are
70    dropped.  \code{distance.tolerance} determines what is considered to be an
71    exact match. The \code{exact} option takes precedence over the
72    \code{caliper} option.}
73  \item{caliper}{ A scalar or vector denoting the caliper(s) which
74    should be used when matching.  A caliper is the distance which is
75    acceptable for any match.  Observations which are outside of the
76    caliper are dropped. If a scalar caliper is provided, this caliper is
77    used for all covariates in \code{X}.  If a vector of calipers is
78    provided, a caliper value should be provide for each covariate in
79    \code{X}. The caliper is interpreted to be in standardized units.  For
80    example, \code{caliper=.25} means that all matches not equal to or
81    within .25 standard deviations of each covariate in \code{X} are
82    dropped.}
83  \item{AI}{A logical flag for if the Abadie-Imbens standard error
84    should be calculated. It is computationally expensive to calculate
85    with large datasets. \code{Matchby} can only calculate AI SEs for ATT.
86    To calculate AI errors with other estimands, please use the
87    \code{\link{Match}} function.  See the \code{Var.calc} option if one
88    does not want to assume homoscedasticity.}
89  \item{Var.calc}{A scalar for the variance estimate
90    that should be used.  By default \code{Var.calc=0} which means that
91    homoscedasticity is assumed.  For values of  \code{Var.calc > 0},
92    robust variances are calculated using \code{Var.calc} matches.}
93  \item{Weight}{ A scalar for the type of
94    weighting scheme the matching algorithm should use when weighting
95    each of the covariates in \code{X}.  The default value of
96    1 denotes that weights are equal to the inverse of the variances. 2
97    denotes the Mahalanobis distance metric, and 3 denotes
98    that the user will supply a weight matrix (\code{Weight.matrix}).  Note that
99    if the user supplies a \code{Weight.matrix}, \code{Weight} will be automatically
100    set to be equal to 3.}
101  \item{Weight.matrix}{ This matrix denotes the weights the matching
102    algorithm uses when weighting each of the covariates in \code{X}---see
103    the \code{Weight} option. This square matrix should have as many
104    columns as the number of columns of the \code{X} matrix. This matrix
105    is usually provided by a call to the \code{\link{GenMatch}} function
106    which finds the optimal weight each variable should be given so as to
107    achieve balance on the covariates. \cr
108
109    For most uses, this matrix has zeros in the off-diagonal
110    cells.  This matrix can be used to weight some variables more than
111    others.  For
112    example, if \code{X} contains three variables and we want to
113    match as best as we can on the first, the following would work well:
114    \cr
115    \code{> Weight.matrix <- diag(3)}\cr
116    \code{> Weight.matrix[1,1] <- 1000/var(X[,1])} \cr
117    \code{> Weight.matrix[2,2] <- 1/var(X[,2])} \cr
118    \code{> Weight.matrix[3,3] <- 1/var(X[,3])} \cr
119    This code changes the weights implied by the
120    inverse of the variances by multiplying the first variable by a 1000
121    so that it is highly weighted.  In order to enforce exact matching
122    see the \code{exact} and \code{caliper} options.}
123  \item{distance.tolerance}{This is a scalar which is used to determine if distances
124    between two observations are different from zero.  Values less than
125    \code{distance.tolerance} are deemed to be equal to zero.  This
126    option can be used to perform a type of optimal matching}
127  \item{tolerance}{ This is a scalar which is used to determine
128    numerical tolerances.  This option is used by numerical routines
129    such as those used to determine if a matrix is singular.}
130  \item{print.level}{The level of printing. Set to '0' to turn off printing.}
131  \item{version}{The version of the code to be used.  The "Matchby" C/C++
132    version of the code is the fastest, and the end-user should not
133    change this option.}
134  \item{...}{Additional arguments passed on to \code{\link{Match}}.}
135}
136\details{
137  \code{Matchby} is much faster for large datasets than
138  \code{\link{Match}}.  But \code{Matchby} only implements a subset of
139  the functionality of \code{\link{Match}}.  For example, the
140  \code{restrict} option cannot be used, Abadie-Imbens standard errors
141  are not provided and bias adjustment cannot be requested.
142  \code{Matchby} is a wrapper for the \code{\link{Match}} function which
143  separates the matching problem into subgroups defined by a factor.  This
144  is the equivalent to doing exact matching on each factor, and the
145  way in which matches are found within each factor is determined by the
146  usual matching options. \cr
147
148  \emph{Note that by default \code{ties=FALSE} although the default for
149    the \code{Match} in \code{GenMatch} functions is \code{TRUE}.  This is
150    done because randomly breaking ties in large datasets often results in
151    a great speedup.}  For additional speed, consider doing matching
152  without replacement which is often much faster when the dataset is
153  large---see the \code{replace} option. \cr
154
155  There will be slight differences in the matches produced by
156  \code{Matchby} and \code{\link{Match}} because of how the covariates
157  are weighted.  When the data is broken up into separate groups (via
158  the \code{by} option), Mahalanobis distance and inverse variance
159  will imply different weights than when the data is taken as whole.
160}
161
162\value{
163  \item{est}{The estimated average causal effect.}
164  \item{se.standard }{The usual standard error.  This is the standard error
165    calculated on the matched data using the usual method of calculating
166    the difference of means (between treated and control) weighted so
167    that ties are taken into account.}
168  \item{se }{The Abadie-Imbens standard error. This is only calculated
169    if the \code{AI} option is \code{TRUE}. This standard error has
170    correct coverage if \code{X} consists of either covariates or a
171    known propensity score because it takes into account the uncertainty
172    of the matching
173    procedure.  If an estimated propensity score is used, the
174    uncertainty involved in its estimation is not accounted for although the
175    uncertainty of the matching procedure itself still is.}
176  \item{index.treated }{A vector containing the observation numbers from
177    the original dataset for the treated observations in the
178    matched dataset.  This index in conjunction with \code{index.control}
179    can be used to recover the matched dataset produced by
180    \code{Matchby}.  For example, the \code{X} matrix used by \code{Matchby}
181    can be recovered by
182    \code{rbind(X[index.treated,],X[index.control,])}.}
183  \item{index.control }{A vector containing the observation numbers from
184    the original data for the control observations in the
185    matched data.  This index in conjunction with \code{index.treated}
186    can be used to recover the matched dataset produced by
187    \code{Matchby}.  For example, the \code{Y} matrix for the matched dataset
188    can be recovered by
189    \code{c(Y[index.treated],Y[index.control])}.}
190  \item{weights}{The weights for each observation in the matched
191    dataset.}
192  \item{orig.nobs }{The original number of observations in the dataset.}
193  \item{nobs }{The number of observations in the matched dataset.}
194  \item{wnobs }{The number of weighted observations in the matched dataset.}
195  \item{orig.treated.nobs}{The original number of treated observations.}
196  \item{ndrops}{The number of matches which were dropped because there
197    were not enough observations in a given group and because of
198    caliper and exact matching.}
199  \item{estimand}{The estimand which was estimated.}
200  \item{version}{The version of \code{\link{Match}} which was used.}
201}
202\references{
203  Sekhon, Jasjeet S. 2011.  "Multivariate and Propensity Score
204  Matching Software with Automated Balance Optimization.''
205  \emph{Journal of Statistical Software} 42(7): 1-52.
206  \doi{10.18637/jss.v042.i07}
207
208  Diamond, Alexis and Jasjeet S. Sekhon. 2013. "Genetic
209  Matching for Estimating Causal Effects: A General Multivariate
210  Matching Method for Achieving Balance in Observational Studies.''
211  \emph{Review of Economics and Statistics}.  95 (3): 932--945.
212  \url{http://sekhon.berkeley.edu/papers/GenMatch.pdf}
213
214  Abadie, Alberto and Guido Imbens. 2006.
215  ``Large Sample Properties of Matching Estimators for Average
216  Treatment Effects.'' \emph{Econometrica} 74(1): 235-267.
217
218  Imbens, Guido. 2004. Matching Software for Matlab and
219  Stata.
220
221}
222\author{Jasjeet S. Sekhon, UC Berkeley, \email{sekhon@berkeley.edu},
223  \url{http://sekhon.berkeley.edu/}.
224}
225\seealso{ Also see \code{\link{Match}},
226    \code{\link{summary.Matchby}},
227    \code{\link{GenMatch}},
228    \code{\link{MatchBalance}},
229    \code{\link{balanceUV}},
230    \code{\link{qqstats}}, \code{\link{ks.boot}},
231    \code{\link{GerberGreenImai}}, \code{\link{lalonde}}
232}
233\examples{
234#
235# Match exactly by racial groups and then match using the propensity score within racial groups
236#
237
238data(lalonde)
239
240#
241# Estimate the Propensity Score
242#
243glm1  <- glm(treat~age + I(age^2) + educ + I(educ^2) +
244             hisp + married + nodegr + re74  + I(re74^2) + re75 + I(re75^2) +
245             u74 + u75, family=binomial, data=lalonde)
246
247
248#save data objects
249#
250X  <- glm1$fitted
251Y  <- lalonde$re78
252Tr <- lalonde$treat
253
254# one-to-one matching with replacement (the "M=1" option) after exactly
255# matching on race using the 'by' option.  Estimating the treatment
256# effect on the treated (the "estimand" option defaults to ATT).
257rr  <- Matchby(Y=Y, Tr=Tr, X=X, by=lalonde$black, M=1);
258summary(rr)
259
260# Let's check the covariate balance
261# 'nboots' is set to small values in the interest of speed.
262# Please increase to at least 500 each for publication quality p-values.
263mb  <- MatchBalance(treat~age + I(age^2) + educ + I(educ^2) + black +
264                    hisp + married + nodegr + re74  + I(re74^2) + re75 + I(re75^2) +
265                    u74 + u75, data=lalonde, match.out=rr, nboots=10)
266
267}
268\keyword{nonparametric}
269
270
271%  LocalWords:  MatchBalance GenMatch emph estimand ATT BiasAdjust calc dataset
272%  LocalWords:  ATC ecaliper cr diag homoscedasticity rbind GerberGreenImai se
273%  LocalWords:  DehejiaWahba AbadieImbens noadj cond mdata datasets wnobs url
274%  LocalWords:  ndrops Abadie Imbens Econometrica Matlab Stata UC seealso Wahba
275%  LocalWords:  balanceUV lalonde Dehejia psid Rajeev Sadek glm hisp
276%  LocalWords:  nodegr rr nboots nmc mb Matchby ret qqstats Mahalanobis SEs
277%  LocalWords:  estimands
278