1find.matches <- function(x, y, tol=rep(0,ncol(y)), scale=tol,
2                         maxmatch=10)
3{
4  if(.R.) rep.int <- rep
5
6  ##if(length(dim(x))==0) x <- matrix(x, nrow=1)  10may02
7  if(!is.matrix(x))
8    x <- as.matrix(x)
9
10  n <- nrow(x)
11  p <- ncol(x)
12  if(!is.matrix(y))
13    y <- as.matrix(y)  ## 10may02
14
15  if(p != ncol(y))
16    stop("number of columns of x and y must match")
17
18  ny <- nrow(y)
19  rown <- dimnames(x)[[1]]
20  ry <- dimnames(y)[[1]]
21  matches <- matrix(if(length(ry))
22                      ""
23                    else
24                      0,
25                    n, maxmatch,
26                    dimnames=list(rown,
27                                  paste("Match #",1:maxmatch,sep="")))
28
29  distance <- matrix(NA, n, maxmatch,
30                     dimnames=list(rown,
31                                   paste("Distance #",1:maxmatch,sep="")))
32
33  if(length(ry)==0)
34    ry <- 1:ny
35
36  scale <- ifelse(scale==0,1,tol)
37  ones <- rep(1,p)
38  mx <- 0
39  for(i in 1:n) {
40    dif <- abs(y - rep(x[i,], rep.int(ny,p)))
41    toll <- rep(tol, rep.int(nrow(dif),p))
42    which <- (1:ny)[((dif > toll) %*% ones)==0]
43    lw <- length(which)
44    if(lw) {
45      scaled <- dif[which,,drop=FALSE]/rep(scale, rep.int(lw,p))
46      dist <- (scaled^2) %*% ones
47      lw <- min(lw,maxmatch)
48      mx <- max(mx,lw)
49      d <- order(dist)[1:lw]
50      matches[i,1:lw] <- ry[which[d]]
51      distance[i,1:lw] <- dist[d]
52    }
53  }
54
55  structure(list(matches=matches[,1:mx], distance=distance[,1:mx]),
56            class="find.matches")
57}
58
59
60print.find.matches <- function(x, digits=.Options$digits, ...)
61{
62  cat("\nMatches:\n\n")
63  print(x$matches, quote=FALSE)
64  cat("\nDistances:\n\n")
65  print(x$distance, digits=digits)
66  invisible()
67}
68
69
70summary.find.matches <- function(object, ...)
71{
72  mat <- object$matches
73  dist <- object$distance
74  cat("Frequency table of number of matches found per observation\n\n")
75  m <- (!is.na(dist)) %*% rep(1,ncol(mat))
76  print(table(m))
77  cat("\nMedian minimum distance by number of matches\n\n")
78  print(tapply(dist[m>0,1], m[m>0], median))
79  ta <- table(mat[m>0,1])
80  ta <- ta[ta>1]
81  if(length(ta)) {
82    cat("\nObservations selected first more than once (with frequencies)\n\n")
83    print(ta)
84  } else cat("\nNo observations selected first more than once\n\n")
85
86  invisible()
87}
88
89
90matchCases <- function(xcase,    ycase,    idcase=names(ycase),
91                       xcontrol, ycontrol, idcontrol=names(ycontrol),
92                       tol=NULL,
93                       maxobs=max(length(ycase),length(ycontrol))*10,
94                       maxmatch=20, which=c('closest','random'))
95{
96  if(!length(tol))
97    stop('must specify tol')
98
99  if((length(xcase)!=length(ycase)) || (length(xcontrol)!=length(ycontrol)))
100    stop('lengths of xcase, ycase and of xcontrol, ycontrol must be same')
101
102  which <- match.arg(which)
103
104  ycase    <- as.matrix(ycase)
105  ycontrol <- as.matrix(ycontrol)
106  if(!length(idcase))
107    idcase <- 1:length(ycase)
108
109  if(!length(idcontrol))
110    idcontrol <- 1:length(ycontrol)
111
112  idcase    <- as.character(idcase)
113  idcontrol <- as.character(idcontrol)
114
115  j <- is.na(ycase %*% rep(1,ncol(ycase))) | is.na(xcase)
116  if(any(j)) {
117    warning(paste(sum(j),'cases removed due to NAs'))
118    ycase <- ycase[!j,,drop=FALSE]
119    xcase <- xcase[!j]
120    idcase <- idcase[!j]
121  }
122
123  j <- is.na(ycontrol %*% rep(1,ncol(ycontrol))) | is.na(xcontrol)
124  if(any(j)) {
125    warning(paste(sum(j),'controls removed due to NAs'))
126    ycontrol <- ycontrol[!j,,drop=FALSE]
127    xcontrol <- xcontrol[!j]
128    idcontrol <- idcontrol[!j]
129  }
130
131  idCase <- id <- character(maxobs)
132  type   <- factor(rep(NA,maxobs), c('case','control'))
133  x      <- numeric(maxobs)
134  y      <- matrix(NA, ncol=ncol(ycase), nrow=maxobs)
135
136  last <- 0
137  ncase <- length(ycase)
138  ncontrol <- length(ycontrol)
139  matches  <- integer(ncase)
140  for(i in 1:ncase) {
141    s <- abs(xcontrol-xcase[i]) <= tol
142    nmatch <- sum(s)
143    if(nmatch > maxmatch) {
144      s <- (1:ncontrol)[s]  ## next line was sample(j,...) 4jun02
145      if(which=="random")
146        s <- sample(s, maxmatch, replace=FALSE)
147      else {
148        errors <- abs(xcontrol[s]-xcase[i])
149        serrors <- order(errors)
150        s <- (s[serrors])[1:maxmatch]
151      }
152
153      nmatch <- maxmatch
154    }
155
156    matches[i] <- nmatch
157    if(!nmatch)
158      next
159
160    end <- last + nmatch + 1
161    if(end > maxobs)
162      stop(paste('needed maxobs >',maxobs))
163
164    start <- last+1
165    last <- end
166    idCase[start:end] <- rep(idcase[i], nmatch+1)
167    type[start:end]   <- c('case',rep('control',nmatch))
168    id[start:end]     <- c(idcase[i], idcontrol[s])
169    x[start:end]      <- c(xcase[i], xcontrol[s])
170    y[start:end,]     <- rbind(ycase[i,,drop=FALSE], ycontrol[s,,drop=FALSE])
171  }
172
173  cat('\nFrequencies of Number of Matched Controls per Case:\n\n')
174  print(table(matches))
175  cat('\n')
176  structure(list(idcase=idCase[1:end], type=type[1:end],
177                 id=id[1:end], x=x[1:end], y=drop(y[1:end,])),
178            row.names=as.character(1:end),
179            class='data.frame')
180}
181