1library(survey)
2
3## two-phase simple random sampling.
4data(pbc, package="survival")
5pbc$id<-1:nrow(pbc)
6pbc$randomized<-with(pbc, !is.na(trt) & trt>-9)
7(d2pbc<-twophase(id=list(~id,~id), data=pbc, subset=~I(!randomized)))
8m<-svymean(~bili, d2pbc)
9all.equal(as.vector(coef(m)),with(pbc, mean(bili[!randomized])))
10all.equal(as.vector(SE(m)),
11          with(pbc, sd(bili[!randomized])/sqrt(sum(!randomized))),
12          tolerance=0.002)
13
14## two-stage sampling as two-phase
15data(mu284)
16ii<-with(mu284, c(1:15, rep(1:5,n2[1:5]-3)))
17mu284.1<-mu284[ii,]
18mu284.1$id<-1:nrow(mu284.1)
19mu284.1$sub<-rep(c(TRUE,FALSE),c(15,34-15))
20dmu284<-svydesign(id=~id1+id2,fpc=~n1+n2, data=mu284)
21## first phase cluster sample, second phase stratified within cluster
22(d2mu284<-twophase(id=list(~id1,~id),strata=list(NULL,~id1),
23                   fpc=list(~n1,NULL),data=mu284.1,subset=~sub,method="approx"))
24(d22mu284<-twophase(id=list(~id1,~id),strata=list(NULL,~id1),
25                   fpc=list(~n1,NULL),data=mu284.1,subset=~sub,method="full"))
26summary(d2mu284)
27t1<-svytotal(~y1, dmu284)
28t2<-svytotal(~y1, d2mu284)
29t22<-svytotal(~y1,d22mu284)
30m1<-svymean(~y1, dmu284)
31m2<-svymean(~y1, d2mu284)
32m22<-svymean(~y1, d22mu284)
33all.equal(coef(t1),coef(t2))
34all.equal(coef(t1),coef(t22))
35all.equal(coef(m1),coef(m2))
36all.equal(coef(m1),coef(m22))
37all.equal(as.vector(SE(m1)),as.vector(SE(m2)))
38all.equal(as.vector(SE(m1)),as.vector(SE(m22)))
39all.equal(as.vector(SE(t1)),as.vector(SE(t2)))
40all.equal(as.vector(SE(t1)),as.vector(SE(t22)))
41
42## case-cohort design
43##this example requires R 2.3.1 or later for cch and data.
44library("survival")
45data(nwtco, package="survival")
46## unstratified, equivalent to Lin & Ying (1993)
47print(dcchs<-twophase(id=list(~seqno,~seqno), strata=list(NULL,~rel),
48                 subset=~I(in.subcohort | rel), data=nwtco))
49cch1<-svycoxph(Surv(edrel,rel)~factor(stage)+factor(histol)+I(age/12),
50               design=dcchs)
51dcchs2<-twophase(id=list(~seqno,~seqno), strata=list(NULL,~rel),
52                 subset=~I(in.subcohort | rel), data=nwtco,method="approx")
53cch1.2<-svycoxph(Surv(edrel,rel)~factor(stage)+factor(histol)+I(age/12),
54               design=dcchs)
55all.equal(coef(cch1),coef(cch1.2))
56all.equal(SE(cch1),SE(cch1.2))
57## Using survival::cch
58subcoh <- nwtco$in.subcohort
59selccoh <- with(nwtco, rel==1|subcoh==1)
60ccoh.data <- nwtco[selccoh,]
61ccoh.data$subcohort <- subcoh[selccoh]
62cch2<-cch(Surv(edrel, rel) ~ factor(stage) + factor(histol) + I(age/12),
63          data =ccoh.data, subcoh = ~subcohort, id=~seqno,
64          cohort.size=4028, method="LinYing", robust=TRUE)
65
66print(all.equal(as.vector(coef(cch1)),as.vector(coef(cch2))))
67## cch has smaller variances by a factor of 1.0005 because
68## there is a (n/(n-1)) in the survey phase1 varianace
69print(all.equal(as.vector(SE(cch1)), as.vector(SE(cch2)),tolerance=0.0006))
70
71
72## bug report from Takahiro Tsuchiya for version 3.4
73## We used to not match Sarndal exactly, because our old phase-one
74## estimator had a small bias for finite populations
75rei<-read.table(tmp<-textConnection(
76"  id   N n.a h n.ah n.h   sub  y
771   1 300  20 1   12   5  TRUE  1
782   2 300  20 1   12   5  TRUE  2
793   3 300  20 1   12   5  TRUE  3
804   4 300  20 1   12   5  TRUE  4
815   5 300  20 1   12   5  TRUE  5
826   6 300  20 1   12   5 FALSE NA
837   7 300  20 1   12   5 FALSE NA
848   8 300  20 1   12   5 FALSE NA
859   9 300  20 1   12   5 FALSE NA
8610 10 300  20 1   12   5 FALSE NA
8711 11 300  20 1   12   5 FALSE NA
8812 12 300  20 1   12   5 FALSE NA
8913 13 300  20 2    8   3  TRUE  6
9014 14 300  20 2    8   3  TRUE  7
9115 15 300  20 2    8   3  TRUE  8
9216 16 300  20 2    8   3 FALSE NA
9317 17 300  20 2    8   3 FALSE NA
9418 18 300  20 2    8   3 FALSE NA
9519 19 300  20 2    8   3 FALSE NA
9620 20 300  20 2    8   3 FALSE NA
97"), header=TRUE)
98close(tmp)
99
100des.rei <- twophase(id=list(~id,~id), strata=list(NULL,~h),
101                    fpc=list(~N,NULL), subset=~sub, data=rei, method="approx")
102tot<- svytotal(~y, des.rei)
103des.rei2 <- twophase(id=list(~id,~id), strata=list(NULL,~h),
104                    fpc=list(~N,NULL), subset=~sub, data=rei)
105tot2<- svytotal(~y, des.rei2)
106
107## based on Sarndal et al (9.4.14)
108rei$w.ah <- rei$n.ah / rei$n.a
109a.rei <- aggregate(rei, by=list(rei$h), mean, na.rm=TRUE)
110a.rei$S.ysh <- tapply(rei$y, rei$h, var, na.rm=TRUE)
111a.rei$y.u <- sum(a.rei$w.ah * a.rei$y)
112V <- with(a.rei, sum(N * (N-1) * ((n.ah-1)/(n.a-1) - (n.h-1)/(N-1)) * w.ah * S.ysh / n.h))
113V <- V + with(a.rei, sum(N * (N-n.a) * w.ah * (y - y.u)^2 / (n.a-1)))
114
115a.rei$f.h<-with(a.rei, n.h/n.ah)
116Vphase2<-with(a.rei, sum(N*N*w.ah^2* ((1-f.h)/n.h) *S.ysh))
117
118a.rei$f<-with(a.rei, n.a/N)
119a.rei$delta.h<-with(a.rei, (1/n.h)*(n.a-n.ah)/(n.a-1))
120Vphase1<-with(a.rei, sum(N*N*((1-f)/n.a)*( w.ah*(1-delta.h)*S.ysh+ ((n.a)/(n.a-1))*w.ah*(y-y.u)^2)))
121
122V
123Vphase1
124Vphase2
125vcov(tot)
126vcov(tot2)
127## phase 2 identical
128all.equal(Vphase2,drop(attr(vcov(tot),"phases")$phase2))
129all.equal(Vphase2,drop(attr(vcov(tot2),"phases")$phase2))
130## phase 1 differs by 2.6% for old twophase estimator
131Vphase1/attr(vcov(tot),"phases")$phase1
132all.equal(Vphase1,as.vector(attr(vcov(tot2),"phases")$phase1))
133
134