1library(survey) 2 3## two-phase simple random sampling. 4data(pbc, package="survival") 5pbc$id<-1:nrow(pbc) 6pbc$randomized<-with(pbc, !is.na(trt) & trt>-9) 7(d2pbc<-twophase(id=list(~id,~id), data=pbc, subset=~I(!randomized))) 8m<-svymean(~bili, d2pbc) 9all.equal(as.vector(coef(m)),with(pbc, mean(bili[!randomized]))) 10all.equal(as.vector(SE(m)), 11 with(pbc, sd(bili[!randomized])/sqrt(sum(!randomized))), 12 tolerance=0.002) 13 14## two-stage sampling as two-phase 15data(mu284) 16ii<-with(mu284, c(1:15, rep(1:5,n2[1:5]-3))) 17mu284.1<-mu284[ii,] 18mu284.1$id<-1:nrow(mu284.1) 19mu284.1$sub<-rep(c(TRUE,FALSE),c(15,34-15)) 20dmu284<-svydesign(id=~id1+id2,fpc=~n1+n2, data=mu284) 21## first phase cluster sample, second phase stratified within cluster 22(d2mu284<-twophase(id=list(~id1,~id),strata=list(NULL,~id1), 23 fpc=list(~n1,NULL),data=mu284.1,subset=~sub,method="approx")) 24(d22mu284<-twophase(id=list(~id1,~id),strata=list(NULL,~id1), 25 fpc=list(~n1,NULL),data=mu284.1,subset=~sub,method="full")) 26summary(d2mu284) 27t1<-svytotal(~y1, dmu284) 28t2<-svytotal(~y1, d2mu284) 29t22<-svytotal(~y1,d22mu284) 30m1<-svymean(~y1, dmu284) 31m2<-svymean(~y1, d2mu284) 32m22<-svymean(~y1, d22mu284) 33all.equal(coef(t1),coef(t2)) 34all.equal(coef(t1),coef(t22)) 35all.equal(coef(m1),coef(m2)) 36all.equal(coef(m1),coef(m22)) 37all.equal(as.vector(SE(m1)),as.vector(SE(m2))) 38all.equal(as.vector(SE(m1)),as.vector(SE(m22))) 39all.equal(as.vector(SE(t1)),as.vector(SE(t2))) 40all.equal(as.vector(SE(t1)),as.vector(SE(t22))) 41 42## case-cohort design 43##this example requires R 2.3.1 or later for cch and data. 44library("survival") 45data(nwtco, package="survival") 46## unstratified, equivalent to Lin & Ying (1993) 47print(dcchs<-twophase(id=list(~seqno,~seqno), strata=list(NULL,~rel), 48 subset=~I(in.subcohort | rel), data=nwtco)) 49cch1<-svycoxph(Surv(edrel,rel)~factor(stage)+factor(histol)+I(age/12), 50 design=dcchs) 51dcchs2<-twophase(id=list(~seqno,~seqno), strata=list(NULL,~rel), 52 subset=~I(in.subcohort | rel), data=nwtco,method="approx") 53cch1.2<-svycoxph(Surv(edrel,rel)~factor(stage)+factor(histol)+I(age/12), 54 design=dcchs) 55all.equal(coef(cch1),coef(cch1.2)) 56all.equal(SE(cch1),SE(cch1.2)) 57## Using survival::cch 58subcoh <- nwtco$in.subcohort 59selccoh <- with(nwtco, rel==1|subcoh==1) 60ccoh.data <- nwtco[selccoh,] 61ccoh.data$subcohort <- subcoh[selccoh] 62cch2<-cch(Surv(edrel, rel) ~ factor(stage) + factor(histol) + I(age/12), 63 data =ccoh.data, subcoh = ~subcohort, id=~seqno, 64 cohort.size=4028, method="LinYing", robust=TRUE) 65 66print(all.equal(as.vector(coef(cch1)),as.vector(coef(cch2)))) 67## cch has smaller variances by a factor of 1.0005 because 68## there is a (n/(n-1)) in the survey phase1 varianace 69print(all.equal(as.vector(SE(cch1)), as.vector(SE(cch2)),tolerance=0.0006)) 70 71 72## bug report from Takahiro Tsuchiya for version 3.4 73## We used to not match Sarndal exactly, because our old phase-one 74## estimator had a small bias for finite populations 75rei<-read.table(tmp<-textConnection( 76" id N n.a h n.ah n.h sub y 771 1 300 20 1 12 5 TRUE 1 782 2 300 20 1 12 5 TRUE 2 793 3 300 20 1 12 5 TRUE 3 804 4 300 20 1 12 5 TRUE 4 815 5 300 20 1 12 5 TRUE 5 826 6 300 20 1 12 5 FALSE NA 837 7 300 20 1 12 5 FALSE NA 848 8 300 20 1 12 5 FALSE NA 859 9 300 20 1 12 5 FALSE NA 8610 10 300 20 1 12 5 FALSE NA 8711 11 300 20 1 12 5 FALSE NA 8812 12 300 20 1 12 5 FALSE NA 8913 13 300 20 2 8 3 TRUE 6 9014 14 300 20 2 8 3 TRUE 7 9115 15 300 20 2 8 3 TRUE 8 9216 16 300 20 2 8 3 FALSE NA 9317 17 300 20 2 8 3 FALSE NA 9418 18 300 20 2 8 3 FALSE NA 9519 19 300 20 2 8 3 FALSE NA 9620 20 300 20 2 8 3 FALSE NA 97"), header=TRUE) 98close(tmp) 99 100des.rei <- twophase(id=list(~id,~id), strata=list(NULL,~h), 101 fpc=list(~N,NULL), subset=~sub, data=rei, method="approx") 102tot<- svytotal(~y, des.rei) 103des.rei2 <- twophase(id=list(~id,~id), strata=list(NULL,~h), 104 fpc=list(~N,NULL), subset=~sub, data=rei) 105tot2<- svytotal(~y, des.rei2) 106 107## based on Sarndal et al (9.4.14) 108rei$w.ah <- rei$n.ah / rei$n.a 109a.rei <- aggregate(rei, by=list(rei$h), mean, na.rm=TRUE) 110a.rei$S.ysh <- tapply(rei$y, rei$h, var, na.rm=TRUE) 111a.rei$y.u <- sum(a.rei$w.ah * a.rei$y) 112V <- with(a.rei, sum(N * (N-1) * ((n.ah-1)/(n.a-1) - (n.h-1)/(N-1)) * w.ah * S.ysh / n.h)) 113V <- V + with(a.rei, sum(N * (N-n.a) * w.ah * (y - y.u)^2 / (n.a-1))) 114 115a.rei$f.h<-with(a.rei, n.h/n.ah) 116Vphase2<-with(a.rei, sum(N*N*w.ah^2* ((1-f.h)/n.h) *S.ysh)) 117 118a.rei$f<-with(a.rei, n.a/N) 119a.rei$delta.h<-with(a.rei, (1/n.h)*(n.a-n.ah)/(n.a-1)) 120Vphase1<-with(a.rei, sum(N*N*((1-f)/n.a)*( w.ah*(1-delta.h)*S.ysh+ ((n.a)/(n.a-1))*w.ah*(y-y.u)^2))) 121 122V 123Vphase1 124Vphase2 125vcov(tot) 126vcov(tot2) 127## phase 2 identical 128all.equal(Vphase2,drop(attr(vcov(tot),"phases")$phase2)) 129all.equal(Vphase2,drop(attr(vcov(tot2),"phases")$phase2)) 130## phase 1 differs by 2.6% for old twophase estimator 131Vphase1/attr(vcov(tot),"phases")$phase1 132all.equal(Vphase1,as.vector(attr(vcov(tot2),"phases")$phase1)) 133 134