1 2R version 3.4.2 (2017-09-28) -- "Short Summer" 3Copyright (C) 2017 The R Foundation for Statistical Computing 4Platform: x86_64-apple-darwin15.6.0 (64-bit) 5 6R is free software and comes with ABSOLUTELY NO WARRANTY. 7You are welcome to redistribute it under certain conditions. 8Type 'license()' or 'licence()' for distribution details. 9 10R is a collaborative project with many contributors. 11Type 'contributors()' for more information and 12'citation()' on how to cite R or R packages in publications. 13 14Type 'demo()' for some demos, 'help()' for on-line help, or 15'help.start()' for an HTML browser interface to help. 16Type 'q()' to quit R. 17 18> library(survey) 19Loading required package: grid 20Loading required package: Matrix 21Loading required package: survival 22 23Attaching package: 'survey' 24 25The following object is masked from 'package:graphics': 26 27 dotchart 28 29> 30> ## two-phase simple random sampling. 31> data(pbc, package="survival") 32> pbc$id<-1:nrow(pbc) 33> pbc$randomized<-with(pbc, !is.na(trt) & trt>-9) 34> (d2pbc<-twophase(id=list(~id,~id), data=pbc, subset=~I(!randomized))) 35Two-phase sparse-matrix design: 36 twophase2(id = id, strata = strata, probs = probs, fpc = fpc, 37 subset = subset, data = data) 38Phase 1: 39Independent Sampling design (with replacement) 40svydesign(ids = ~id) 41Phase 2: 42Independent Sampling design 43svydesign(ids = ~id, fpc = `*phase1*`) 44> m<-svymean(~bili, d2pbc) 45> all.equal(as.vector(coef(m)),with(pbc, mean(bili[!randomized]))) 46[1] TRUE 47> all.equal(as.vector(SE(m)), 48+ with(pbc, sd(bili[!randomized])/sqrt(sum(!randomized))), 49+ tolerance=0.002) 50[1] TRUE 51> 52> ## two-stage sampling as two-phase 53> data(mu284) 54> ii<-with(mu284, c(1:15, rep(1:5,n2[1:5]-3))) 55> mu284.1<-mu284[ii,] 56> mu284.1$id<-1:nrow(mu284.1) 57> mu284.1$sub<-rep(c(TRUE,FALSE),c(15,34-15)) 58> dmu284<-svydesign(id=~id1+id2,fpc=~n1+n2, data=mu284) 59> ## first phase cluster sample, second phase stratified within cluster 60> (d2mu284<-twophase(id=list(~id1,~id),strata=list(NULL,~id1), 61+ fpc=list(~n1,NULL),data=mu284.1,subset=~sub,method="approx")) 62Two-phase design: twophase(id = list(~id1, ~id), strata = list(NULL, ~id1), fpc = list(~n1, 63 NULL), data = mu284.1, subset = ~sub, method = "approx") 64Phase 1: 651 - level Cluster Sampling design 66With (5) clusters. 67svydesign(ids = ~id1, fpc = ~n1) 68Phase 2: 69Stratified Independent Sampling design 70svydesign(ids = ~id, strata = ~id1, fpc = `*phase1*`) 71> (d22mu284<-twophase(id=list(~id1,~id),strata=list(NULL,~id1), 72+ fpc=list(~n1,NULL),data=mu284.1,subset=~sub,method="full")) 73Two-phase sparse-matrix design: 74 twophase2(id = id, strata = strata, probs = probs, fpc = fpc, 75 subset = subset, data = data) 76Phase 1: 771 - level Cluster Sampling design 78With (5) clusters. 79svydesign(ids = ~id1, fpc = ~n1) 80Phase 2: 81Stratified Independent Sampling design 82svydesign(ids = ~id, strata = ~id1, fpc = `*phase1*`) 83> summary(d2mu284) 84Two-phase design: twophase(id = list(~id1, ~id), strata = list(NULL, ~id1), fpc = list(~n1, 85 NULL), data = mu284.1, subset = ~sub, method = "approx") 86Phase 1: 871 - level Cluster Sampling design 88With (5) clusters. 89svydesign(ids = ~id1, fpc = ~n1) 90Probabilities: 91 Min. 1st Qu. Median Mean 3rd Qu. Max. 92 0.1 0.1 0.1 0.1 0.1 0.1 93Population size (PSUs): 50 94Phase 2: 95Stratified Independent Sampling design 96svydesign(ids = ~id, strata = ~id1, fpc = `*phase1*`) 97Probabilities: 98 Min. 1st Qu. Median Mean 3rd Qu. Max. 99 0.3333 0.3750 0.4286 0.4674 0.6000 0.6000 100Stratum Sizes: 101 19 31 45 47 50 102obs 3 3 3 3 3 103design.PSU 3 3 3 3 3 104actual.PSU 3 3 3 3 3 105Population stratum sizes (PSUs): 10619 31 45 47 50 107 5 7 8 5 9 108Data variables: 109[1] "id1" "n1" "id2" "y1" "n2" "id" "sub" 110> t1<-svytotal(~y1, dmu284) 111> t2<-svytotal(~y1, d2mu284) 112> t22<-svytotal(~y1,d22mu284) 113> m1<-svymean(~y1, dmu284) 114> m2<-svymean(~y1, d2mu284) 115> m22<-svymean(~y1, d22mu284) 116> all.equal(coef(t1),coef(t2)) 117[1] TRUE 118> all.equal(coef(t1),coef(t22)) 119[1] TRUE 120> all.equal(coef(m1),coef(m2)) 121[1] TRUE 122> all.equal(coef(m1),coef(m22)) 123[1] TRUE 124> all.equal(as.vector(SE(m1)),as.vector(SE(m2))) 125[1] TRUE 126> all.equal(as.vector(SE(m1)),as.vector(SE(m22))) 127[1] TRUE 128> all.equal(as.vector(SE(t1)),as.vector(SE(t2))) 129[1] TRUE 130> all.equal(as.vector(SE(t1)),as.vector(SE(t22))) 131[1] TRUE 132> 133> ## case-cohort design 134> ##this example requires R 2.3.1 or later for cch and data. 135> library("survival") 136> data(nwtco, package="survival") 137> ## unstratified, equivalent to Lin & Ying (1993) 138> print(dcchs<-twophase(id=list(~seqno,~seqno), strata=list(NULL,~rel), 139+ subset=~I(in.subcohort | rel), data=nwtco)) 140Two-phase sparse-matrix design: 141 twophase2(id = id, strata = strata, probs = probs, fpc = fpc, 142 subset = subset, data = data) 143Phase 1: 144Independent Sampling design (with replacement) 145svydesign(ids = ~seqno) 146Phase 2: 147Stratified Independent Sampling design 148svydesign(ids = ~seqno, strata = ~rel, fpc = `*phase1*`) 149> cch1<-svycoxph(Surv(edrel,rel)~factor(stage)+factor(histol)+I(age/12), 150+ design=dcchs) 151> dcchs2<-twophase(id=list(~seqno,~seqno), strata=list(NULL,~rel), 152+ subset=~I(in.subcohort | rel), data=nwtco,method="approx") 153> cch1.2<-svycoxph(Surv(edrel,rel)~factor(stage)+factor(histol)+I(age/12), 154+ design=dcchs) 155> all.equal(coef(cch1),coef(cch1.2)) 156[1] TRUE 157> all.equal(SE(cch1),SE(cch1.2)) 158[1] TRUE 159> ## Using survival::cch 160> subcoh <- nwtco$in.subcohort 161> selccoh <- with(nwtco, rel==1|subcoh==1) 162> ccoh.data <- nwtco[selccoh,] 163> ccoh.data$subcohort <- subcoh[selccoh] 164> cch2<-cch(Surv(edrel, rel) ~ factor(stage) + factor(histol) + I(age/12), 165+ data =ccoh.data, subcoh = ~subcohort, id=~seqno, 166+ cohort.size=4028, method="LinYing", robust=TRUE) 167> 168> print(all.equal(as.vector(coef(cch1)),as.vector(coef(cch2)))) 169[1] TRUE 170> ## cch has smaller variances by a factor of 1.0005 because 171> ## there is a (n/(n-1)) in the survey phase1 varianace 172> print(all.equal(as.vector(SE(cch1)), as.vector(SE(cch2)),tolerance=0.0006)) 173[1] TRUE 174> 175> 176> ## bug report from Takahiro Tsuchiya for version 3.4 177> ## We used to not match Sarndal exactly, because our old phase-one 178> ## estimator had a small bias for finite populations 179> rei<-read.table(tmp<-textConnection( 180+ " id N n.a h n.ah n.h sub y 181+ 1 1 300 20 1 12 5 TRUE 1 182+ 2 2 300 20 1 12 5 TRUE 2 183+ 3 3 300 20 1 12 5 TRUE 3 184+ 4 4 300 20 1 12 5 TRUE 4 185+ 5 5 300 20 1 12 5 TRUE 5 186+ 6 6 300 20 1 12 5 FALSE NA 187+ 7 7 300 20 1 12 5 FALSE NA 188+ 8 8 300 20 1 12 5 FALSE NA 189+ 9 9 300 20 1 12 5 FALSE NA 190+ 10 10 300 20 1 12 5 FALSE NA 191+ 11 11 300 20 1 12 5 FALSE NA 192+ 12 12 300 20 1 12 5 FALSE NA 193+ 13 13 300 20 2 8 3 TRUE 6 194+ 14 14 300 20 2 8 3 TRUE 7 195+ 15 15 300 20 2 8 3 TRUE 8 196+ 16 16 300 20 2 8 3 FALSE NA 197+ 17 17 300 20 2 8 3 FALSE NA 198+ 18 18 300 20 2 8 3 FALSE NA 199+ 19 19 300 20 2 8 3 FALSE NA 200+ 20 20 300 20 2 8 3 FALSE NA 201+ "), header=TRUE) 202> close(tmp) 203> 204> des.rei <- twophase(id=list(~id,~id), strata=list(NULL,~h), 205+ fpc=list(~N,NULL), subset=~sub, data=rei, method="approx") 206> tot<- svytotal(~y, des.rei) 207> des.rei2 <- twophase(id=list(~id,~id), strata=list(NULL,~h), 208+ fpc=list(~N,NULL), subset=~sub, data=rei) 209> tot2<- svytotal(~y, des.rei2) 210> 211> ## based on Sarndal et al (9.4.14) 212> rei$w.ah <- rei$n.ah / rei$n.a 213> a.rei <- aggregate(rei, by=list(rei$h), mean, na.rm=TRUE) 214> a.rei$S.ysh <- tapply(rei$y, rei$h, var, na.rm=TRUE) 215> a.rei$y.u <- sum(a.rei$w.ah * a.rei$y) 216> V <- with(a.rei, sum(N * (N-1) * ((n.ah-1)/(n.a-1) - (n.h-1)/(N-1)) * w.ah * S.ysh / n.h)) 217> V <- V + with(a.rei, sum(N * (N-n.a) * w.ah * (y - y.u)^2 / (n.a-1))) 218> 219> a.rei$f.h<-with(a.rei, n.h/n.ah) 220> Vphase2<-with(a.rei, sum(N*N*w.ah^2* ((1-f.h)/n.h) *S.ysh)) 221> 222> a.rei$f<-with(a.rei, n.a/N) 223> a.rei$delta.h<-with(a.rei, (1/n.h)*(n.a-n.ah)/(n.a-1)) 224> Vphase1<-with(a.rei, sum(N*N*((1-f)/n.a)*( w.ah*(1-delta.h)*S.ysh+ ((n.a)/(n.a-1))*w.ah*(y-y.u)^2))) 225> 226> V 227[1] 36522.63 228> Vphase1 229[1] 24072.63 230> Vphase2 231[1] 12450 232> vcov(tot) 233 y 234y 35911.05 235attr(,"phases") 236attr(,"phases")$phase1 237 [,1] 238[1,] 23461.05 239 240attr(,"phases")$phase2 241 y 242y 12450 243 244> vcov(tot2) 245 [,1] 246[1,] 36522.63 247attr(,"phases") 248attr(,"phases")$phase1 249 [,1] 250[1,] 24072.63 251 252attr(,"phases")$phase2 253 [,1] 254[1,] 12450 255 256> ## phase 2 identical 257> all.equal(Vphase2,drop(attr(vcov(tot),"phases")$phase2)) 258[1] TRUE 259> all.equal(Vphase2,drop(attr(vcov(tot2),"phases")$phase2)) 260[1] TRUE 261> ## phase 1 differs by 2.6% for old twophase estimator 262> Vphase1/attr(vcov(tot),"phases")$phase1 263 [,1] 264[1,] 1.026068 265> all.equal(Vphase1,as.vector(attr(vcov(tot2),"phases")$phase1)) 266[1] TRUE 267> 268> 269> proc.time() 270 user system elapsed 271 3.561 0.201 3.809 272