1
2R version 3.4.2 (2017-09-28) -- "Short Summer"
3Copyright (C) 2017 The R Foundation for Statistical Computing
4Platform: x86_64-apple-darwin15.6.0 (64-bit)
5
6R is free software and comes with ABSOLUTELY NO WARRANTY.
7You are welcome to redistribute it under certain conditions.
8Type 'license()' or 'licence()' for distribution details.
9
10R is a collaborative project with many contributors.
11Type 'contributors()' for more information and
12'citation()' on how to cite R or R packages in publications.
13
14Type 'demo()' for some demos, 'help()' for on-line help, or
15'help.start()' for an HTML browser interface to help.
16Type 'q()' to quit R.
17
18> library(survey)
19Loading required package: grid
20Loading required package: Matrix
21Loading required package: survival
22
23Attaching package: 'survey'
24
25The following object is masked from 'package:graphics':
26
27    dotchart
28
29>
30> ## two-phase simple random sampling.
31> data(pbc, package="survival")
32> pbc$id<-1:nrow(pbc)
33> pbc$randomized<-with(pbc, !is.na(trt) & trt>-9)
34> (d2pbc<-twophase(id=list(~id,~id), data=pbc, subset=~I(!randomized)))
35Two-phase sparse-matrix design:
36 twophase2(id = id, strata = strata, probs = probs, fpc = fpc,
37    subset = subset, data = data)
38Phase 1:
39Independent Sampling design (with replacement)
40svydesign(ids = ~id)
41Phase 2:
42Independent Sampling design
43svydesign(ids = ~id, fpc = `*phase1*`)
44> m<-svymean(~bili, d2pbc)
45> all.equal(as.vector(coef(m)),with(pbc, mean(bili[!randomized])))
46[1] TRUE
47> all.equal(as.vector(SE(m)),
48+           with(pbc, sd(bili[!randomized])/sqrt(sum(!randomized))),
49+           tolerance=0.002)
50[1] TRUE
51>
52> ## two-stage sampling as two-phase
53> data(mu284)
54> ii<-with(mu284, c(1:15, rep(1:5,n2[1:5]-3)))
55> mu284.1<-mu284[ii,]
56> mu284.1$id<-1:nrow(mu284.1)
57> mu284.1$sub<-rep(c(TRUE,FALSE),c(15,34-15))
58> dmu284<-svydesign(id=~id1+id2,fpc=~n1+n2, data=mu284)
59> ## first phase cluster sample, second phase stratified within cluster
60> (d2mu284<-twophase(id=list(~id1,~id),strata=list(NULL,~id1),
61+                    fpc=list(~n1,NULL),data=mu284.1,subset=~sub,method="approx"))
62Two-phase design: twophase(id = list(~id1, ~id), strata = list(NULL, ~id1), fpc = list(~n1,
63    NULL), data = mu284.1, subset = ~sub, method = "approx")
64Phase 1:
651 - level Cluster Sampling design
66With (5) clusters.
67svydesign(ids = ~id1, fpc = ~n1)
68Phase 2:
69Stratified Independent Sampling design
70svydesign(ids = ~id, strata = ~id1, fpc = `*phase1*`)
71> (d22mu284<-twophase(id=list(~id1,~id),strata=list(NULL,~id1),
72+                    fpc=list(~n1,NULL),data=mu284.1,subset=~sub,method="full"))
73Two-phase sparse-matrix design:
74 twophase2(id = id, strata = strata, probs = probs, fpc = fpc,
75    subset = subset, data = data)
76Phase 1:
771 - level Cluster Sampling design
78With (5) clusters.
79svydesign(ids = ~id1, fpc = ~n1)
80Phase 2:
81Stratified Independent Sampling design
82svydesign(ids = ~id, strata = ~id1, fpc = `*phase1*`)
83> summary(d2mu284)
84Two-phase design: twophase(id = list(~id1, ~id), strata = list(NULL, ~id1), fpc = list(~n1,
85    NULL), data = mu284.1, subset = ~sub, method = "approx")
86Phase 1:
871 - level Cluster Sampling design
88With (5) clusters.
89svydesign(ids = ~id1, fpc = ~n1)
90Probabilities:
91   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
92    0.1     0.1     0.1     0.1     0.1     0.1
93Population size (PSUs): 50
94Phase 2:
95Stratified Independent Sampling design
96svydesign(ids = ~id, strata = ~id1, fpc = `*phase1*`)
97Probabilities:
98   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
99 0.3333  0.3750  0.4286  0.4674  0.6000  0.6000
100Stratum Sizes:
101           19 31 45 47 50
102obs         3  3  3  3  3
103design.PSU  3  3  3  3  3
104actual.PSU  3  3  3  3  3
105Population stratum sizes (PSUs):
10619 31 45 47 50
107 5  7  8  5  9
108Data variables:
109[1] "id1" "n1"  "id2" "y1"  "n2"  "id"  "sub"
110> t1<-svytotal(~y1, dmu284)
111> t2<-svytotal(~y1, d2mu284)
112> t22<-svytotal(~y1,d22mu284)
113> m1<-svymean(~y1, dmu284)
114> m2<-svymean(~y1, d2mu284)
115> m22<-svymean(~y1, d22mu284)
116> all.equal(coef(t1),coef(t2))
117[1] TRUE
118> all.equal(coef(t1),coef(t22))
119[1] TRUE
120> all.equal(coef(m1),coef(m2))
121[1] TRUE
122> all.equal(coef(m1),coef(m22))
123[1] TRUE
124> all.equal(as.vector(SE(m1)),as.vector(SE(m2)))
125[1] TRUE
126> all.equal(as.vector(SE(m1)),as.vector(SE(m22)))
127[1] TRUE
128> all.equal(as.vector(SE(t1)),as.vector(SE(t2)))
129[1] TRUE
130> all.equal(as.vector(SE(t1)),as.vector(SE(t22)))
131[1] TRUE
132>
133> ## case-cohort design
134> ##this example requires R 2.3.1 or later for cch and data.
135> library("survival")
136> data(nwtco, package="survival")
137> ## unstratified, equivalent to Lin & Ying (1993)
138> print(dcchs<-twophase(id=list(~seqno,~seqno), strata=list(NULL,~rel),
139+                  subset=~I(in.subcohort | rel), data=nwtco))
140Two-phase sparse-matrix design:
141 twophase2(id = id, strata = strata, probs = probs, fpc = fpc,
142    subset = subset, data = data)
143Phase 1:
144Independent Sampling design (with replacement)
145svydesign(ids = ~seqno)
146Phase 2:
147Stratified Independent Sampling design
148svydesign(ids = ~seqno, strata = ~rel, fpc = `*phase1*`)
149> cch1<-svycoxph(Surv(edrel,rel)~factor(stage)+factor(histol)+I(age/12),
150+                design=dcchs)
151> dcchs2<-twophase(id=list(~seqno,~seqno), strata=list(NULL,~rel),
152+                  subset=~I(in.subcohort | rel), data=nwtco,method="approx")
153> cch1.2<-svycoxph(Surv(edrel,rel)~factor(stage)+factor(histol)+I(age/12),
154+                design=dcchs)
155> all.equal(coef(cch1),coef(cch1.2))
156[1] TRUE
157> all.equal(SE(cch1),SE(cch1.2))
158[1] TRUE
159> ## Using survival::cch
160> subcoh <- nwtco$in.subcohort
161> selccoh <- with(nwtco, rel==1|subcoh==1)
162> ccoh.data <- nwtco[selccoh,]
163> ccoh.data$subcohort <- subcoh[selccoh]
164> cch2<-cch(Surv(edrel, rel) ~ factor(stage) + factor(histol) + I(age/12),
165+           data =ccoh.data, subcoh = ~subcohort, id=~seqno,
166+           cohort.size=4028, method="LinYing", robust=TRUE)
167>
168> print(all.equal(as.vector(coef(cch1)),as.vector(coef(cch2))))
169[1] TRUE
170> ## cch has smaller variances by a factor of 1.0005 because
171> ## there is a (n/(n-1)) in the survey phase1 varianace
172> print(all.equal(as.vector(SE(cch1)), as.vector(SE(cch2)),tolerance=0.0006))
173[1] TRUE
174>
175>
176> ## bug report from Takahiro Tsuchiya for version 3.4
177> ## We used to not match Sarndal exactly, because our old phase-one
178> ## estimator had a small bias for finite populations
179> rei<-read.table(tmp<-textConnection(
180+ "  id   N n.a h n.ah n.h   sub  y
181+ 1   1 300  20 1   12   5  TRUE  1
182+ 2   2 300  20 1   12   5  TRUE  2
183+ 3   3 300  20 1   12   5  TRUE  3
184+ 4   4 300  20 1   12   5  TRUE  4
185+ 5   5 300  20 1   12   5  TRUE  5
186+ 6   6 300  20 1   12   5 FALSE NA
187+ 7   7 300  20 1   12   5 FALSE NA
188+ 8   8 300  20 1   12   5 FALSE NA
189+ 9   9 300  20 1   12   5 FALSE NA
190+ 10 10 300  20 1   12   5 FALSE NA
191+ 11 11 300  20 1   12   5 FALSE NA
192+ 12 12 300  20 1   12   5 FALSE NA
193+ 13 13 300  20 2    8   3  TRUE  6
194+ 14 14 300  20 2    8   3  TRUE  7
195+ 15 15 300  20 2    8   3  TRUE  8
196+ 16 16 300  20 2    8   3 FALSE NA
197+ 17 17 300  20 2    8   3 FALSE NA
198+ 18 18 300  20 2    8   3 FALSE NA
199+ 19 19 300  20 2    8   3 FALSE NA
200+ 20 20 300  20 2    8   3 FALSE NA
201+ "), header=TRUE)
202> close(tmp)
203>
204> des.rei <- twophase(id=list(~id,~id), strata=list(NULL,~h),
205+                     fpc=list(~N,NULL), subset=~sub, data=rei, method="approx")
206> tot<- svytotal(~y, des.rei)
207> des.rei2 <- twophase(id=list(~id,~id), strata=list(NULL,~h),
208+                     fpc=list(~N,NULL), subset=~sub, data=rei)
209> tot2<- svytotal(~y, des.rei2)
210>
211> ## based on Sarndal et al (9.4.14)
212> rei$w.ah <- rei$n.ah / rei$n.a
213> a.rei <- aggregate(rei, by=list(rei$h), mean, na.rm=TRUE)
214> a.rei$S.ysh <- tapply(rei$y, rei$h, var, na.rm=TRUE)
215> a.rei$y.u <- sum(a.rei$w.ah * a.rei$y)
216> V <- with(a.rei, sum(N * (N-1) * ((n.ah-1)/(n.a-1) - (n.h-1)/(N-1)) * w.ah * S.ysh / n.h))
217> V <- V + with(a.rei, sum(N * (N-n.a) * w.ah * (y - y.u)^2 / (n.a-1)))
218>
219> a.rei$f.h<-with(a.rei, n.h/n.ah)
220> Vphase2<-with(a.rei, sum(N*N*w.ah^2* ((1-f.h)/n.h) *S.ysh))
221>
222> a.rei$f<-with(a.rei, n.a/N)
223> a.rei$delta.h<-with(a.rei, (1/n.h)*(n.a-n.ah)/(n.a-1))
224> Vphase1<-with(a.rei, sum(N*N*((1-f)/n.a)*( w.ah*(1-delta.h)*S.ysh+ ((n.a)/(n.a-1))*w.ah*(y-y.u)^2)))
225>
226> V
227[1] 36522.63
228> Vphase1
229[1] 24072.63
230> Vphase2
231[1] 12450
232> vcov(tot)
233         y
234y 35911.05
235attr(,"phases")
236attr(,"phases")$phase1
237         [,1]
238[1,] 23461.05
239
240attr(,"phases")$phase2
241      y
242y 12450
243
244> vcov(tot2)
245         [,1]
246[1,] 36522.63
247attr(,"phases")
248attr(,"phases")$phase1
249         [,1]
250[1,] 24072.63
251
252attr(,"phases")$phase2
253      [,1]
254[1,] 12450
255
256> ## phase 2 identical
257> all.equal(Vphase2,drop(attr(vcov(tot),"phases")$phase2))
258[1] TRUE
259> all.equal(Vphase2,drop(attr(vcov(tot2),"phases")$phase2))
260[1] TRUE
261> ## phase 1 differs by 2.6% for old twophase estimator
262> Vphase1/attr(vcov(tot),"phases")$phase1
263         [,1]
264[1,] 1.026068
265> all.equal(Vphase1,as.vector(attr(vcov(tot2),"phases")$phase1))
266[1] TRUE
267>
268>
269> proc.time()
270   user  system elapsed
271  3.561   0.201   3.809
272