1 /*-------------------------------------------------------------------------
2 *
3 * sampling.c
4 * Relation block sampling routines.
5 *
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/utils/misc/sampling.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16 #include "postgres.h"
17
18 #include <math.h>
19
20 #include "utils/sampling.h"
21
22
23 /*
24 * BlockSampler_Init -- prepare for random sampling of blocknumbers
25 *
26 * BlockSampler provides algorithm for block level sampling of a relation
27 * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28 * It selects a random sample of samplesize blocks out of
29 * the nblocks blocks in the table. If the table has less than
30 * samplesize blocks, all blocks are selected.
31 *
32 * Since we know the total number of blocks in advance, we can use the
33 * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34 * algorithm.
35 */
36 void
BlockSampler_Init(BlockSampler bs,BlockNumber nblocks,int samplesize,long randseed)37 BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
38 long randseed)
39 {
40 bs->N = nblocks; /* measured table size */
41
42 /*
43 * If we decide to reduce samplesize for tables that have less or not much
44 * more than samplesize blocks, here is the place to do it.
45 */
46 bs->n = samplesize;
47 bs->t = 0; /* blocks scanned so far */
48 bs->m = 0; /* blocks selected so far */
49
50 sampler_random_init_state(randseed, bs->randstate);
51 }
52
53 bool
BlockSampler_HasMore(BlockSampler bs)54 BlockSampler_HasMore(BlockSampler bs)
55 {
56 return (bs->t < bs->N) && (bs->m < bs->n);
57 }
58
59 BlockNumber
BlockSampler_Next(BlockSampler bs)60 BlockSampler_Next(BlockSampler bs)
61 {
62 BlockNumber K = bs->N - bs->t; /* remaining blocks */
63 int k = bs->n - bs->m; /* blocks still to sample */
64 double p; /* probability to skip block */
65 double V; /* random */
66
67 Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
68
69 if ((BlockNumber) k >= K)
70 {
71 /* need all the rest */
72 bs->m++;
73 return bs->t++;
74 }
75
76 /*----------
77 * It is not obvious that this code matches Knuth's Algorithm S.
78 * Knuth says to skip the current block with probability 1 - k/K.
79 * If we are to skip, we should advance t (hence decrease K), and
80 * repeat the same probabilistic test for the next block. The naive
81 * implementation thus requires a sampler_random_fract() call for each
82 * block number. But we can reduce this to one sampler_random_fract()
83 * call per selected block, by noting that each time the while-test
84 * succeeds, we can reinterpret V as a uniform random number in the range
85 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
86 * the appropriate fraction of its former value, and our next loop
87 * makes the appropriate probabilistic test.
88 *
89 * We have initially K > k > 0. If the loop reduces K to equal k,
90 * the next while-test must fail since p will become exactly zero
91 * (we assume there will not be roundoff error in the division).
92 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
93 * to be doubly sure about roundoff error.) Therefore K cannot become
94 * less than k, which means that we cannot fail to select enough blocks.
95 *----------
96 */
97 V = sampler_random_fract(bs->randstate);
98 p = 1.0 - (double) k / (double) K;
99 while (V < p)
100 {
101 /* skip */
102 bs->t++;
103 K--; /* keep K == N - t */
104
105 /* adjust p to be new cutoff point in reduced range */
106 p *= 1.0 - (double) k / (double) K;
107 }
108
109 /* select */
110 bs->m++;
111 return bs->t++;
112 }
113
114 /*
115 * These two routines embody Algorithm Z from "Random sampling with a
116 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
117 * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
118 * of the count S of records to skip before processing another record.
119 * It is computed primarily based on t, the number of records already read.
120 * The only extra state needed between calls is W, a random state variable.
121 *
122 * reservoir_init_selection_state computes the initial W value.
123 *
124 * Given that we've already read t records (t >= n), reservoir_get_next_S
125 * determines the number of records to skip before the next record is
126 * processed.
127 */
128 void
reservoir_init_selection_state(ReservoirState rs,int n)129 reservoir_init_selection_state(ReservoirState rs, int n)
130 {
131 /*
132 * Reservoir sampling is not used anywhere where it would need to return
133 * repeatable results so we can initialize it randomly.
134 */
135 sampler_random_init_state(random(), rs->randstate);
136
137 /* Initial value of W (for use when Algorithm Z is first applied) */
138 rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
139 }
140
141 double
reservoir_get_next_S(ReservoirState rs,double t,int n)142 reservoir_get_next_S(ReservoirState rs, double t, int n)
143 {
144 double S;
145
146 /* The magic constant here is T from Vitter's paper */
147 if (t <= (22.0 * n))
148 {
149 /* Process records using Algorithm X until t is large enough */
150 double V,
151 quot;
152
153 V = sampler_random_fract(rs->randstate); /* Generate V */
154 S = 0;
155 t += 1;
156 /* Note: "num" in Vitter's code is always equal to t - n */
157 quot = (t - (double) n) / t;
158 /* Find min S satisfying (4.1) */
159 while (quot > V)
160 {
161 S += 1;
162 t += 1;
163 quot *= (t - (double) n) / t;
164 }
165 }
166 else
167 {
168 /* Now apply Algorithm Z */
169 double W = rs->W;
170 double term = t - (double) n + 1;
171
172 for (;;)
173 {
174 double numer,
175 numer_lim,
176 denom;
177 double U,
178 X,
179 lhs,
180 rhs,
181 y,
182 tmp;
183
184 /* Generate U and X */
185 U = sampler_random_fract(rs->randstate);
186 X = t * (W - 1.0);
187 S = floor(X); /* S is tentatively set to floor(X) */
188 /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
189 tmp = (t + 1) / term;
190 lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
191 rhs = (((t + X) / (term + S)) * term) / t;
192 if (lhs <= rhs)
193 {
194 W = rhs / lhs;
195 break;
196 }
197 /* Test if U <= f(S)/cg(X) */
198 y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
199 if ((double) n < S)
200 {
201 denom = t;
202 numer_lim = term + S;
203 }
204 else
205 {
206 denom = t - (double) n + S;
207 numer_lim = t + 1;
208 }
209 for (numer = t + S; numer >= numer_lim; numer -= 1)
210 {
211 y *= numer / denom;
212 denom -= 1;
213 }
214 W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
215 if (exp(log(y) / n) <= (t + X) / t)
216 break;
217 }
218 rs->W = W;
219 }
220 return S;
221 }
222
223
224 /*----------
225 * Random number generator used by sampling
226 *----------
227 */
228 void
sampler_random_init_state(long seed,SamplerRandomState randstate)229 sampler_random_init_state(long seed, SamplerRandomState randstate)
230 {
231 randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */
232 randstate[1] = (unsigned short) seed;
233 randstate[2] = (unsigned short) (seed >> 16);
234 }
235
236 /* Select a random value R uniformly distributed in (0 - 1) */
237 double
sampler_random_fract(SamplerRandomState randstate)238 sampler_random_fract(SamplerRandomState randstate)
239 {
240 double res;
241
242 /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
243 do
244 {
245 res = pg_erand48(randstate);
246 } while (res == 0.0);
247 return res;
248 }
249
250
251 /*
252 * Backwards-compatible API for block sampling
253 *
254 * This code is now deprecated, but since it's still in use by many FDWs,
255 * we should keep it for awhile at least. The functionality is the same as
256 * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
257 * except that a common random state is used across all callers.
258 */
259 static ReservoirStateData oldrs;
260
261 double
anl_random_fract(void)262 anl_random_fract(void)
263 {
264 /* initialize if first time through */
265 if (oldrs.randstate[0] == 0)
266 sampler_random_init_state(random(), oldrs.randstate);
267
268 /* and compute a random fraction */
269 return sampler_random_fract(oldrs.randstate);
270 }
271
272 double
anl_init_selection_state(int n)273 anl_init_selection_state(int n)
274 {
275 /* initialize if first time through */
276 if (oldrs.randstate[0] == 0)
277 sampler_random_init_state(random(), oldrs.randstate);
278
279 /* Initial value of W (for use when Algorithm Z is first applied) */
280 return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
281 }
282
283 double
anl_get_next_S(double t,int n,double * stateptr)284 anl_get_next_S(double t, int n, double *stateptr)
285 {
286 double result;
287
288 oldrs.W = *stateptr;
289 result = reservoir_get_next_S(&oldrs, t, n);
290 *stateptr = oldrs.W;
291 return result;
292 }
293