1 /*-------------------------------------------------------------------------
2  *
3  * sampling.c
4  *	  Relation block sampling routines.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/utils/misc/sampling.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include <math.h>
19 
20 #include "utils/sampling.h"
21 
22 
23 /*
24  * BlockSampler_Init -- prepare for random sampling of blocknumbers
25  *
26  * BlockSampler provides algorithm for block level sampling of a relation
27  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28  * It selects a random sample of samplesize blocks out of
29  * the nblocks blocks in the table. If the table has less than
30  * samplesize blocks, all blocks are selected.
31  *
32  * Since we know the total number of blocks in advance, we can use the
33  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34  * algorithm.
35  */
36 void
BlockSampler_Init(BlockSampler bs,BlockNumber nblocks,int samplesize,long randseed)37 BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
38 				  long randseed)
39 {
40 	bs->N = nblocks;			/* measured table size */
41 
42 	/*
43 	 * If we decide to reduce samplesize for tables that have less or not much
44 	 * more than samplesize blocks, here is the place to do it.
45 	 */
46 	bs->n = samplesize;
47 	bs->t = 0;					/* blocks scanned so far */
48 	bs->m = 0;					/* blocks selected so far */
49 
50 	sampler_random_init_state(randseed, bs->randstate);
51 }
52 
53 bool
BlockSampler_HasMore(BlockSampler bs)54 BlockSampler_HasMore(BlockSampler bs)
55 {
56 	return (bs->t < bs->N) && (bs->m < bs->n);
57 }
58 
59 BlockNumber
BlockSampler_Next(BlockSampler bs)60 BlockSampler_Next(BlockSampler bs)
61 {
62 	BlockNumber K = bs->N - bs->t;	/* remaining blocks */
63 	int			k = bs->n - bs->m;	/* blocks still to sample */
64 	double		p;				/* probability to skip block */
65 	double		V;				/* random */
66 
67 	Assert(BlockSampler_HasMore(bs));	/* hence K > 0 and k > 0 */
68 
69 	if ((BlockNumber) k >= K)
70 	{
71 		/* need all the rest */
72 		bs->m++;
73 		return bs->t++;
74 	}
75 
76 	/*----------
77 	 * It is not obvious that this code matches Knuth's Algorithm S.
78 	 * Knuth says to skip the current block with probability 1 - k/K.
79 	 * If we are to skip, we should advance t (hence decrease K), and
80 	 * repeat the same probabilistic test for the next block.  The naive
81 	 * implementation thus requires a sampler_random_fract() call for each
82 	 * block number.  But we can reduce this to one sampler_random_fract()
83 	 * call per selected block, by noting that each time the while-test
84 	 * succeeds, we can reinterpret V as a uniform random number in the range
85 	 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
86 	 * the appropriate fraction of its former value, and our next loop
87 	 * makes the appropriate probabilistic test.
88 	 *
89 	 * We have initially K > k > 0.  If the loop reduces K to equal k,
90 	 * the next while-test must fail since p will become exactly zero
91 	 * (we assume there will not be roundoff error in the division).
92 	 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
93 	 * to be doubly sure about roundoff error.)  Therefore K cannot become
94 	 * less than k, which means that we cannot fail to select enough blocks.
95 	 *----------
96 	 */
97 	V = sampler_random_fract(bs->randstate);
98 	p = 1.0 - (double) k / (double) K;
99 	while (V < p)
100 	{
101 		/* skip */
102 		bs->t++;
103 		K--;					/* keep K == N - t */
104 
105 		/* adjust p to be new cutoff point in reduced range */
106 		p *= 1.0 - (double) k / (double) K;
107 	}
108 
109 	/* select */
110 	bs->m++;
111 	return bs->t++;
112 }
113 
114 /*
115  * These two routines embody Algorithm Z from "Random sampling with a
116  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
117  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
118  * of the count S of records to skip before processing another record.
119  * It is computed primarily based on t, the number of records already read.
120  * The only extra state needed between calls is W, a random state variable.
121  *
122  * reservoir_init_selection_state computes the initial W value.
123  *
124  * Given that we've already read t records (t >= n), reservoir_get_next_S
125  * determines the number of records to skip before the next record is
126  * processed.
127  */
128 void
reservoir_init_selection_state(ReservoirState rs,int n)129 reservoir_init_selection_state(ReservoirState rs, int n)
130 {
131 	/*
132 	 * Reservoir sampling is not used anywhere where it would need to return
133 	 * repeatable results so we can initialize it randomly.
134 	 */
135 	sampler_random_init_state(random(), rs->randstate);
136 
137 	/* Initial value of W (for use when Algorithm Z is first applied) */
138 	rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
139 }
140 
141 double
reservoir_get_next_S(ReservoirState rs,double t,int n)142 reservoir_get_next_S(ReservoirState rs, double t, int n)
143 {
144 	double		S;
145 
146 	/* The magic constant here is T from Vitter's paper */
147 	if (t <= (22.0 * n))
148 	{
149 		/* Process records using Algorithm X until t is large enough */
150 		double		V,
151 					quot;
152 
153 		V = sampler_random_fract(rs->randstate);	/* Generate V */
154 		S = 0;
155 		t += 1;
156 		/* Note: "num" in Vitter's code is always equal to t - n */
157 		quot = (t - (double) n) / t;
158 		/* Find min S satisfying (4.1) */
159 		while (quot > V)
160 		{
161 			S += 1;
162 			t += 1;
163 			quot *= (t - (double) n) / t;
164 		}
165 	}
166 	else
167 	{
168 		/* Now apply Algorithm Z */
169 		double		W = rs->W;
170 		double		term = t - (double) n + 1;
171 
172 		for (;;)
173 		{
174 			double		numer,
175 						numer_lim,
176 						denom;
177 			double		U,
178 						X,
179 						lhs,
180 						rhs,
181 						y,
182 						tmp;
183 
184 			/* Generate U and X */
185 			U = sampler_random_fract(rs->randstate);
186 			X = t * (W - 1.0);
187 			S = floor(X);		/* S is tentatively set to floor(X) */
188 			/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
189 			tmp = (t + 1) / term;
190 			lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
191 			rhs = (((t + X) / (term + S)) * term) / t;
192 			if (lhs <= rhs)
193 			{
194 				W = rhs / lhs;
195 				break;
196 			}
197 			/* Test if U <= f(S)/cg(X) */
198 			y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
199 			if ((double) n < S)
200 			{
201 				denom = t;
202 				numer_lim = term + S;
203 			}
204 			else
205 			{
206 				denom = t - (double) n + S;
207 				numer_lim = t + 1;
208 			}
209 			for (numer = t + S; numer >= numer_lim; numer -= 1)
210 			{
211 				y *= numer / denom;
212 				denom -= 1;
213 			}
214 			W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
215 			if (exp(log(y) / n) <= (t + X) / t)
216 				break;
217 		}
218 		rs->W = W;
219 	}
220 	return S;
221 }
222 
223 
224 /*----------
225  * Random number generator used by sampling
226  *----------
227  */
228 void
sampler_random_init_state(long seed,SamplerRandomState randstate)229 sampler_random_init_state(long seed, SamplerRandomState randstate)
230 {
231 	randstate[0] = 0x330e;		/* same as pg_erand48, but could be anything */
232 	randstate[1] = (unsigned short) seed;
233 	randstate[2] = (unsigned short) (seed >> 16);
234 }
235 
236 /* Select a random value R uniformly distributed in (0 - 1) */
237 double
sampler_random_fract(SamplerRandomState randstate)238 sampler_random_fract(SamplerRandomState randstate)
239 {
240 	double		res;
241 
242 	/* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
243 	do
244 	{
245 		res = pg_erand48(randstate);
246 	} while (res == 0.0);
247 	return res;
248 }
249 
250 
251 /*
252  * Backwards-compatible API for block sampling
253  *
254  * This code is now deprecated, but since it's still in use by many FDWs,
255  * we should keep it for awhile at least.  The functionality is the same as
256  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
257  * except that a common random state is used across all callers.
258  */
259 static ReservoirStateData oldrs;
260 
261 double
anl_random_fract(void)262 anl_random_fract(void)
263 {
264 	/* initialize if first time through */
265 	if (oldrs.randstate[0] == 0)
266 		sampler_random_init_state(random(), oldrs.randstate);
267 
268 	/* and compute a random fraction */
269 	return sampler_random_fract(oldrs.randstate);
270 }
271 
272 double
anl_init_selection_state(int n)273 anl_init_selection_state(int n)
274 {
275 	/* initialize if first time through */
276 	if (oldrs.randstate[0] == 0)
277 		sampler_random_init_state(random(), oldrs.randstate);
278 
279 	/* Initial value of W (for use when Algorithm Z is first applied) */
280 	return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
281 }
282 
283 double
anl_get_next_S(double t,int n,double * stateptr)284 anl_get_next_S(double t, int n, double *stateptr)
285 {
286 	double		result;
287 
288 	oldrs.W = *stateptr;
289 	result = reservoir_get_next_S(&oldrs, t, n);
290 	*stateptr = oldrs.W;
291 	return result;
292 }
293