1 /*-------------------------------------------------------------------------
2 *
3 * rangetypes_typanalyze.c
4 * Functions for gathering statistics from range columns
5 *
6 * For a range type column, histograms of lower and upper bounds, and
7 * the fraction of NULL and empty ranges are collected.
8 *
9 * Both histograms have the same length, and they are combined into a
10 * single array of ranges. This has the same shape as the histogram that
11 * std_typanalyze would collect, but the values are different. Each range
12 * in the array is a valid range, even though the lower and upper bounds
13 * come from different tuples. In theory, the standard scalar selectivity
14 * functions could be used with the combined histogram.
15 *
16 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
18 *
19 *
20 * IDENTIFICATION
21 * src/backend/utils/adt/rangetypes_typanalyze.c
22 *
23 *-------------------------------------------------------------------------
24 */
25 #include "postgres.h"
26
27 #include "catalog/pg_operator.h"
28 #include "commands/vacuum.h"
29 #include "utils/float.h"
30 #include "utils/fmgrprotos.h"
31 #include "utils/lsyscache.h"
32 #include "utils/rangetypes.h"
33 #include "utils/multirangetypes.h"
34
35 static int float8_qsort_cmp(const void *a1, const void *a2);
36 static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
37 static void compute_range_stats(VacAttrStats *stats,
38 AnalyzeAttrFetchFunc fetchfunc, int samplerows,
39 double totalrows);
40
41 /*
42 * range_typanalyze -- typanalyze function for range columns
43 */
44 Datum
range_typanalyze(PG_FUNCTION_ARGS)45 range_typanalyze(PG_FUNCTION_ARGS)
46 {
47 VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
48 TypeCacheEntry *typcache;
49 Form_pg_attribute attr = stats->attr;
50
51 /* Get information about range type; note column might be a domain */
52 typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
53
54 if (attr->attstattarget < 0)
55 attr->attstattarget = default_statistics_target;
56
57 stats->compute_stats = compute_range_stats;
58 stats->extra_data = typcache;
59 /* same as in std_typanalyze */
60 stats->minrows = 300 * attr->attstattarget;
61
62 PG_RETURN_BOOL(true);
63 }
64
65 /*
66 * multirange_typanalyze -- typanalyze function for multirange columns
67 *
68 * We do the same analysis as for ranges, but on the smallest range that
69 * completely includes the multirange.
70 */
71 Datum
multirange_typanalyze(PG_FUNCTION_ARGS)72 multirange_typanalyze(PG_FUNCTION_ARGS)
73 {
74 VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
75 TypeCacheEntry *typcache;
76 Form_pg_attribute attr = stats->attr;
77
78 /* Get information about multirange type; note column might be a domain */
79 typcache = multirange_get_typcache(fcinfo, getBaseType(stats->attrtypid));
80
81 if (attr->attstattarget < 0)
82 attr->attstattarget = default_statistics_target;
83
84 stats->compute_stats = compute_range_stats;
85 stats->extra_data = typcache;
86 /* same as in std_typanalyze */
87 stats->minrows = 300 * attr->attstattarget;
88
89 PG_RETURN_BOOL(true);
90 }
91
92 /*
93 * Comparison function for sorting float8s, used for range lengths.
94 */
95 static int
float8_qsort_cmp(const void * a1,const void * a2)96 float8_qsort_cmp(const void *a1, const void *a2)
97 {
98 const float8 *f1 = (const float8 *) a1;
99 const float8 *f2 = (const float8 *) a2;
100
101 if (*f1 < *f2)
102 return -1;
103 else if (*f1 == *f2)
104 return 0;
105 else
106 return 1;
107 }
108
109 /*
110 * Comparison function for sorting RangeBounds.
111 */
112 static int
range_bound_qsort_cmp(const void * a1,const void * a2,void * arg)113 range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
114 {
115 RangeBound *b1 = (RangeBound *) a1;
116 RangeBound *b2 = (RangeBound *) a2;
117 TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
118
119 return range_cmp_bounds(typcache, b1, b2);
120 }
121
122 /*
123 * compute_range_stats() -- compute statistics for a range column
124 */
125 static void
compute_range_stats(VacAttrStats * stats,AnalyzeAttrFetchFunc fetchfunc,int samplerows,double totalrows)126 compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
127 int samplerows, double totalrows)
128 {
129 TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
130 TypeCacheEntry *mltrng_typcache = NULL;
131 bool has_subdiff;
132 int null_cnt = 0;
133 int non_null_cnt = 0;
134 int non_empty_cnt = 0;
135 int empty_cnt = 0;
136 int range_no;
137 int slot_idx;
138 int num_bins = stats->attr->attstattarget;
139 int num_hist;
140 float8 *lengths;
141 RangeBound *lowers,
142 *uppers;
143 double total_width = 0;
144
145 if (typcache->typtype == TYPTYPE_MULTIRANGE)
146 {
147 mltrng_typcache = typcache;
148 typcache = typcache->rngtype;
149 }
150 else
151 Assert(typcache->typtype == TYPTYPE_RANGE);
152 has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
153
154 /* Allocate memory to hold range bounds and lengths of the sample ranges. */
155 lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
156 uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
157 lengths = (float8 *) palloc(sizeof(float8) * samplerows);
158
159 /* Loop over the sample ranges. */
160 for (range_no = 0; range_no < samplerows; range_no++)
161 {
162 Datum value;
163 bool isnull,
164 empty;
165 MultirangeType *multirange;
166 RangeType *range;
167 RangeBound lower,
168 upper;
169 float8 length;
170
171 vacuum_delay_point();
172
173 value = fetchfunc(stats, range_no, &isnull);
174 if (isnull)
175 {
176 /* range is null, just count that */
177 null_cnt++;
178 continue;
179 }
180
181 /*
182 * XXX: should we ignore wide values, like std_typanalyze does, to
183 * avoid bloating the statistics table?
184 */
185 total_width += VARSIZE_ANY(DatumGetPointer(value));
186
187 /* Get range and deserialize it for further analysis. */
188 if (mltrng_typcache != NULL)
189 {
190 /* Treat multiranges like a big range without gaps. */
191 multirange = DatumGetMultirangeTypeP(value);
192 if (!MultirangeIsEmpty(multirange))
193 {
194 RangeBound tmp;
195
196 multirange_get_bounds(typcache, multirange, 0,
197 &lower, &tmp);
198 multirange_get_bounds(typcache, multirange,
199 multirange->rangeCount - 1,
200 &tmp, &upper);
201 empty = false;
202 }
203 else
204 {
205 empty = true;
206 }
207 }
208 else
209 {
210 range = DatumGetRangeTypeP(value);
211 range_deserialize(typcache, range, &lower, &upper, &empty);
212 }
213
214 if (!empty)
215 {
216 /* Remember bounds and length for further usage in histograms */
217 lowers[non_empty_cnt] = lower;
218 uppers[non_empty_cnt] = upper;
219
220 if (lower.infinite || upper.infinite)
221 {
222 /* Length of any kind of an infinite range is infinite */
223 length = get_float8_infinity();
224 }
225 else if (has_subdiff)
226 {
227 /*
228 * For an ordinary range, use subdiff function between upper
229 * and lower bound values.
230 */
231 length = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
232 typcache->rng_collation,
233 upper.val, lower.val));
234 }
235 else
236 {
237 /* Use default value of 1.0 if no subdiff is available. */
238 length = 1.0;
239 }
240 lengths[non_empty_cnt] = length;
241
242 non_empty_cnt++;
243 }
244 else
245 empty_cnt++;
246
247 non_null_cnt++;
248 }
249
250 slot_idx = 0;
251
252 /* We can only compute real stats if we found some non-null values. */
253 if (non_null_cnt > 0)
254 {
255 Datum *bound_hist_values;
256 Datum *length_hist_values;
257 int pos,
258 posfrac,
259 delta,
260 deltafrac,
261 i;
262 MemoryContext old_cxt;
263 float4 *emptyfrac;
264
265 stats->stats_valid = true;
266 /* Do the simple null-frac and width stats */
267 stats->stanullfrac = (double) null_cnt / (double) samplerows;
268 stats->stawidth = total_width / (double) non_null_cnt;
269
270 /* Estimate that non-null values are unique */
271 stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
272
273 /* Must copy the target values into anl_context */
274 old_cxt = MemoryContextSwitchTo(stats->anl_context);
275
276 /*
277 * Generate a bounds histogram slot entry if there are at least two
278 * values.
279 */
280 if (non_empty_cnt >= 2)
281 {
282 /* Sort bound values */
283 qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
284 range_bound_qsort_cmp, typcache);
285 qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
286 range_bound_qsort_cmp, typcache);
287
288 num_hist = non_empty_cnt;
289 if (num_hist > num_bins)
290 num_hist = num_bins + 1;
291
292 bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
293
294 /*
295 * The object of this loop is to construct ranges from first and
296 * last entries in lowers[] and uppers[] along with evenly-spaced
297 * values in between. So the i'th value is a range of lowers[(i *
298 * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
299 * (num_hist - 1)]. But computing that subscript directly risks
300 * integer overflow when the stats target is more than a couple
301 * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
302 * at each step, tracking the integral and fractional parts of the
303 * sum separately.
304 */
305 delta = (non_empty_cnt - 1) / (num_hist - 1);
306 deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
307 pos = posfrac = 0;
308
309 for (i = 0; i < num_hist; i++)
310 {
311 bound_hist_values[i] = PointerGetDatum(range_serialize(typcache,
312 &lowers[pos],
313 &uppers[pos],
314 false));
315 pos += delta;
316 posfrac += deltafrac;
317 if (posfrac >= (num_hist - 1))
318 {
319 /* fractional part exceeds 1, carry to integer part */
320 pos++;
321 posfrac -= (num_hist - 1);
322 }
323 }
324
325 stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
326 stats->stavalues[slot_idx] = bound_hist_values;
327 stats->numvalues[slot_idx] = num_hist;
328
329 /* Store ranges even if we're analyzing a multirange column */
330 stats->statypid[slot_idx] = typcache->type_id;
331 stats->statyplen[slot_idx] = typcache->typlen;
332 stats->statypbyval[slot_idx] = typcache->typbyval;
333 stats->statypalign[slot_idx] = typcache->typalign;
334
335 slot_idx++;
336 }
337
338 /*
339 * Generate a length histogram slot entry if there are at least two
340 * values.
341 */
342 if (non_empty_cnt >= 2)
343 {
344 /*
345 * Ascending sort of range lengths for further filling of
346 * histogram
347 */
348 qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
349
350 num_hist = non_empty_cnt;
351 if (num_hist > num_bins)
352 num_hist = num_bins + 1;
353
354 length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
355
356 /*
357 * The object of this loop is to copy the first and last lengths[]
358 * entries along with evenly-spaced values in between. So the i'th
359 * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
360 * computing that subscript directly risks integer overflow when
361 * the stats target is more than a couple thousand. Instead we
362 * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
363 * the integral and fractional parts of the sum separately.
364 */
365 delta = (non_empty_cnt - 1) / (num_hist - 1);
366 deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
367 pos = posfrac = 0;
368
369 for (i = 0; i < num_hist; i++)
370 {
371 length_hist_values[i] = Float8GetDatum(lengths[pos]);
372 pos += delta;
373 posfrac += deltafrac;
374 if (posfrac >= (num_hist - 1))
375 {
376 /* fractional part exceeds 1, carry to integer part */
377 pos++;
378 posfrac -= (num_hist - 1);
379 }
380 }
381 }
382 else
383 {
384 /*
385 * Even when we don't create the histogram, store an empty array
386 * to mean "no histogram". We can't just leave stavalues NULL,
387 * because get_attstatsslot() errors if you ask for stavalues, and
388 * it's NULL. We'll still store the empty fraction in stanumbers.
389 */
390 length_hist_values = palloc(0);
391 num_hist = 0;
392 }
393 stats->staop[slot_idx] = Float8LessOperator;
394 stats->stacoll[slot_idx] = InvalidOid;
395 stats->stavalues[slot_idx] = length_hist_values;
396 stats->numvalues[slot_idx] = num_hist;
397 stats->statypid[slot_idx] = FLOAT8OID;
398 stats->statyplen[slot_idx] = sizeof(float8);
399 stats->statypbyval[slot_idx] = FLOAT8PASSBYVAL;
400 stats->statypalign[slot_idx] = 'd';
401
402 /* Store the fraction of empty ranges */
403 emptyfrac = (float4 *) palloc(sizeof(float4));
404 *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
405 stats->stanumbers[slot_idx] = emptyfrac;
406 stats->numnumbers[slot_idx] = 1;
407
408 stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
409 slot_idx++;
410
411 MemoryContextSwitchTo(old_cxt);
412 }
413 else if (null_cnt > 0)
414 {
415 /* We found only nulls; assume the column is entirely null */
416 stats->stats_valid = true;
417 stats->stanullfrac = 1.0;
418 stats->stawidth = 0; /* "unknown" */
419 stats->stadistinct = 0.0; /* "unknown" */
420 }
421
422 /*
423 * We don't need to bother cleaning up any of our temporary palloc's. The
424 * hashtable should also go away, as it used a child memory context.
425 */
426 }
427