1 /*-------------------------------------------------------------------------
2 *
3 * ragetypes_typanalyze.c
4 * Functions for gathering statistics from range columns
5 *
6 * For a range type column, histograms of lower and upper bounds, and
7 * the fraction of NULL and empty ranges are collected.
8 *
9 * Both histograms have the same length, and they are combined into a
10 * single array of ranges. This has the same shape as the histogram that
11 * std_typanalyze would collect, but the values are different. Each range
12 * in the array is a valid range, even though the lower and upper bounds
13 * come from different tuples. In theory, the standard scalar selectivity
14 * functions could be used with the combined histogram.
15 *
16 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
18 *
19 *
20 * IDENTIFICATION
21 * src/backend/utils/adt/rangetypes_typanalyze.c
22 *
23 *-------------------------------------------------------------------------
24 */
25 #include "postgres.h"
26
27 #include "catalog/pg_operator.h"
28 #include "commands/vacuum.h"
29 #include "utils/builtins.h"
30 #include "utils/lsyscache.h"
31 #include "utils/rangetypes.h"
32
33 static int float8_qsort_cmp(const void *a1, const void *a2);
34 static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
35 static void compute_range_stats(VacAttrStats *stats,
36 AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
37
38 /*
39 * range_typanalyze -- typanalyze function for range columns
40 */
41 Datum
range_typanalyze(PG_FUNCTION_ARGS)42 range_typanalyze(PG_FUNCTION_ARGS)
43 {
44 VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
45 TypeCacheEntry *typcache;
46 Form_pg_attribute attr = stats->attr;
47
48 /* Get information about range type; note column might be a domain */
49 typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
50
51 if (attr->attstattarget < 0)
52 attr->attstattarget = default_statistics_target;
53
54 stats->compute_stats = compute_range_stats;
55 stats->extra_data = typcache;
56 /* same as in std_typanalyze */
57 stats->minrows = 300 * attr->attstattarget;
58
59 PG_RETURN_BOOL(true);
60 }
61
62 /*
63 * Comparison function for sorting float8s, used for range lengths.
64 */
65 static int
float8_qsort_cmp(const void * a1,const void * a2)66 float8_qsort_cmp(const void *a1, const void *a2)
67 {
68 const float8 *f1 = (const float8 *) a1;
69 const float8 *f2 = (const float8 *) a2;
70
71 if (*f1 < *f2)
72 return -1;
73 else if (*f1 == *f2)
74 return 0;
75 else
76 return 1;
77 }
78
79 /*
80 * Comparison function for sorting RangeBounds.
81 */
82 static int
range_bound_qsort_cmp(const void * a1,const void * a2,void * arg)83 range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
84 {
85 RangeBound *b1 = (RangeBound *) a1;
86 RangeBound *b2 = (RangeBound *) a2;
87 TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
88
89 return range_cmp_bounds(typcache, b1, b2);
90 }
91
92 /*
93 * compute_range_stats() -- compute statistics for a range column
94 */
95 static void
compute_range_stats(VacAttrStats * stats,AnalyzeAttrFetchFunc fetchfunc,int samplerows,double totalrows)96 compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
97 int samplerows, double totalrows)
98 {
99 TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
100 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
101 int null_cnt = 0;
102 int non_null_cnt = 0;
103 int non_empty_cnt = 0;
104 int empty_cnt = 0;
105 int range_no;
106 int slot_idx;
107 int num_bins = stats->attr->attstattarget;
108 int num_hist;
109 float8 *lengths;
110 RangeBound *lowers,
111 *uppers;
112 double total_width = 0;
113
114 /* Allocate memory to hold range bounds and lengths of the sample ranges. */
115 lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
116 uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
117 lengths = (float8 *) palloc(sizeof(float8) * samplerows);
118
119 /* Loop over the sample ranges. */
120 for (range_no = 0; range_no < samplerows; range_no++)
121 {
122 Datum value;
123 bool isnull,
124 empty;
125 RangeType *range;
126 RangeBound lower,
127 upper;
128 float8 length;
129
130 vacuum_delay_point();
131
132 value = fetchfunc(stats, range_no, &isnull);
133 if (isnull)
134 {
135 /* range is null, just count that */
136 null_cnt++;
137 continue;
138 }
139
140 /*
141 * XXX: should we ignore wide values, like std_typanalyze does, to
142 * avoid bloating the statistics table?
143 */
144 total_width += VARSIZE_ANY(DatumGetPointer(value));
145
146 /* Get range and deserialize it for further analysis. */
147 range = DatumGetRangeTypeP(value);
148 range_deserialize(typcache, range, &lower, &upper, &empty);
149
150 if (!empty)
151 {
152 /* Remember bounds and length for further usage in histograms */
153 lowers[non_empty_cnt] = lower;
154 uppers[non_empty_cnt] = upper;
155
156 if (lower.infinite || upper.infinite)
157 {
158 /* Length of any kind of an infinite range is infinite */
159 length = get_float8_infinity();
160 }
161 else if (has_subdiff)
162 {
163 /*
164 * For an ordinary range, use subdiff function between upper
165 * and lower bound values.
166 */
167 length = DatumGetFloat8(FunctionCall2Coll(
168 &typcache->rng_subdiff_finfo,
169 typcache->rng_collation,
170 upper.val, lower.val));
171 }
172 else
173 {
174 /* Use default value of 1.0 if no subdiff is available. */
175 length = 1.0;
176 }
177 lengths[non_empty_cnt] = length;
178
179 non_empty_cnt++;
180 }
181 else
182 empty_cnt++;
183
184 non_null_cnt++;
185 }
186
187 slot_idx = 0;
188
189 /* We can only compute real stats if we found some non-null values. */
190 if (non_null_cnt > 0)
191 {
192 Datum *bound_hist_values;
193 Datum *length_hist_values;
194 int pos,
195 posfrac,
196 delta,
197 deltafrac,
198 i;
199 MemoryContext old_cxt;
200 float4 *emptyfrac;
201
202 stats->stats_valid = true;
203 /* Do the simple null-frac and width stats */
204 stats->stanullfrac = (double) null_cnt / (double) samplerows;
205 stats->stawidth = total_width / (double) non_null_cnt;
206
207 /* Estimate that non-null values are unique */
208 stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
209
210 /* Must copy the target values into anl_context */
211 old_cxt = MemoryContextSwitchTo(stats->anl_context);
212
213 /*
214 * Generate a bounds histogram slot entry if there are at least two
215 * values.
216 */
217 if (non_empty_cnt >= 2)
218 {
219 /* Sort bound values */
220 qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
221 range_bound_qsort_cmp, typcache);
222 qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
223 range_bound_qsort_cmp, typcache);
224
225 num_hist = non_empty_cnt;
226 if (num_hist > num_bins)
227 num_hist = num_bins + 1;
228
229 bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
230
231 /*
232 * The object of this loop is to construct ranges from first and
233 * last entries in lowers[] and uppers[] along with evenly-spaced
234 * values in between. So the i'th value is a range of lowers[(i *
235 * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
236 * (num_hist - 1)]. But computing that subscript directly risks
237 * integer overflow when the stats target is more than a couple
238 * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
239 * at each step, tracking the integral and fractional parts of the
240 * sum separately.
241 */
242 delta = (non_empty_cnt - 1) / (num_hist - 1);
243 deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
244 pos = posfrac = 0;
245
246 for (i = 0; i < num_hist; i++)
247 {
248 bound_hist_values[i] = PointerGetDatum(range_serialize(
249 typcache, &lowers[pos], &uppers[pos], false));
250 pos += delta;
251 posfrac += deltafrac;
252 if (posfrac >= (num_hist - 1))
253 {
254 /* fractional part exceeds 1, carry to integer part */
255 pos++;
256 posfrac -= (num_hist - 1);
257 }
258 }
259
260 stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
261 stats->stavalues[slot_idx] = bound_hist_values;
262 stats->numvalues[slot_idx] = num_hist;
263 slot_idx++;
264 }
265
266 /*
267 * Generate a length histogram slot entry if there are at least two
268 * values.
269 */
270 if (non_empty_cnt >= 2)
271 {
272 /*
273 * Ascending sort of range lengths for further filling of
274 * histogram
275 */
276 qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
277
278 num_hist = non_empty_cnt;
279 if (num_hist > num_bins)
280 num_hist = num_bins + 1;
281
282 length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
283
284 /*
285 * The object of this loop is to copy the first and last lengths[]
286 * entries along with evenly-spaced values in between. So the i'th
287 * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
288 * computing that subscript directly risks integer overflow when
289 * the stats target is more than a couple thousand. Instead we
290 * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
291 * the integral and fractional parts of the sum separately.
292 */
293 delta = (non_empty_cnt - 1) / (num_hist - 1);
294 deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
295 pos = posfrac = 0;
296
297 for (i = 0; i < num_hist; i++)
298 {
299 length_hist_values[i] = Float8GetDatum(lengths[pos]);
300 pos += delta;
301 posfrac += deltafrac;
302 if (posfrac >= (num_hist - 1))
303 {
304 /* fractional part exceeds 1, carry to integer part */
305 pos++;
306 posfrac -= (num_hist - 1);
307 }
308 }
309 }
310 else
311 {
312 /*
313 * Even when we don't create the histogram, store an empty array
314 * to mean "no histogram". We can't just leave stavalues NULL,
315 * because get_attstatsslot() errors if you ask for stavalues, and
316 * it's NULL. We'll still store the empty fraction in stanumbers.
317 */
318 length_hist_values = palloc(0);
319 num_hist = 0;
320 }
321 stats->staop[slot_idx] = Float8LessOperator;
322 stats->stavalues[slot_idx] = length_hist_values;
323 stats->numvalues[slot_idx] = num_hist;
324 stats->statypid[slot_idx] = FLOAT8OID;
325 stats->statyplen[slot_idx] = sizeof(float8);
326 #ifdef USE_FLOAT8_BYVAL
327 stats->statypbyval[slot_idx] = true;
328 #else
329 stats->statypbyval[slot_idx] = false;
330 #endif
331 stats->statypalign[slot_idx] = 'd';
332
333 /* Store the fraction of empty ranges */
334 emptyfrac = (float4 *) palloc(sizeof(float4));
335 *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
336 stats->stanumbers[slot_idx] = emptyfrac;
337 stats->numnumbers[slot_idx] = 1;
338
339 stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
340 slot_idx++;
341
342 MemoryContextSwitchTo(old_cxt);
343 }
344 else if (null_cnt > 0)
345 {
346 /* We found only nulls; assume the column is entirely null */
347 stats->stats_valid = true;
348 stats->stanullfrac = 1.0;
349 stats->stawidth = 0; /* "unknown" */
350 stats->stadistinct = 0.0; /* "unknown" */
351 }
352
353 /*
354 * We don't need to bother cleaning up any of our temporary palloc's. The
355 * hashtable should also go away, as it used a child memory context.
356 */
357 }
358