1 /*-------------------------------------------------------------------------
2  *
3  * rangetypes_typanalyze.c
4  *	  Functions for gathering statistics from range columns
5  *
6  * For a range type column, histograms of lower and upper bounds, and
7  * the fraction of NULL and empty ranges are collected.
8  *
9  * Both histograms have the same length, and they are combined into a
10  * single array of ranges. This has the same shape as the histogram that
11  * std_typanalyze would collect, but the values are different. Each range
12  * in the array is a valid range, even though the lower and upper bounds
13  * come from different tuples. In theory, the standard scalar selectivity
14  * functions could be used with the combined histogram.
15  *
16  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
17  * Portions Copyright (c) 1994, Regents of the University of California
18  *
19  *
20  * IDENTIFICATION
21  *	  src/backend/utils/adt/rangetypes_typanalyze.c
22  *
23  *-------------------------------------------------------------------------
24  */
25 #include "postgres.h"
26 
27 #include "catalog/pg_operator.h"
28 #include "commands/vacuum.h"
29 #include "utils/float.h"
30 #include "utils/fmgrprotos.h"
31 #include "utils/lsyscache.h"
32 #include "utils/rangetypes.h"
33 #include "utils/multirangetypes.h"
34 
35 static int	float8_qsort_cmp(const void *a1, const void *a2);
36 static int	range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
37 static void compute_range_stats(VacAttrStats *stats,
38 								AnalyzeAttrFetchFunc fetchfunc, int samplerows,
39 								double totalrows);
40 
41 /*
42  * range_typanalyze -- typanalyze function for range columns
43  */
44 Datum
range_typanalyze(PG_FUNCTION_ARGS)45 range_typanalyze(PG_FUNCTION_ARGS)
46 {
47 	VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
48 	TypeCacheEntry *typcache;
49 	Form_pg_attribute attr = stats->attr;
50 
51 	/* Get information about range type; note column might be a domain */
52 	typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
53 
54 	if (attr->attstattarget < 0)
55 		attr->attstattarget = default_statistics_target;
56 
57 	stats->compute_stats = compute_range_stats;
58 	stats->extra_data = typcache;
59 	/* same as in std_typanalyze */
60 	stats->minrows = 300 * attr->attstattarget;
61 
62 	PG_RETURN_BOOL(true);
63 }
64 
65 /*
66  * multirange_typanalyze -- typanalyze function for multirange columns
67  *
68  * We do the same analysis as for ranges, but on the smallest range that
69  * completely includes the multirange.
70  */
71 Datum
multirange_typanalyze(PG_FUNCTION_ARGS)72 multirange_typanalyze(PG_FUNCTION_ARGS)
73 {
74 	VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
75 	TypeCacheEntry *typcache;
76 	Form_pg_attribute attr = stats->attr;
77 
78 	/* Get information about multirange type; note column might be a domain */
79 	typcache = multirange_get_typcache(fcinfo, getBaseType(stats->attrtypid));
80 
81 	if (attr->attstattarget < 0)
82 		attr->attstattarget = default_statistics_target;
83 
84 	stats->compute_stats = compute_range_stats;
85 	stats->extra_data = typcache;
86 	/* same as in std_typanalyze */
87 	stats->minrows = 300 * attr->attstattarget;
88 
89 	PG_RETURN_BOOL(true);
90 }
91 
92 /*
93  * Comparison function for sorting float8s, used for range lengths.
94  */
95 static int
float8_qsort_cmp(const void * a1,const void * a2)96 float8_qsort_cmp(const void *a1, const void *a2)
97 {
98 	const float8 *f1 = (const float8 *) a1;
99 	const float8 *f2 = (const float8 *) a2;
100 
101 	if (*f1 < *f2)
102 		return -1;
103 	else if (*f1 == *f2)
104 		return 0;
105 	else
106 		return 1;
107 }
108 
109 /*
110  * Comparison function for sorting RangeBounds.
111  */
112 static int
range_bound_qsort_cmp(const void * a1,const void * a2,void * arg)113 range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
114 {
115 	RangeBound *b1 = (RangeBound *) a1;
116 	RangeBound *b2 = (RangeBound *) a2;
117 	TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
118 
119 	return range_cmp_bounds(typcache, b1, b2);
120 }
121 
122 /*
123  * compute_range_stats() -- compute statistics for a range column
124  */
125 static void
compute_range_stats(VacAttrStats * stats,AnalyzeAttrFetchFunc fetchfunc,int samplerows,double totalrows)126 compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
127 					int samplerows, double totalrows)
128 {
129 	TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
130 	TypeCacheEntry *mltrng_typcache = NULL;
131 	bool		has_subdiff;
132 	int			null_cnt = 0;
133 	int			non_null_cnt = 0;
134 	int			non_empty_cnt = 0;
135 	int			empty_cnt = 0;
136 	int			range_no;
137 	int			slot_idx;
138 	int			num_bins = stats->attr->attstattarget;
139 	int			num_hist;
140 	float8	   *lengths;
141 	RangeBound *lowers,
142 			   *uppers;
143 	double		total_width = 0;
144 
145 	if (typcache->typtype == TYPTYPE_MULTIRANGE)
146 	{
147 		mltrng_typcache = typcache;
148 		typcache = typcache->rngtype;
149 	}
150 	else
151 		Assert(typcache->typtype == TYPTYPE_RANGE);
152 	has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
153 
154 	/* Allocate memory to hold range bounds and lengths of the sample ranges. */
155 	lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
156 	uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
157 	lengths = (float8 *) palloc(sizeof(float8) * samplerows);
158 
159 	/* Loop over the sample ranges. */
160 	for (range_no = 0; range_no < samplerows; range_no++)
161 	{
162 		Datum		value;
163 		bool		isnull,
164 					empty;
165 		MultirangeType *multirange;
166 		RangeType  *range;
167 		RangeBound	lower,
168 					upper;
169 		float8		length;
170 
171 		vacuum_delay_point();
172 
173 		value = fetchfunc(stats, range_no, &isnull);
174 		if (isnull)
175 		{
176 			/* range is null, just count that */
177 			null_cnt++;
178 			continue;
179 		}
180 
181 		/*
182 		 * XXX: should we ignore wide values, like std_typanalyze does, to
183 		 * avoid bloating the statistics table?
184 		 */
185 		total_width += VARSIZE_ANY(DatumGetPointer(value));
186 
187 		/* Get range and deserialize it for further analysis. */
188 		if (mltrng_typcache != NULL)
189 		{
190 			/* Treat multiranges like a big range without gaps. */
191 			multirange = DatumGetMultirangeTypeP(value);
192 			if (!MultirangeIsEmpty(multirange))
193 			{
194 				RangeBound	tmp;
195 
196 				multirange_get_bounds(typcache, multirange, 0,
197 									  &lower, &tmp);
198 				multirange_get_bounds(typcache, multirange,
199 									  multirange->rangeCount - 1,
200 									  &tmp, &upper);
201 				empty = false;
202 			}
203 			else
204 			{
205 				empty = true;
206 			}
207 		}
208 		else
209 		{
210 			range = DatumGetRangeTypeP(value);
211 			range_deserialize(typcache, range, &lower, &upper, &empty);
212 		}
213 
214 		if (!empty)
215 		{
216 			/* Remember bounds and length for further usage in histograms */
217 			lowers[non_empty_cnt] = lower;
218 			uppers[non_empty_cnt] = upper;
219 
220 			if (lower.infinite || upper.infinite)
221 			{
222 				/* Length of any kind of an infinite range is infinite */
223 				length = get_float8_infinity();
224 			}
225 			else if (has_subdiff)
226 			{
227 				/*
228 				 * For an ordinary range, use subdiff function between upper
229 				 * and lower bound values.
230 				 */
231 				length = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
232 														  typcache->rng_collation,
233 														  upper.val, lower.val));
234 			}
235 			else
236 			{
237 				/* Use default value of 1.0 if no subdiff is available. */
238 				length = 1.0;
239 			}
240 			lengths[non_empty_cnt] = length;
241 
242 			non_empty_cnt++;
243 		}
244 		else
245 			empty_cnt++;
246 
247 		non_null_cnt++;
248 	}
249 
250 	slot_idx = 0;
251 
252 	/* We can only compute real stats if we found some non-null values. */
253 	if (non_null_cnt > 0)
254 	{
255 		Datum	   *bound_hist_values;
256 		Datum	   *length_hist_values;
257 		int			pos,
258 					posfrac,
259 					delta,
260 					deltafrac,
261 					i;
262 		MemoryContext old_cxt;
263 		float4	   *emptyfrac;
264 
265 		stats->stats_valid = true;
266 		/* Do the simple null-frac and width stats */
267 		stats->stanullfrac = (double) null_cnt / (double) samplerows;
268 		stats->stawidth = total_width / (double) non_null_cnt;
269 
270 		/* Estimate that non-null values are unique */
271 		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
272 
273 		/* Must copy the target values into anl_context */
274 		old_cxt = MemoryContextSwitchTo(stats->anl_context);
275 
276 		/*
277 		 * Generate a bounds histogram slot entry if there are at least two
278 		 * values.
279 		 */
280 		if (non_empty_cnt >= 2)
281 		{
282 			/* Sort bound values */
283 			qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
284 					  range_bound_qsort_cmp, typcache);
285 			qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
286 					  range_bound_qsort_cmp, typcache);
287 
288 			num_hist = non_empty_cnt;
289 			if (num_hist > num_bins)
290 				num_hist = num_bins + 1;
291 
292 			bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
293 
294 			/*
295 			 * The object of this loop is to construct ranges from first and
296 			 * last entries in lowers[] and uppers[] along with evenly-spaced
297 			 * values in between. So the i'th value is a range of lowers[(i *
298 			 * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
299 			 * (num_hist - 1)]. But computing that subscript directly risks
300 			 * integer overflow when the stats target is more than a couple
301 			 * thousand.  Instead we add (nvals - 1) / (num_hist - 1) to pos
302 			 * at each step, tracking the integral and fractional parts of the
303 			 * sum separately.
304 			 */
305 			delta = (non_empty_cnt - 1) / (num_hist - 1);
306 			deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
307 			pos = posfrac = 0;
308 
309 			for (i = 0; i < num_hist; i++)
310 			{
311 				bound_hist_values[i] = PointerGetDatum(range_serialize(typcache,
312 																	   &lowers[pos],
313 																	   &uppers[pos],
314 																	   false));
315 				pos += delta;
316 				posfrac += deltafrac;
317 				if (posfrac >= (num_hist - 1))
318 				{
319 					/* fractional part exceeds 1, carry to integer part */
320 					pos++;
321 					posfrac -= (num_hist - 1);
322 				}
323 			}
324 
325 			stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
326 			stats->stavalues[slot_idx] = bound_hist_values;
327 			stats->numvalues[slot_idx] = num_hist;
328 
329 			/* Store ranges even if we're analyzing a multirange column */
330 			stats->statypid[slot_idx] = typcache->type_id;
331 			stats->statyplen[slot_idx] = typcache->typlen;
332 			stats->statypbyval[slot_idx] = typcache->typbyval;
333 			stats->statypalign[slot_idx] = typcache->typalign;
334 
335 			slot_idx++;
336 		}
337 
338 		/*
339 		 * Generate a length histogram slot entry if there are at least two
340 		 * values.
341 		 */
342 		if (non_empty_cnt >= 2)
343 		{
344 			/*
345 			 * Ascending sort of range lengths for further filling of
346 			 * histogram
347 			 */
348 			qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
349 
350 			num_hist = non_empty_cnt;
351 			if (num_hist > num_bins)
352 				num_hist = num_bins + 1;
353 
354 			length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
355 
356 			/*
357 			 * The object of this loop is to copy the first and last lengths[]
358 			 * entries along with evenly-spaced values in between. So the i'th
359 			 * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
360 			 * computing that subscript directly risks integer overflow when
361 			 * the stats target is more than a couple thousand.  Instead we
362 			 * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
363 			 * the integral and fractional parts of the sum separately.
364 			 */
365 			delta = (non_empty_cnt - 1) / (num_hist - 1);
366 			deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
367 			pos = posfrac = 0;
368 
369 			for (i = 0; i < num_hist; i++)
370 			{
371 				length_hist_values[i] = Float8GetDatum(lengths[pos]);
372 				pos += delta;
373 				posfrac += deltafrac;
374 				if (posfrac >= (num_hist - 1))
375 				{
376 					/* fractional part exceeds 1, carry to integer part */
377 					pos++;
378 					posfrac -= (num_hist - 1);
379 				}
380 			}
381 		}
382 		else
383 		{
384 			/*
385 			 * Even when we don't create the histogram, store an empty array
386 			 * to mean "no histogram". We can't just leave stavalues NULL,
387 			 * because get_attstatsslot() errors if you ask for stavalues, and
388 			 * it's NULL. We'll still store the empty fraction in stanumbers.
389 			 */
390 			length_hist_values = palloc(0);
391 			num_hist = 0;
392 		}
393 		stats->staop[slot_idx] = Float8LessOperator;
394 		stats->stacoll[slot_idx] = InvalidOid;
395 		stats->stavalues[slot_idx] = length_hist_values;
396 		stats->numvalues[slot_idx] = num_hist;
397 		stats->statypid[slot_idx] = FLOAT8OID;
398 		stats->statyplen[slot_idx] = sizeof(float8);
399 		stats->statypbyval[slot_idx] = FLOAT8PASSBYVAL;
400 		stats->statypalign[slot_idx] = 'd';
401 
402 		/* Store the fraction of empty ranges */
403 		emptyfrac = (float4 *) palloc(sizeof(float4));
404 		*emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
405 		stats->stanumbers[slot_idx] = emptyfrac;
406 		stats->numnumbers[slot_idx] = 1;
407 
408 		stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
409 		slot_idx++;
410 
411 		MemoryContextSwitchTo(old_cxt);
412 	}
413 	else if (null_cnt > 0)
414 	{
415 		/* We found only nulls; assume the column is entirely null */
416 		stats->stats_valid = true;
417 		stats->stanullfrac = 1.0;
418 		stats->stawidth = 0;	/* "unknown" */
419 		stats->stadistinct = 0.0;	/* "unknown" */
420 	}
421 
422 	/*
423 	 * We don't need to bother cleaning up any of our temporary palloc's. The
424 	 * hashtable should also go away, as it used a child memory context.
425 	 */
426 }
427