1 /*-------------------------------------------------------------------------
2  *
3  * ragetypes_typanalyze.c
4  *	  Functions for gathering statistics from range columns
5  *
6  * For a range type column, histograms of lower and upper bounds, and
7  * the fraction of NULL and empty ranges are collected.
8  *
9  * Both histograms have the same length, and they are combined into a
10  * single array of ranges. This has the same shape as the histogram that
11  * std_typanalyze would collect, but the values are different. Each range
12  * in the array is a valid range, even though the lower and upper bounds
13  * come from different tuples. In theory, the standard scalar selectivity
14  * functions could be used with the combined histogram.
15  *
16  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
17  * Portions Copyright (c) 1994, Regents of the University of California
18  *
19  *
20  * IDENTIFICATION
21  *	  src/backend/utils/adt/rangetypes_typanalyze.c
22  *
23  *-------------------------------------------------------------------------
24  */
25 #include "postgres.h"
26 
27 #include "catalog/pg_operator.h"
28 #include "commands/vacuum.h"
29 #include "utils/float.h"
30 #include "utils/fmgrprotos.h"
31 #include "utils/lsyscache.h"
32 #include "utils/rangetypes.h"
33 
34 static int	float8_qsort_cmp(const void *a1, const void *a2);
35 static int	range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
36 static void compute_range_stats(VacAttrStats *stats,
37 								AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
38 
39 /*
40  * range_typanalyze -- typanalyze function for range columns
41  */
42 Datum
range_typanalyze(PG_FUNCTION_ARGS)43 range_typanalyze(PG_FUNCTION_ARGS)
44 {
45 	VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
46 	TypeCacheEntry *typcache;
47 	Form_pg_attribute attr = stats->attr;
48 
49 	/* Get information about range type; note column might be a domain */
50 	typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
51 
52 	if (attr->attstattarget < 0)
53 		attr->attstattarget = default_statistics_target;
54 
55 	stats->compute_stats = compute_range_stats;
56 	stats->extra_data = typcache;
57 	/* same as in std_typanalyze */
58 	stats->minrows = 300 * attr->attstattarget;
59 
60 	PG_RETURN_BOOL(true);
61 }
62 
63 /*
64  * Comparison function for sorting float8s, used for range lengths.
65  */
66 static int
float8_qsort_cmp(const void * a1,const void * a2)67 float8_qsort_cmp(const void *a1, const void *a2)
68 {
69 	const float8 *f1 = (const float8 *) a1;
70 	const float8 *f2 = (const float8 *) a2;
71 
72 	if (*f1 < *f2)
73 		return -1;
74 	else if (*f1 == *f2)
75 		return 0;
76 	else
77 		return 1;
78 }
79 
80 /*
81  * Comparison function for sorting RangeBounds.
82  */
83 static int
range_bound_qsort_cmp(const void * a1,const void * a2,void * arg)84 range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
85 {
86 	RangeBound *b1 = (RangeBound *) a1;
87 	RangeBound *b2 = (RangeBound *) a2;
88 	TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
89 
90 	return range_cmp_bounds(typcache, b1, b2);
91 }
92 
93 /*
94  * compute_range_stats() -- compute statistics for a range column
95  */
96 static void
compute_range_stats(VacAttrStats * stats,AnalyzeAttrFetchFunc fetchfunc,int samplerows,double totalrows)97 compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
98 					int samplerows, double totalrows)
99 {
100 	TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
101 	bool		has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
102 	int			null_cnt = 0;
103 	int			non_null_cnt = 0;
104 	int			non_empty_cnt = 0;
105 	int			empty_cnt = 0;
106 	int			range_no;
107 	int			slot_idx;
108 	int			num_bins = stats->attr->attstattarget;
109 	int			num_hist;
110 	float8	   *lengths;
111 	RangeBound *lowers,
112 			   *uppers;
113 	double		total_width = 0;
114 
115 	/* Allocate memory to hold range bounds and lengths of the sample ranges. */
116 	lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
117 	uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
118 	lengths = (float8 *) palloc(sizeof(float8) * samplerows);
119 
120 	/* Loop over the sample ranges. */
121 	for (range_no = 0; range_no < samplerows; range_no++)
122 	{
123 		Datum		value;
124 		bool		isnull,
125 					empty;
126 		RangeType  *range;
127 		RangeBound	lower,
128 					upper;
129 		float8		length;
130 
131 		vacuum_delay_point();
132 
133 		value = fetchfunc(stats, range_no, &isnull);
134 		if (isnull)
135 		{
136 			/* range is null, just count that */
137 			null_cnt++;
138 			continue;
139 		}
140 
141 		/*
142 		 * XXX: should we ignore wide values, like std_typanalyze does, to
143 		 * avoid bloating the statistics table?
144 		 */
145 		total_width += VARSIZE_ANY(DatumGetPointer(value));
146 
147 		/* Get range and deserialize it for further analysis. */
148 		range = DatumGetRangeTypeP(value);
149 		range_deserialize(typcache, range, &lower, &upper, &empty);
150 
151 		if (!empty)
152 		{
153 			/* Remember bounds and length for further usage in histograms */
154 			lowers[non_empty_cnt] = lower;
155 			uppers[non_empty_cnt] = upper;
156 
157 			if (lower.infinite || upper.infinite)
158 			{
159 				/* Length of any kind of an infinite range is infinite */
160 				length = get_float8_infinity();
161 			}
162 			else if (has_subdiff)
163 			{
164 				/*
165 				 * For an ordinary range, use subdiff function between upper
166 				 * and lower bound values.
167 				 */
168 				length = DatumGetFloat8(FunctionCall2Coll(
169 														  &typcache->rng_subdiff_finfo,
170 														  typcache->rng_collation,
171 														  upper.val, lower.val));
172 			}
173 			else
174 			{
175 				/* Use default value of 1.0 if no subdiff is available. */
176 				length = 1.0;
177 			}
178 			lengths[non_empty_cnt] = length;
179 
180 			non_empty_cnt++;
181 		}
182 		else
183 			empty_cnt++;
184 
185 		non_null_cnt++;
186 	}
187 
188 	slot_idx = 0;
189 
190 	/* We can only compute real stats if we found some non-null values. */
191 	if (non_null_cnt > 0)
192 	{
193 		Datum	   *bound_hist_values;
194 		Datum	   *length_hist_values;
195 		int			pos,
196 					posfrac,
197 					delta,
198 					deltafrac,
199 					i;
200 		MemoryContext old_cxt;
201 		float4	   *emptyfrac;
202 
203 		stats->stats_valid = true;
204 		/* Do the simple null-frac and width stats */
205 		stats->stanullfrac = (double) null_cnt / (double) samplerows;
206 		stats->stawidth = total_width / (double) non_null_cnt;
207 
208 		/* Estimate that non-null values are unique */
209 		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
210 
211 		/* Must copy the target values into anl_context */
212 		old_cxt = MemoryContextSwitchTo(stats->anl_context);
213 
214 		/*
215 		 * Generate a bounds histogram slot entry if there are at least two
216 		 * values.
217 		 */
218 		if (non_empty_cnt >= 2)
219 		{
220 			/* Sort bound values */
221 			qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
222 					  range_bound_qsort_cmp, typcache);
223 			qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
224 					  range_bound_qsort_cmp, typcache);
225 
226 			num_hist = non_empty_cnt;
227 			if (num_hist > num_bins)
228 				num_hist = num_bins + 1;
229 
230 			bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
231 
232 			/*
233 			 * The object of this loop is to construct ranges from first and
234 			 * last entries in lowers[] and uppers[] along with evenly-spaced
235 			 * values in between. So the i'th value is a range of lowers[(i *
236 			 * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
237 			 * (num_hist - 1)]. But computing that subscript directly risks
238 			 * integer overflow when the stats target is more than a couple
239 			 * thousand.  Instead we add (nvals - 1) / (num_hist - 1) to pos
240 			 * at each step, tracking the integral and fractional parts of the
241 			 * sum separately.
242 			 */
243 			delta = (non_empty_cnt - 1) / (num_hist - 1);
244 			deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
245 			pos = posfrac = 0;
246 
247 			for (i = 0; i < num_hist; i++)
248 			{
249 				bound_hist_values[i] = PointerGetDatum(range_serialize(
250 																	   typcache, &lowers[pos], &uppers[pos], false));
251 				pos += delta;
252 				posfrac += deltafrac;
253 				if (posfrac >= (num_hist - 1))
254 				{
255 					/* fractional part exceeds 1, carry to integer part */
256 					pos++;
257 					posfrac -= (num_hist - 1);
258 				}
259 			}
260 
261 			stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
262 			stats->stavalues[slot_idx] = bound_hist_values;
263 			stats->numvalues[slot_idx] = num_hist;
264 			slot_idx++;
265 		}
266 
267 		/*
268 		 * Generate a length histogram slot entry if there are at least two
269 		 * values.
270 		 */
271 		if (non_empty_cnt >= 2)
272 		{
273 			/*
274 			 * Ascending sort of range lengths for further filling of
275 			 * histogram
276 			 */
277 			qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
278 
279 			num_hist = non_empty_cnt;
280 			if (num_hist > num_bins)
281 				num_hist = num_bins + 1;
282 
283 			length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
284 
285 			/*
286 			 * The object of this loop is to copy the first and last lengths[]
287 			 * entries along with evenly-spaced values in between. So the i'th
288 			 * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
289 			 * computing that subscript directly risks integer overflow when
290 			 * the stats target is more than a couple thousand.  Instead we
291 			 * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
292 			 * the integral and fractional parts of the sum separately.
293 			 */
294 			delta = (non_empty_cnt - 1) / (num_hist - 1);
295 			deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
296 			pos = posfrac = 0;
297 
298 			for (i = 0; i < num_hist; i++)
299 			{
300 				length_hist_values[i] = Float8GetDatum(lengths[pos]);
301 				pos += delta;
302 				posfrac += deltafrac;
303 				if (posfrac >= (num_hist - 1))
304 				{
305 					/* fractional part exceeds 1, carry to integer part */
306 					pos++;
307 					posfrac -= (num_hist - 1);
308 				}
309 			}
310 		}
311 		else
312 		{
313 			/*
314 			 * Even when we don't create the histogram, store an empty array
315 			 * to mean "no histogram". We can't just leave stavalues NULL,
316 			 * because get_attstatsslot() errors if you ask for stavalues, and
317 			 * it's NULL. We'll still store the empty fraction in stanumbers.
318 			 */
319 			length_hist_values = palloc(0);
320 			num_hist = 0;
321 		}
322 		stats->staop[slot_idx] = Float8LessOperator;
323 		stats->stacoll[slot_idx] = InvalidOid;
324 		stats->stavalues[slot_idx] = length_hist_values;
325 		stats->numvalues[slot_idx] = num_hist;
326 		stats->statypid[slot_idx] = FLOAT8OID;
327 		stats->statyplen[slot_idx] = sizeof(float8);
328 #ifdef USE_FLOAT8_BYVAL
329 		stats->statypbyval[slot_idx] = true;
330 #else
331 		stats->statypbyval[slot_idx] = false;
332 #endif
333 		stats->statypalign[slot_idx] = 'd';
334 
335 		/* Store the fraction of empty ranges */
336 		emptyfrac = (float4 *) palloc(sizeof(float4));
337 		*emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
338 		stats->stanumbers[slot_idx] = emptyfrac;
339 		stats->numnumbers[slot_idx] = 1;
340 
341 		stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
342 		slot_idx++;
343 
344 		MemoryContextSwitchTo(old_cxt);
345 	}
346 	else if (null_cnt > 0)
347 	{
348 		/* We found only nulls; assume the column is entirely null */
349 		stats->stats_valid = true;
350 		stats->stanullfrac = 1.0;
351 		stats->stawidth = 0;	/* "unknown" */
352 		stats->stadistinct = 0.0;	/* "unknown" */
353 	}
354 
355 	/*
356 	 * We don't need to bother cleaning up any of our temporary palloc's. The
357 	 * hashtable should also go away, as it used a child memory context.
358 	 */
359 }
360