1 /*-------------------------------------------------------------------------
2  *
3  * rangetypes_selfuncs.c
4  *	  Functions for selectivity estimation of range operators
5  *
6  * Estimates are based on histograms of lower and upper bounds, and the
7  * fraction of empty ranges.
8  *
9  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
10  * Portions Copyright (c) 1994, Regents of the University of California
11  *
12  *
13  * IDENTIFICATION
14  *	  src/backend/utils/adt/rangetypes_selfuncs.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include <math.h>
21 
22 #include "access/htup_details.h"
23 #include "catalog/pg_operator.h"
24 #include "catalog/pg_statistic.h"
25 #include "catalog/pg_type.h"
26 #include "utils/builtins.h"
27 #include "utils/lsyscache.h"
28 #include "utils/rangetypes.h"
29 #include "utils/selfuncs.h"
30 #include "utils/typcache.h"
31 
32 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
33 			  RangeType *constval, Oid operator);
34 static double default_range_selectivity(Oid operator);
35 static double calc_hist_selectivity(TypeCacheEntry *typcache,
36 					  VariableStatData *vardata, RangeType *constval,
37 					  Oid operator);
38 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
39 							 RangeBound *constbound,
40 							 RangeBound *hist, int hist_nvalues,
41 							 bool equal);
42 static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
43 			   RangeBound *hist, int hist_length, bool equal);
44 static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
45 			 RangeBound *hist1, RangeBound *hist2);
46 static float8 get_len_position(double value, double hist1, double hist2);
47 static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
48 			 RangeBound *bound2);
49 static int length_hist_bsearch(Datum *length_hist_values,
50 					int length_hist_nvalues, double value, bool equal);
51 static double calc_length_hist_frac(Datum *length_hist_values,
52 					  int length_hist_nvalues, double length1, double length2, bool equal);
53 static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
54 								RangeBound *lower, RangeBound *upper,
55 								RangeBound *hist_lower, int hist_nvalues,
56 								Datum *length_hist_values, int length_hist_nvalues);
57 static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
58 							   RangeBound *lower, RangeBound *upper,
59 							   RangeBound *hist_lower, int hist_nvalues,
60 							   Datum *length_hist_values, int length_hist_nvalues);
61 
62 /*
63  * Returns a default selectivity estimate for given operator, when we don't
64  * have statistics or cannot use them for some reason.
65  */
66 static double
default_range_selectivity(Oid operator)67 default_range_selectivity(Oid operator)
68 {
69 	switch (operator)
70 	{
71 		case OID_RANGE_OVERLAP_OP:
72 			return 0.01;
73 
74 		case OID_RANGE_CONTAINS_OP:
75 		case OID_RANGE_CONTAINED_OP:
76 			return 0.005;
77 
78 		case OID_RANGE_CONTAINS_ELEM_OP:
79 		case OID_RANGE_ELEM_CONTAINED_OP:
80 
81 			/*
82 			 * "range @> elem" is more or less identical to a scalar
83 			 * inequality "A >= b AND A <= c".
84 			 */
85 			return DEFAULT_RANGE_INEQ_SEL;
86 
87 		case OID_RANGE_LESS_OP:
88 		case OID_RANGE_LESS_EQUAL_OP:
89 		case OID_RANGE_GREATER_OP:
90 		case OID_RANGE_GREATER_EQUAL_OP:
91 		case OID_RANGE_LEFT_OP:
92 		case OID_RANGE_RIGHT_OP:
93 		case OID_RANGE_OVERLAPS_LEFT_OP:
94 		case OID_RANGE_OVERLAPS_RIGHT_OP:
95 			/* these are similar to regular scalar inequalities */
96 			return DEFAULT_INEQ_SEL;
97 
98 		default:
99 			/* all range operators should be handled above, but just in case */
100 			return 0.01;
101 	}
102 }
103 
104 /*
105  * rangesel -- restriction selectivity for range operators
106  */
107 Datum
rangesel(PG_FUNCTION_ARGS)108 rangesel(PG_FUNCTION_ARGS)
109 {
110 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
111 	Oid			operator = PG_GETARG_OID(1);
112 	List	   *args = (List *) PG_GETARG_POINTER(2);
113 	int			varRelid = PG_GETARG_INT32(3);
114 	VariableStatData vardata;
115 	Node	   *other;
116 	bool		varonleft;
117 	Selectivity selec;
118 	TypeCacheEntry *typcache = NULL;
119 	RangeType  *constrange = NULL;
120 
121 	/*
122 	 * If expression is not (variable op something) or (something op
123 	 * variable), then punt and return a default estimate.
124 	 */
125 	if (!get_restriction_variable(root, args, varRelid,
126 								  &vardata, &other, &varonleft))
127 		PG_RETURN_FLOAT8(default_range_selectivity(operator));
128 
129 	/*
130 	 * Can't do anything useful if the something is not a constant, either.
131 	 */
132 	if (!IsA(other, Const))
133 	{
134 		ReleaseVariableStats(vardata);
135 		PG_RETURN_FLOAT8(default_range_selectivity(operator));
136 	}
137 
138 	/*
139 	 * All the range operators are strict, so we can cope with a NULL constant
140 	 * right away.
141 	 */
142 	if (((Const *) other)->constisnull)
143 	{
144 		ReleaseVariableStats(vardata);
145 		PG_RETURN_FLOAT8(0.0);
146 	}
147 
148 	/*
149 	 * If var is on the right, commute the operator, so that we can assume the
150 	 * var is on the left in what follows.
151 	 */
152 	if (!varonleft)
153 	{
154 		/* we have other Op var, commute to make var Op other */
155 		operator = get_commutator(operator);
156 		if (!operator)
157 		{
158 			/* Use default selectivity (should we raise an error instead?) */
159 			ReleaseVariableStats(vardata);
160 			PG_RETURN_FLOAT8(default_range_selectivity(operator));
161 		}
162 	}
163 
164 	/*
165 	 * OK, there's a Var and a Const we're dealing with here.  We need the
166 	 * Const to be of same range type as the column, else we can't do anything
167 	 * useful. (Such cases will likely fail at runtime, but here we'd rather
168 	 * just return a default estimate.)
169 	 *
170 	 * If the operator is "range @> element", the constant should be of the
171 	 * element type of the range column. Convert it to a range that includes
172 	 * only that single point, so that we don't need special handling for that
173 	 * in what follows.
174 	 */
175 	if (operator == OID_RANGE_CONTAINS_ELEM_OP)
176 	{
177 		typcache = range_get_typcache(fcinfo, vardata.vartype);
178 
179 		if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
180 		{
181 			RangeBound	lower,
182 						upper;
183 
184 			lower.inclusive = true;
185 			lower.val = ((Const *) other)->constvalue;
186 			lower.infinite = false;
187 			lower.lower = true;
188 			upper.inclusive = true;
189 			upper.val = ((Const *) other)->constvalue;
190 			upper.infinite = false;
191 			upper.lower = false;
192 			constrange = range_serialize(typcache, &lower, &upper, false);
193 		}
194 	}
195 	else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
196 	{
197 		/*
198 		 * Here, the Var is the elem, not the range.  For now we just punt and
199 		 * return the default estimate.  In future we could disassemble the
200 		 * range constant and apply scalarineqsel ...
201 		 */
202 	}
203 	else if (((Const *) other)->consttype == vardata.vartype)
204 	{
205 		/* Both sides are the same range type */
206 		typcache = range_get_typcache(fcinfo, vardata.vartype);
207 
208 		constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
209 	}
210 
211 	/*
212 	 * If we got a valid constant on one side of the operator, proceed to
213 	 * estimate using statistics. Otherwise punt and return a default constant
214 	 * estimate.  Note that calc_rangesel need not handle
215 	 * OID_RANGE_ELEM_CONTAINED_OP.
216 	 */
217 	if (constrange)
218 		selec = calc_rangesel(typcache, &vardata, constrange, operator);
219 	else
220 		selec = default_range_selectivity(operator);
221 
222 	ReleaseVariableStats(vardata);
223 
224 	CLAMP_PROBABILITY(selec);
225 
226 	PG_RETURN_FLOAT8((float8) selec);
227 }
228 
229 static double
calc_rangesel(TypeCacheEntry * typcache,VariableStatData * vardata,RangeType * constval,Oid operator)230 calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
231 			  RangeType *constval, Oid operator)
232 {
233 	double		hist_selec;
234 	double		selec;
235 	float4		empty_frac,
236 				null_frac;
237 
238 	/*
239 	 * First look up the fraction of NULLs and empty ranges from pg_statistic.
240 	 */
241 	if (HeapTupleIsValid(vardata->statsTuple))
242 	{
243 		Form_pg_statistic stats;
244 		AttStatsSlot sslot;
245 
246 		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
247 		null_frac = stats->stanullfrac;
248 
249 		/* Try to get fraction of empty ranges */
250 		if (get_attstatsslot(&sslot, vardata->statsTuple,
251 							 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
252 							 InvalidOid,
253 							 ATTSTATSSLOT_NUMBERS))
254 		{
255 			if (sslot.nnumbers != 1)
256 				elog(ERROR, "invalid empty fraction statistic");	/* shouldn't happen */
257 			empty_frac = sslot.numbers[0];
258 			free_attstatsslot(&sslot);
259 		}
260 		else
261 		{
262 			/* No empty fraction statistic. Assume no empty ranges. */
263 			empty_frac = 0.0;
264 		}
265 	}
266 	else
267 	{
268 		/*
269 		 * No stats are available. Follow through the calculations below
270 		 * anyway, assuming no NULLs and no empty ranges. This still allows us
271 		 * to give a better-than-nothing estimate based on whether the
272 		 * constant is an empty range or not.
273 		 */
274 		null_frac = 0.0;
275 		empty_frac = 0.0;
276 	}
277 
278 	if (RangeIsEmpty(constval))
279 	{
280 		/*
281 		 * An empty range matches all ranges, all empty ranges, or nothing,
282 		 * depending on the operator
283 		 */
284 		switch (operator)
285 		{
286 				/* these return false if either argument is empty */
287 			case OID_RANGE_OVERLAP_OP:
288 			case OID_RANGE_OVERLAPS_LEFT_OP:
289 			case OID_RANGE_OVERLAPS_RIGHT_OP:
290 			case OID_RANGE_LEFT_OP:
291 			case OID_RANGE_RIGHT_OP:
292 				/* nothing is less than an empty range */
293 			case OID_RANGE_LESS_OP:
294 				selec = 0.0;
295 				break;
296 
297 				/* only empty ranges can be contained by an empty range */
298 			case OID_RANGE_CONTAINED_OP:
299 				/* only empty ranges are <= an empty range */
300 			case OID_RANGE_LESS_EQUAL_OP:
301 				selec = empty_frac;
302 				break;
303 
304 				/* everything contains an empty range */
305 			case OID_RANGE_CONTAINS_OP:
306 				/* everything is >= an empty range */
307 			case OID_RANGE_GREATER_EQUAL_OP:
308 				selec = 1.0;
309 				break;
310 
311 				/* all non-empty ranges are > an empty range */
312 			case OID_RANGE_GREATER_OP:
313 				selec = 1.0 - empty_frac;
314 				break;
315 
316 				/* an element cannot be empty */
317 			case OID_RANGE_CONTAINS_ELEM_OP:
318 			default:
319 				elog(ERROR, "unexpected operator %u", operator);
320 				selec = 0.0;	/* keep compiler quiet */
321 				break;
322 		}
323 	}
324 	else
325 	{
326 		/*
327 		 * Calculate selectivity using bound histograms. If that fails for
328 		 * some reason, e.g no histogram in pg_statistic, use the default
329 		 * constant estimate for the fraction of non-empty values. This is
330 		 * still somewhat better than just returning the default estimate,
331 		 * because this still takes into account the fraction of empty and
332 		 * NULL tuples, if we had statistics for them.
333 		 */
334 		hist_selec = calc_hist_selectivity(typcache, vardata, constval,
335 										   operator);
336 		if (hist_selec < 0.0)
337 			hist_selec = default_range_selectivity(operator);
338 
339 		/*
340 		 * Now merge the results for the empty ranges and histogram
341 		 * calculations, realizing that the histogram covers only the
342 		 * non-null, non-empty values.
343 		 */
344 		if (operator == OID_RANGE_CONTAINED_OP)
345 		{
346 			/* empty is contained by anything non-empty */
347 			selec = (1.0 - empty_frac) * hist_selec + empty_frac;
348 		}
349 		else
350 		{
351 			/* with any other operator, empty Op non-empty matches nothing */
352 			selec = (1.0 - empty_frac) * hist_selec;
353 		}
354 	}
355 
356 	/* all range operators are strict */
357 	selec *= (1.0 - null_frac);
358 
359 	/* result should be in range, but make sure... */
360 	CLAMP_PROBABILITY(selec);
361 
362 	return selec;
363 }
364 
365 /*
366  * Calculate range operator selectivity using histograms of range bounds.
367  *
368  * This estimate is for the portion of values that are not empty and not
369  * NULL.
370  */
371 static double
calc_hist_selectivity(TypeCacheEntry * typcache,VariableStatData * vardata,RangeType * constval,Oid operator)372 calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
373 					  RangeType *constval, Oid operator)
374 {
375 	AttStatsSlot hslot;
376 	AttStatsSlot lslot;
377 	int			nhist;
378 	RangeBound *hist_lower;
379 	RangeBound *hist_upper;
380 	int			i;
381 	RangeBound	const_lower;
382 	RangeBound	const_upper;
383 	bool		empty;
384 	double		hist_selec;
385 
386 	/* Can't use the histogram with insecure range support functions */
387 	if (!statistic_proc_security_check(vardata,
388 									   typcache->rng_cmp_proc_finfo.fn_oid))
389 		return -1;
390 	if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
391 		!statistic_proc_security_check(vardata,
392 									   typcache->rng_subdiff_finfo.fn_oid))
393 		return -1;
394 
395 	/* Try to get histogram of ranges */
396 	if (!(HeapTupleIsValid(vardata->statsTuple) &&
397 		  get_attstatsslot(&hslot, vardata->statsTuple,
398 						   STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
399 						   ATTSTATSSLOT_VALUES)))
400 		return -1.0;
401 
402 	/* check that it's a histogram, not just a dummy entry */
403 	if (hslot.nvalues < 2)
404 	{
405 		free_attstatsslot(&hslot);
406 		return -1.0;
407 	}
408 
409 	/*
410 	 * Convert histogram of ranges into histograms of its lower and upper
411 	 * bounds.
412 	 */
413 	nhist = hslot.nvalues;
414 	hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
415 	hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
416 	for (i = 0; i < nhist; i++)
417 	{
418 		range_deserialize(typcache, DatumGetRangeTypeP(hslot.values[i]),
419 						  &hist_lower[i], &hist_upper[i], &empty);
420 		/* The histogram should not contain any empty ranges */
421 		if (empty)
422 			elog(ERROR, "bounds histogram contains an empty range");
423 	}
424 
425 	/* @> and @< also need a histogram of range lengths */
426 	if (operator == OID_RANGE_CONTAINS_OP ||
427 		operator == OID_RANGE_CONTAINED_OP)
428 	{
429 		if (!(HeapTupleIsValid(vardata->statsTuple) &&
430 			  get_attstatsslot(&lslot, vardata->statsTuple,
431 							   STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
432 							   InvalidOid,
433 							   ATTSTATSSLOT_VALUES)))
434 		{
435 			free_attstatsslot(&hslot);
436 			return -1.0;
437 		}
438 
439 		/* check that it's a histogram, not just a dummy entry */
440 		if (lslot.nvalues < 2)
441 		{
442 			free_attstatsslot(&lslot);
443 			free_attstatsslot(&hslot);
444 			return -1.0;
445 		}
446 	}
447 	else
448 		memset(&lslot, 0, sizeof(lslot));
449 
450 	/* Extract the bounds of the constant value. */
451 	range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
452 	Assert(!empty);
453 
454 	/*
455 	 * Calculate selectivity comparing the lower or upper bound of the
456 	 * constant with the histogram of lower or upper bounds.
457 	 */
458 	switch (operator)
459 	{
460 		case OID_RANGE_LESS_OP:
461 
462 			/*
463 			 * The regular b-tree comparison operators (<, <=, >, >=) compare
464 			 * the lower bounds first, and the upper bounds for values with
465 			 * equal lower bounds. Estimate that by comparing the lower bounds
466 			 * only. This gives a fairly accurate estimate assuming there
467 			 * aren't many rows with a lower bound equal to the constant's
468 			 * lower bound.
469 			 */
470 			hist_selec =
471 				calc_hist_selectivity_scalar(typcache, &const_lower,
472 											 hist_lower, nhist, false);
473 			break;
474 
475 		case OID_RANGE_LESS_EQUAL_OP:
476 			hist_selec =
477 				calc_hist_selectivity_scalar(typcache, &const_lower,
478 											 hist_lower, nhist, true);
479 			break;
480 
481 		case OID_RANGE_GREATER_OP:
482 			hist_selec =
483 				1 - calc_hist_selectivity_scalar(typcache, &const_lower,
484 												 hist_lower, nhist, false);
485 			break;
486 
487 		case OID_RANGE_GREATER_EQUAL_OP:
488 			hist_selec =
489 				1 - calc_hist_selectivity_scalar(typcache, &const_lower,
490 												 hist_lower, nhist, true);
491 			break;
492 
493 		case OID_RANGE_LEFT_OP:
494 			/* var << const when upper(var) < lower(const) */
495 			hist_selec =
496 				calc_hist_selectivity_scalar(typcache, &const_lower,
497 											 hist_upper, nhist, false);
498 			break;
499 
500 		case OID_RANGE_RIGHT_OP:
501 			/* var >> const when lower(var) > upper(const) */
502 			hist_selec =
503 				1 - calc_hist_selectivity_scalar(typcache, &const_upper,
504 												 hist_lower, nhist, true);
505 			break;
506 
507 		case OID_RANGE_OVERLAPS_RIGHT_OP:
508 			/* compare lower bounds */
509 			hist_selec =
510 				1 - calc_hist_selectivity_scalar(typcache, &const_lower,
511 												 hist_lower, nhist, false);
512 			break;
513 
514 		case OID_RANGE_OVERLAPS_LEFT_OP:
515 			/* compare upper bounds */
516 			hist_selec =
517 				calc_hist_selectivity_scalar(typcache, &const_upper,
518 											 hist_upper, nhist, true);
519 			break;
520 
521 		case OID_RANGE_OVERLAP_OP:
522 		case OID_RANGE_CONTAINS_ELEM_OP:
523 
524 			/*
525 			 * A && B <=> NOT (A << B OR A >> B).
526 			 *
527 			 * Since A << B and A >> B are mutually exclusive events we can
528 			 * sum their probabilities to find probability of (A << B OR A >>
529 			 * B).
530 			 *
531 			 * "range @> elem" is equivalent to "range && [elem,elem]". The
532 			 * caller already constructed the singular range from the element
533 			 * constant, so just treat it the same as &&.
534 			 */
535 			hist_selec =
536 				calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
537 											 nhist, false);
538 			hist_selec +=
539 				(1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
540 													nhist, true));
541 			hist_selec = 1.0 - hist_selec;
542 			break;
543 
544 		case OID_RANGE_CONTAINS_OP:
545 			hist_selec =
546 				calc_hist_selectivity_contains(typcache, &const_lower,
547 											   &const_upper, hist_lower, nhist,
548 											   lslot.values, lslot.nvalues);
549 			break;
550 
551 		case OID_RANGE_CONTAINED_OP:
552 			if (const_lower.infinite)
553 			{
554 				/*
555 				 * Lower bound no longer matters. Just estimate the fraction
556 				 * with an upper bound <= const upper bound
557 				 */
558 				hist_selec =
559 					calc_hist_selectivity_scalar(typcache, &const_upper,
560 												 hist_upper, nhist, true);
561 			}
562 			else if (const_upper.infinite)
563 			{
564 				hist_selec =
565 					1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
566 													   hist_lower, nhist, false);
567 			}
568 			else
569 			{
570 				hist_selec =
571 					calc_hist_selectivity_contained(typcache, &const_lower,
572 													&const_upper, hist_lower, nhist,
573 													lslot.values, lslot.nvalues);
574 			}
575 			break;
576 
577 		default:
578 			elog(ERROR, "unknown range operator %u", operator);
579 			hist_selec = -1.0;	/* keep compiler quiet */
580 			break;
581 	}
582 
583 	free_attstatsslot(&lslot);
584 	free_attstatsslot(&hslot);
585 
586 	return hist_selec;
587 }
588 
589 
590 /*
591  * Look up the fraction of values less than (or equal, if 'equal' argument
592  * is true) a given const in a histogram of range bounds.
593  */
594 static double
calc_hist_selectivity_scalar(TypeCacheEntry * typcache,RangeBound * constbound,RangeBound * hist,int hist_nvalues,bool equal)595 calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
596 							 RangeBound *hist, int hist_nvalues, bool equal)
597 {
598 	Selectivity selec;
599 	int			index;
600 
601 	/*
602 	 * Find the histogram bin the given constant falls into. Estimate
603 	 * selectivity as the number of preceding whole bins.
604 	 */
605 	index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
606 	selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
607 
608 	/* Adjust using linear interpolation within the bin */
609 	if (index >= 0 && index < hist_nvalues - 1)
610 		selec += get_position(typcache, constbound, &hist[index],
611 							  &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
612 
613 	return selec;
614 }
615 
616 /*
617  * Binary search on an array of range bounds. Returns greatest index of range
618  * bound in array which is less(less or equal) than given range bound. If all
619  * range bounds in array are greater or equal(greater) than given range bound,
620  * return -1. When "equal" flag is set conditions in brackets are used.
621  *
622  * This function is used in scalar operator selectivity estimation. Another
623  * goal of this function is to find a histogram bin where to stop
624  * interpolation of portion of bounds which are less than or equal to given bound.
625  */
626 static int
rbound_bsearch(TypeCacheEntry * typcache,RangeBound * value,RangeBound * hist,int hist_length,bool equal)627 rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
628 			   int hist_length, bool equal)
629 {
630 	int			lower = -1,
631 				upper = hist_length - 1,
632 				cmp,
633 				middle;
634 
635 	while (lower < upper)
636 	{
637 		middle = (lower + upper + 1) / 2;
638 		cmp = range_cmp_bounds(typcache, &hist[middle], value);
639 
640 		if (cmp < 0 || (equal && cmp == 0))
641 			lower = middle;
642 		else
643 			upper = middle - 1;
644 	}
645 	return lower;
646 }
647 
648 
649 /*
650  * Binary search on length histogram. Returns greatest index of range length in
651  * histogram which is less than (less than or equal) the given length value. If
652  * all lengths in the histogram are greater than (greater than or equal) the
653  * given length, returns -1.
654  */
655 static int
length_hist_bsearch(Datum * length_hist_values,int length_hist_nvalues,double value,bool equal)656 length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
657 					double value, bool equal)
658 {
659 	int			lower = -1,
660 				upper = length_hist_nvalues - 1,
661 				middle;
662 
663 	while (lower < upper)
664 	{
665 		double		middleval;
666 
667 		middle = (lower + upper + 1) / 2;
668 
669 		middleval = DatumGetFloat8(length_hist_values[middle]);
670 		if (middleval < value || (equal && middleval <= value))
671 			lower = middle;
672 		else
673 			upper = middle - 1;
674 	}
675 	return lower;
676 }
677 
678 /*
679  * Get relative position of value in histogram bin in [0,1] range.
680  */
681 static float8
get_position(TypeCacheEntry * typcache,RangeBound * value,RangeBound * hist1,RangeBound * hist2)682 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
683 			 RangeBound *hist2)
684 {
685 	bool		has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
686 	float8		position;
687 
688 	if (!hist1->infinite && !hist2->infinite)
689 	{
690 		float8		bin_width;
691 
692 		/*
693 		 * Both bounds are finite. Assuming the subtype's comparison function
694 		 * works sanely, the value must be finite, too, because it lies
695 		 * somewhere between the bounds.  If it doesn't, arbitrarily return
696 		 * 0.5.
697 		 */
698 		if (value->infinite)
699 			return 0.5;
700 
701 		/* Can't interpolate without subdiff function */
702 		if (!has_subdiff)
703 			return 0.5;
704 
705 		/* Calculate relative position using subdiff function. */
706 		bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
707 													 typcache->rng_collation,
708 													 hist2->val,
709 													 hist1->val));
710 		if (isnan(bin_width) || bin_width <= 0.0)
711 			return 0.5;			/* punt for NaN or zero-width bin */
712 
713 		position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
714 													typcache->rng_collation,
715 													value->val,
716 													hist1->val))
717 			/ bin_width;
718 
719 		if (isnan(position))
720 			return 0.5;			/* punt for NaN from subdiff, Inf/Inf, etc */
721 
722 		/* Relative position must be in [0,1] range */
723 		position = Max(position, 0.0);
724 		position = Min(position, 1.0);
725 		return position;
726 	}
727 	else if (hist1->infinite && !hist2->infinite)
728 	{
729 		/*
730 		 * Lower bin boundary is -infinite, upper is finite. If the value is
731 		 * -infinite, return 0.0 to indicate it's equal to the lower bound.
732 		 * Otherwise return 1.0 to indicate it's infinitely far from the lower
733 		 * bound.
734 		 */
735 		return ((value->infinite && value->lower) ? 0.0 : 1.0);
736 	}
737 	else if (!hist1->infinite && hist2->infinite)
738 	{
739 		/* same as above, but in reverse */
740 		return ((value->infinite && !value->lower) ? 1.0 : 0.0);
741 	}
742 	else
743 	{
744 		/*
745 		 * If both bin boundaries are infinite, they should be equal to each
746 		 * other, and the value should also be infinite and equal to both
747 		 * bounds. (But don't Assert that, to avoid crashing if a user creates
748 		 * a datatype with a broken comparison function).
749 		 *
750 		 * Assume the value to lie in the middle of the infinite bounds.
751 		 */
752 		return 0.5;
753 	}
754 }
755 
756 
757 /*
758  * Get relative position of value in a length histogram bin in [0,1] range.
759  */
760 static double
get_len_position(double value,double hist1,double hist2)761 get_len_position(double value, double hist1, double hist2)
762 {
763 	if (!is_infinite(hist1) && !is_infinite(hist2))
764 	{
765 		/*
766 		 * Both bounds are finite. The value should be finite too, because it
767 		 * lies somewhere between the bounds. If it doesn't, just return
768 		 * something.
769 		 */
770 		if (is_infinite(value))
771 			return 0.5;
772 
773 		return 1.0 - (hist2 - value) / (hist2 - hist1);
774 	}
775 	else if (is_infinite(hist1) && !is_infinite(hist2))
776 	{
777 		/*
778 		 * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
779 		 * indicate the value is infinitely far from the lower bound.
780 		 */
781 		return 1.0;
782 	}
783 	else if (is_infinite(hist1) && is_infinite(hist2))
784 	{
785 		/* same as above, but in reverse */
786 		return 0.0;
787 	}
788 	else
789 	{
790 		/*
791 		 * If both bin boundaries are infinite, they should be equal to each
792 		 * other, and the value should also be infinite and equal to both
793 		 * bounds. (But don't Assert that, to avoid crashing unnecessarily if
794 		 * the caller messes up)
795 		 *
796 		 * Assume the value to lie in the middle of the infinite bounds.
797 		 */
798 		return 0.5;
799 	}
800 }
801 
802 /*
803  * Measure distance between two range bounds.
804  */
805 static float8
get_distance(TypeCacheEntry * typcache,RangeBound * bound1,RangeBound * bound2)806 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
807 {
808 	bool		has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
809 
810 	if (!bound1->infinite && !bound2->infinite)
811 	{
812 		/*
813 		 * Neither bound is infinite, use subdiff function or return default
814 		 * value of 1.0 if no subdiff is available.
815 		 */
816 		if (has_subdiff)
817 		{
818 			float8		res;
819 
820 			res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
821 												   typcache->rng_collation,
822 												   bound2->val,
823 												   bound1->val));
824 			/* Reject possible NaN result, also negative result */
825 			if (isnan(res) || res < 0.0)
826 				return 1.0;
827 			else
828 				return res;
829 		}
830 		else
831 			return 1.0;
832 	}
833 	else if (bound1->infinite && bound2->infinite)
834 	{
835 		/* Both bounds are infinite */
836 		if (bound1->lower == bound2->lower)
837 			return 0.0;
838 		else
839 			return get_float8_infinity();
840 	}
841 	else
842 	{
843 		/* One bound is infinite, the other is not */
844 		return get_float8_infinity();
845 	}
846 }
847 
848 /*
849  * Calculate the average of function P(x), in the interval [length1, length2],
850  * where P(x) is the fraction of tuples with length < x (or length <= x if
851  * 'equal' is true).
852  */
853 static double
calc_length_hist_frac(Datum * length_hist_values,int length_hist_nvalues,double length1,double length2,bool equal)854 calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
855 					  double length1, double length2, bool equal)
856 {
857 	double		frac;
858 	double		A,
859 				B,
860 				PA,
861 				PB;
862 	double		pos;
863 	int			i;
864 	double		area;
865 
866 	Assert(length2 >= length1);
867 
868 	if (length2 < 0.0)
869 		return 0.0;				/* shouldn't happen, but doesn't hurt to check */
870 
871 	/* All lengths in the table are <= infinite. */
872 	if (is_infinite(length2) && equal)
873 		return 1.0;
874 
875 	/*----------
876 	 * The average of a function between A and B can be calculated by the
877 	 * formula:
878 	 *
879 	 *			B
880 	 *	  1		/
881 	 * -------	| P(x)dx
882 	 *	B - A	/
883 	 *			A
884 	 *
885 	 * The geometrical interpretation of the integral is the area under the
886 	 * graph of P(x). P(x) is defined by the length histogram. We calculate
887 	 * the area in a piecewise fashion, iterating through the length histogram
888 	 * bins. Each bin is a trapezoid:
889 	 *
890 	 *		 P(x2)
891 	 *		  /|
892 	 *		 / |
893 	 * P(x1)/  |
894 	 *	   |   |
895 	 *	   |   |
896 	 *	---+---+--
897 	 *	   x1  x2
898 	 *
899 	 * where x1 and x2 are the boundaries of the current histogram, and P(x1)
900 	 * and P(x1) are the cumulative fraction of tuples at the boundaries.
901 	 *
902 	 * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
903 	 *
904 	 * The first bin contains the lower bound passed by the caller, so we
905 	 * use linear interpolation between the previous and next histogram bin
906 	 * boundary to calculate P(x1). Likewise for the last bin: we use linear
907 	 * interpolation to calculate P(x2). For the bins in between, x1 and x2
908 	 * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
909 	 * P(x1) =	  (bin index) / (number of bins)
910 	 * P(x2) = (bin index + 1 / (number of bins)
911 	 */
912 
913 	/* First bin, the one that contains lower bound */
914 	i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
915 	if (i >= length_hist_nvalues - 1)
916 		return 1.0;
917 
918 	if (i < 0)
919 	{
920 		i = 0;
921 		pos = 0.0;
922 	}
923 	else
924 	{
925 		/* interpolate length1's position in the bin */
926 		pos = get_len_position(length1,
927 							   DatumGetFloat8(length_hist_values[i]),
928 							   DatumGetFloat8(length_hist_values[i + 1]));
929 	}
930 	PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
931 	B = length1;
932 
933 	/*
934 	 * In the degenerate case that length1 == length2, simply return
935 	 * P(length1). This is not merely an optimization: if length1 == length2,
936 	 * we'd divide by zero later on.
937 	 */
938 	if (length2 == length1)
939 		return PB;
940 
941 	/*
942 	 * Loop through all the bins, until we hit the last bin, the one that
943 	 * contains the upper bound. (if lower and upper bounds are in the same
944 	 * bin, this falls out immediately)
945 	 */
946 	area = 0.0;
947 	for (; i < length_hist_nvalues - 1; i++)
948 	{
949 		double		bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
950 
951 		/* check if we've reached the last bin */
952 		if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
953 			break;
954 
955 		/* the upper bound of previous bin is the lower bound of this bin */
956 		A = B;
957 		PA = PB;
958 
959 		B = bin_upper;
960 		PB = (double) i / (double) (length_hist_nvalues - 1);
961 
962 		/*
963 		 * Add the area of this trapezoid to the total. The point of the
964 		 * if-check is to avoid NaN, in the corner case that PA == PB == 0,
965 		 * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
966 		 * 0) is zero, regardless of the width (B - A).
967 		 */
968 		if (PA > 0 || PB > 0)
969 			area += 0.5 * (PB + PA) * (B - A);
970 	}
971 
972 	/* Last bin */
973 	A = B;
974 	PA = PB;
975 
976 	B = length2;				/* last bin ends at the query upper bound */
977 	if (i >= length_hist_nvalues - 1)
978 		pos = 0.0;
979 	else
980 	{
981 		if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
982 			pos = 0.0;
983 		else
984 			pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
985 	}
986 	PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
987 
988 	if (PA > 0 || PB > 0)
989 		area += 0.5 * (PB + PA) * (B - A);
990 
991 	/*
992 	 * Ok, we have calculated the area, ie. the integral. Divide by width to
993 	 * get the requested average.
994 	 *
995 	 * Avoid NaN arising from infinite / infinite. This happens at least if
996 	 * length2 is infinite. It's not clear what the correct value would be in
997 	 * that case, so 0.5 seems as good as any value.
998 	 */
999 	if (is_infinite(area) && is_infinite(length2))
1000 		frac = 0.5;
1001 	else
1002 		frac = area / (length2 - length1);
1003 
1004 	return frac;
1005 }
1006 
1007 /*
1008  * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
1009  * of ranges that fall within the constant lower and upper bounds. This uses
1010  * the histograms of range lower bounds and range lengths, on the assumption
1011  * that the range lengths are independent of the lower bounds.
1012  *
1013  * The caller has already checked that constant lower and upper bounds are
1014  * finite.
1015  */
1016 static double
calc_hist_selectivity_contained(TypeCacheEntry * typcache,RangeBound * lower,RangeBound * upper,RangeBound * hist_lower,int hist_nvalues,Datum * length_hist_values,int length_hist_nvalues)1017 calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1018 								RangeBound *lower, RangeBound *upper,
1019 								RangeBound *hist_lower, int hist_nvalues,
1020 								Datum *length_hist_values, int length_hist_nvalues)
1021 {
1022 	int			i,
1023 				upper_index;
1024 	float8		prev_dist;
1025 	double		bin_width;
1026 	double		upper_bin_width;
1027 	double		sum_frac;
1028 
1029 	/*
1030 	 * Begin by finding the bin containing the upper bound, in the lower bound
1031 	 * histogram. Any range with a lower bound > constant upper bound can't
1032 	 * match, ie. there are no matches in bins greater than upper_index.
1033 	 */
1034 	upper->inclusive = !upper->inclusive;
1035 	upper->lower = true;
1036 	upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1037 								 false);
1038 
1039 	/*
1040 	 * If the upper bound value is below the histogram's lower limit, there
1041 	 * are no matches.
1042 	 */
1043 	if (upper_index < 0)
1044 		return 0.0;
1045 
1046 	/*
1047 	 * If the upper bound value is at or beyond the histogram's upper limit,
1048 	 * start our loop at the last actual bin, as though the upper bound were
1049 	 * within that bin; get_position will clamp its result to 1.0 anyway.
1050 	 * (This corresponds to assuming that the data population above the
1051 	 * histogram's upper limit is empty, exactly like what we just assumed for
1052 	 * the lower limit.)
1053 	 */
1054 	upper_index = Min(upper_index, hist_nvalues - 2);
1055 
1056 	/*
1057 	 * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1058 	 * upper_index + 1) bin which is greater than upper bound of query range
1059 	 * using linear interpolation of subdiff function.
1060 	 */
1061 	upper_bin_width = get_position(typcache, upper,
1062 								   &hist_lower[upper_index],
1063 								   &hist_lower[upper_index + 1]);
1064 
1065 	/*
1066 	 * In the loop, dist and prev_dist are the distance of the "current" bin's
1067 	 * lower and upper bounds from the constant upper bound.
1068 	 *
1069 	 * bin_width represents the width of the current bin. Normally it is 1.0,
1070 	 * meaning a full width bin, but can be less in the corner cases: start
1071 	 * and end of the loop. We start with bin_width = upper_bin_width, because
1072 	 * we begin at the bin containing the upper bound.
1073 	 */
1074 	prev_dist = 0.0;
1075 	bin_width = upper_bin_width;
1076 
1077 	sum_frac = 0.0;
1078 	for (i = upper_index; i >= 0; i--)
1079 	{
1080 		double		dist;
1081 		double		length_hist_frac;
1082 		bool		final_bin = false;
1083 
1084 		/*
1085 		 * dist -- distance from upper bound of query range to lower bound of
1086 		 * the current bin in the lower bound histogram. Or to the lower bound
1087 		 * of the constant range, if this is the final bin, containing the
1088 		 * constant lower bound.
1089 		 */
1090 		if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1091 		{
1092 			dist = get_distance(typcache, lower, upper);
1093 
1094 			/*
1095 			 * Subtract from bin_width the portion of this bin that we want to
1096 			 * ignore.
1097 			 */
1098 			bin_width -= get_position(typcache, lower, &hist_lower[i],
1099 									  &hist_lower[i + 1]);
1100 			if (bin_width < 0.0)
1101 				bin_width = 0.0;
1102 			final_bin = true;
1103 		}
1104 		else
1105 			dist = get_distance(typcache, &hist_lower[i], upper);
1106 
1107 		/*
1108 		 * Estimate the fraction of tuples in this bin that are narrow enough
1109 		 * to not exceed the distance to the upper bound of the query range.
1110 		 */
1111 		length_hist_frac = calc_length_hist_frac(length_hist_values,
1112 												 length_hist_nvalues,
1113 												 prev_dist, dist, true);
1114 
1115 		/*
1116 		 * Add the fraction of tuples in this bin, with a suitable length, to
1117 		 * the total.
1118 		 */
1119 		sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1120 
1121 		if (final_bin)
1122 			break;
1123 
1124 		bin_width = 1.0;
1125 		prev_dist = dist;
1126 	}
1127 
1128 	return sum_frac;
1129 }
1130 
1131 /*
1132  * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1133  * of ranges that contain the constant lower and upper bounds. This uses
1134  * the histograms of range lower bounds and range lengths, on the assumption
1135  * that the range lengths are independent of the lower bounds.
1136  */
1137 static double
calc_hist_selectivity_contains(TypeCacheEntry * typcache,RangeBound * lower,RangeBound * upper,RangeBound * hist_lower,int hist_nvalues,Datum * length_hist_values,int length_hist_nvalues)1138 calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1139 							   RangeBound *lower, RangeBound *upper,
1140 							   RangeBound *hist_lower, int hist_nvalues,
1141 							   Datum *length_hist_values, int length_hist_nvalues)
1142 {
1143 	int			i,
1144 				lower_index;
1145 	double		bin_width,
1146 				lower_bin_width;
1147 	double		sum_frac;
1148 	float8		prev_dist;
1149 
1150 	/* Find the bin containing the lower bound of query range. */
1151 	lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1152 								 true);
1153 
1154 	/*
1155 	 * If the lower bound value is below the histogram's lower limit, there
1156 	 * are no matches.
1157 	 */
1158 	if (lower_index < 0)
1159 		return 0.0;
1160 
1161 	/*
1162 	 * If the lower bound value is at or beyond the histogram's upper limit,
1163 	 * start our loop at the last actual bin, as though the upper bound were
1164 	 * within that bin; get_position will clamp its result to 1.0 anyway.
1165 	 * (This corresponds to assuming that the data population above the
1166 	 * histogram's upper limit is empty, exactly like what we just assumed for
1167 	 * the lower limit.)
1168 	 */
1169 	lower_index = Min(lower_index, hist_nvalues - 2);
1170 
1171 	/*
1172 	 * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1173 	 * lower_index + 1) bin which is greater than lower bound of query range
1174 	 * using linear interpolation of subdiff function.
1175 	 */
1176 	lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1177 								   &hist_lower[lower_index + 1]);
1178 
1179 	/*
1180 	 * Loop through all the lower bound bins, smaller than the query lower
1181 	 * bound. In the loop, dist and prev_dist are the distance of the
1182 	 * "current" bin's lower and upper bounds from the constant upper bound.
1183 	 * We begin from query lower bound, and walk backwards, so the first bin's
1184 	 * upper bound is the query lower bound, and its distance to the query
1185 	 * upper bound is the length of the query range.
1186 	 *
1187 	 * bin_width represents the width of the current bin. Normally it is 1.0,
1188 	 * meaning a full width bin, except for the first bin, which is only
1189 	 * counted up to the constant lower bound.
1190 	 */
1191 	prev_dist = get_distance(typcache, lower, upper);
1192 	sum_frac = 0.0;
1193 	bin_width = lower_bin_width;
1194 	for (i = lower_index; i >= 0; i--)
1195 	{
1196 		float8		dist;
1197 		double		length_hist_frac;
1198 
1199 		/*
1200 		 * dist -- distance from upper bound of query range to current value
1201 		 * of lower bound histogram or lower bound of query range (if we've
1202 		 * reach it).
1203 		 */
1204 		dist = get_distance(typcache, &hist_lower[i], upper);
1205 
1206 		/*
1207 		 * Get average fraction of length histogram which covers intervals
1208 		 * longer than (or equal to) distance to upper bound of query range.
1209 		 */
1210 		length_hist_frac =
1211 			1.0 - calc_length_hist_frac(length_hist_values,
1212 										length_hist_nvalues,
1213 										prev_dist, dist, false);
1214 
1215 		sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1216 
1217 		bin_width = 1.0;
1218 		prev_dist = dist;
1219 	}
1220 
1221 	return sum_frac;
1222 }
1223