1 /*-------------------------------------------------------------------------
2  *
3  * rangetypes_selfuncs.c
4  *	  Functions for selectivity estimation of range operators
5  *
6  * Estimates are based on histograms of lower and upper bounds, and the
7  * fraction of empty ranges.
8  *
9  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
10  * Portions Copyright (c) 1994, Regents of the University of California
11  *
12  *
13  * IDENTIFICATION
14  *	  src/backend/utils/adt/rangetypes_selfuncs.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include <math.h>
21 
22 #include "access/htup_details.h"
23 #include "catalog/pg_operator.h"
24 #include "catalog/pg_statistic.h"
25 #include "catalog/pg_type.h"
26 #include "utils/builtins.h"
27 #include "utils/lsyscache.h"
28 #include "utils/rangetypes.h"
29 #include "utils/selfuncs.h"
30 #include "utils/typcache.h"
31 
32 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
33 			  RangeType *constval, Oid operator);
34 static double default_range_selectivity(Oid operator);
35 static double calc_hist_selectivity(TypeCacheEntry *typcache,
36 					  VariableStatData *vardata, RangeType *constval,
37 					  Oid operator);
38 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
39 							 RangeBound *constbound,
40 							 RangeBound *hist, int hist_nvalues,
41 							 bool equal);
42 static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
43 			   RangeBound *hist, int hist_length, bool equal);
44 static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
45 			 RangeBound *hist1, RangeBound *hist2);
46 static float8 get_len_position(double value, double hist1, double hist2);
47 static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
48 			 RangeBound *bound2);
49 static int length_hist_bsearch(Datum *length_hist_values,
50 					int length_hist_nvalues, double value, bool equal);
51 static double calc_length_hist_frac(Datum *length_hist_values,
52 		int length_hist_nvalues, double length1, double length2, bool equal);
53 static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
54 								RangeBound *lower, RangeBound *upper,
55 								RangeBound *hist_lower, int hist_nvalues,
56 						 Datum *length_hist_values, int length_hist_nvalues);
57 static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
58 							   RangeBound *lower, RangeBound *upper,
59 							   RangeBound *hist_lower, int hist_nvalues,
60 						 Datum *length_hist_values, int length_hist_nvalues);
61 
62 /*
63  * Returns a default selectivity estimate for given operator, when we don't
64  * have statistics or cannot use them for some reason.
65  */
66 static double
default_range_selectivity(Oid operator)67 default_range_selectivity(Oid operator)
68 {
69 	switch (operator)
70 	{
71 		case OID_RANGE_OVERLAP_OP:
72 			return 0.01;
73 
74 		case OID_RANGE_CONTAINS_OP:
75 		case OID_RANGE_CONTAINED_OP:
76 			return 0.005;
77 
78 		case OID_RANGE_CONTAINS_ELEM_OP:
79 		case OID_RANGE_ELEM_CONTAINED_OP:
80 
81 			/*
82 			 * "range @> elem" is more or less identical to a scalar
83 			 * inequality "A >= b AND A <= c".
84 			 */
85 			return DEFAULT_RANGE_INEQ_SEL;
86 
87 		case OID_RANGE_LESS_OP:
88 		case OID_RANGE_LESS_EQUAL_OP:
89 		case OID_RANGE_GREATER_OP:
90 		case OID_RANGE_GREATER_EQUAL_OP:
91 		case OID_RANGE_LEFT_OP:
92 		case OID_RANGE_RIGHT_OP:
93 		case OID_RANGE_OVERLAPS_LEFT_OP:
94 		case OID_RANGE_OVERLAPS_RIGHT_OP:
95 			/* these are similar to regular scalar inequalities */
96 			return DEFAULT_INEQ_SEL;
97 
98 		default:
99 			/* all range operators should be handled above, but just in case */
100 			return 0.01;
101 	}
102 }
103 
104 /*
105  * rangesel -- restriction selectivity for range operators
106  */
107 Datum
rangesel(PG_FUNCTION_ARGS)108 rangesel(PG_FUNCTION_ARGS)
109 {
110 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
111 	Oid			operator = PG_GETARG_OID(1);
112 	List	   *args = (List *) PG_GETARG_POINTER(2);
113 	int			varRelid = PG_GETARG_INT32(3);
114 	VariableStatData vardata;
115 	Node	   *other;
116 	bool		varonleft;
117 	Selectivity selec;
118 	TypeCacheEntry *typcache = NULL;
119 	RangeType  *constrange = NULL;
120 
121 	/*
122 	 * If expression is not (variable op something) or (something op
123 	 * variable), then punt and return a default estimate.
124 	 */
125 	if (!get_restriction_variable(root, args, varRelid,
126 								  &vardata, &other, &varonleft))
127 		PG_RETURN_FLOAT8(default_range_selectivity(operator));
128 
129 	/*
130 	 * Can't do anything useful if the something is not a constant, either.
131 	 */
132 	if (!IsA(other, Const))
133 	{
134 		ReleaseVariableStats(vardata);
135 		PG_RETURN_FLOAT8(default_range_selectivity(operator));
136 	}
137 
138 	/*
139 	 * All the range operators are strict, so we can cope with a NULL constant
140 	 * right away.
141 	 */
142 	if (((Const *) other)->constisnull)
143 	{
144 		ReleaseVariableStats(vardata);
145 		PG_RETURN_FLOAT8(0.0);
146 	}
147 
148 	/*
149 	 * If var is on the right, commute the operator, so that we can assume the
150 	 * var is on the left in what follows.
151 	 */
152 	if (!varonleft)
153 	{
154 		/* we have other Op var, commute to make var Op other */
155 		operator = get_commutator(operator);
156 		if (!operator)
157 		{
158 			/* Use default selectivity (should we raise an error instead?) */
159 			ReleaseVariableStats(vardata);
160 			PG_RETURN_FLOAT8(default_range_selectivity(operator));
161 		}
162 	}
163 
164 	/*
165 	 * OK, there's a Var and a Const we're dealing with here.  We need the
166 	 * Const to be of same range type as the column, else we can't do anything
167 	 * useful. (Such cases will likely fail at runtime, but here we'd rather
168 	 * just return a default estimate.)
169 	 *
170 	 * If the operator is "range @> element", the constant should be of the
171 	 * element type of the range column. Convert it to a range that includes
172 	 * only that single point, so that we don't need special handling for that
173 	 * in what follows.
174 	 */
175 	if (operator == OID_RANGE_CONTAINS_ELEM_OP)
176 	{
177 		typcache = range_get_typcache(fcinfo, vardata.vartype);
178 
179 		if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
180 		{
181 			RangeBound	lower,
182 						upper;
183 
184 			lower.inclusive = true;
185 			lower.val = ((Const *) other)->constvalue;
186 			lower.infinite = false;
187 			lower.lower = true;
188 			upper.inclusive = true;
189 			upper.val = ((Const *) other)->constvalue;
190 			upper.infinite = false;
191 			upper.lower = false;
192 			constrange = range_serialize(typcache, &lower, &upper, false);
193 		}
194 	}
195 	else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
196 	{
197 		/*
198 		 * Here, the Var is the elem, not the range.  For now we just punt and
199 		 * return the default estimate.  In future we could disassemble the
200 		 * range constant and apply scalarineqsel ...
201 		 */
202 	}
203 	else if (((Const *) other)->consttype == vardata.vartype)
204 	{
205 		/* Both sides are the same range type */
206 		typcache = range_get_typcache(fcinfo, vardata.vartype);
207 
208 		constrange = DatumGetRangeType(((Const *) other)->constvalue);
209 	}
210 
211 	/*
212 	 * If we got a valid constant on one side of the operator, proceed to
213 	 * estimate using statistics. Otherwise punt and return a default constant
214 	 * estimate.  Note that calc_rangesel need not handle
215 	 * OID_RANGE_ELEM_CONTAINED_OP.
216 	 */
217 	if (constrange)
218 		selec = calc_rangesel(typcache, &vardata, constrange, operator);
219 	else
220 		selec = default_range_selectivity(operator);
221 
222 	ReleaseVariableStats(vardata);
223 
224 	CLAMP_PROBABILITY(selec);
225 
226 	PG_RETURN_FLOAT8((float8) selec);
227 }
228 
229 static double
calc_rangesel(TypeCacheEntry * typcache,VariableStatData * vardata,RangeType * constval,Oid operator)230 calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
231 			  RangeType *constval, Oid operator)
232 {
233 	double		hist_selec;
234 	double		selec;
235 	float4		empty_frac,
236 				null_frac;
237 
238 	/*
239 	 * First look up the fraction of NULLs and empty ranges from pg_statistic.
240 	 */
241 	if (HeapTupleIsValid(vardata->statsTuple))
242 	{
243 		Form_pg_statistic stats;
244 		float4	   *numbers;
245 		int			nnumbers;
246 
247 		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
248 		null_frac = stats->stanullfrac;
249 
250 		/* Try to get fraction of empty ranges */
251 		if (get_attstatsslot(vardata->statsTuple,
252 							 FLOAT8OID, -1,
253 							 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
254 							 InvalidOid,
255 							 NULL,
256 							 NULL, NULL,
257 							 &numbers, &nnumbers))
258 		{
259 			if (nnumbers != 1)
260 				elog(ERROR, "invalid empty fraction statistic");		/* shouldn't happen */
261 			empty_frac = numbers[0];
262 			free_attstatsslot(FLOAT8OID, NULL, 0, numbers, nnumbers);
263 		}
264 		else
265 		{
266 			/* No empty fraction statistic. Assume no empty ranges. */
267 			empty_frac = 0.0;
268 		}
269 	}
270 	else
271 	{
272 		/*
273 		 * No stats are available. Follow through the calculations below
274 		 * anyway, assuming no NULLs and no empty ranges. This still allows us
275 		 * to give a better-than-nothing estimate based on whether the
276 		 * constant is an empty range or not.
277 		 */
278 		null_frac = 0.0;
279 		empty_frac = 0.0;
280 	}
281 
282 	if (RangeIsEmpty(constval))
283 	{
284 		/*
285 		 * An empty range matches all ranges, all empty ranges, or nothing,
286 		 * depending on the operator
287 		 */
288 		switch (operator)
289 		{
290 				/* these return false if either argument is empty */
291 			case OID_RANGE_OVERLAP_OP:
292 			case OID_RANGE_OVERLAPS_LEFT_OP:
293 			case OID_RANGE_OVERLAPS_RIGHT_OP:
294 			case OID_RANGE_LEFT_OP:
295 			case OID_RANGE_RIGHT_OP:
296 				/* nothing is less than an empty range */
297 			case OID_RANGE_LESS_OP:
298 				selec = 0.0;
299 				break;
300 
301 				/* only empty ranges can be contained by an empty range */
302 			case OID_RANGE_CONTAINED_OP:
303 				/* only empty ranges are <= an empty range */
304 			case OID_RANGE_LESS_EQUAL_OP:
305 				selec = empty_frac;
306 				break;
307 
308 				/* everything contains an empty range */
309 			case OID_RANGE_CONTAINS_OP:
310 				/* everything is >= an empty range */
311 			case OID_RANGE_GREATER_EQUAL_OP:
312 				selec = 1.0;
313 				break;
314 
315 				/* all non-empty ranges are > an empty range */
316 			case OID_RANGE_GREATER_OP:
317 				selec = 1.0 - empty_frac;
318 				break;
319 
320 				/* an element cannot be empty */
321 			case OID_RANGE_CONTAINS_ELEM_OP:
322 			default:
323 				elog(ERROR, "unexpected operator %u", operator);
324 				selec = 0.0;	/* keep compiler quiet */
325 				break;
326 		}
327 	}
328 	else
329 	{
330 		/*
331 		 * Calculate selectivity using bound histograms. If that fails for
332 		 * some reason, e.g no histogram in pg_statistic, use the default
333 		 * constant estimate for the fraction of non-empty values. This is
334 		 * still somewhat better than just returning the default estimate,
335 		 * because this still takes into account the fraction of empty and
336 		 * NULL tuples, if we had statistics for them.
337 		 */
338 		hist_selec = calc_hist_selectivity(typcache, vardata, constval,
339 										   operator);
340 		if (hist_selec < 0.0)
341 			hist_selec = default_range_selectivity(operator);
342 
343 		/*
344 		 * Now merge the results for the empty ranges and histogram
345 		 * calculations, realizing that the histogram covers only the
346 		 * non-null, non-empty values.
347 		 */
348 		if (operator == OID_RANGE_CONTAINED_OP)
349 		{
350 			/* empty is contained by anything non-empty */
351 			selec = (1.0 - empty_frac) * hist_selec + empty_frac;
352 		}
353 		else
354 		{
355 			/* with any other operator, empty Op non-empty matches nothing */
356 			selec = (1.0 - empty_frac) * hist_selec;
357 		}
358 	}
359 
360 	/* all range operators are strict */
361 	selec *= (1.0 - null_frac);
362 
363 	/* result should be in range, but make sure... */
364 	CLAMP_PROBABILITY(selec);
365 
366 	return selec;
367 }
368 
369 /*
370  * Calculate range operator selectivity using histograms of range bounds.
371  *
372  * This estimate is for the portion of values that are not empty and not
373  * NULL.
374  */
375 static double
calc_hist_selectivity(TypeCacheEntry * typcache,VariableStatData * vardata,RangeType * constval,Oid operator)376 calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
377 					  RangeType *constval, Oid operator)
378 {
379 	Datum	   *hist_values;
380 	int			nhist;
381 	Datum	   *length_hist_values = NULL;
382 	int			length_nhist = 0;
383 	RangeBound *hist_lower;
384 	RangeBound *hist_upper;
385 	int			i;
386 	RangeBound	const_lower;
387 	RangeBound	const_upper;
388 	bool		empty;
389 	double		hist_selec;
390 
391 	/* Can't use the histogram with insecure range support functions */
392 	if (!statistic_proc_security_check(vardata,
393 									   typcache->rng_cmp_proc_finfo.fn_oid))
394 		return -1;
395 	if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
396 		!statistic_proc_security_check(vardata,
397 									   typcache->rng_subdiff_finfo.fn_oid))
398 		return -1;
399 
400 	/* Try to get histogram of ranges */
401 	if (!(HeapTupleIsValid(vardata->statsTuple) &&
402 		  get_attstatsslot(vardata->statsTuple,
403 						   vardata->atttype, vardata->atttypmod,
404 						   STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
405 						   NULL,
406 						   &hist_values, &nhist,
407 						   NULL, NULL)))
408 		return -1.0;
409 
410 	/* check that it's a histogram, not just a dummy entry */
411 	if (nhist < 2)
412 	{
413 		free_attstatsslot(vardata->atttype, hist_values, nhist, NULL, 0);
414 		return -1.0;
415 	}
416 
417 	/*
418 	 * Convert histogram of ranges into histograms of its lower and upper
419 	 * bounds.
420 	 */
421 	hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
422 	hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
423 	for (i = 0; i < nhist; i++)
424 	{
425 		range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
426 						  &hist_lower[i], &hist_upper[i], &empty);
427 		/* The histogram should not contain any empty ranges */
428 		if (empty)
429 			elog(ERROR, "bounds histogram contains an empty range");
430 	}
431 
432 	/* @> and @< also need a histogram of range lengths */
433 	if (operator == OID_RANGE_CONTAINS_OP ||
434 		operator == OID_RANGE_CONTAINED_OP)
435 	{
436 		if (!(HeapTupleIsValid(vardata->statsTuple) &&
437 			  get_attstatsslot(vardata->statsTuple,
438 							   FLOAT8OID, -1,
439 							   STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
440 							   InvalidOid,
441 							   NULL,
442 							   &length_hist_values, &length_nhist,
443 							   NULL, NULL)))
444 		{
445 			free_attstatsslot(vardata->atttype, hist_values, nhist, NULL, 0);
446 			return -1.0;
447 		}
448 
449 		/* check that it's a histogram, not just a dummy entry */
450 		if (length_nhist < 2)
451 		{
452 			free_attstatsslot(FLOAT8OID,
453 							  length_hist_values, length_nhist, NULL, 0);
454 			free_attstatsslot(vardata->atttype, hist_values, nhist, NULL, 0);
455 			return -1.0;
456 		}
457 	}
458 
459 	/* Extract the bounds of the constant value. */
460 	range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
461 	Assert(!empty);
462 
463 	/*
464 	 * Calculate selectivity comparing the lower or upper bound of the
465 	 * constant with the histogram of lower or upper bounds.
466 	 */
467 	switch (operator)
468 	{
469 		case OID_RANGE_LESS_OP:
470 
471 			/*
472 			 * The regular b-tree comparison operators (<, <=, >, >=) compare
473 			 * the lower bounds first, and the upper bounds for values with
474 			 * equal lower bounds. Estimate that by comparing the lower bounds
475 			 * only. This gives a fairly accurate estimate assuming there
476 			 * aren't many rows with a lower bound equal to the constant's
477 			 * lower bound.
478 			 */
479 			hist_selec =
480 				calc_hist_selectivity_scalar(typcache, &const_lower,
481 											 hist_lower, nhist, false);
482 			break;
483 
484 		case OID_RANGE_LESS_EQUAL_OP:
485 			hist_selec =
486 				calc_hist_selectivity_scalar(typcache, &const_lower,
487 											 hist_lower, nhist, true);
488 			break;
489 
490 		case OID_RANGE_GREATER_OP:
491 			hist_selec =
492 				1 - calc_hist_selectivity_scalar(typcache, &const_lower,
493 												 hist_lower, nhist, false);
494 			break;
495 
496 		case OID_RANGE_GREATER_EQUAL_OP:
497 			hist_selec =
498 				1 - calc_hist_selectivity_scalar(typcache, &const_lower,
499 												 hist_lower, nhist, true);
500 			break;
501 
502 		case OID_RANGE_LEFT_OP:
503 			/* var << const when upper(var) < lower(const) */
504 			hist_selec =
505 				calc_hist_selectivity_scalar(typcache, &const_lower,
506 											 hist_upper, nhist, false);
507 			break;
508 
509 		case OID_RANGE_RIGHT_OP:
510 			/* var >> const when lower(var) > upper(const) */
511 			hist_selec =
512 				1 - calc_hist_selectivity_scalar(typcache, &const_upper,
513 												 hist_lower, nhist, true);
514 			break;
515 
516 		case OID_RANGE_OVERLAPS_RIGHT_OP:
517 			/* compare lower bounds */
518 			hist_selec =
519 				1 - calc_hist_selectivity_scalar(typcache, &const_lower,
520 												 hist_lower, nhist, false);
521 			break;
522 
523 		case OID_RANGE_OVERLAPS_LEFT_OP:
524 			/* compare upper bounds */
525 			hist_selec =
526 				calc_hist_selectivity_scalar(typcache, &const_upper,
527 											 hist_upper, nhist, true);
528 			break;
529 
530 		case OID_RANGE_OVERLAP_OP:
531 		case OID_RANGE_CONTAINS_ELEM_OP:
532 
533 			/*
534 			 * A && B <=> NOT (A << B OR A >> B).
535 			 *
536 			 * Since A << B and A >> B are mutually exclusive events we can
537 			 * sum their probabilities to find probability of (A << B OR A >>
538 			 * B).
539 			 *
540 			 * "range @> elem" is equivalent to "range && [elem,elem]". The
541 			 * caller already constructed the singular range from the element
542 			 * constant, so just treat it the same as &&.
543 			 */
544 			hist_selec =
545 				calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
546 											 nhist, false);
547 			hist_selec +=
548 				(1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
549 													nhist, true));
550 			hist_selec = 1.0 - hist_selec;
551 			break;
552 
553 		case OID_RANGE_CONTAINS_OP:
554 			hist_selec =
555 				calc_hist_selectivity_contains(typcache, &const_lower,
556 											 &const_upper, hist_lower, nhist,
557 										   length_hist_values, length_nhist);
558 			break;
559 
560 		case OID_RANGE_CONTAINED_OP:
561 			if (const_lower.infinite)
562 			{
563 				/*
564 				 * Lower bound no longer matters. Just estimate the fraction
565 				 * with an upper bound <= const upper bound
566 				 */
567 				hist_selec =
568 					calc_hist_selectivity_scalar(typcache, &const_upper,
569 												 hist_upper, nhist, true);
570 			}
571 			else if (const_upper.infinite)
572 			{
573 				hist_selec =
574 					1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
575 												   hist_lower, nhist, false);
576 			}
577 			else
578 			{
579 				hist_selec =
580 					calc_hist_selectivity_contained(typcache, &const_lower,
581 											 &const_upper, hist_lower, nhist,
582 										   length_hist_values, length_nhist);
583 			}
584 			break;
585 
586 		default:
587 			elog(ERROR, "unknown range operator %u", operator);
588 			hist_selec = -1.0;	/* keep compiler quiet */
589 			break;
590 	}
591 
592 	free_attstatsslot(FLOAT8OID,
593 					  length_hist_values, length_nhist, NULL, 0);
594 	free_attstatsslot(vardata->atttype, hist_values, nhist, NULL, 0);
595 
596 	return hist_selec;
597 }
598 
599 
600 /*
601  * Look up the fraction of values less than (or equal, if 'equal' argument
602  * is true) a given const in a histogram of range bounds.
603  */
604 static double
calc_hist_selectivity_scalar(TypeCacheEntry * typcache,RangeBound * constbound,RangeBound * hist,int hist_nvalues,bool equal)605 calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
606 							 RangeBound *hist, int hist_nvalues, bool equal)
607 {
608 	Selectivity selec;
609 	int			index;
610 
611 	/*
612 	 * Find the histogram bin the given constant falls into. Estimate
613 	 * selectivity as the number of preceding whole bins.
614 	 */
615 	index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
616 	selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
617 
618 	/* Adjust using linear interpolation within the bin */
619 	if (index >= 0 && index < hist_nvalues - 1)
620 		selec += get_position(typcache, constbound, &hist[index],
621 						&hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
622 
623 	return selec;
624 }
625 
626 /*
627  * Binary search on an array of range bounds. Returns greatest index of range
628  * bound in array which is less(less or equal) than given range bound. If all
629  * range bounds in array are greater or equal(greater) than given range bound,
630  * return -1. When "equal" flag is set conditions in brackets are used.
631  *
632  * This function is used in scalar operator selectivity estimation. Another
633  * goal of this function is to find a histogram bin where to stop
634  * interpolation of portion of bounds which are less than or equal to given bound.
635  */
636 static int
rbound_bsearch(TypeCacheEntry * typcache,RangeBound * value,RangeBound * hist,int hist_length,bool equal)637 rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
638 			   int hist_length, bool equal)
639 {
640 	int			lower = -1,
641 				upper = hist_length - 1,
642 				cmp,
643 				middle;
644 
645 	while (lower < upper)
646 	{
647 		middle = (lower + upper + 1) / 2;
648 		cmp = range_cmp_bounds(typcache, &hist[middle], value);
649 
650 		if (cmp < 0 || (equal && cmp == 0))
651 			lower = middle;
652 		else
653 			upper = middle - 1;
654 	}
655 	return lower;
656 }
657 
658 
659 /*
660  * Binary search on length histogram. Returns greatest index of range length in
661  * histogram which is less than (less than or equal) the given length value. If
662  * all lengths in the histogram are greater than (greater than or equal) the
663  * given length, returns -1.
664  */
665 static int
length_hist_bsearch(Datum * length_hist_values,int length_hist_nvalues,double value,bool equal)666 length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
667 					double value, bool equal)
668 {
669 	int			lower = -1,
670 				upper = length_hist_nvalues - 1,
671 				middle;
672 
673 	while (lower < upper)
674 	{
675 		double		middleval;
676 
677 		middle = (lower + upper + 1) / 2;
678 
679 		middleval = DatumGetFloat8(length_hist_values[middle]);
680 		if (middleval < value || (equal && middleval <= value))
681 			lower = middle;
682 		else
683 			upper = middle - 1;
684 	}
685 	return lower;
686 }
687 
688 /*
689  * Get relative position of value in histogram bin in [0,1] range.
690  */
691 static float8
get_position(TypeCacheEntry * typcache,RangeBound * value,RangeBound * hist1,RangeBound * hist2)692 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
693 			 RangeBound *hist2)
694 {
695 	bool		has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
696 	float8		position;
697 
698 	if (!hist1->infinite && !hist2->infinite)
699 	{
700 		float8		bin_width;
701 
702 		/*
703 		 * Both bounds are finite. Assuming the subtype's comparison function
704 		 * works sanely, the value must be finite, too, because it lies
705 		 * somewhere between the bounds.  If it doesn't, arbitrarily return
706 		 * 0.5.
707 		 */
708 		if (value->infinite)
709 			return 0.5;
710 
711 		/* Can't interpolate without subdiff function */
712 		if (!has_subdiff)
713 			return 0.5;
714 
715 		/* Calculate relative position using subdiff function. */
716 		bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
717 													 typcache->rng_collation,
718 													 hist2->val,
719 													 hist1->val));
720 		if (isnan(bin_width) || bin_width <= 0.0)
721 			return 0.5;			/* punt for NaN or zero-width bin */
722 
723 		position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
724 													typcache->rng_collation,
725 													value->val,
726 													hist1->val))
727 			/ bin_width;
728 
729 		if (isnan(position))
730 			return 0.5;			/* punt for NaN from subdiff, Inf/Inf, etc */
731 
732 		/* Relative position must be in [0,1] range */
733 		position = Max(position, 0.0);
734 		position = Min(position, 1.0);
735 		return position;
736 	}
737 	else if (hist1->infinite && !hist2->infinite)
738 	{
739 		/*
740 		 * Lower bin boundary is -infinite, upper is finite. If the value is
741 		 * -infinite, return 0.0 to indicate it's equal to the lower bound.
742 		 * Otherwise return 1.0 to indicate it's infinitely far from the lower
743 		 * bound.
744 		 */
745 		return ((value->infinite && value->lower) ? 0.0 : 1.0);
746 	}
747 	else if (!hist1->infinite && hist2->infinite)
748 	{
749 		/* same as above, but in reverse */
750 		return ((value->infinite && !value->lower) ? 1.0 : 0.0);
751 	}
752 	else
753 	{
754 		/*
755 		 * If both bin boundaries are infinite, they should be equal to each
756 		 * other, and the value should also be infinite and equal to both
757 		 * bounds. (But don't Assert that, to avoid crashing if a user creates
758 		 * a datatype with a broken comparison function).
759 		 *
760 		 * Assume the value to lie in the middle of the infinite bounds.
761 		 */
762 		return 0.5;
763 	}
764 }
765 
766 
767 /*
768  * Get relative position of value in a length histogram bin in [0,1] range.
769  */
770 static double
get_len_position(double value,double hist1,double hist2)771 get_len_position(double value, double hist1, double hist2)
772 {
773 	if (!is_infinite(hist1) && !is_infinite(hist2))
774 	{
775 		/*
776 		 * Both bounds are finite. The value should be finite too, because it
777 		 * lies somewhere between the bounds. If it doesn't, just return
778 		 * something.
779 		 */
780 		if (is_infinite(value))
781 			return 0.5;
782 
783 		return 1.0 - (hist2 - value) / (hist2 - hist1);
784 	}
785 	else if (is_infinite(hist1) && !is_infinite(hist2))
786 	{
787 		/*
788 		 * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
789 		 * indicate the value is infinitely far from the lower bound.
790 		 */
791 		return 1.0;
792 	}
793 	else if (is_infinite(hist1) && is_infinite(hist2))
794 	{
795 		/* same as above, but in reverse */
796 		return 0.0;
797 	}
798 	else
799 	{
800 		/*
801 		 * If both bin boundaries are infinite, they should be equal to each
802 		 * other, and the value should also be infinite and equal to both
803 		 * bounds. (But don't Assert that, to avoid crashing unnecessarily if
804 		 * the caller messes up)
805 		 *
806 		 * Assume the value to lie in the middle of the infinite bounds.
807 		 */
808 		return 0.5;
809 	}
810 }
811 
812 /*
813  * Measure distance between two range bounds.
814  */
815 static float8
get_distance(TypeCacheEntry * typcache,RangeBound * bound1,RangeBound * bound2)816 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
817 {
818 	bool		has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
819 
820 	if (!bound1->infinite && !bound2->infinite)
821 	{
822 		/*
823 		 * Neither bound is infinite, use subdiff function or return default
824 		 * value of 1.0 if no subdiff is available.
825 		 */
826 		if (has_subdiff)
827 		{
828 			float8		res;
829 
830 			res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
831 												   typcache->rng_collation,
832 												   bound2->val,
833 												   bound1->val));
834 			/* Reject possible NaN result, also negative result */
835 			if (isnan(res) || res < 0.0)
836 				return 1.0;
837 			else
838 				return res;
839 		}
840 		else
841 			return 1.0;
842 	}
843 	else if (bound1->infinite && bound2->infinite)
844 	{
845 		/* Both bounds are infinite */
846 		if (bound1->lower == bound2->lower)
847 			return 0.0;
848 		else
849 			return get_float8_infinity();
850 	}
851 	else
852 	{
853 		/* One bound is infinite, the other is not */
854 		return get_float8_infinity();
855 	}
856 }
857 
858 /*
859  * Calculate the average of function P(x), in the interval [length1, length2],
860  * where P(x) is the fraction of tuples with length < x (or length <= x if
861  * 'equal' is true).
862  */
863 static double
calc_length_hist_frac(Datum * length_hist_values,int length_hist_nvalues,double length1,double length2,bool equal)864 calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
865 					  double length1, double length2, bool equal)
866 {
867 	double		frac;
868 	double		A,
869 				B,
870 				PA,
871 				PB;
872 	double		pos;
873 	int			i;
874 	double		area;
875 
876 	Assert(length2 >= length1);
877 
878 	if (length2 < 0.0)
879 		return 0.0;				/* shouldn't happen, but doesn't hurt to check */
880 
881 	/* All lengths in the table are <= infinite. */
882 	if (is_infinite(length2) && equal)
883 		return 1.0;
884 
885 	/*----------
886 	 * The average of a function between A and B can be calculated by the
887 	 * formula:
888 	 *
889 	 *			B
890 	 *	  1		/
891 	 * -------	| P(x)dx
892 	 *	B - A	/
893 	 *			A
894 	 *
895 	 * The geometrical interpretation of the integral is the area under the
896 	 * graph of P(x). P(x) is defined by the length histogram. We calculate
897 	 * the area in a piecewise fashion, iterating through the length histogram
898 	 * bins. Each bin is a trapezoid:
899 	 *
900 	 *		 P(x2)
901 	 *		  /|
902 	 *		 / |
903 	 * P(x1)/  |
904 	 *	   |   |
905 	 *	   |   |
906 	 *	---+---+--
907 	 *	   x1  x2
908 	 *
909 	 * where x1 and x2 are the boundaries of the current histogram, and P(x1)
910 	 * and P(x1) are the cumulative fraction of tuples at the boundaries.
911 	 *
912 	 * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
913 	 *
914 	 * The first bin contains the lower bound passed by the caller, so we
915 	 * use linear interpolation between the previous and next histogram bin
916 	 * boundary to calculate P(x1). Likewise for the last bin: we use linear
917 	 * interpolation to calculate P(x2). For the bins in between, x1 and x2
918 	 * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
919 	 * P(x1) =	  (bin index) / (number of bins)
920 	 * P(x2) = (bin index + 1 / (number of bins)
921 	 */
922 
923 	/* First bin, the one that contains lower bound */
924 	i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
925 	if (i >= length_hist_nvalues - 1)
926 		return 1.0;
927 
928 	if (i < 0)
929 	{
930 		i = 0;
931 		pos = 0.0;
932 	}
933 	else
934 	{
935 		/* interpolate length1's position in the bin */
936 		pos = get_len_position(length1,
937 							   DatumGetFloat8(length_hist_values[i]),
938 							   DatumGetFloat8(length_hist_values[i + 1]));
939 	}
940 	PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
941 	B = length1;
942 
943 	/*
944 	 * In the degenerate case that length1 == length2, simply return
945 	 * P(length1). This is not merely an optimization: if length1 == length2,
946 	 * we'd divide by zero later on.
947 	 */
948 	if (length2 == length1)
949 		return PB;
950 
951 	/*
952 	 * Loop through all the bins, until we hit the last bin, the one that
953 	 * contains the upper bound. (if lower and upper bounds are in the same
954 	 * bin, this falls out immediately)
955 	 */
956 	area = 0.0;
957 	for (; i < length_hist_nvalues - 1; i++)
958 	{
959 		double		bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
960 
961 		/* check if we've reached the last bin */
962 		if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
963 			break;
964 
965 		/* the upper bound of previous bin is the lower bound of this bin */
966 		A = B;
967 		PA = PB;
968 
969 		B = bin_upper;
970 		PB = (double) i / (double) (length_hist_nvalues - 1);
971 
972 		/*
973 		 * Add the area of this trapezoid to the total. The point of the
974 		 * if-check is to avoid NaN, in the corner case that PA == PB == 0,
975 		 * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
976 		 * 0) is zero, regardless of the width (B - A).
977 		 */
978 		if (PA > 0 || PB > 0)
979 			area += 0.5 * (PB + PA) * (B - A);
980 	}
981 
982 	/* Last bin */
983 	A = B;
984 	PA = PB;
985 
986 	B = length2;				/* last bin ends at the query upper bound */
987 	if (i >= length_hist_nvalues - 1)
988 		pos = 0.0;
989 	else
990 	{
991 		if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
992 			pos = 0.0;
993 		else
994 			pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
995 	}
996 	PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
997 
998 	if (PA > 0 || PB > 0)
999 		area += 0.5 * (PB + PA) * (B - A);
1000 
1001 	/*
1002 	 * Ok, we have calculated the area, ie. the integral. Divide by width to
1003 	 * get the requested average.
1004 	 *
1005 	 * Avoid NaN arising from infinite / infinite. This happens at least if
1006 	 * length2 is infinite. It's not clear what the correct value would be in
1007 	 * that case, so 0.5 seems as good as any value.
1008 	 */
1009 	if (is_infinite(area) && is_infinite(length2))
1010 		frac = 0.5;
1011 	else
1012 		frac = area / (length2 - length1);
1013 
1014 	return frac;
1015 }
1016 
1017 /*
1018  * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
1019  * of ranges that fall within the constant lower and upper bounds. This uses
1020  * the histograms of range lower bounds and range lengths, on the assumption
1021  * that the range lengths are independent of the lower bounds.
1022  *
1023  * The caller has already checked that constant lower and upper bounds are
1024  * finite.
1025  */
1026 static double
calc_hist_selectivity_contained(TypeCacheEntry * typcache,RangeBound * lower,RangeBound * upper,RangeBound * hist_lower,int hist_nvalues,Datum * length_hist_values,int length_hist_nvalues)1027 calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1028 								RangeBound *lower, RangeBound *upper,
1029 								RangeBound *hist_lower, int hist_nvalues,
1030 						  Datum *length_hist_values, int length_hist_nvalues)
1031 {
1032 	int			i,
1033 				upper_index;
1034 	float8		prev_dist;
1035 	double		bin_width;
1036 	double		upper_bin_width;
1037 	double		sum_frac;
1038 
1039 	/*
1040 	 * Begin by finding the bin containing the upper bound, in the lower bound
1041 	 * histogram. Any range with a lower bound > constant upper bound can't
1042 	 * match, ie. there are no matches in bins greater than upper_index.
1043 	 */
1044 	upper->inclusive = !upper->inclusive;
1045 	upper->lower = true;
1046 	upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1047 								 false);
1048 
1049 	/*
1050 	 * If the upper bound value is below the histogram's lower limit, there
1051 	 * are no matches.
1052 	 */
1053 	if (upper_index < 0)
1054 		return 0.0;
1055 
1056 	/*
1057 	 * If the upper bound value is at or beyond the histogram's upper limit,
1058 	 * start our loop at the last actual bin, as though the upper bound were
1059 	 * within that bin; get_position will clamp its result to 1.0 anyway.
1060 	 * (This corresponds to assuming that the data population above the
1061 	 * histogram's upper limit is empty, exactly like what we just assumed for
1062 	 * the lower limit.)
1063 	 */
1064 	upper_index = Min(upper_index, hist_nvalues - 2);
1065 
1066 	/*
1067 	 * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1068 	 * upper_index + 1) bin which is greater than upper bound of query range
1069 	 * using linear interpolation of subdiff function.
1070 	 */
1071 	upper_bin_width = get_position(typcache, upper,
1072 								   &hist_lower[upper_index],
1073 								   &hist_lower[upper_index + 1]);
1074 
1075 	/*
1076 	 * In the loop, dist and prev_dist are the distance of the "current" bin's
1077 	 * lower and upper bounds from the constant upper bound.
1078 	 *
1079 	 * bin_width represents the width of the current bin. Normally it is 1.0,
1080 	 * meaning a full width bin, but can be less in the corner cases: start
1081 	 * and end of the loop. We start with bin_width = upper_bin_width, because
1082 	 * we begin at the bin containing the upper bound.
1083 	 */
1084 	prev_dist = 0.0;
1085 	bin_width = upper_bin_width;
1086 
1087 	sum_frac = 0.0;
1088 	for (i = upper_index; i >= 0; i--)
1089 	{
1090 		double		dist;
1091 		double		length_hist_frac;
1092 		bool		final_bin = false;
1093 
1094 		/*
1095 		 * dist -- distance from upper bound of query range to lower bound of
1096 		 * the current bin in the lower bound histogram. Or to the lower bound
1097 		 * of the constant range, if this is the final bin, containing the
1098 		 * constant lower bound.
1099 		 */
1100 		if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1101 		{
1102 			dist = get_distance(typcache, lower, upper);
1103 
1104 			/*
1105 			 * Subtract from bin_width the portion of this bin that we want to
1106 			 * ignore.
1107 			 */
1108 			bin_width -= get_position(typcache, lower, &hist_lower[i],
1109 									  &hist_lower[i + 1]);
1110 			if (bin_width < 0.0)
1111 				bin_width = 0.0;
1112 			final_bin = true;
1113 		}
1114 		else
1115 			dist = get_distance(typcache, &hist_lower[i], upper);
1116 
1117 		/*
1118 		 * Estimate the fraction of tuples in this bin that are narrow enough
1119 		 * to not exceed the distance to the upper bound of the query range.
1120 		 */
1121 		length_hist_frac = calc_length_hist_frac(length_hist_values,
1122 												 length_hist_nvalues,
1123 												 prev_dist, dist, true);
1124 
1125 		/*
1126 		 * Add the fraction of tuples in this bin, with a suitable length, to
1127 		 * the total.
1128 		 */
1129 		sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1130 
1131 		if (final_bin)
1132 			break;
1133 
1134 		bin_width = 1.0;
1135 		prev_dist = dist;
1136 	}
1137 
1138 	return sum_frac;
1139 }
1140 
1141 /*
1142  * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1143  * of ranges that contain the constant lower and upper bounds. This uses
1144  * the histograms of range lower bounds and range lengths, on the assumption
1145  * that the range lengths are independent of the lower bounds.
1146  */
1147 static double
calc_hist_selectivity_contains(TypeCacheEntry * typcache,RangeBound * lower,RangeBound * upper,RangeBound * hist_lower,int hist_nvalues,Datum * length_hist_values,int length_hist_nvalues)1148 calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1149 							   RangeBound *lower, RangeBound *upper,
1150 							   RangeBound *hist_lower, int hist_nvalues,
1151 						  Datum *length_hist_values, int length_hist_nvalues)
1152 {
1153 	int			i,
1154 				lower_index;
1155 	double		bin_width,
1156 				lower_bin_width;
1157 	double		sum_frac;
1158 	float8		prev_dist;
1159 
1160 	/* Find the bin containing the lower bound of query range. */
1161 	lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1162 								 true);
1163 
1164 	/*
1165 	 * If the lower bound value is below the histogram's lower limit, there
1166 	 * are no matches.
1167 	 */
1168 	if (lower_index < 0)
1169 		return 0.0;
1170 
1171 	/*
1172 	 * If the lower bound value is at or beyond the histogram's upper limit,
1173 	 * start our loop at the last actual bin, as though the upper bound were
1174 	 * within that bin; get_position will clamp its result to 1.0 anyway.
1175 	 * (This corresponds to assuming that the data population above the
1176 	 * histogram's upper limit is empty, exactly like what we just assumed for
1177 	 * the lower limit.)
1178 	 */
1179 	lower_index = Min(lower_index, hist_nvalues - 2);
1180 
1181 	/*
1182 	 * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1183 	 * lower_index + 1) bin which is greater than lower bound of query range
1184 	 * using linear interpolation of subdiff function.
1185 	 */
1186 	lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1187 								   &hist_lower[lower_index + 1]);
1188 
1189 	/*
1190 	 * Loop through all the lower bound bins, smaller than the query lower
1191 	 * bound. In the loop, dist and prev_dist are the distance of the
1192 	 * "current" bin's lower and upper bounds from the constant upper bound.
1193 	 * We begin from query lower bound, and walk backwards, so the first bin's
1194 	 * upper bound is the query lower bound, and its distance to the query
1195 	 * upper bound is the length of the query range.
1196 	 *
1197 	 * bin_width represents the width of the current bin. Normally it is 1.0,
1198 	 * meaning a full width bin, except for the first bin, which is only
1199 	 * counted up to the constant lower bound.
1200 	 */
1201 	prev_dist = get_distance(typcache, lower, upper);
1202 	sum_frac = 0.0;
1203 	bin_width = lower_bin_width;
1204 	for (i = lower_index; i >= 0; i--)
1205 	{
1206 		float8		dist;
1207 		double		length_hist_frac;
1208 
1209 		/*
1210 		 * dist -- distance from upper bound of query range to current value
1211 		 * of lower bound histogram or lower bound of query range (if we've
1212 		 * reach it).
1213 		 */
1214 		dist = get_distance(typcache, &hist_lower[i], upper);
1215 
1216 		/*
1217 		 * Get average fraction of length histogram which covers intervals
1218 		 * longer than (or equal to) distance to upper bound of query range.
1219 		 */
1220 		length_hist_frac =
1221 			1.0 - calc_length_hist_frac(length_hist_values,
1222 										length_hist_nvalues,
1223 										prev_dist, dist, false);
1224 
1225 		sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1226 
1227 		bin_width = 1.0;
1228 		prev_dist = dist;
1229 	}
1230 
1231 	return sum_frac;
1232 }
1233