1 /*-------------------------------------------------------------------------
2 *
3 * rangetypes_selfuncs.c
4 * Functions for selectivity estimation of range operators
5 *
6 * Estimates are based on histograms of lower and upper bounds, and the
7 * fraction of empty ranges.
8 *
9 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
11 *
12 *
13 * IDENTIFICATION
14 * src/backend/utils/adt/rangetypes_selfuncs.c
15 *
16 *-------------------------------------------------------------------------
17 */
18 #include "postgres.h"
19
20 #include <math.h>
21
22 #include "access/htup_details.h"
23 #include "catalog/pg_operator.h"
24 #include "catalog/pg_statistic.h"
25 #include "catalog/pg_type.h"
26 #include "utils/builtins.h"
27 #include "utils/lsyscache.h"
28 #include "utils/rangetypes.h"
29 #include "utils/selfuncs.h"
30 #include "utils/typcache.h"
31
32 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
33 RangeType *constval, Oid operator);
34 static double default_range_selectivity(Oid operator);
35 static double calc_hist_selectivity(TypeCacheEntry *typcache,
36 VariableStatData *vardata, RangeType *constval,
37 Oid operator);
38 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
39 RangeBound *constbound,
40 RangeBound *hist, int hist_nvalues,
41 bool equal);
42 static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
43 RangeBound *hist, int hist_length, bool equal);
44 static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
45 RangeBound *hist1, RangeBound *hist2);
46 static float8 get_len_position(double value, double hist1, double hist2);
47 static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
48 RangeBound *bound2);
49 static int length_hist_bsearch(Datum *length_hist_values,
50 int length_hist_nvalues, double value, bool equal);
51 static double calc_length_hist_frac(Datum *length_hist_values,
52 int length_hist_nvalues, double length1, double length2, bool equal);
53 static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
54 RangeBound *lower, RangeBound *upper,
55 RangeBound *hist_lower, int hist_nvalues,
56 Datum *length_hist_values, int length_hist_nvalues);
57 static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
58 RangeBound *lower, RangeBound *upper,
59 RangeBound *hist_lower, int hist_nvalues,
60 Datum *length_hist_values, int length_hist_nvalues);
61
62 /*
63 * Returns a default selectivity estimate for given operator, when we don't
64 * have statistics or cannot use them for some reason.
65 */
66 static double
default_range_selectivity(Oid operator)67 default_range_selectivity(Oid operator)
68 {
69 switch (operator)
70 {
71 case OID_RANGE_OVERLAP_OP:
72 return 0.01;
73
74 case OID_RANGE_CONTAINS_OP:
75 case OID_RANGE_CONTAINED_OP:
76 return 0.005;
77
78 case OID_RANGE_CONTAINS_ELEM_OP:
79 case OID_RANGE_ELEM_CONTAINED_OP:
80
81 /*
82 * "range @> elem" is more or less identical to a scalar
83 * inequality "A >= b AND A <= c".
84 */
85 return DEFAULT_RANGE_INEQ_SEL;
86
87 case OID_RANGE_LESS_OP:
88 case OID_RANGE_LESS_EQUAL_OP:
89 case OID_RANGE_GREATER_OP:
90 case OID_RANGE_GREATER_EQUAL_OP:
91 case OID_RANGE_LEFT_OP:
92 case OID_RANGE_RIGHT_OP:
93 case OID_RANGE_OVERLAPS_LEFT_OP:
94 case OID_RANGE_OVERLAPS_RIGHT_OP:
95 /* these are similar to regular scalar inequalities */
96 return DEFAULT_INEQ_SEL;
97
98 default:
99 /* all range operators should be handled above, but just in case */
100 return 0.01;
101 }
102 }
103
104 /*
105 * rangesel -- restriction selectivity for range operators
106 */
107 Datum
rangesel(PG_FUNCTION_ARGS)108 rangesel(PG_FUNCTION_ARGS)
109 {
110 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
111 Oid operator = PG_GETARG_OID(1);
112 List *args = (List *) PG_GETARG_POINTER(2);
113 int varRelid = PG_GETARG_INT32(3);
114 VariableStatData vardata;
115 Node *other;
116 bool varonleft;
117 Selectivity selec;
118 TypeCacheEntry *typcache = NULL;
119 RangeType *constrange = NULL;
120
121 /*
122 * If expression is not (variable op something) or (something op
123 * variable), then punt and return a default estimate.
124 */
125 if (!get_restriction_variable(root, args, varRelid,
126 &vardata, &other, &varonleft))
127 PG_RETURN_FLOAT8(default_range_selectivity(operator));
128
129 /*
130 * Can't do anything useful if the something is not a constant, either.
131 */
132 if (!IsA(other, Const))
133 {
134 ReleaseVariableStats(vardata);
135 PG_RETURN_FLOAT8(default_range_selectivity(operator));
136 }
137
138 /*
139 * All the range operators are strict, so we can cope with a NULL constant
140 * right away.
141 */
142 if (((Const *) other)->constisnull)
143 {
144 ReleaseVariableStats(vardata);
145 PG_RETURN_FLOAT8(0.0);
146 }
147
148 /*
149 * If var is on the right, commute the operator, so that we can assume the
150 * var is on the left in what follows.
151 */
152 if (!varonleft)
153 {
154 /* we have other Op var, commute to make var Op other */
155 operator = get_commutator(operator);
156 if (!operator)
157 {
158 /* Use default selectivity (should we raise an error instead?) */
159 ReleaseVariableStats(vardata);
160 PG_RETURN_FLOAT8(default_range_selectivity(operator));
161 }
162 }
163
164 /*
165 * OK, there's a Var and a Const we're dealing with here. We need the
166 * Const to be of same range type as the column, else we can't do anything
167 * useful. (Such cases will likely fail at runtime, but here we'd rather
168 * just return a default estimate.)
169 *
170 * If the operator is "range @> element", the constant should be of the
171 * element type of the range column. Convert it to a range that includes
172 * only that single point, so that we don't need special handling for that
173 * in what follows.
174 */
175 if (operator == OID_RANGE_CONTAINS_ELEM_OP)
176 {
177 typcache = range_get_typcache(fcinfo, vardata.vartype);
178
179 if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
180 {
181 RangeBound lower,
182 upper;
183
184 lower.inclusive = true;
185 lower.val = ((Const *) other)->constvalue;
186 lower.infinite = false;
187 lower.lower = true;
188 upper.inclusive = true;
189 upper.val = ((Const *) other)->constvalue;
190 upper.infinite = false;
191 upper.lower = false;
192 constrange = range_serialize(typcache, &lower, &upper, false);
193 }
194 }
195 else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
196 {
197 /*
198 * Here, the Var is the elem, not the range. For now we just punt and
199 * return the default estimate. In future we could disassemble the
200 * range constant and apply scalarineqsel ...
201 */
202 }
203 else if (((Const *) other)->consttype == vardata.vartype)
204 {
205 /* Both sides are the same range type */
206 typcache = range_get_typcache(fcinfo, vardata.vartype);
207
208 constrange = DatumGetRangeType(((Const *) other)->constvalue);
209 }
210
211 /*
212 * If we got a valid constant on one side of the operator, proceed to
213 * estimate using statistics. Otherwise punt and return a default constant
214 * estimate. Note that calc_rangesel need not handle
215 * OID_RANGE_ELEM_CONTAINED_OP.
216 */
217 if (constrange)
218 selec = calc_rangesel(typcache, &vardata, constrange, operator);
219 else
220 selec = default_range_selectivity(operator);
221
222 ReleaseVariableStats(vardata);
223
224 CLAMP_PROBABILITY(selec);
225
226 PG_RETURN_FLOAT8((float8) selec);
227 }
228
229 static double
calc_rangesel(TypeCacheEntry * typcache,VariableStatData * vardata,RangeType * constval,Oid operator)230 calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
231 RangeType *constval, Oid operator)
232 {
233 double hist_selec;
234 double selec;
235 float4 empty_frac,
236 null_frac;
237
238 /*
239 * First look up the fraction of NULLs and empty ranges from pg_statistic.
240 */
241 if (HeapTupleIsValid(vardata->statsTuple))
242 {
243 Form_pg_statistic stats;
244 float4 *numbers;
245 int nnumbers;
246
247 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
248 null_frac = stats->stanullfrac;
249
250 /* Try to get fraction of empty ranges */
251 if (get_attstatsslot(vardata->statsTuple,
252 FLOAT8OID, -1,
253 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
254 InvalidOid,
255 NULL,
256 NULL, NULL,
257 &numbers, &nnumbers))
258 {
259 if (nnumbers != 1)
260 elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
261 empty_frac = numbers[0];
262 free_attstatsslot(FLOAT8OID, NULL, 0, numbers, nnumbers);
263 }
264 else
265 {
266 /* No empty fraction statistic. Assume no empty ranges. */
267 empty_frac = 0.0;
268 }
269 }
270 else
271 {
272 /*
273 * No stats are available. Follow through the calculations below
274 * anyway, assuming no NULLs and no empty ranges. This still allows us
275 * to give a better-than-nothing estimate based on whether the
276 * constant is an empty range or not.
277 */
278 null_frac = 0.0;
279 empty_frac = 0.0;
280 }
281
282 if (RangeIsEmpty(constval))
283 {
284 /*
285 * An empty range matches all ranges, all empty ranges, or nothing,
286 * depending on the operator
287 */
288 switch (operator)
289 {
290 /* these return false if either argument is empty */
291 case OID_RANGE_OVERLAP_OP:
292 case OID_RANGE_OVERLAPS_LEFT_OP:
293 case OID_RANGE_OVERLAPS_RIGHT_OP:
294 case OID_RANGE_LEFT_OP:
295 case OID_RANGE_RIGHT_OP:
296 /* nothing is less than an empty range */
297 case OID_RANGE_LESS_OP:
298 selec = 0.0;
299 break;
300
301 /* only empty ranges can be contained by an empty range */
302 case OID_RANGE_CONTAINED_OP:
303 /* only empty ranges are <= an empty range */
304 case OID_RANGE_LESS_EQUAL_OP:
305 selec = empty_frac;
306 break;
307
308 /* everything contains an empty range */
309 case OID_RANGE_CONTAINS_OP:
310 /* everything is >= an empty range */
311 case OID_RANGE_GREATER_EQUAL_OP:
312 selec = 1.0;
313 break;
314
315 /* all non-empty ranges are > an empty range */
316 case OID_RANGE_GREATER_OP:
317 selec = 1.0 - empty_frac;
318 break;
319
320 /* an element cannot be empty */
321 case OID_RANGE_CONTAINS_ELEM_OP:
322 default:
323 elog(ERROR, "unexpected operator %u", operator);
324 selec = 0.0; /* keep compiler quiet */
325 break;
326 }
327 }
328 else
329 {
330 /*
331 * Calculate selectivity using bound histograms. If that fails for
332 * some reason, e.g no histogram in pg_statistic, use the default
333 * constant estimate for the fraction of non-empty values. This is
334 * still somewhat better than just returning the default estimate,
335 * because this still takes into account the fraction of empty and
336 * NULL tuples, if we had statistics for them.
337 */
338 hist_selec = calc_hist_selectivity(typcache, vardata, constval,
339 operator);
340 if (hist_selec < 0.0)
341 hist_selec = default_range_selectivity(operator);
342
343 /*
344 * Now merge the results for the empty ranges and histogram
345 * calculations, realizing that the histogram covers only the
346 * non-null, non-empty values.
347 */
348 if (operator == OID_RANGE_CONTAINED_OP)
349 {
350 /* empty is contained by anything non-empty */
351 selec = (1.0 - empty_frac) * hist_selec + empty_frac;
352 }
353 else
354 {
355 /* with any other operator, empty Op non-empty matches nothing */
356 selec = (1.0 - empty_frac) * hist_selec;
357 }
358 }
359
360 /* all range operators are strict */
361 selec *= (1.0 - null_frac);
362
363 /* result should be in range, but make sure... */
364 CLAMP_PROBABILITY(selec);
365
366 return selec;
367 }
368
369 /*
370 * Calculate range operator selectivity using histograms of range bounds.
371 *
372 * This estimate is for the portion of values that are not empty and not
373 * NULL.
374 */
375 static double
calc_hist_selectivity(TypeCacheEntry * typcache,VariableStatData * vardata,RangeType * constval,Oid operator)376 calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
377 RangeType *constval, Oid operator)
378 {
379 Datum *hist_values;
380 int nhist;
381 Datum *length_hist_values = NULL;
382 int length_nhist = 0;
383 RangeBound *hist_lower;
384 RangeBound *hist_upper;
385 int i;
386 RangeBound const_lower;
387 RangeBound const_upper;
388 bool empty;
389 double hist_selec;
390
391 /* Can't use the histogram with insecure range support functions */
392 if (!statistic_proc_security_check(vardata,
393 typcache->rng_cmp_proc_finfo.fn_oid))
394 return -1;
395 if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
396 !statistic_proc_security_check(vardata,
397 typcache->rng_subdiff_finfo.fn_oid))
398 return -1;
399
400 /* Try to get histogram of ranges */
401 if (!(HeapTupleIsValid(vardata->statsTuple) &&
402 get_attstatsslot(vardata->statsTuple,
403 vardata->atttype, vardata->atttypmod,
404 STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
405 NULL,
406 &hist_values, &nhist,
407 NULL, NULL)))
408 return -1.0;
409
410 /* check that it's a histogram, not just a dummy entry */
411 if (nhist < 2)
412 {
413 free_attstatsslot(vardata->atttype, hist_values, nhist, NULL, 0);
414 return -1.0;
415 }
416
417 /*
418 * Convert histogram of ranges into histograms of its lower and upper
419 * bounds.
420 */
421 hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
422 hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
423 for (i = 0; i < nhist; i++)
424 {
425 range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
426 &hist_lower[i], &hist_upper[i], &empty);
427 /* The histogram should not contain any empty ranges */
428 if (empty)
429 elog(ERROR, "bounds histogram contains an empty range");
430 }
431
432 /* @> and @< also need a histogram of range lengths */
433 if (operator == OID_RANGE_CONTAINS_OP ||
434 operator == OID_RANGE_CONTAINED_OP)
435 {
436 if (!(HeapTupleIsValid(vardata->statsTuple) &&
437 get_attstatsslot(vardata->statsTuple,
438 FLOAT8OID, -1,
439 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
440 InvalidOid,
441 NULL,
442 &length_hist_values, &length_nhist,
443 NULL, NULL)))
444 {
445 free_attstatsslot(vardata->atttype, hist_values, nhist, NULL, 0);
446 return -1.0;
447 }
448
449 /* check that it's a histogram, not just a dummy entry */
450 if (length_nhist < 2)
451 {
452 free_attstatsslot(FLOAT8OID,
453 length_hist_values, length_nhist, NULL, 0);
454 free_attstatsslot(vardata->atttype, hist_values, nhist, NULL, 0);
455 return -1.0;
456 }
457 }
458
459 /* Extract the bounds of the constant value. */
460 range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
461 Assert(!empty);
462
463 /*
464 * Calculate selectivity comparing the lower or upper bound of the
465 * constant with the histogram of lower or upper bounds.
466 */
467 switch (operator)
468 {
469 case OID_RANGE_LESS_OP:
470
471 /*
472 * The regular b-tree comparison operators (<, <=, >, >=) compare
473 * the lower bounds first, and the upper bounds for values with
474 * equal lower bounds. Estimate that by comparing the lower bounds
475 * only. This gives a fairly accurate estimate assuming there
476 * aren't many rows with a lower bound equal to the constant's
477 * lower bound.
478 */
479 hist_selec =
480 calc_hist_selectivity_scalar(typcache, &const_lower,
481 hist_lower, nhist, false);
482 break;
483
484 case OID_RANGE_LESS_EQUAL_OP:
485 hist_selec =
486 calc_hist_selectivity_scalar(typcache, &const_lower,
487 hist_lower, nhist, true);
488 break;
489
490 case OID_RANGE_GREATER_OP:
491 hist_selec =
492 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
493 hist_lower, nhist, false);
494 break;
495
496 case OID_RANGE_GREATER_EQUAL_OP:
497 hist_selec =
498 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
499 hist_lower, nhist, true);
500 break;
501
502 case OID_RANGE_LEFT_OP:
503 /* var << const when upper(var) < lower(const) */
504 hist_selec =
505 calc_hist_selectivity_scalar(typcache, &const_lower,
506 hist_upper, nhist, false);
507 break;
508
509 case OID_RANGE_RIGHT_OP:
510 /* var >> const when lower(var) > upper(const) */
511 hist_selec =
512 1 - calc_hist_selectivity_scalar(typcache, &const_upper,
513 hist_lower, nhist, true);
514 break;
515
516 case OID_RANGE_OVERLAPS_RIGHT_OP:
517 /* compare lower bounds */
518 hist_selec =
519 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
520 hist_lower, nhist, false);
521 break;
522
523 case OID_RANGE_OVERLAPS_LEFT_OP:
524 /* compare upper bounds */
525 hist_selec =
526 calc_hist_selectivity_scalar(typcache, &const_upper,
527 hist_upper, nhist, true);
528 break;
529
530 case OID_RANGE_OVERLAP_OP:
531 case OID_RANGE_CONTAINS_ELEM_OP:
532
533 /*
534 * A && B <=> NOT (A << B OR A >> B).
535 *
536 * Since A << B and A >> B are mutually exclusive events we can
537 * sum their probabilities to find probability of (A << B OR A >>
538 * B).
539 *
540 * "range @> elem" is equivalent to "range && [elem,elem]". The
541 * caller already constructed the singular range from the element
542 * constant, so just treat it the same as &&.
543 */
544 hist_selec =
545 calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
546 nhist, false);
547 hist_selec +=
548 (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
549 nhist, true));
550 hist_selec = 1.0 - hist_selec;
551 break;
552
553 case OID_RANGE_CONTAINS_OP:
554 hist_selec =
555 calc_hist_selectivity_contains(typcache, &const_lower,
556 &const_upper, hist_lower, nhist,
557 length_hist_values, length_nhist);
558 break;
559
560 case OID_RANGE_CONTAINED_OP:
561 if (const_lower.infinite)
562 {
563 /*
564 * Lower bound no longer matters. Just estimate the fraction
565 * with an upper bound <= const upper bound
566 */
567 hist_selec =
568 calc_hist_selectivity_scalar(typcache, &const_upper,
569 hist_upper, nhist, true);
570 }
571 else if (const_upper.infinite)
572 {
573 hist_selec =
574 1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
575 hist_lower, nhist, false);
576 }
577 else
578 {
579 hist_selec =
580 calc_hist_selectivity_contained(typcache, &const_lower,
581 &const_upper, hist_lower, nhist,
582 length_hist_values, length_nhist);
583 }
584 break;
585
586 default:
587 elog(ERROR, "unknown range operator %u", operator);
588 hist_selec = -1.0; /* keep compiler quiet */
589 break;
590 }
591
592 free_attstatsslot(FLOAT8OID,
593 length_hist_values, length_nhist, NULL, 0);
594 free_attstatsslot(vardata->atttype, hist_values, nhist, NULL, 0);
595
596 return hist_selec;
597 }
598
599
600 /*
601 * Look up the fraction of values less than (or equal, if 'equal' argument
602 * is true) a given const in a histogram of range bounds.
603 */
604 static double
calc_hist_selectivity_scalar(TypeCacheEntry * typcache,RangeBound * constbound,RangeBound * hist,int hist_nvalues,bool equal)605 calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
606 RangeBound *hist, int hist_nvalues, bool equal)
607 {
608 Selectivity selec;
609 int index;
610
611 /*
612 * Find the histogram bin the given constant falls into. Estimate
613 * selectivity as the number of preceding whole bins.
614 */
615 index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
616 selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
617
618 /* Adjust using linear interpolation within the bin */
619 if (index >= 0 && index < hist_nvalues - 1)
620 selec += get_position(typcache, constbound, &hist[index],
621 &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
622
623 return selec;
624 }
625
626 /*
627 * Binary search on an array of range bounds. Returns greatest index of range
628 * bound in array which is less(less or equal) than given range bound. If all
629 * range bounds in array are greater or equal(greater) than given range bound,
630 * return -1. When "equal" flag is set conditions in brackets are used.
631 *
632 * This function is used in scalar operator selectivity estimation. Another
633 * goal of this function is to find a histogram bin where to stop
634 * interpolation of portion of bounds which are less than or equal to given bound.
635 */
636 static int
rbound_bsearch(TypeCacheEntry * typcache,RangeBound * value,RangeBound * hist,int hist_length,bool equal)637 rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
638 int hist_length, bool equal)
639 {
640 int lower = -1,
641 upper = hist_length - 1,
642 cmp,
643 middle;
644
645 while (lower < upper)
646 {
647 middle = (lower + upper + 1) / 2;
648 cmp = range_cmp_bounds(typcache, &hist[middle], value);
649
650 if (cmp < 0 || (equal && cmp == 0))
651 lower = middle;
652 else
653 upper = middle - 1;
654 }
655 return lower;
656 }
657
658
659 /*
660 * Binary search on length histogram. Returns greatest index of range length in
661 * histogram which is less than (less than or equal) the given length value. If
662 * all lengths in the histogram are greater than (greater than or equal) the
663 * given length, returns -1.
664 */
665 static int
length_hist_bsearch(Datum * length_hist_values,int length_hist_nvalues,double value,bool equal)666 length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
667 double value, bool equal)
668 {
669 int lower = -1,
670 upper = length_hist_nvalues - 1,
671 middle;
672
673 while (lower < upper)
674 {
675 double middleval;
676
677 middle = (lower + upper + 1) / 2;
678
679 middleval = DatumGetFloat8(length_hist_values[middle]);
680 if (middleval < value || (equal && middleval <= value))
681 lower = middle;
682 else
683 upper = middle - 1;
684 }
685 return lower;
686 }
687
688 /*
689 * Get relative position of value in histogram bin in [0,1] range.
690 */
691 static float8
get_position(TypeCacheEntry * typcache,RangeBound * value,RangeBound * hist1,RangeBound * hist2)692 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
693 RangeBound *hist2)
694 {
695 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
696 float8 position;
697
698 if (!hist1->infinite && !hist2->infinite)
699 {
700 float8 bin_width;
701
702 /*
703 * Both bounds are finite. Assuming the subtype's comparison function
704 * works sanely, the value must be finite, too, because it lies
705 * somewhere between the bounds. If it doesn't, arbitrarily return
706 * 0.5.
707 */
708 if (value->infinite)
709 return 0.5;
710
711 /* Can't interpolate without subdiff function */
712 if (!has_subdiff)
713 return 0.5;
714
715 /* Calculate relative position using subdiff function. */
716 bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
717 typcache->rng_collation,
718 hist2->val,
719 hist1->val));
720 if (isnan(bin_width) || bin_width <= 0.0)
721 return 0.5; /* punt for NaN or zero-width bin */
722
723 position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
724 typcache->rng_collation,
725 value->val,
726 hist1->val))
727 / bin_width;
728
729 if (isnan(position))
730 return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */
731
732 /* Relative position must be in [0,1] range */
733 position = Max(position, 0.0);
734 position = Min(position, 1.0);
735 return position;
736 }
737 else if (hist1->infinite && !hist2->infinite)
738 {
739 /*
740 * Lower bin boundary is -infinite, upper is finite. If the value is
741 * -infinite, return 0.0 to indicate it's equal to the lower bound.
742 * Otherwise return 1.0 to indicate it's infinitely far from the lower
743 * bound.
744 */
745 return ((value->infinite && value->lower) ? 0.0 : 1.0);
746 }
747 else if (!hist1->infinite && hist2->infinite)
748 {
749 /* same as above, but in reverse */
750 return ((value->infinite && !value->lower) ? 1.0 : 0.0);
751 }
752 else
753 {
754 /*
755 * If both bin boundaries are infinite, they should be equal to each
756 * other, and the value should also be infinite and equal to both
757 * bounds. (But don't Assert that, to avoid crashing if a user creates
758 * a datatype with a broken comparison function).
759 *
760 * Assume the value to lie in the middle of the infinite bounds.
761 */
762 return 0.5;
763 }
764 }
765
766
767 /*
768 * Get relative position of value in a length histogram bin in [0,1] range.
769 */
770 static double
get_len_position(double value,double hist1,double hist2)771 get_len_position(double value, double hist1, double hist2)
772 {
773 if (!is_infinite(hist1) && !is_infinite(hist2))
774 {
775 /*
776 * Both bounds are finite. The value should be finite too, because it
777 * lies somewhere between the bounds. If it doesn't, just return
778 * something.
779 */
780 if (is_infinite(value))
781 return 0.5;
782
783 return 1.0 - (hist2 - value) / (hist2 - hist1);
784 }
785 else if (is_infinite(hist1) && !is_infinite(hist2))
786 {
787 /*
788 * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
789 * indicate the value is infinitely far from the lower bound.
790 */
791 return 1.0;
792 }
793 else if (is_infinite(hist1) && is_infinite(hist2))
794 {
795 /* same as above, but in reverse */
796 return 0.0;
797 }
798 else
799 {
800 /*
801 * If both bin boundaries are infinite, they should be equal to each
802 * other, and the value should also be infinite and equal to both
803 * bounds. (But don't Assert that, to avoid crashing unnecessarily if
804 * the caller messes up)
805 *
806 * Assume the value to lie in the middle of the infinite bounds.
807 */
808 return 0.5;
809 }
810 }
811
812 /*
813 * Measure distance between two range bounds.
814 */
815 static float8
get_distance(TypeCacheEntry * typcache,RangeBound * bound1,RangeBound * bound2)816 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
817 {
818 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
819
820 if (!bound1->infinite && !bound2->infinite)
821 {
822 /*
823 * Neither bound is infinite, use subdiff function or return default
824 * value of 1.0 if no subdiff is available.
825 */
826 if (has_subdiff)
827 {
828 float8 res;
829
830 res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
831 typcache->rng_collation,
832 bound2->val,
833 bound1->val));
834 /* Reject possible NaN result, also negative result */
835 if (isnan(res) || res < 0.0)
836 return 1.0;
837 else
838 return res;
839 }
840 else
841 return 1.0;
842 }
843 else if (bound1->infinite && bound2->infinite)
844 {
845 /* Both bounds are infinite */
846 if (bound1->lower == bound2->lower)
847 return 0.0;
848 else
849 return get_float8_infinity();
850 }
851 else
852 {
853 /* One bound is infinite, the other is not */
854 return get_float8_infinity();
855 }
856 }
857
858 /*
859 * Calculate the average of function P(x), in the interval [length1, length2],
860 * where P(x) is the fraction of tuples with length < x (or length <= x if
861 * 'equal' is true).
862 */
863 static double
calc_length_hist_frac(Datum * length_hist_values,int length_hist_nvalues,double length1,double length2,bool equal)864 calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
865 double length1, double length2, bool equal)
866 {
867 double frac;
868 double A,
869 B,
870 PA,
871 PB;
872 double pos;
873 int i;
874 double area;
875
876 Assert(length2 >= length1);
877
878 if (length2 < 0.0)
879 return 0.0; /* shouldn't happen, but doesn't hurt to check */
880
881 /* All lengths in the table are <= infinite. */
882 if (is_infinite(length2) && equal)
883 return 1.0;
884
885 /*----------
886 * The average of a function between A and B can be calculated by the
887 * formula:
888 *
889 * B
890 * 1 /
891 * ------- | P(x)dx
892 * B - A /
893 * A
894 *
895 * The geometrical interpretation of the integral is the area under the
896 * graph of P(x). P(x) is defined by the length histogram. We calculate
897 * the area in a piecewise fashion, iterating through the length histogram
898 * bins. Each bin is a trapezoid:
899 *
900 * P(x2)
901 * /|
902 * / |
903 * P(x1)/ |
904 * | |
905 * | |
906 * ---+---+--
907 * x1 x2
908 *
909 * where x1 and x2 are the boundaries of the current histogram, and P(x1)
910 * and P(x1) are the cumulative fraction of tuples at the boundaries.
911 *
912 * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
913 *
914 * The first bin contains the lower bound passed by the caller, so we
915 * use linear interpolation between the previous and next histogram bin
916 * boundary to calculate P(x1). Likewise for the last bin: we use linear
917 * interpolation to calculate P(x2). For the bins in between, x1 and x2
918 * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
919 * P(x1) = (bin index) / (number of bins)
920 * P(x2) = (bin index + 1 / (number of bins)
921 */
922
923 /* First bin, the one that contains lower bound */
924 i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
925 if (i >= length_hist_nvalues - 1)
926 return 1.0;
927
928 if (i < 0)
929 {
930 i = 0;
931 pos = 0.0;
932 }
933 else
934 {
935 /* interpolate length1's position in the bin */
936 pos = get_len_position(length1,
937 DatumGetFloat8(length_hist_values[i]),
938 DatumGetFloat8(length_hist_values[i + 1]));
939 }
940 PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
941 B = length1;
942
943 /*
944 * In the degenerate case that length1 == length2, simply return
945 * P(length1). This is not merely an optimization: if length1 == length2,
946 * we'd divide by zero later on.
947 */
948 if (length2 == length1)
949 return PB;
950
951 /*
952 * Loop through all the bins, until we hit the last bin, the one that
953 * contains the upper bound. (if lower and upper bounds are in the same
954 * bin, this falls out immediately)
955 */
956 area = 0.0;
957 for (; i < length_hist_nvalues - 1; i++)
958 {
959 double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
960
961 /* check if we've reached the last bin */
962 if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
963 break;
964
965 /* the upper bound of previous bin is the lower bound of this bin */
966 A = B;
967 PA = PB;
968
969 B = bin_upper;
970 PB = (double) i / (double) (length_hist_nvalues - 1);
971
972 /*
973 * Add the area of this trapezoid to the total. The point of the
974 * if-check is to avoid NaN, in the corner case that PA == PB == 0,
975 * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
976 * 0) is zero, regardless of the width (B - A).
977 */
978 if (PA > 0 || PB > 0)
979 area += 0.5 * (PB + PA) * (B - A);
980 }
981
982 /* Last bin */
983 A = B;
984 PA = PB;
985
986 B = length2; /* last bin ends at the query upper bound */
987 if (i >= length_hist_nvalues - 1)
988 pos = 0.0;
989 else
990 {
991 if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
992 pos = 0.0;
993 else
994 pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
995 }
996 PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
997
998 if (PA > 0 || PB > 0)
999 area += 0.5 * (PB + PA) * (B - A);
1000
1001 /*
1002 * Ok, we have calculated the area, ie. the integral. Divide by width to
1003 * get the requested average.
1004 *
1005 * Avoid NaN arising from infinite / infinite. This happens at least if
1006 * length2 is infinite. It's not clear what the correct value would be in
1007 * that case, so 0.5 seems as good as any value.
1008 */
1009 if (is_infinite(area) && is_infinite(length2))
1010 frac = 0.5;
1011 else
1012 frac = area / (length2 - length1);
1013
1014 return frac;
1015 }
1016
1017 /*
1018 * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
1019 * of ranges that fall within the constant lower and upper bounds. This uses
1020 * the histograms of range lower bounds and range lengths, on the assumption
1021 * that the range lengths are independent of the lower bounds.
1022 *
1023 * The caller has already checked that constant lower and upper bounds are
1024 * finite.
1025 */
1026 static double
calc_hist_selectivity_contained(TypeCacheEntry * typcache,RangeBound * lower,RangeBound * upper,RangeBound * hist_lower,int hist_nvalues,Datum * length_hist_values,int length_hist_nvalues)1027 calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1028 RangeBound *lower, RangeBound *upper,
1029 RangeBound *hist_lower, int hist_nvalues,
1030 Datum *length_hist_values, int length_hist_nvalues)
1031 {
1032 int i,
1033 upper_index;
1034 float8 prev_dist;
1035 double bin_width;
1036 double upper_bin_width;
1037 double sum_frac;
1038
1039 /*
1040 * Begin by finding the bin containing the upper bound, in the lower bound
1041 * histogram. Any range with a lower bound > constant upper bound can't
1042 * match, ie. there are no matches in bins greater than upper_index.
1043 */
1044 upper->inclusive = !upper->inclusive;
1045 upper->lower = true;
1046 upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1047 false);
1048
1049 /*
1050 * If the upper bound value is below the histogram's lower limit, there
1051 * are no matches.
1052 */
1053 if (upper_index < 0)
1054 return 0.0;
1055
1056 /*
1057 * If the upper bound value is at or beyond the histogram's upper limit,
1058 * start our loop at the last actual bin, as though the upper bound were
1059 * within that bin; get_position will clamp its result to 1.0 anyway.
1060 * (This corresponds to assuming that the data population above the
1061 * histogram's upper limit is empty, exactly like what we just assumed for
1062 * the lower limit.)
1063 */
1064 upper_index = Min(upper_index, hist_nvalues - 2);
1065
1066 /*
1067 * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1068 * upper_index + 1) bin which is greater than upper bound of query range
1069 * using linear interpolation of subdiff function.
1070 */
1071 upper_bin_width = get_position(typcache, upper,
1072 &hist_lower[upper_index],
1073 &hist_lower[upper_index + 1]);
1074
1075 /*
1076 * In the loop, dist and prev_dist are the distance of the "current" bin's
1077 * lower and upper bounds from the constant upper bound.
1078 *
1079 * bin_width represents the width of the current bin. Normally it is 1.0,
1080 * meaning a full width bin, but can be less in the corner cases: start
1081 * and end of the loop. We start with bin_width = upper_bin_width, because
1082 * we begin at the bin containing the upper bound.
1083 */
1084 prev_dist = 0.0;
1085 bin_width = upper_bin_width;
1086
1087 sum_frac = 0.0;
1088 for (i = upper_index; i >= 0; i--)
1089 {
1090 double dist;
1091 double length_hist_frac;
1092 bool final_bin = false;
1093
1094 /*
1095 * dist -- distance from upper bound of query range to lower bound of
1096 * the current bin in the lower bound histogram. Or to the lower bound
1097 * of the constant range, if this is the final bin, containing the
1098 * constant lower bound.
1099 */
1100 if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1101 {
1102 dist = get_distance(typcache, lower, upper);
1103
1104 /*
1105 * Subtract from bin_width the portion of this bin that we want to
1106 * ignore.
1107 */
1108 bin_width -= get_position(typcache, lower, &hist_lower[i],
1109 &hist_lower[i + 1]);
1110 if (bin_width < 0.0)
1111 bin_width = 0.0;
1112 final_bin = true;
1113 }
1114 else
1115 dist = get_distance(typcache, &hist_lower[i], upper);
1116
1117 /*
1118 * Estimate the fraction of tuples in this bin that are narrow enough
1119 * to not exceed the distance to the upper bound of the query range.
1120 */
1121 length_hist_frac = calc_length_hist_frac(length_hist_values,
1122 length_hist_nvalues,
1123 prev_dist, dist, true);
1124
1125 /*
1126 * Add the fraction of tuples in this bin, with a suitable length, to
1127 * the total.
1128 */
1129 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1130
1131 if (final_bin)
1132 break;
1133
1134 bin_width = 1.0;
1135 prev_dist = dist;
1136 }
1137
1138 return sum_frac;
1139 }
1140
1141 /*
1142 * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1143 * of ranges that contain the constant lower and upper bounds. This uses
1144 * the histograms of range lower bounds and range lengths, on the assumption
1145 * that the range lengths are independent of the lower bounds.
1146 */
1147 static double
calc_hist_selectivity_contains(TypeCacheEntry * typcache,RangeBound * lower,RangeBound * upper,RangeBound * hist_lower,int hist_nvalues,Datum * length_hist_values,int length_hist_nvalues)1148 calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1149 RangeBound *lower, RangeBound *upper,
1150 RangeBound *hist_lower, int hist_nvalues,
1151 Datum *length_hist_values, int length_hist_nvalues)
1152 {
1153 int i,
1154 lower_index;
1155 double bin_width,
1156 lower_bin_width;
1157 double sum_frac;
1158 float8 prev_dist;
1159
1160 /* Find the bin containing the lower bound of query range. */
1161 lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1162 true);
1163
1164 /*
1165 * If the lower bound value is below the histogram's lower limit, there
1166 * are no matches.
1167 */
1168 if (lower_index < 0)
1169 return 0.0;
1170
1171 /*
1172 * If the lower bound value is at or beyond the histogram's upper limit,
1173 * start our loop at the last actual bin, as though the upper bound were
1174 * within that bin; get_position will clamp its result to 1.0 anyway.
1175 * (This corresponds to assuming that the data population above the
1176 * histogram's upper limit is empty, exactly like what we just assumed for
1177 * the lower limit.)
1178 */
1179 lower_index = Min(lower_index, hist_nvalues - 2);
1180
1181 /*
1182 * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1183 * lower_index + 1) bin which is greater than lower bound of query range
1184 * using linear interpolation of subdiff function.
1185 */
1186 lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1187 &hist_lower[lower_index + 1]);
1188
1189 /*
1190 * Loop through all the lower bound bins, smaller than the query lower
1191 * bound. In the loop, dist and prev_dist are the distance of the
1192 * "current" bin's lower and upper bounds from the constant upper bound.
1193 * We begin from query lower bound, and walk backwards, so the first bin's
1194 * upper bound is the query lower bound, and its distance to the query
1195 * upper bound is the length of the query range.
1196 *
1197 * bin_width represents the width of the current bin. Normally it is 1.0,
1198 * meaning a full width bin, except for the first bin, which is only
1199 * counted up to the constant lower bound.
1200 */
1201 prev_dist = get_distance(typcache, lower, upper);
1202 sum_frac = 0.0;
1203 bin_width = lower_bin_width;
1204 for (i = lower_index; i >= 0; i--)
1205 {
1206 float8 dist;
1207 double length_hist_frac;
1208
1209 /*
1210 * dist -- distance from upper bound of query range to current value
1211 * of lower bound histogram or lower bound of query range (if we've
1212 * reach it).
1213 */
1214 dist = get_distance(typcache, &hist_lower[i], upper);
1215
1216 /*
1217 * Get average fraction of length histogram which covers intervals
1218 * longer than (or equal to) distance to upper bound of query range.
1219 */
1220 length_hist_frac =
1221 1.0 - calc_length_hist_frac(length_hist_values,
1222 length_hist_nvalues,
1223 prev_dist, dist, false);
1224
1225 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1226
1227 bin_width = 1.0;
1228 prev_dist = dist;
1229 }
1230
1231 return sum_frac;
1232 }
1233