1 /*-------------------------------------------------------------------------
2 *
3 * rangetypes_selfuncs.c
4 * Functions for selectivity estimation of range operators
5 *
6 * Estimates are based on histograms of lower and upper bounds, and the
7 * fraction of empty ranges.
8 *
9 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
11 *
12 *
13 * IDENTIFICATION
14 * src/backend/utils/adt/rangetypes_selfuncs.c
15 *
16 *-------------------------------------------------------------------------
17 */
18 #include "postgres.h"
19
20 #include <math.h>
21
22 #include "access/htup_details.h"
23 #include "catalog/pg_operator.h"
24 #include "catalog/pg_statistic.h"
25 #include "catalog/pg_type.h"
26 #include "utils/builtins.h"
27 #include "utils/lsyscache.h"
28 #include "utils/rangetypes.h"
29 #include "utils/selfuncs.h"
30 #include "utils/typcache.h"
31
32 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
33 RangeType *constval, Oid operator);
34 static double default_range_selectivity(Oid operator);
35 static double calc_hist_selectivity(TypeCacheEntry *typcache,
36 VariableStatData *vardata, RangeType *constval,
37 Oid operator);
38 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
39 RangeBound *constbound,
40 RangeBound *hist, int hist_nvalues,
41 bool equal);
42 static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
43 RangeBound *hist, int hist_length, bool equal);
44 static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
45 RangeBound *hist1, RangeBound *hist2);
46 static float8 get_len_position(double value, double hist1, double hist2);
47 static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
48 RangeBound *bound2);
49 static int length_hist_bsearch(Datum *length_hist_values,
50 int length_hist_nvalues, double value, bool equal);
51 static double calc_length_hist_frac(Datum *length_hist_values,
52 int length_hist_nvalues, double length1, double length2, bool equal);
53 static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
54 RangeBound *lower, RangeBound *upper,
55 RangeBound *hist_lower, int hist_nvalues,
56 Datum *length_hist_values, int length_hist_nvalues);
57 static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
58 RangeBound *lower, RangeBound *upper,
59 RangeBound *hist_lower, int hist_nvalues,
60 Datum *length_hist_values, int length_hist_nvalues);
61
62 /*
63 * Returns a default selectivity estimate for given operator, when we don't
64 * have statistics or cannot use them for some reason.
65 */
66 static double
default_range_selectivity(Oid operator)67 default_range_selectivity(Oid operator)
68 {
69 switch (operator)
70 {
71 case OID_RANGE_OVERLAP_OP:
72 return 0.01;
73
74 case OID_RANGE_CONTAINS_OP:
75 case OID_RANGE_CONTAINED_OP:
76 return 0.005;
77
78 case OID_RANGE_CONTAINS_ELEM_OP:
79 case OID_RANGE_ELEM_CONTAINED_OP:
80
81 /*
82 * "range @> elem" is more or less identical to a scalar
83 * inequality "A >= b AND A <= c".
84 */
85 return DEFAULT_RANGE_INEQ_SEL;
86
87 case OID_RANGE_LESS_OP:
88 case OID_RANGE_LESS_EQUAL_OP:
89 case OID_RANGE_GREATER_OP:
90 case OID_RANGE_GREATER_EQUAL_OP:
91 case OID_RANGE_LEFT_OP:
92 case OID_RANGE_RIGHT_OP:
93 case OID_RANGE_OVERLAPS_LEFT_OP:
94 case OID_RANGE_OVERLAPS_RIGHT_OP:
95 /* these are similar to regular scalar inequalities */
96 return DEFAULT_INEQ_SEL;
97
98 default:
99 /* all range operators should be handled above, but just in case */
100 return 0.01;
101 }
102 }
103
104 /*
105 * rangesel -- restriction selectivity for range operators
106 */
107 Datum
rangesel(PG_FUNCTION_ARGS)108 rangesel(PG_FUNCTION_ARGS)
109 {
110 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
111 Oid operator = PG_GETARG_OID(1);
112 List *args = (List *) PG_GETARG_POINTER(2);
113 int varRelid = PG_GETARG_INT32(3);
114 VariableStatData vardata;
115 Node *other;
116 bool varonleft;
117 Selectivity selec;
118 TypeCacheEntry *typcache = NULL;
119 RangeType *constrange = NULL;
120
121 /*
122 * If expression is not (variable op something) or (something op
123 * variable), then punt and return a default estimate.
124 */
125 if (!get_restriction_variable(root, args, varRelid,
126 &vardata, &other, &varonleft))
127 PG_RETURN_FLOAT8(default_range_selectivity(operator));
128
129 /*
130 * Can't do anything useful if the something is not a constant, either.
131 */
132 if (!IsA(other, Const))
133 {
134 ReleaseVariableStats(vardata);
135 PG_RETURN_FLOAT8(default_range_selectivity(operator));
136 }
137
138 /*
139 * All the range operators are strict, so we can cope with a NULL constant
140 * right away.
141 */
142 if (((Const *) other)->constisnull)
143 {
144 ReleaseVariableStats(vardata);
145 PG_RETURN_FLOAT8(0.0);
146 }
147
148 /*
149 * If var is on the right, commute the operator, so that we can assume the
150 * var is on the left in what follows.
151 */
152 if (!varonleft)
153 {
154 /* we have other Op var, commute to make var Op other */
155 operator = get_commutator(operator);
156 if (!operator)
157 {
158 /* Use default selectivity (should we raise an error instead?) */
159 ReleaseVariableStats(vardata);
160 PG_RETURN_FLOAT8(default_range_selectivity(operator));
161 }
162 }
163
164 /*
165 * OK, there's a Var and a Const we're dealing with here. We need the
166 * Const to be of same range type as the column, else we can't do anything
167 * useful. (Such cases will likely fail at runtime, but here we'd rather
168 * just return a default estimate.)
169 *
170 * If the operator is "range @> element", the constant should be of the
171 * element type of the range column. Convert it to a range that includes
172 * only that single point, so that we don't need special handling for that
173 * in what follows.
174 */
175 if (operator == OID_RANGE_CONTAINS_ELEM_OP)
176 {
177 typcache = range_get_typcache(fcinfo, vardata.vartype);
178
179 if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
180 {
181 RangeBound lower,
182 upper;
183
184 lower.inclusive = true;
185 lower.val = ((Const *) other)->constvalue;
186 lower.infinite = false;
187 lower.lower = true;
188 upper.inclusive = true;
189 upper.val = ((Const *) other)->constvalue;
190 upper.infinite = false;
191 upper.lower = false;
192 constrange = range_serialize(typcache, &lower, &upper, false);
193 }
194 }
195 else if (operator == OID_RANGE_ELEM_CONTAINED_OP)
196 {
197 /*
198 * Here, the Var is the elem, not the range. For now we just punt and
199 * return the default estimate. In future we could disassemble the
200 * range constant and apply scalarineqsel ...
201 */
202 }
203 else if (((Const *) other)->consttype == vardata.vartype)
204 {
205 /* Both sides are the same range type */
206 typcache = range_get_typcache(fcinfo, vardata.vartype);
207
208 constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
209 }
210
211 /*
212 * If we got a valid constant on one side of the operator, proceed to
213 * estimate using statistics. Otherwise punt and return a default constant
214 * estimate. Note that calc_rangesel need not handle
215 * OID_RANGE_ELEM_CONTAINED_OP.
216 */
217 if (constrange)
218 selec = calc_rangesel(typcache, &vardata, constrange, operator);
219 else
220 selec = default_range_selectivity(operator);
221
222 ReleaseVariableStats(vardata);
223
224 CLAMP_PROBABILITY(selec);
225
226 PG_RETURN_FLOAT8((float8) selec);
227 }
228
229 static double
calc_rangesel(TypeCacheEntry * typcache,VariableStatData * vardata,RangeType * constval,Oid operator)230 calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
231 RangeType *constval, Oid operator)
232 {
233 double hist_selec;
234 double selec;
235 float4 empty_frac,
236 null_frac;
237
238 /*
239 * First look up the fraction of NULLs and empty ranges from pg_statistic.
240 */
241 if (HeapTupleIsValid(vardata->statsTuple))
242 {
243 Form_pg_statistic stats;
244 AttStatsSlot sslot;
245
246 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
247 null_frac = stats->stanullfrac;
248
249 /* Try to get fraction of empty ranges */
250 if (get_attstatsslot(&sslot, vardata->statsTuple,
251 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
252 InvalidOid,
253 ATTSTATSSLOT_NUMBERS))
254 {
255 if (sslot.nnumbers != 1)
256 elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
257 empty_frac = sslot.numbers[0];
258 free_attstatsslot(&sslot);
259 }
260 else
261 {
262 /* No empty fraction statistic. Assume no empty ranges. */
263 empty_frac = 0.0;
264 }
265 }
266 else
267 {
268 /*
269 * No stats are available. Follow through the calculations below
270 * anyway, assuming no NULLs and no empty ranges. This still allows us
271 * to give a better-than-nothing estimate based on whether the
272 * constant is an empty range or not.
273 */
274 null_frac = 0.0;
275 empty_frac = 0.0;
276 }
277
278 if (RangeIsEmpty(constval))
279 {
280 /*
281 * An empty range matches all ranges, all empty ranges, or nothing,
282 * depending on the operator
283 */
284 switch (operator)
285 {
286 /* these return false if either argument is empty */
287 case OID_RANGE_OVERLAP_OP:
288 case OID_RANGE_OVERLAPS_LEFT_OP:
289 case OID_RANGE_OVERLAPS_RIGHT_OP:
290 case OID_RANGE_LEFT_OP:
291 case OID_RANGE_RIGHT_OP:
292 /* nothing is less than an empty range */
293 case OID_RANGE_LESS_OP:
294 selec = 0.0;
295 break;
296
297 /* only empty ranges can be contained by an empty range */
298 case OID_RANGE_CONTAINED_OP:
299 /* only empty ranges are <= an empty range */
300 case OID_RANGE_LESS_EQUAL_OP:
301 selec = empty_frac;
302 break;
303
304 /* everything contains an empty range */
305 case OID_RANGE_CONTAINS_OP:
306 /* everything is >= an empty range */
307 case OID_RANGE_GREATER_EQUAL_OP:
308 selec = 1.0;
309 break;
310
311 /* all non-empty ranges are > an empty range */
312 case OID_RANGE_GREATER_OP:
313 selec = 1.0 - empty_frac;
314 break;
315
316 /* an element cannot be empty */
317 case OID_RANGE_CONTAINS_ELEM_OP:
318 default:
319 elog(ERROR, "unexpected operator %u", operator);
320 selec = 0.0; /* keep compiler quiet */
321 break;
322 }
323 }
324 else
325 {
326 /*
327 * Calculate selectivity using bound histograms. If that fails for
328 * some reason, e.g no histogram in pg_statistic, use the default
329 * constant estimate for the fraction of non-empty values. This is
330 * still somewhat better than just returning the default estimate,
331 * because this still takes into account the fraction of empty and
332 * NULL tuples, if we had statistics for them.
333 */
334 hist_selec = calc_hist_selectivity(typcache, vardata, constval,
335 operator);
336 if (hist_selec < 0.0)
337 hist_selec = default_range_selectivity(operator);
338
339 /*
340 * Now merge the results for the empty ranges and histogram
341 * calculations, realizing that the histogram covers only the
342 * non-null, non-empty values.
343 */
344 if (operator == OID_RANGE_CONTAINED_OP)
345 {
346 /* empty is contained by anything non-empty */
347 selec = (1.0 - empty_frac) * hist_selec + empty_frac;
348 }
349 else
350 {
351 /* with any other operator, empty Op non-empty matches nothing */
352 selec = (1.0 - empty_frac) * hist_selec;
353 }
354 }
355
356 /* all range operators are strict */
357 selec *= (1.0 - null_frac);
358
359 /* result should be in range, but make sure... */
360 CLAMP_PROBABILITY(selec);
361
362 return selec;
363 }
364
365 /*
366 * Calculate range operator selectivity using histograms of range bounds.
367 *
368 * This estimate is for the portion of values that are not empty and not
369 * NULL.
370 */
371 static double
calc_hist_selectivity(TypeCacheEntry * typcache,VariableStatData * vardata,RangeType * constval,Oid operator)372 calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
373 RangeType *constval, Oid operator)
374 {
375 AttStatsSlot hslot;
376 AttStatsSlot lslot;
377 int nhist;
378 RangeBound *hist_lower;
379 RangeBound *hist_upper;
380 int i;
381 RangeBound const_lower;
382 RangeBound const_upper;
383 bool empty;
384 double hist_selec;
385
386 /* Can't use the histogram with insecure range support functions */
387 if (!statistic_proc_security_check(vardata,
388 typcache->rng_cmp_proc_finfo.fn_oid))
389 return -1;
390 if (OidIsValid(typcache->rng_subdiff_finfo.fn_oid) &&
391 !statistic_proc_security_check(vardata,
392 typcache->rng_subdiff_finfo.fn_oid))
393 return -1;
394
395 /* Try to get histogram of ranges */
396 if (!(HeapTupleIsValid(vardata->statsTuple) &&
397 get_attstatsslot(&hslot, vardata->statsTuple,
398 STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
399 ATTSTATSSLOT_VALUES)))
400 return -1.0;
401
402 /* check that it's a histogram, not just a dummy entry */
403 if (hslot.nvalues < 2)
404 {
405 free_attstatsslot(&hslot);
406 return -1.0;
407 }
408
409 /*
410 * Convert histogram of ranges into histograms of its lower and upper
411 * bounds.
412 */
413 nhist = hslot.nvalues;
414 hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
415 hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
416 for (i = 0; i < nhist; i++)
417 {
418 range_deserialize(typcache, DatumGetRangeTypeP(hslot.values[i]),
419 &hist_lower[i], &hist_upper[i], &empty);
420 /* The histogram should not contain any empty ranges */
421 if (empty)
422 elog(ERROR, "bounds histogram contains an empty range");
423 }
424
425 /* @> and @< also need a histogram of range lengths */
426 if (operator == OID_RANGE_CONTAINS_OP ||
427 operator == OID_RANGE_CONTAINED_OP)
428 {
429 if (!(HeapTupleIsValid(vardata->statsTuple) &&
430 get_attstatsslot(&lslot, vardata->statsTuple,
431 STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
432 InvalidOid,
433 ATTSTATSSLOT_VALUES)))
434 {
435 free_attstatsslot(&hslot);
436 return -1.0;
437 }
438
439 /* check that it's a histogram, not just a dummy entry */
440 if (lslot.nvalues < 2)
441 {
442 free_attstatsslot(&lslot);
443 free_attstatsslot(&hslot);
444 return -1.0;
445 }
446 }
447 else
448 memset(&lslot, 0, sizeof(lslot));
449
450 /* Extract the bounds of the constant value. */
451 range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
452 Assert(!empty);
453
454 /*
455 * Calculate selectivity comparing the lower or upper bound of the
456 * constant with the histogram of lower or upper bounds.
457 */
458 switch (operator)
459 {
460 case OID_RANGE_LESS_OP:
461
462 /*
463 * The regular b-tree comparison operators (<, <=, >, >=) compare
464 * the lower bounds first, and the upper bounds for values with
465 * equal lower bounds. Estimate that by comparing the lower bounds
466 * only. This gives a fairly accurate estimate assuming there
467 * aren't many rows with a lower bound equal to the constant's
468 * lower bound.
469 */
470 hist_selec =
471 calc_hist_selectivity_scalar(typcache, &const_lower,
472 hist_lower, nhist, false);
473 break;
474
475 case OID_RANGE_LESS_EQUAL_OP:
476 hist_selec =
477 calc_hist_selectivity_scalar(typcache, &const_lower,
478 hist_lower, nhist, true);
479 break;
480
481 case OID_RANGE_GREATER_OP:
482 hist_selec =
483 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
484 hist_lower, nhist, false);
485 break;
486
487 case OID_RANGE_GREATER_EQUAL_OP:
488 hist_selec =
489 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
490 hist_lower, nhist, true);
491 break;
492
493 case OID_RANGE_LEFT_OP:
494 /* var << const when upper(var) < lower(const) */
495 hist_selec =
496 calc_hist_selectivity_scalar(typcache, &const_lower,
497 hist_upper, nhist, false);
498 break;
499
500 case OID_RANGE_RIGHT_OP:
501 /* var >> const when lower(var) > upper(const) */
502 hist_selec =
503 1 - calc_hist_selectivity_scalar(typcache, &const_upper,
504 hist_lower, nhist, true);
505 break;
506
507 case OID_RANGE_OVERLAPS_RIGHT_OP:
508 /* compare lower bounds */
509 hist_selec =
510 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
511 hist_lower, nhist, false);
512 break;
513
514 case OID_RANGE_OVERLAPS_LEFT_OP:
515 /* compare upper bounds */
516 hist_selec =
517 calc_hist_selectivity_scalar(typcache, &const_upper,
518 hist_upper, nhist, true);
519 break;
520
521 case OID_RANGE_OVERLAP_OP:
522 case OID_RANGE_CONTAINS_ELEM_OP:
523
524 /*
525 * A && B <=> NOT (A << B OR A >> B).
526 *
527 * Since A << B and A >> B are mutually exclusive events we can
528 * sum their probabilities to find probability of (A << B OR A >>
529 * B).
530 *
531 * "range @> elem" is equivalent to "range && [elem,elem]". The
532 * caller already constructed the singular range from the element
533 * constant, so just treat it the same as &&.
534 */
535 hist_selec =
536 calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
537 nhist, false);
538 hist_selec +=
539 (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
540 nhist, true));
541 hist_selec = 1.0 - hist_selec;
542 break;
543
544 case OID_RANGE_CONTAINS_OP:
545 hist_selec =
546 calc_hist_selectivity_contains(typcache, &const_lower,
547 &const_upper, hist_lower, nhist,
548 lslot.values, lslot.nvalues);
549 break;
550
551 case OID_RANGE_CONTAINED_OP:
552 if (const_lower.infinite)
553 {
554 /*
555 * Lower bound no longer matters. Just estimate the fraction
556 * with an upper bound <= const upper bound
557 */
558 hist_selec =
559 calc_hist_selectivity_scalar(typcache, &const_upper,
560 hist_upper, nhist, true);
561 }
562 else if (const_upper.infinite)
563 {
564 hist_selec =
565 1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
566 hist_lower, nhist, false);
567 }
568 else
569 {
570 hist_selec =
571 calc_hist_selectivity_contained(typcache, &const_lower,
572 &const_upper, hist_lower, nhist,
573 lslot.values, lslot.nvalues);
574 }
575 break;
576
577 default:
578 elog(ERROR, "unknown range operator %u", operator);
579 hist_selec = -1.0; /* keep compiler quiet */
580 break;
581 }
582
583 free_attstatsslot(&lslot);
584 free_attstatsslot(&hslot);
585
586 return hist_selec;
587 }
588
589
590 /*
591 * Look up the fraction of values less than (or equal, if 'equal' argument
592 * is true) a given const in a histogram of range bounds.
593 */
594 static double
calc_hist_selectivity_scalar(TypeCacheEntry * typcache,RangeBound * constbound,RangeBound * hist,int hist_nvalues,bool equal)595 calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
596 RangeBound *hist, int hist_nvalues, bool equal)
597 {
598 Selectivity selec;
599 int index;
600
601 /*
602 * Find the histogram bin the given constant falls into. Estimate
603 * selectivity as the number of preceding whole bins.
604 */
605 index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
606 selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
607
608 /* Adjust using linear interpolation within the bin */
609 if (index >= 0 && index < hist_nvalues - 1)
610 selec += get_position(typcache, constbound, &hist[index],
611 &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
612
613 return selec;
614 }
615
616 /*
617 * Binary search on an array of range bounds. Returns greatest index of range
618 * bound in array which is less(less or equal) than given range bound. If all
619 * range bounds in array are greater or equal(greater) than given range bound,
620 * return -1. When "equal" flag is set conditions in brackets are used.
621 *
622 * This function is used in scalar operator selectivity estimation. Another
623 * goal of this function is to find a histogram bin where to stop
624 * interpolation of portion of bounds which are less than or equal to given bound.
625 */
626 static int
rbound_bsearch(TypeCacheEntry * typcache,RangeBound * value,RangeBound * hist,int hist_length,bool equal)627 rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
628 int hist_length, bool equal)
629 {
630 int lower = -1,
631 upper = hist_length - 1,
632 cmp,
633 middle;
634
635 while (lower < upper)
636 {
637 middle = (lower + upper + 1) / 2;
638 cmp = range_cmp_bounds(typcache, &hist[middle], value);
639
640 if (cmp < 0 || (equal && cmp == 0))
641 lower = middle;
642 else
643 upper = middle - 1;
644 }
645 return lower;
646 }
647
648
649 /*
650 * Binary search on length histogram. Returns greatest index of range length in
651 * histogram which is less than (less than or equal) the given length value. If
652 * all lengths in the histogram are greater than (greater than or equal) the
653 * given length, returns -1.
654 */
655 static int
length_hist_bsearch(Datum * length_hist_values,int length_hist_nvalues,double value,bool equal)656 length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
657 double value, bool equal)
658 {
659 int lower = -1,
660 upper = length_hist_nvalues - 1,
661 middle;
662
663 while (lower < upper)
664 {
665 double middleval;
666
667 middle = (lower + upper + 1) / 2;
668
669 middleval = DatumGetFloat8(length_hist_values[middle]);
670 if (middleval < value || (equal && middleval <= value))
671 lower = middle;
672 else
673 upper = middle - 1;
674 }
675 return lower;
676 }
677
678 /*
679 * Get relative position of value in histogram bin in [0,1] range.
680 */
681 static float8
get_position(TypeCacheEntry * typcache,RangeBound * value,RangeBound * hist1,RangeBound * hist2)682 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
683 RangeBound *hist2)
684 {
685 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
686 float8 position;
687
688 if (!hist1->infinite && !hist2->infinite)
689 {
690 float8 bin_width;
691
692 /*
693 * Both bounds are finite. Assuming the subtype's comparison function
694 * works sanely, the value must be finite, too, because it lies
695 * somewhere between the bounds. If it doesn't, arbitrarily return
696 * 0.5.
697 */
698 if (value->infinite)
699 return 0.5;
700
701 /* Can't interpolate without subdiff function */
702 if (!has_subdiff)
703 return 0.5;
704
705 /* Calculate relative position using subdiff function. */
706 bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
707 typcache->rng_collation,
708 hist2->val,
709 hist1->val));
710 if (isnan(bin_width) || bin_width <= 0.0)
711 return 0.5; /* punt for NaN or zero-width bin */
712
713 position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
714 typcache->rng_collation,
715 value->val,
716 hist1->val))
717 / bin_width;
718
719 if (isnan(position))
720 return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */
721
722 /* Relative position must be in [0,1] range */
723 position = Max(position, 0.0);
724 position = Min(position, 1.0);
725 return position;
726 }
727 else if (hist1->infinite && !hist2->infinite)
728 {
729 /*
730 * Lower bin boundary is -infinite, upper is finite. If the value is
731 * -infinite, return 0.0 to indicate it's equal to the lower bound.
732 * Otherwise return 1.0 to indicate it's infinitely far from the lower
733 * bound.
734 */
735 return ((value->infinite && value->lower) ? 0.0 : 1.0);
736 }
737 else if (!hist1->infinite && hist2->infinite)
738 {
739 /* same as above, but in reverse */
740 return ((value->infinite && !value->lower) ? 1.0 : 0.0);
741 }
742 else
743 {
744 /*
745 * If both bin boundaries are infinite, they should be equal to each
746 * other, and the value should also be infinite and equal to both
747 * bounds. (But don't Assert that, to avoid crashing if a user creates
748 * a datatype with a broken comparison function).
749 *
750 * Assume the value to lie in the middle of the infinite bounds.
751 */
752 return 0.5;
753 }
754 }
755
756
757 /*
758 * Get relative position of value in a length histogram bin in [0,1] range.
759 */
760 static double
get_len_position(double value,double hist1,double hist2)761 get_len_position(double value, double hist1, double hist2)
762 {
763 if (!is_infinite(hist1) && !is_infinite(hist2))
764 {
765 /*
766 * Both bounds are finite. The value should be finite too, because it
767 * lies somewhere between the bounds. If it doesn't, just return
768 * something.
769 */
770 if (is_infinite(value))
771 return 0.5;
772
773 return 1.0 - (hist2 - value) / (hist2 - hist1);
774 }
775 else if (is_infinite(hist1) && !is_infinite(hist2))
776 {
777 /*
778 * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
779 * indicate the value is infinitely far from the lower bound.
780 */
781 return 1.0;
782 }
783 else if (is_infinite(hist1) && is_infinite(hist2))
784 {
785 /* same as above, but in reverse */
786 return 0.0;
787 }
788 else
789 {
790 /*
791 * If both bin boundaries are infinite, they should be equal to each
792 * other, and the value should also be infinite and equal to both
793 * bounds. (But don't Assert that, to avoid crashing unnecessarily if
794 * the caller messes up)
795 *
796 * Assume the value to lie in the middle of the infinite bounds.
797 */
798 return 0.5;
799 }
800 }
801
802 /*
803 * Measure distance between two range bounds.
804 */
805 static float8
get_distance(TypeCacheEntry * typcache,RangeBound * bound1,RangeBound * bound2)806 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
807 {
808 bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
809
810 if (!bound1->infinite && !bound2->infinite)
811 {
812 /*
813 * Neither bound is infinite, use subdiff function or return default
814 * value of 1.0 if no subdiff is available.
815 */
816 if (has_subdiff)
817 {
818 float8 res;
819
820 res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
821 typcache->rng_collation,
822 bound2->val,
823 bound1->val));
824 /* Reject possible NaN result, also negative result */
825 if (isnan(res) || res < 0.0)
826 return 1.0;
827 else
828 return res;
829 }
830 else
831 return 1.0;
832 }
833 else if (bound1->infinite && bound2->infinite)
834 {
835 /* Both bounds are infinite */
836 if (bound1->lower == bound2->lower)
837 return 0.0;
838 else
839 return get_float8_infinity();
840 }
841 else
842 {
843 /* One bound is infinite, the other is not */
844 return get_float8_infinity();
845 }
846 }
847
848 /*
849 * Calculate the average of function P(x), in the interval [length1, length2],
850 * where P(x) is the fraction of tuples with length < x (or length <= x if
851 * 'equal' is true).
852 */
853 static double
calc_length_hist_frac(Datum * length_hist_values,int length_hist_nvalues,double length1,double length2,bool equal)854 calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
855 double length1, double length2, bool equal)
856 {
857 double frac;
858 double A,
859 B,
860 PA,
861 PB;
862 double pos;
863 int i;
864 double area;
865
866 Assert(length2 >= length1);
867
868 if (length2 < 0.0)
869 return 0.0; /* shouldn't happen, but doesn't hurt to check */
870
871 /* All lengths in the table are <= infinite. */
872 if (is_infinite(length2) && equal)
873 return 1.0;
874
875 /*----------
876 * The average of a function between A and B can be calculated by the
877 * formula:
878 *
879 * B
880 * 1 /
881 * ------- | P(x)dx
882 * B - A /
883 * A
884 *
885 * The geometrical interpretation of the integral is the area under the
886 * graph of P(x). P(x) is defined by the length histogram. We calculate
887 * the area in a piecewise fashion, iterating through the length histogram
888 * bins. Each bin is a trapezoid:
889 *
890 * P(x2)
891 * /|
892 * / |
893 * P(x1)/ |
894 * | |
895 * | |
896 * ---+---+--
897 * x1 x2
898 *
899 * where x1 and x2 are the boundaries of the current histogram, and P(x1)
900 * and P(x1) are the cumulative fraction of tuples at the boundaries.
901 *
902 * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
903 *
904 * The first bin contains the lower bound passed by the caller, so we
905 * use linear interpolation between the previous and next histogram bin
906 * boundary to calculate P(x1). Likewise for the last bin: we use linear
907 * interpolation to calculate P(x2). For the bins in between, x1 and x2
908 * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
909 * P(x1) = (bin index) / (number of bins)
910 * P(x2) = (bin index + 1 / (number of bins)
911 */
912
913 /* First bin, the one that contains lower bound */
914 i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
915 if (i >= length_hist_nvalues - 1)
916 return 1.0;
917
918 if (i < 0)
919 {
920 i = 0;
921 pos = 0.0;
922 }
923 else
924 {
925 /* interpolate length1's position in the bin */
926 pos = get_len_position(length1,
927 DatumGetFloat8(length_hist_values[i]),
928 DatumGetFloat8(length_hist_values[i + 1]));
929 }
930 PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
931 B = length1;
932
933 /*
934 * In the degenerate case that length1 == length2, simply return
935 * P(length1). This is not merely an optimization: if length1 == length2,
936 * we'd divide by zero later on.
937 */
938 if (length2 == length1)
939 return PB;
940
941 /*
942 * Loop through all the bins, until we hit the last bin, the one that
943 * contains the upper bound. (if lower and upper bounds are in the same
944 * bin, this falls out immediately)
945 */
946 area = 0.0;
947 for (; i < length_hist_nvalues - 1; i++)
948 {
949 double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
950
951 /* check if we've reached the last bin */
952 if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
953 break;
954
955 /* the upper bound of previous bin is the lower bound of this bin */
956 A = B;
957 PA = PB;
958
959 B = bin_upper;
960 PB = (double) i / (double) (length_hist_nvalues - 1);
961
962 /*
963 * Add the area of this trapezoid to the total. The point of the
964 * if-check is to avoid NaN, in the corner case that PA == PB == 0,
965 * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
966 * 0) is zero, regardless of the width (B - A).
967 */
968 if (PA > 0 || PB > 0)
969 area += 0.5 * (PB + PA) * (B - A);
970 }
971
972 /* Last bin */
973 A = B;
974 PA = PB;
975
976 B = length2; /* last bin ends at the query upper bound */
977 if (i >= length_hist_nvalues - 1)
978 pos = 0.0;
979 else
980 {
981 if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
982 pos = 0.0;
983 else
984 pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
985 }
986 PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
987
988 if (PA > 0 || PB > 0)
989 area += 0.5 * (PB + PA) * (B - A);
990
991 /*
992 * Ok, we have calculated the area, ie. the integral. Divide by width to
993 * get the requested average.
994 *
995 * Avoid NaN arising from infinite / infinite. This happens at least if
996 * length2 is infinite. It's not clear what the correct value would be in
997 * that case, so 0.5 seems as good as any value.
998 */
999 if (is_infinite(area) && is_infinite(length2))
1000 frac = 0.5;
1001 else
1002 frac = area / (length2 - length1);
1003
1004 return frac;
1005 }
1006
1007 /*
1008 * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
1009 * of ranges that fall within the constant lower and upper bounds. This uses
1010 * the histograms of range lower bounds and range lengths, on the assumption
1011 * that the range lengths are independent of the lower bounds.
1012 *
1013 * The caller has already checked that constant lower and upper bounds are
1014 * finite.
1015 */
1016 static double
calc_hist_selectivity_contained(TypeCacheEntry * typcache,RangeBound * lower,RangeBound * upper,RangeBound * hist_lower,int hist_nvalues,Datum * length_hist_values,int length_hist_nvalues)1017 calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1018 RangeBound *lower, RangeBound *upper,
1019 RangeBound *hist_lower, int hist_nvalues,
1020 Datum *length_hist_values, int length_hist_nvalues)
1021 {
1022 int i,
1023 upper_index;
1024 float8 prev_dist;
1025 double bin_width;
1026 double upper_bin_width;
1027 double sum_frac;
1028
1029 /*
1030 * Begin by finding the bin containing the upper bound, in the lower bound
1031 * histogram. Any range with a lower bound > constant upper bound can't
1032 * match, ie. there are no matches in bins greater than upper_index.
1033 */
1034 upper->inclusive = !upper->inclusive;
1035 upper->lower = true;
1036 upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1037 false);
1038
1039 /*
1040 * If the upper bound value is below the histogram's lower limit, there
1041 * are no matches.
1042 */
1043 if (upper_index < 0)
1044 return 0.0;
1045
1046 /*
1047 * If the upper bound value is at or beyond the histogram's upper limit,
1048 * start our loop at the last actual bin, as though the upper bound were
1049 * within that bin; get_position will clamp its result to 1.0 anyway.
1050 * (This corresponds to assuming that the data population above the
1051 * histogram's upper limit is empty, exactly like what we just assumed for
1052 * the lower limit.)
1053 */
1054 upper_index = Min(upper_index, hist_nvalues - 2);
1055
1056 /*
1057 * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1058 * upper_index + 1) bin which is greater than upper bound of query range
1059 * using linear interpolation of subdiff function.
1060 */
1061 upper_bin_width = get_position(typcache, upper,
1062 &hist_lower[upper_index],
1063 &hist_lower[upper_index + 1]);
1064
1065 /*
1066 * In the loop, dist and prev_dist are the distance of the "current" bin's
1067 * lower and upper bounds from the constant upper bound.
1068 *
1069 * bin_width represents the width of the current bin. Normally it is 1.0,
1070 * meaning a full width bin, but can be less in the corner cases: start
1071 * and end of the loop. We start with bin_width = upper_bin_width, because
1072 * we begin at the bin containing the upper bound.
1073 */
1074 prev_dist = 0.0;
1075 bin_width = upper_bin_width;
1076
1077 sum_frac = 0.0;
1078 for (i = upper_index; i >= 0; i--)
1079 {
1080 double dist;
1081 double length_hist_frac;
1082 bool final_bin = false;
1083
1084 /*
1085 * dist -- distance from upper bound of query range to lower bound of
1086 * the current bin in the lower bound histogram. Or to the lower bound
1087 * of the constant range, if this is the final bin, containing the
1088 * constant lower bound.
1089 */
1090 if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1091 {
1092 dist = get_distance(typcache, lower, upper);
1093
1094 /*
1095 * Subtract from bin_width the portion of this bin that we want to
1096 * ignore.
1097 */
1098 bin_width -= get_position(typcache, lower, &hist_lower[i],
1099 &hist_lower[i + 1]);
1100 if (bin_width < 0.0)
1101 bin_width = 0.0;
1102 final_bin = true;
1103 }
1104 else
1105 dist = get_distance(typcache, &hist_lower[i], upper);
1106
1107 /*
1108 * Estimate the fraction of tuples in this bin that are narrow enough
1109 * to not exceed the distance to the upper bound of the query range.
1110 */
1111 length_hist_frac = calc_length_hist_frac(length_hist_values,
1112 length_hist_nvalues,
1113 prev_dist, dist, true);
1114
1115 /*
1116 * Add the fraction of tuples in this bin, with a suitable length, to
1117 * the total.
1118 */
1119 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1120
1121 if (final_bin)
1122 break;
1123
1124 bin_width = 1.0;
1125 prev_dist = dist;
1126 }
1127
1128 return sum_frac;
1129 }
1130
1131 /*
1132 * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1133 * of ranges that contain the constant lower and upper bounds. This uses
1134 * the histograms of range lower bounds and range lengths, on the assumption
1135 * that the range lengths are independent of the lower bounds.
1136 */
1137 static double
calc_hist_selectivity_contains(TypeCacheEntry * typcache,RangeBound * lower,RangeBound * upper,RangeBound * hist_lower,int hist_nvalues,Datum * length_hist_values,int length_hist_nvalues)1138 calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1139 RangeBound *lower, RangeBound *upper,
1140 RangeBound *hist_lower, int hist_nvalues,
1141 Datum *length_hist_values, int length_hist_nvalues)
1142 {
1143 int i,
1144 lower_index;
1145 double bin_width,
1146 lower_bin_width;
1147 double sum_frac;
1148 float8 prev_dist;
1149
1150 /* Find the bin containing the lower bound of query range. */
1151 lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1152 true);
1153
1154 /*
1155 * If the lower bound value is below the histogram's lower limit, there
1156 * are no matches.
1157 */
1158 if (lower_index < 0)
1159 return 0.0;
1160
1161 /*
1162 * If the lower bound value is at or beyond the histogram's upper limit,
1163 * start our loop at the last actual bin, as though the upper bound were
1164 * within that bin; get_position will clamp its result to 1.0 anyway.
1165 * (This corresponds to assuming that the data population above the
1166 * histogram's upper limit is empty, exactly like what we just assumed for
1167 * the lower limit.)
1168 */
1169 lower_index = Min(lower_index, hist_nvalues - 2);
1170
1171 /*
1172 * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1173 * lower_index + 1) bin which is greater than lower bound of query range
1174 * using linear interpolation of subdiff function.
1175 */
1176 lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1177 &hist_lower[lower_index + 1]);
1178
1179 /*
1180 * Loop through all the lower bound bins, smaller than the query lower
1181 * bound. In the loop, dist and prev_dist are the distance of the
1182 * "current" bin's lower and upper bounds from the constant upper bound.
1183 * We begin from query lower bound, and walk backwards, so the first bin's
1184 * upper bound is the query lower bound, and its distance to the query
1185 * upper bound is the length of the query range.
1186 *
1187 * bin_width represents the width of the current bin. Normally it is 1.0,
1188 * meaning a full width bin, except for the first bin, which is only
1189 * counted up to the constant lower bound.
1190 */
1191 prev_dist = get_distance(typcache, lower, upper);
1192 sum_frac = 0.0;
1193 bin_width = lower_bin_width;
1194 for (i = lower_index; i >= 0; i--)
1195 {
1196 float8 dist;
1197 double length_hist_frac;
1198
1199 /*
1200 * dist -- distance from upper bound of query range to current value
1201 * of lower bound histogram or lower bound of query range (if we've
1202 * reach it).
1203 */
1204 dist = get_distance(typcache, &hist_lower[i], upper);
1205
1206 /*
1207 * Get average fraction of length histogram which covers intervals
1208 * longer than (or equal to) distance to upper bound of query range.
1209 */
1210 length_hist_frac =
1211 1.0 - calc_length_hist_frac(length_hist_values,
1212 length_hist_nvalues,
1213 prev_dist, dist, false);
1214
1215 sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1216
1217 bin_width = 1.0;
1218 prev_dist = dist;
1219 }
1220
1221 return sum_frac;
1222 }
1223