1 /*-------------------------------------------------------------------------
2 *
3 * array_selfuncs.c
4 * Functions for selectivity estimation of array operators
5 *
6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/utils/adt/array_selfuncs.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include <math.h>
18
19 #include "access/htup_details.h"
20 #include "catalog/pg_collation.h"
21 #include "catalog/pg_operator.h"
22 #include "catalog/pg_statistic.h"
23 #include "optimizer/clauses.h"
24 #include "utils/array.h"
25 #include "utils/builtins.h"
26 #include "utils/lsyscache.h"
27 #include "utils/selfuncs.h"
28 #include "utils/typcache.h"
29
30
31 /* Default selectivity constant for "@>" and "<@" operators */
32 #define DEFAULT_CONTAIN_SEL 0.005
33
34 /* Default selectivity constant for "&&" operator */
35 #define DEFAULT_OVERLAP_SEL 0.01
36
37 /* Default selectivity for given operator */
38 #define DEFAULT_SEL(operator) \
39 ((operator) == OID_ARRAY_OVERLAP_OP ? \
40 DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
41
42 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
43 Oid elemtype, Oid operator);
44 static Selectivity mcelem_array_selec(ArrayType *array,
45 TypeCacheEntry *typentry,
46 Datum *mcelem, int nmcelem,
47 float4 *numbers, int nnumbers,
48 float4 *hist, int nhist,
49 Oid operator, FmgrInfo *cmpfunc);
50 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
51 float4 *numbers, int nnumbers,
52 Datum *array_data, int nitems,
53 Oid operator, FmgrInfo *cmpfunc);
54 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
55 float4 *numbers, int nnumbers,
56 Datum *array_data, int nitems,
57 float4 *hist, int nhist,
58 Oid operator, FmgrInfo *cmpfunc);
59 static float *calc_hist(const float4 *hist, int nhist, int n);
60 static float *calc_distr(const float *p, int n, int m, float rest);
61 static int floor_log2(uint32 n);
62 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
63 int *index, FmgrInfo *cmpfunc);
64 static int element_compare(const void *key1, const void *key2, void *arg);
65 static int float_compare_desc(const void *key1, const void *key2);
66
67
68 /*
69 * scalararraysel_containment
70 * Estimate selectivity of ScalarArrayOpExpr via array containment.
71 *
72 * If we have const =/<> ANY/ALL (array_var) then we can estimate the
73 * selectivity as though this were an array containment operator,
74 * array_var op ARRAY[const].
75 *
76 * scalararraysel() has already verified that the ScalarArrayOpExpr's operator
77 * is the array element type's default equality or inequality operator, and
78 * has aggressively simplified both inputs to constants.
79 *
80 * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
81 */
82 Selectivity
scalararraysel_containment(PlannerInfo * root,Node * leftop,Node * rightop,Oid elemtype,bool isEquality,bool useOr,int varRelid)83 scalararraysel_containment(PlannerInfo *root,
84 Node *leftop, Node *rightop,
85 Oid elemtype, bool isEquality, bool useOr,
86 int varRelid)
87 {
88 Selectivity selec;
89 VariableStatData vardata;
90 Datum constval;
91 TypeCacheEntry *typentry;
92 FmgrInfo *cmpfunc;
93
94 /*
95 * rightop must be a variable, else punt.
96 */
97 examine_variable(root, rightop, varRelid, &vardata);
98 if (!vardata.rel)
99 {
100 ReleaseVariableStats(vardata);
101 return -1.0;
102 }
103
104 /*
105 * leftop must be a constant, else punt.
106 */
107 if (!IsA(leftop, Const))
108 {
109 ReleaseVariableStats(vardata);
110 return -1.0;
111 }
112 if (((Const *) leftop)->constisnull)
113 {
114 /* qual can't succeed if null on left */
115 ReleaseVariableStats(vardata);
116 return (Selectivity) 0.0;
117 }
118 constval = ((Const *) leftop)->constvalue;
119
120 /* Get element type's default comparison function */
121 typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
122 if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
123 {
124 ReleaseVariableStats(vardata);
125 return -1.0;
126 }
127 cmpfunc = &typentry->cmp_proc_finfo;
128
129 /*
130 * If the operator is <>, swap ANY/ALL, then invert the result later.
131 */
132 if (!isEquality)
133 useOr = !useOr;
134
135 /* Get array element stats for var, if available */
136 if (HeapTupleIsValid(vardata.statsTuple) &&
137 statistic_proc_security_check(&vardata, cmpfunc->fn_oid))
138 {
139 Form_pg_statistic stats;
140 AttStatsSlot sslot;
141 AttStatsSlot hslot;
142
143 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
144
145 /* MCELEM will be an array of same type as element */
146 if (get_attstatsslot(&sslot, vardata.statsTuple,
147 STATISTIC_KIND_MCELEM, InvalidOid,
148 ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
149 {
150 /* For ALL case, also get histogram of distinct-element counts */
151 if (useOr ||
152 !get_attstatsslot(&hslot, vardata.statsTuple,
153 STATISTIC_KIND_DECHIST, InvalidOid,
154 ATTSTATSSLOT_NUMBERS))
155 memset(&hslot, 0, sizeof(hslot));
156
157 /*
158 * For = ANY, estimate as var @> ARRAY[const].
159 *
160 * For = ALL, estimate as var <@ ARRAY[const].
161 */
162 if (useOr)
163 selec = mcelem_array_contain_overlap_selec(sslot.values,
164 sslot.nvalues,
165 sslot.numbers,
166 sslot.nnumbers,
167 &constval, 1,
168 OID_ARRAY_CONTAINS_OP,
169 cmpfunc);
170 else
171 selec = mcelem_array_contained_selec(sslot.values,
172 sslot.nvalues,
173 sslot.numbers,
174 sslot.nnumbers,
175 &constval, 1,
176 hslot.numbers,
177 hslot.nnumbers,
178 OID_ARRAY_CONTAINED_OP,
179 cmpfunc);
180
181 free_attstatsslot(&hslot);
182 free_attstatsslot(&sslot);
183 }
184 else
185 {
186 /* No most-common-elements info, so do without */
187 if (useOr)
188 selec = mcelem_array_contain_overlap_selec(NULL, 0,
189 NULL, 0,
190 &constval, 1,
191 OID_ARRAY_CONTAINS_OP,
192 cmpfunc);
193 else
194 selec = mcelem_array_contained_selec(NULL, 0,
195 NULL, 0,
196 &constval, 1,
197 NULL, 0,
198 OID_ARRAY_CONTAINED_OP,
199 cmpfunc);
200 }
201
202 /*
203 * MCE stats count only non-null rows, so adjust for null rows.
204 */
205 selec *= (1.0 - stats->stanullfrac);
206 }
207 else
208 {
209 /* No stats at all, so do without */
210 if (useOr)
211 selec = mcelem_array_contain_overlap_selec(NULL, 0,
212 NULL, 0,
213 &constval, 1,
214 OID_ARRAY_CONTAINS_OP,
215 cmpfunc);
216 else
217 selec = mcelem_array_contained_selec(NULL, 0,
218 NULL, 0,
219 &constval, 1,
220 NULL, 0,
221 OID_ARRAY_CONTAINED_OP,
222 cmpfunc);
223 /* we assume no nulls here, so no stanullfrac correction */
224 }
225
226 ReleaseVariableStats(vardata);
227
228 /*
229 * If the operator is <>, invert the results.
230 */
231 if (!isEquality)
232 selec = 1.0 - selec;
233
234 CLAMP_PROBABILITY(selec);
235
236 return selec;
237 }
238
239 /*
240 * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
241 */
242 Datum
arraycontsel(PG_FUNCTION_ARGS)243 arraycontsel(PG_FUNCTION_ARGS)
244 {
245 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
246 Oid operator = PG_GETARG_OID(1);
247 List *args = (List *) PG_GETARG_POINTER(2);
248 int varRelid = PG_GETARG_INT32(3);
249 VariableStatData vardata;
250 Node *other;
251 bool varonleft;
252 Selectivity selec;
253 Oid element_typeid;
254
255 /*
256 * If expression is not (variable op something) or (something op
257 * variable), then punt and return a default estimate.
258 */
259 if (!get_restriction_variable(root, args, varRelid,
260 &vardata, &other, &varonleft))
261 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
262
263 /*
264 * Can't do anything useful if the something is not a constant, either.
265 */
266 if (!IsA(other, Const))
267 {
268 ReleaseVariableStats(vardata);
269 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
270 }
271
272 /*
273 * The "&&", "@>" and "<@" operators are strict, so we can cope with a
274 * NULL constant right away.
275 */
276 if (((Const *) other)->constisnull)
277 {
278 ReleaseVariableStats(vardata);
279 PG_RETURN_FLOAT8(0.0);
280 }
281
282 /*
283 * If var is on the right, commute the operator, so that we can assume the
284 * var is on the left in what follows.
285 */
286 if (!varonleft)
287 {
288 if (operator == OID_ARRAY_CONTAINS_OP)
289 operator = OID_ARRAY_CONTAINED_OP;
290 else if (operator == OID_ARRAY_CONTAINED_OP)
291 operator = OID_ARRAY_CONTAINS_OP;
292 }
293
294 /*
295 * OK, there's a Var and a Const we're dealing with here. We need the
296 * Const to be an array with same element type as column, else we can't do
297 * anything useful. (Such cases will likely fail at runtime, but here
298 * we'd rather just return a default estimate.)
299 */
300 element_typeid = get_base_element_type(((Const *) other)->consttype);
301 if (element_typeid != InvalidOid &&
302 element_typeid == get_base_element_type(vardata.vartype))
303 {
304 selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
305 element_typeid, operator);
306 }
307 else
308 {
309 selec = DEFAULT_SEL(operator);
310 }
311
312 ReleaseVariableStats(vardata);
313
314 CLAMP_PROBABILITY(selec);
315
316 PG_RETURN_FLOAT8((float8) selec);
317 }
318
319 /*
320 * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
321 */
322 Datum
arraycontjoinsel(PG_FUNCTION_ARGS)323 arraycontjoinsel(PG_FUNCTION_ARGS)
324 {
325 /* For the moment this is just a stub */
326 Oid operator = PG_GETARG_OID(1);
327
328 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
329 }
330
331 /*
332 * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
333 * or "arraycolumn <@ const" based on the statistics
334 *
335 * This function is mainly responsible for extracting the pg_statistic data
336 * to be used; we then pass the problem on to mcelem_array_selec().
337 */
338 static Selectivity
calc_arraycontsel(VariableStatData * vardata,Datum constval,Oid elemtype,Oid operator)339 calc_arraycontsel(VariableStatData *vardata, Datum constval,
340 Oid elemtype, Oid operator)
341 {
342 Selectivity selec;
343 TypeCacheEntry *typentry;
344 FmgrInfo *cmpfunc;
345 ArrayType *array;
346
347 /* Get element type's default comparison function */
348 typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
349 if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
350 return DEFAULT_SEL(operator);
351 cmpfunc = &typentry->cmp_proc_finfo;
352
353 /*
354 * The caller made sure the const is an array with same element type, so
355 * get it now
356 */
357 array = DatumGetArrayTypeP(constval);
358
359 if (HeapTupleIsValid(vardata->statsTuple) &&
360 statistic_proc_security_check(vardata, cmpfunc->fn_oid))
361 {
362 Form_pg_statistic stats;
363 AttStatsSlot sslot;
364 AttStatsSlot hslot;
365
366 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
367
368 /* MCELEM will be an array of same type as column */
369 if (get_attstatsslot(&sslot, vardata->statsTuple,
370 STATISTIC_KIND_MCELEM, InvalidOid,
371 ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
372 {
373 /*
374 * For "array <@ const" case we also need histogram of distinct
375 * element counts.
376 */
377 if (operator != OID_ARRAY_CONTAINED_OP ||
378 !get_attstatsslot(&hslot, vardata->statsTuple,
379 STATISTIC_KIND_DECHIST, InvalidOid,
380 ATTSTATSSLOT_NUMBERS))
381 memset(&hslot, 0, sizeof(hslot));
382
383 /* Use the most-common-elements slot for the array Var. */
384 selec = mcelem_array_selec(array, typentry,
385 sslot.values, sslot.nvalues,
386 sslot.numbers, sslot.nnumbers,
387 hslot.numbers, hslot.nnumbers,
388 operator, cmpfunc);
389
390 free_attstatsslot(&hslot);
391 free_attstatsslot(&sslot);
392 }
393 else
394 {
395 /* No most-common-elements info, so do without */
396 selec = mcelem_array_selec(array, typentry,
397 NULL, 0, NULL, 0, NULL, 0,
398 operator, cmpfunc);
399 }
400
401 /*
402 * MCE stats count only non-null rows, so adjust for null rows.
403 */
404 selec *= (1.0 - stats->stanullfrac);
405 }
406 else
407 {
408 /* No stats at all, so do without */
409 selec = mcelem_array_selec(array, typentry,
410 NULL, 0, NULL, 0, NULL, 0,
411 operator, cmpfunc);
412 /* we assume no nulls here, so no stanullfrac correction */
413 }
414
415 /* If constant was toasted, release the copy we made */
416 if (PointerGetDatum(array) != constval)
417 pfree(array);
418
419 return selec;
420 }
421
422 /*
423 * Array selectivity estimation based on most common elements statistics
424 *
425 * This function just deconstructs and sorts the array constant's contents,
426 * and then passes the problem on to mcelem_array_contain_overlap_selec or
427 * mcelem_array_contained_selec depending on the operator.
428 */
429 static Selectivity
mcelem_array_selec(ArrayType * array,TypeCacheEntry * typentry,Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers,float4 * hist,int nhist,Oid operator,FmgrInfo * cmpfunc)430 mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry,
431 Datum *mcelem, int nmcelem,
432 float4 *numbers, int nnumbers,
433 float4 *hist, int nhist,
434 Oid operator, FmgrInfo *cmpfunc)
435 {
436 Selectivity selec;
437 int num_elems;
438 Datum *elem_values;
439 bool *elem_nulls;
440 bool null_present;
441 int nonnull_nitems;
442 int i;
443
444 /*
445 * Prepare constant array data for sorting. Sorting lets us find unique
446 * elements and efficiently merge with the MCELEM array.
447 */
448 deconstruct_array(array,
449 typentry->type_id,
450 typentry->typlen,
451 typentry->typbyval,
452 typentry->typalign,
453 &elem_values, &elem_nulls, &num_elems);
454
455 /* Collapse out any null elements */
456 nonnull_nitems = 0;
457 null_present = false;
458 for (i = 0; i < num_elems; i++)
459 {
460 if (elem_nulls[i])
461 null_present = true;
462 else
463 elem_values[nonnull_nitems++] = elem_values[i];
464 }
465
466 /*
467 * Query "column @> '{anything, null}'" matches nothing. For the other
468 * two operators, presence of a null in the constant can be ignored.
469 */
470 if (null_present && operator == OID_ARRAY_CONTAINS_OP)
471 {
472 pfree(elem_values);
473 pfree(elem_nulls);
474 return (Selectivity) 0.0;
475 }
476
477 /* Sort extracted elements using their default comparison function. */
478 qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
479 element_compare, cmpfunc);
480
481 /* Separate cases according to operator */
482 if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
483 selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
484 numbers, nnumbers,
485 elem_values, nonnull_nitems,
486 operator, cmpfunc);
487 else if (operator == OID_ARRAY_CONTAINED_OP)
488 selec = mcelem_array_contained_selec(mcelem, nmcelem,
489 numbers, nnumbers,
490 elem_values, nonnull_nitems,
491 hist, nhist,
492 operator, cmpfunc);
493 else
494 {
495 elog(ERROR, "arraycontsel called for unrecognized operator %u",
496 operator);
497 selec = 0.0; /* keep compiler quiet */
498 }
499
500 pfree(elem_values);
501 pfree(elem_nulls);
502 return selec;
503 }
504
505 /*
506 * Estimate selectivity of "column @> const" and "column && const" based on
507 * most common element statistics. This estimation assumes element
508 * occurrences are independent.
509 *
510 * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
511 * the array column's MCELEM statistics slot, or are NULL/0 if stats are
512 * not available. array_data (of length nitems) is the constant's elements.
513 *
514 * Both the mcelem and array_data arrays are assumed presorted according
515 * to the element type's cmpfunc. Null elements are not present.
516 *
517 * TODO: this estimate probably could be improved by using the distinct
518 * elements count histogram. For example, excepting the special case of
519 * "column @> '{}'", we can multiply the calculated selectivity by the
520 * fraction of nonempty arrays in the column.
521 */
522 static Selectivity
mcelem_array_contain_overlap_selec(Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers,Datum * array_data,int nitems,Oid operator,FmgrInfo * cmpfunc)523 mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
524 float4 *numbers, int nnumbers,
525 Datum *array_data, int nitems,
526 Oid operator, FmgrInfo *cmpfunc)
527 {
528 Selectivity selec,
529 elem_selec;
530 int mcelem_index,
531 i;
532 bool use_bsearch;
533 float4 minfreq;
534
535 /*
536 * There should be three more Numbers than Values, because the last three
537 * cells should hold minimal and maximal frequency among the non-null
538 * elements, and then the frequency of null elements. Ignore the Numbers
539 * if not right.
540 */
541 if (nnumbers != nmcelem + 3)
542 {
543 numbers = NULL;
544 nnumbers = 0;
545 }
546
547 if (numbers)
548 {
549 /* Grab the lowest observed frequency */
550 minfreq = numbers[nmcelem];
551 }
552 else
553 {
554 /* Without statistics make some default assumptions */
555 minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
556 }
557
558 /* Decide whether it is faster to use binary search or not. */
559 if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
560 use_bsearch = true;
561 else
562 use_bsearch = false;
563
564 if (operator == OID_ARRAY_CONTAINS_OP)
565 {
566 /*
567 * Initial selectivity for "column @> const" query is 1.0, and it will
568 * be decreased with each element of constant array.
569 */
570 selec = 1.0;
571 }
572 else
573 {
574 /*
575 * Initial selectivity for "column && const" query is 0.0, and it will
576 * be increased with each element of constant array.
577 */
578 selec = 0.0;
579 }
580
581 /* Scan mcelem and array in parallel. */
582 mcelem_index = 0;
583 for (i = 0; i < nitems; i++)
584 {
585 bool match = false;
586
587 /* Ignore any duplicates in the array data. */
588 if (i > 0 &&
589 element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
590 continue;
591
592 /* Find the smallest MCELEM >= this array item. */
593 if (use_bsearch)
594 {
595 match = find_next_mcelem(mcelem, nmcelem, array_data[i],
596 &mcelem_index, cmpfunc);
597 }
598 else
599 {
600 while (mcelem_index < nmcelem)
601 {
602 int cmp = element_compare(&mcelem[mcelem_index],
603 &array_data[i],
604 cmpfunc);
605
606 if (cmp < 0)
607 mcelem_index++;
608 else
609 {
610 if (cmp == 0)
611 match = true; /* mcelem is found */
612 break;
613 }
614 }
615 }
616
617 if (match && numbers)
618 {
619 /* MCELEM matches the array item; use its frequency. */
620 elem_selec = numbers[mcelem_index];
621 mcelem_index++;
622 }
623 else
624 {
625 /*
626 * The element is not in MCELEM. Punt, but assume that the
627 * selectivity cannot be more than minfreq / 2.
628 */
629 elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
630 }
631
632 /*
633 * Update overall selectivity using the current element's selectivity
634 * and an assumption of element occurrence independence.
635 */
636 if (operator == OID_ARRAY_CONTAINS_OP)
637 selec *= elem_selec;
638 else
639 selec = selec + elem_selec - selec * elem_selec;
640
641 /* Clamp intermediate results to stay sane despite roundoff error */
642 CLAMP_PROBABILITY(selec);
643 }
644
645 return selec;
646 }
647
648 /*
649 * Estimate selectivity of "column <@ const" based on most common element
650 * statistics.
651 *
652 * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
653 * the array column's MCELEM statistics slot, or are NULL/0 if stats are
654 * not available. array_data (of length nitems) is the constant's elements.
655 * hist (of length nhist) is from the array column's DECHIST statistics slot,
656 * or is NULL/0 if those stats are not available.
657 *
658 * Both the mcelem and array_data arrays are assumed presorted according
659 * to the element type's cmpfunc. Null elements are not present.
660 *
661 * Independent element occurrence would imply a particular distribution of
662 * distinct element counts among matching rows. Real data usually falsifies
663 * that assumption. For example, in a set of 11-element integer arrays having
664 * elements in the range [0..10], element occurrences are typically not
665 * independent. If they were, a sufficiently-large set would include all
666 * distinct element counts 0 through 11. We correct for this using the
667 * histogram of distinct element counts.
668 *
669 * In the "column @> const" and "column && const" cases, we usually have a
670 * "const" with low number of elements (otherwise we have selectivity close
671 * to 0 or 1 respectively). That's why the effect of dependence related
672 * to distinct element count distribution is negligible there. In the
673 * "column <@ const" case, number of elements is usually high (otherwise we
674 * have selectivity close to 0). That's why we should do a correction with
675 * the array distinct element count distribution here.
676 *
677 * Using the histogram of distinct element counts produces a different
678 * distribution law than independent occurrences of elements. This
679 * distribution law can be described as follows:
680 *
681 * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
682 * (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
683 *
684 * where:
685 * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
686 * (1 - occurrence, 0 - no occurrence) in row
687 * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
688 * (scalar values in [0..1]) according to collected statistics
689 * m = o1 + o2 + ... + on = total number of distinct elements in row
690 * hist[m] - histogram data for occurrence of m elements.
691 * ind[m] - probability of m occurrences from n events assuming their
692 * probabilities to be equal to frequencies of array elements.
693 *
694 * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
695 * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
696 */
697 static Selectivity
mcelem_array_contained_selec(Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers,Datum * array_data,int nitems,float4 * hist,int nhist,Oid operator,FmgrInfo * cmpfunc)698 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
699 float4 *numbers, int nnumbers,
700 Datum *array_data, int nitems,
701 float4 *hist, int nhist,
702 Oid operator, FmgrInfo *cmpfunc)
703 {
704 int mcelem_index,
705 i,
706 unique_nitems = 0;
707 float selec,
708 minfreq,
709 nullelem_freq;
710 float *dist,
711 *mcelem_dist,
712 *hist_part;
713 float avg_count,
714 mult,
715 rest;
716 float *elem_selec;
717
718 /*
719 * There should be three more Numbers than Values in the MCELEM slot,
720 * because the last three cells should hold minimal and maximal frequency
721 * among the non-null elements, and then the frequency of null elements.
722 * Punt if not right, because we can't do much without the element freqs.
723 */
724 if (numbers == NULL || nnumbers != nmcelem + 3)
725 return DEFAULT_CONTAIN_SEL;
726
727 /* Can't do much without a count histogram, either */
728 if (hist == NULL || nhist < 3)
729 return DEFAULT_CONTAIN_SEL;
730
731 /*
732 * Grab some of the summary statistics that compute_array_stats() stores:
733 * lowest frequency, frequency of null elements, and average distinct
734 * element count.
735 */
736 minfreq = numbers[nmcelem];
737 nullelem_freq = numbers[nmcelem + 2];
738 avg_count = hist[nhist - 1];
739
740 /*
741 * "rest" will be the sum of the frequencies of all elements not
742 * represented in MCELEM. The average distinct element count is the sum
743 * of the frequencies of *all* elements. Begin with that; we will proceed
744 * to subtract the MCELEM frequencies.
745 */
746 rest = avg_count;
747
748 /*
749 * mult is a multiplier representing estimate of probability that each
750 * mcelem that is not present in constant doesn't occur.
751 */
752 mult = 1.0f;
753
754 /*
755 * elem_selec is array of estimated frequencies for elements in the
756 * constant.
757 */
758 elem_selec = (float *) palloc(sizeof(float) * nitems);
759
760 /* Scan mcelem and array in parallel. */
761 mcelem_index = 0;
762 for (i = 0; i < nitems; i++)
763 {
764 bool match = false;
765
766 /* Ignore any duplicates in the array data. */
767 if (i > 0 &&
768 element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
769 continue;
770
771 /*
772 * Iterate over MCELEM until we find an entry greater than or equal to
773 * this element of the constant. Update "rest" and "mult" for mcelem
774 * entries skipped over.
775 */
776 while (mcelem_index < nmcelem)
777 {
778 int cmp = element_compare(&mcelem[mcelem_index],
779 &array_data[i],
780 cmpfunc);
781
782 if (cmp < 0)
783 {
784 mult *= (1.0f - numbers[mcelem_index]);
785 rest -= numbers[mcelem_index];
786 mcelem_index++;
787 }
788 else
789 {
790 if (cmp == 0)
791 match = true; /* mcelem is found */
792 break;
793 }
794 }
795
796 if (match)
797 {
798 /* MCELEM matches the array item. */
799 elem_selec[unique_nitems] = numbers[mcelem_index];
800 /* "rest" is decremented for all mcelems, matched or not */
801 rest -= numbers[mcelem_index];
802 mcelem_index++;
803 }
804 else
805 {
806 /*
807 * The element is not in MCELEM. Punt, but assume that the
808 * selectivity cannot be more than minfreq / 2.
809 */
810 elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
811 minfreq / 2);
812 }
813
814 unique_nitems++;
815 }
816
817 /*
818 * If we handled all constant elements without exhausting the MCELEM
819 * array, finish walking it to complete calculation of "rest" and "mult".
820 */
821 while (mcelem_index < nmcelem)
822 {
823 mult *= (1.0f - numbers[mcelem_index]);
824 rest -= numbers[mcelem_index];
825 mcelem_index++;
826 }
827
828 /*
829 * The presence of many distinct rare elements materially decreases
830 * selectivity. Use the Poisson distribution to estimate the probability
831 * of a column value having zero occurrences of such elements. See above
832 * for the definition of "rest".
833 */
834 mult *= exp(-rest);
835
836 /*----------
837 * Using the distinct element count histogram requires
838 * O(unique_nitems * (nmcelem + unique_nitems))
839 * operations. Beyond a certain computational cost threshold, it's
840 * reasonable to sacrifice accuracy for decreased planning time. We limit
841 * the number of operations to EFFORT * nmcelem; since nmcelem is limited
842 * by the column's statistics target, the work done is user-controllable.
843 *
844 * If the number of operations would be too large, we can reduce it
845 * without losing all accuracy by reducing unique_nitems and considering
846 * only the most-common elements of the constant array. To make the
847 * results exactly match what we would have gotten with only those
848 * elements to start with, we'd have to remove any discarded elements'
849 * frequencies from "mult", but since this is only an approximation
850 * anyway, we don't bother with that. Therefore it's sufficient to qsort
851 * elem_selec[] and take the largest elements. (They will no longer match
852 * up with the elements of array_data[], but we don't care.)
853 *----------
854 */
855 #define EFFORT 100
856
857 if ((nmcelem + unique_nitems) > 0 &&
858 unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
859 {
860 /*
861 * Use the quadratic formula to solve for largest allowable N. We
862 * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
863 */
864 double b = (double) nmcelem;
865 int n;
866
867 n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
868
869 /* Sort, then take just the first n elements */
870 qsort(elem_selec, unique_nitems, sizeof(float),
871 float_compare_desc);
872 unique_nitems = n;
873 }
874
875 /*
876 * Calculate probabilities of each distinct element count for both mcelems
877 * and constant elements. At this point, assume independent element
878 * occurrence.
879 */
880 dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
881 mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
882
883 /* ignore hist[nhist-1], which is the average not a histogram member */
884 hist_part = calc_hist(hist, nhist - 1, unique_nitems);
885
886 selec = 0.0f;
887 for (i = 0; i <= unique_nitems; i++)
888 {
889 /*
890 * mult * dist[i] / mcelem_dist[i] gives us probability of qual
891 * matching from assumption of independent element occurrence with the
892 * condition that distinct element count = i.
893 */
894 if (mcelem_dist[i] > 0)
895 selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
896 }
897
898 pfree(dist);
899 pfree(mcelem_dist);
900 pfree(hist_part);
901 pfree(elem_selec);
902
903 /* Take into account occurrence of NULL element. */
904 selec *= (1.0f - nullelem_freq);
905
906 CLAMP_PROBABILITY(selec);
907
908 return selec;
909 }
910
911 /*
912 * Calculate the first n distinct element count probabilities from a
913 * histogram of distinct element counts.
914 *
915 * Returns a palloc'd array of n+1 entries, with array[k] being the
916 * probability of element count k, k in [0..n].
917 *
918 * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
919 * (nhist - 1)) probability to each value in (a,b) and an additional half of
920 * that to a and b themselves.
921 */
922 static float *
calc_hist(const float4 * hist,int nhist,int n)923 calc_hist(const float4 *hist, int nhist, int n)
924 {
925 float *hist_part;
926 int k,
927 i = 0;
928 float prev_interval = 0,
929 next_interval;
930 float frac;
931
932 hist_part = (float *) palloc((n + 1) * sizeof(float));
933
934 /*
935 * frac is a probability contribution for each interval between histogram
936 * values. We have nhist - 1 intervals, so contribution of each one will
937 * be 1 / (nhist - 1).
938 */
939 frac = 1.0f / ((float) (nhist - 1));
940
941 for (k = 0; k <= n; k++)
942 {
943 int count = 0;
944
945 /*
946 * Count the histogram boundaries equal to k. (Although the histogram
947 * should theoretically contain only exact integers, entries are
948 * floats so there could be roundoff error in large values. Treat any
949 * fractional value as equal to the next larger k.)
950 */
951 while (i < nhist && hist[i] <= k)
952 {
953 count++;
954 i++;
955 }
956
957 if (count > 0)
958 {
959 /* k is an exact bound for at least one histogram box. */
960 float val;
961
962 /* Find length between current histogram value and the next one */
963 if (i < nhist)
964 next_interval = hist[i] - hist[i - 1];
965 else
966 next_interval = 0;
967
968 /*
969 * count - 1 histogram boxes contain k exclusively. They
970 * contribute a total of (count - 1) * frac probability. Also
971 * factor in the partial histogram boxes on either side.
972 */
973 val = (float) (count - 1);
974 if (next_interval > 0)
975 val += 0.5f / next_interval;
976 if (prev_interval > 0)
977 val += 0.5f / prev_interval;
978 hist_part[k] = frac * val;
979
980 prev_interval = next_interval;
981 }
982 else
983 {
984 /* k does not appear as an exact histogram bound. */
985 if (prev_interval > 0)
986 hist_part[k] = frac / prev_interval;
987 else
988 hist_part[k] = 0.0f;
989 }
990 }
991
992 return hist_part;
993 }
994
995 /*
996 * Consider n independent events with probabilities p[]. This function
997 * calculates probabilities of exact k of events occurrence for k in [0..m].
998 * Returns a palloc'd array of size m+1.
999 *
1000 * "rest" is the sum of the probabilities of all low-probability events not
1001 * included in p.
1002 *
1003 * Imagine matrix M of size (n + 1) x (m + 1). Element M[i,j] denotes the
1004 * probability that exactly j of first i events occur. Obviously M[0,0] = 1.
1005 * For any constant j, each increment of i increases the probability iff the
1006 * event occurs. So, by the law of total probability:
1007 * M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1008 * for i > 0, j > 0.
1009 * M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1010 */
1011 static float *
calc_distr(const float * p,int n,int m,float rest)1012 calc_distr(const float *p, int n, int m, float rest)
1013 {
1014 float *row,
1015 *prev_row,
1016 *tmp;
1017 int i,
1018 j;
1019
1020 /*
1021 * Since we return only the last row of the matrix and need only the
1022 * current and previous row for calculations, allocate two rows.
1023 */
1024 row = (float *) palloc((m + 1) * sizeof(float));
1025 prev_row = (float *) palloc((m + 1) * sizeof(float));
1026
1027 /* M[0,0] = 1 */
1028 row[0] = 1.0f;
1029 for (i = 1; i <= n; i++)
1030 {
1031 float t = p[i - 1];
1032
1033 /* Swap rows */
1034 tmp = row;
1035 row = prev_row;
1036 prev_row = tmp;
1037
1038 /* Calculate next row */
1039 for (j = 0; j <= i && j <= m; j++)
1040 {
1041 float val = 0.0f;
1042
1043 if (j < i)
1044 val += prev_row[j] * (1.0f - t);
1045 if (j > 0)
1046 val += prev_row[j - 1] * t;
1047 row[j] = val;
1048 }
1049 }
1050
1051 /*
1052 * The presence of many distinct rare (not in "p") elements materially
1053 * decreases selectivity. Model their collective occurrence with the
1054 * Poisson distribution.
1055 */
1056 if (rest > DEFAULT_CONTAIN_SEL)
1057 {
1058 float t;
1059
1060 /* Swap rows */
1061 tmp = row;
1062 row = prev_row;
1063 prev_row = tmp;
1064
1065 for (i = 0; i <= m; i++)
1066 row[i] = 0.0f;
1067
1068 /* Value of Poisson distribution for 0 occurrences */
1069 t = exp(-rest);
1070
1071 /*
1072 * Calculate convolution of previously computed distribution and the
1073 * Poisson distribution.
1074 */
1075 for (i = 0; i <= m; i++)
1076 {
1077 for (j = 0; j <= m - i; j++)
1078 row[j + i] += prev_row[j] * t;
1079
1080 /* Get Poisson distribution value for (i + 1) occurrences */
1081 t *= rest / (float) (i + 1);
1082 }
1083 }
1084
1085 pfree(prev_row);
1086 return row;
1087 }
1088
1089 /* Fast function for floor value of 2 based logarithm calculation. */
1090 static int
floor_log2(uint32 n)1091 floor_log2(uint32 n)
1092 {
1093 int logval = 0;
1094
1095 if (n == 0)
1096 return -1;
1097 if (n >= (1 << 16))
1098 {
1099 n >>= 16;
1100 logval += 16;
1101 }
1102 if (n >= (1 << 8))
1103 {
1104 n >>= 8;
1105 logval += 8;
1106 }
1107 if (n >= (1 << 4))
1108 {
1109 n >>= 4;
1110 logval += 4;
1111 }
1112 if (n >= (1 << 2))
1113 {
1114 n >>= 2;
1115 logval += 2;
1116 }
1117 if (n >= (1 << 1))
1118 {
1119 logval += 1;
1120 }
1121 return logval;
1122 }
1123
1124 /*
1125 * find_next_mcelem binary-searches a most common elements array, starting
1126 * from *index, for the first member >= value. It saves the position of the
1127 * match into *index and returns true if it's an exact match. (Note: we
1128 * assume the mcelem elements are distinct so there can't be more than one
1129 * exact match.)
1130 */
1131 static bool
find_next_mcelem(Datum * mcelem,int nmcelem,Datum value,int * index,FmgrInfo * cmpfunc)1132 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
1133 FmgrInfo *cmpfunc)
1134 {
1135 int l = *index,
1136 r = nmcelem - 1,
1137 i,
1138 res;
1139
1140 while (l <= r)
1141 {
1142 i = (l + r) / 2;
1143 res = element_compare(&mcelem[i], &value, cmpfunc);
1144 if (res == 0)
1145 {
1146 *index = i;
1147 return true;
1148 }
1149 else if (res < 0)
1150 l = i + 1;
1151 else
1152 r = i - 1;
1153 }
1154 *index = l;
1155 return false;
1156 }
1157
1158 /*
1159 * Comparison function for elements.
1160 *
1161 * We use the element type's default btree opclass, and the default collation
1162 * if the type is collation-sensitive.
1163 *
1164 * XXX consider using SortSupport infrastructure
1165 */
1166 static int
element_compare(const void * key1,const void * key2,void * arg)1167 element_compare(const void *key1, const void *key2, void *arg)
1168 {
1169 Datum d1 = *((const Datum *) key1);
1170 Datum d2 = *((const Datum *) key2);
1171 FmgrInfo *cmpfunc = (FmgrInfo *) arg;
1172 Datum c;
1173
1174 c = FunctionCall2Coll(cmpfunc, DEFAULT_COLLATION_OID, d1, d2);
1175 return DatumGetInt32(c);
1176 }
1177
1178 /*
1179 * Comparison function for sorting floats into descending order.
1180 */
1181 static int
float_compare_desc(const void * key1,const void * key2)1182 float_compare_desc(const void *key1, const void *key2)
1183 {
1184 float d1 = *((const float *) key1);
1185 float d2 = *((const float *) key2);
1186
1187 if (d1 > d2)
1188 return -1;
1189 else if (d1 < d2)
1190 return 1;
1191 else
1192 return 0;
1193 }
1194