1 /*-------------------------------------------------------------------------
2 *
3 * array_selfuncs.c
4 * Functions for selectivity estimation of array operators
5 *
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/utils/adt/array_selfuncs.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include <math.h>
18
19 #include "access/htup_details.h"
20 #include "catalog/pg_collation.h"
21 #include "catalog/pg_operator.h"
22 #include "catalog/pg_statistic.h"
23 #include "optimizer/clauses.h"
24 #include "utils/array.h"
25 #include "utils/lsyscache.h"
26 #include "utils/selfuncs.h"
27 #include "utils/typcache.h"
28
29
30 /* Default selectivity constant for "@>" and "<@" operators */
31 #define DEFAULT_CONTAIN_SEL 0.005
32
33 /* Default selectivity constant for "&&" operator */
34 #define DEFAULT_OVERLAP_SEL 0.01
35
36 /* Default selectivity for given operator */
37 #define DEFAULT_SEL(operator) \
38 ((operator) == OID_ARRAY_OVERLAP_OP ? \
39 DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
40
41 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
42 Oid elemtype, Oid operator);
43 static Selectivity mcelem_array_selec(ArrayType *array,
44 TypeCacheEntry *typentry,
45 Datum *mcelem, int nmcelem,
46 float4 *numbers, int nnumbers,
47 float4 *hist, int nhist,
48 Oid operator, FmgrInfo *cmpfunc);
49 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
50 float4 *numbers, int nnumbers,
51 Datum *array_data, int nitems,
52 Oid operator, FmgrInfo *cmpfunc);
53 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
54 float4 *numbers, int nnumbers,
55 Datum *array_data, int nitems,
56 float4 *hist, int nhist,
57 Oid operator, FmgrInfo *cmpfunc);
58 static float *calc_hist(const float4 *hist, int nhist, int n);
59 static float *calc_distr(const float *p, int n, int m, float rest);
60 static int floor_log2(uint32 n);
61 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
62 int *index, FmgrInfo *cmpfunc);
63 static int element_compare(const void *key1, const void *key2, void *arg);
64 static int float_compare_desc(const void *key1, const void *key2);
65
66
67 /*
68 * scalararraysel_containment
69 * Estimate selectivity of ScalarArrayOpExpr via array containment.
70 *
71 * If we have const =/<> ANY/ALL (array_var) then we can estimate the
72 * selectivity as though this were an array containment operator,
73 * array_var op ARRAY[const].
74 *
75 * scalararraysel() has already verified that the ScalarArrayOpExpr's operator
76 * is the array element type's default equality or inequality operator, and
77 * has aggressively simplified both inputs to constants.
78 *
79 * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
80 */
81 Selectivity
scalararraysel_containment(PlannerInfo * root,Node * leftop,Node * rightop,Oid elemtype,bool isEquality,bool useOr,int varRelid)82 scalararraysel_containment(PlannerInfo *root,
83 Node *leftop, Node *rightop,
84 Oid elemtype, bool isEquality, bool useOr,
85 int varRelid)
86 {
87 Selectivity selec;
88 VariableStatData vardata;
89 Datum constval;
90 TypeCacheEntry *typentry;
91 FmgrInfo *cmpfunc;
92
93 /*
94 * rightop must be a variable, else punt.
95 */
96 examine_variable(root, rightop, varRelid, &vardata);
97 if (!vardata.rel)
98 {
99 ReleaseVariableStats(vardata);
100 return -1.0;
101 }
102
103 /*
104 * leftop must be a constant, else punt.
105 */
106 if (!IsA(leftop, Const))
107 {
108 ReleaseVariableStats(vardata);
109 return -1.0;
110 }
111 if (((Const *) leftop)->constisnull)
112 {
113 /* qual can't succeed if null on left */
114 ReleaseVariableStats(vardata);
115 return (Selectivity) 0.0;
116 }
117 constval = ((Const *) leftop)->constvalue;
118
119 /* Get element type's default comparison function */
120 typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
121 if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
122 {
123 ReleaseVariableStats(vardata);
124 return -1.0;
125 }
126 cmpfunc = &typentry->cmp_proc_finfo;
127
128 /*
129 * If the operator is <>, swap ANY/ALL, then invert the result later.
130 */
131 if (!isEquality)
132 useOr = !useOr;
133
134 /* Get array element stats for var, if available */
135 if (HeapTupleIsValid(vardata.statsTuple) &&
136 statistic_proc_security_check(&vardata, cmpfunc->fn_oid))
137 {
138 Form_pg_statistic stats;
139 Datum *values;
140 int nvalues;
141 float4 *numbers;
142 int nnumbers;
143 float4 *hist;
144 int nhist;
145
146 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
147
148 /* MCELEM will be an array of same type as element */
149 if (get_attstatsslot(vardata.statsTuple,
150 elemtype, vardata.atttypmod,
151 STATISTIC_KIND_MCELEM, InvalidOid,
152 NULL,
153 &values, &nvalues,
154 &numbers, &nnumbers))
155 {
156 /* For ALL case, also get histogram of distinct-element counts */
157 if (useOr ||
158 !get_attstatsslot(vardata.statsTuple,
159 elemtype, vardata.atttypmod,
160 STATISTIC_KIND_DECHIST, InvalidOid,
161 NULL,
162 NULL, NULL,
163 &hist, &nhist))
164 {
165 hist = NULL;
166 nhist = 0;
167 }
168
169 /*
170 * For = ANY, estimate as var @> ARRAY[const].
171 *
172 * For = ALL, estimate as var <@ ARRAY[const].
173 */
174 if (useOr)
175 selec = mcelem_array_contain_overlap_selec(values, nvalues,
176 numbers, nnumbers,
177 &constval, 1,
178 OID_ARRAY_CONTAINS_OP,
179 cmpfunc);
180 else
181 selec = mcelem_array_contained_selec(values, nvalues,
182 numbers, nnumbers,
183 &constval, 1,
184 hist, nhist,
185 OID_ARRAY_CONTAINED_OP,
186 cmpfunc);
187
188 if (hist)
189 free_attstatsslot(elemtype, NULL, 0, hist, nhist);
190 free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
191 }
192 else
193 {
194 /* No most-common-elements info, so do without */
195 if (useOr)
196 selec = mcelem_array_contain_overlap_selec(NULL, 0,
197 NULL, 0,
198 &constval, 1,
199 OID_ARRAY_CONTAINS_OP,
200 cmpfunc);
201 else
202 selec = mcelem_array_contained_selec(NULL, 0,
203 NULL, 0,
204 &constval, 1,
205 NULL, 0,
206 OID_ARRAY_CONTAINED_OP,
207 cmpfunc);
208 }
209
210 /*
211 * MCE stats count only non-null rows, so adjust for null rows.
212 */
213 selec *= (1.0 - stats->stanullfrac);
214 }
215 else
216 {
217 /* No stats at all, so do without */
218 if (useOr)
219 selec = mcelem_array_contain_overlap_selec(NULL, 0,
220 NULL, 0,
221 &constval, 1,
222 OID_ARRAY_CONTAINS_OP,
223 cmpfunc);
224 else
225 selec = mcelem_array_contained_selec(NULL, 0,
226 NULL, 0,
227 &constval, 1,
228 NULL, 0,
229 OID_ARRAY_CONTAINED_OP,
230 cmpfunc);
231 /* we assume no nulls here, so no stanullfrac correction */
232 }
233
234 ReleaseVariableStats(vardata);
235
236 /*
237 * If the operator is <>, invert the results.
238 */
239 if (!isEquality)
240 selec = 1.0 - selec;
241
242 CLAMP_PROBABILITY(selec);
243
244 return selec;
245 }
246
247 /*
248 * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
249 */
250 Datum
arraycontsel(PG_FUNCTION_ARGS)251 arraycontsel(PG_FUNCTION_ARGS)
252 {
253 PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
254 Oid operator = PG_GETARG_OID(1);
255 List *args = (List *) PG_GETARG_POINTER(2);
256 int varRelid = PG_GETARG_INT32(3);
257 VariableStatData vardata;
258 Node *other;
259 bool varonleft;
260 Selectivity selec;
261 Oid element_typeid;
262
263 /*
264 * If expression is not (variable op something) or (something op
265 * variable), then punt and return a default estimate.
266 */
267 if (!get_restriction_variable(root, args, varRelid,
268 &vardata, &other, &varonleft))
269 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
270
271 /*
272 * Can't do anything useful if the something is not a constant, either.
273 */
274 if (!IsA(other, Const))
275 {
276 ReleaseVariableStats(vardata);
277 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
278 }
279
280 /*
281 * The "&&", "@>" and "<@" operators are strict, so we can cope with a
282 * NULL constant right away.
283 */
284 if (((Const *) other)->constisnull)
285 {
286 ReleaseVariableStats(vardata);
287 PG_RETURN_FLOAT8(0.0);
288 }
289
290 /*
291 * If var is on the right, commute the operator, so that we can assume the
292 * var is on the left in what follows.
293 */
294 if (!varonleft)
295 {
296 if (operator == OID_ARRAY_CONTAINS_OP)
297 operator = OID_ARRAY_CONTAINED_OP;
298 else if (operator == OID_ARRAY_CONTAINED_OP)
299 operator = OID_ARRAY_CONTAINS_OP;
300 }
301
302 /*
303 * OK, there's a Var and a Const we're dealing with here. We need the
304 * Const to be an array with same element type as column, else we can't do
305 * anything useful. (Such cases will likely fail at runtime, but here
306 * we'd rather just return a default estimate.)
307 */
308 element_typeid = get_base_element_type(((Const *) other)->consttype);
309 if (element_typeid != InvalidOid &&
310 element_typeid == get_base_element_type(vardata.vartype))
311 {
312 selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
313 element_typeid, operator);
314 }
315 else
316 {
317 selec = DEFAULT_SEL(operator);
318 }
319
320 ReleaseVariableStats(vardata);
321
322 CLAMP_PROBABILITY(selec);
323
324 PG_RETURN_FLOAT8((float8) selec);
325 }
326
327 /*
328 * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
329 */
330 Datum
arraycontjoinsel(PG_FUNCTION_ARGS)331 arraycontjoinsel(PG_FUNCTION_ARGS)
332 {
333 /* For the moment this is just a stub */
334 Oid operator = PG_GETARG_OID(1);
335
336 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
337 }
338
339 /*
340 * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
341 * or "arraycolumn <@ const" based on the statistics
342 *
343 * This function is mainly responsible for extracting the pg_statistic data
344 * to be used; we then pass the problem on to mcelem_array_selec().
345 */
346 static Selectivity
calc_arraycontsel(VariableStatData * vardata,Datum constval,Oid elemtype,Oid operator)347 calc_arraycontsel(VariableStatData *vardata, Datum constval,
348 Oid elemtype, Oid operator)
349 {
350 Selectivity selec;
351 TypeCacheEntry *typentry;
352 FmgrInfo *cmpfunc;
353 ArrayType *array;
354
355 /* Get element type's default comparison function */
356 typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
357 if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
358 return DEFAULT_SEL(operator);
359 cmpfunc = &typentry->cmp_proc_finfo;
360
361 /*
362 * The caller made sure the const is an array with same element type, so
363 * get it now
364 */
365 array = DatumGetArrayTypeP(constval);
366
367 if (HeapTupleIsValid(vardata->statsTuple) &&
368 statistic_proc_security_check(vardata, cmpfunc->fn_oid))
369 {
370 Form_pg_statistic stats;
371 Datum *values;
372 int nvalues;
373 float4 *numbers;
374 int nnumbers;
375 float4 *hist;
376 int nhist;
377
378 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
379
380 /* MCELEM will be an array of same type as column */
381 if (get_attstatsslot(vardata->statsTuple,
382 elemtype, vardata->atttypmod,
383 STATISTIC_KIND_MCELEM, InvalidOid,
384 NULL,
385 &values, &nvalues,
386 &numbers, &nnumbers))
387 {
388 /*
389 * For "array <@ const" case we also need histogram of distinct
390 * element counts.
391 */
392 if (operator != OID_ARRAY_CONTAINED_OP ||
393 !get_attstatsslot(vardata->statsTuple,
394 elemtype, vardata->atttypmod,
395 STATISTIC_KIND_DECHIST, InvalidOid,
396 NULL,
397 NULL, NULL,
398 &hist, &nhist))
399 {
400 hist = NULL;
401 nhist = 0;
402 }
403
404 /* Use the most-common-elements slot for the array Var. */
405 selec = mcelem_array_selec(array, typentry,
406 values, nvalues,
407 numbers, nnumbers,
408 hist, nhist,
409 operator, cmpfunc);
410
411 if (hist)
412 free_attstatsslot(elemtype, NULL, 0, hist, nhist);
413 free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
414 }
415 else
416 {
417 /* No most-common-elements info, so do without */
418 selec = mcelem_array_selec(array, typentry,
419 NULL, 0, NULL, 0, NULL, 0,
420 operator, cmpfunc);
421 }
422
423 /*
424 * MCE stats count only non-null rows, so adjust for null rows.
425 */
426 selec *= (1.0 - stats->stanullfrac);
427 }
428 else
429 {
430 /* No stats at all, so do without */
431 selec = mcelem_array_selec(array, typentry,
432 NULL, 0, NULL, 0, NULL, 0,
433 operator, cmpfunc);
434 /* we assume no nulls here, so no stanullfrac correction */
435 }
436
437 /* If constant was toasted, release the copy we made */
438 if (PointerGetDatum(array) != constval)
439 pfree(array);
440
441 return selec;
442 }
443
444 /*
445 * Array selectivity estimation based on most common elements statistics
446 *
447 * This function just deconstructs and sorts the array constant's contents,
448 * and then passes the problem on to mcelem_array_contain_overlap_selec or
449 * mcelem_array_contained_selec depending on the operator.
450 */
451 static Selectivity
mcelem_array_selec(ArrayType * array,TypeCacheEntry * typentry,Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers,float4 * hist,int nhist,Oid operator,FmgrInfo * cmpfunc)452 mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry,
453 Datum *mcelem, int nmcelem,
454 float4 *numbers, int nnumbers,
455 float4 *hist, int nhist,
456 Oid operator, FmgrInfo *cmpfunc)
457 {
458 Selectivity selec;
459 int num_elems;
460 Datum *elem_values;
461 bool *elem_nulls;
462 bool null_present;
463 int nonnull_nitems;
464 int i;
465
466 /*
467 * Prepare constant array data for sorting. Sorting lets us find unique
468 * elements and efficiently merge with the MCELEM array.
469 */
470 deconstruct_array(array,
471 typentry->type_id,
472 typentry->typlen,
473 typentry->typbyval,
474 typentry->typalign,
475 &elem_values, &elem_nulls, &num_elems);
476
477 /* Collapse out any null elements */
478 nonnull_nitems = 0;
479 null_present = false;
480 for (i = 0; i < num_elems; i++)
481 {
482 if (elem_nulls[i])
483 null_present = true;
484 else
485 elem_values[nonnull_nitems++] = elem_values[i];
486 }
487
488 /*
489 * Query "column @> '{anything, null}'" matches nothing. For the other
490 * two operators, presence of a null in the constant can be ignored.
491 */
492 if (null_present && operator == OID_ARRAY_CONTAINS_OP)
493 {
494 pfree(elem_values);
495 pfree(elem_nulls);
496 return (Selectivity) 0.0;
497 }
498
499 /* Sort extracted elements using their default comparison function. */
500 qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
501 element_compare, cmpfunc);
502
503 /* Separate cases according to operator */
504 if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
505 selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
506 numbers, nnumbers,
507 elem_values, nonnull_nitems,
508 operator, cmpfunc);
509 else if (operator == OID_ARRAY_CONTAINED_OP)
510 selec = mcelem_array_contained_selec(mcelem, nmcelem,
511 numbers, nnumbers,
512 elem_values, nonnull_nitems,
513 hist, nhist,
514 operator, cmpfunc);
515 else
516 {
517 elog(ERROR, "arraycontsel called for unrecognized operator %u",
518 operator);
519 selec = 0.0; /* keep compiler quiet */
520 }
521
522 pfree(elem_values);
523 pfree(elem_nulls);
524 return selec;
525 }
526
527 /*
528 * Estimate selectivity of "column @> const" and "column && const" based on
529 * most common element statistics. This estimation assumes element
530 * occurrences are independent.
531 *
532 * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
533 * the array column's MCELEM statistics slot, or are NULL/0 if stats are
534 * not available. array_data (of length nitems) is the constant's elements.
535 *
536 * Both the mcelem and array_data arrays are assumed presorted according
537 * to the element type's cmpfunc. Null elements are not present.
538 *
539 * TODO: this estimate probably could be improved by using the distinct
540 * elements count histogram. For example, excepting the special case of
541 * "column @> '{}'", we can multiply the calculated selectivity by the
542 * fraction of nonempty arrays in the column.
543 */
544 static Selectivity
mcelem_array_contain_overlap_selec(Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers,Datum * array_data,int nitems,Oid operator,FmgrInfo * cmpfunc)545 mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
546 float4 *numbers, int nnumbers,
547 Datum *array_data, int nitems,
548 Oid operator, FmgrInfo *cmpfunc)
549 {
550 Selectivity selec,
551 elem_selec;
552 int mcelem_index,
553 i;
554 bool use_bsearch;
555 float4 minfreq;
556
557 /*
558 * There should be three more Numbers than Values, because the last three
559 * cells should hold minimal and maximal frequency among the non-null
560 * elements, and then the frequency of null elements. Ignore the Numbers
561 * if not right.
562 */
563 if (nnumbers != nmcelem + 3)
564 {
565 numbers = NULL;
566 nnumbers = 0;
567 }
568
569 if (numbers)
570 {
571 /* Grab the lowest observed frequency */
572 minfreq = numbers[nmcelem];
573 }
574 else
575 {
576 /* Without statistics make some default assumptions */
577 minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
578 }
579
580 /* Decide whether it is faster to use binary search or not. */
581 if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
582 use_bsearch = true;
583 else
584 use_bsearch = false;
585
586 if (operator == OID_ARRAY_CONTAINS_OP)
587 {
588 /*
589 * Initial selectivity for "column @> const" query is 1.0, and it will
590 * be decreased with each element of constant array.
591 */
592 selec = 1.0;
593 }
594 else
595 {
596 /*
597 * Initial selectivity for "column && const" query is 0.0, and it will
598 * be increased with each element of constant array.
599 */
600 selec = 0.0;
601 }
602
603 /* Scan mcelem and array in parallel. */
604 mcelem_index = 0;
605 for (i = 0; i < nitems; i++)
606 {
607 bool match = false;
608
609 /* Ignore any duplicates in the array data. */
610 if (i > 0 &&
611 element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
612 continue;
613
614 /* Find the smallest MCELEM >= this array item. */
615 if (use_bsearch)
616 {
617 match = find_next_mcelem(mcelem, nmcelem, array_data[i],
618 &mcelem_index, cmpfunc);
619 }
620 else
621 {
622 while (mcelem_index < nmcelem)
623 {
624 int cmp = element_compare(&mcelem[mcelem_index],
625 &array_data[i],
626 cmpfunc);
627
628 if (cmp < 0)
629 mcelem_index++;
630 else
631 {
632 if (cmp == 0)
633 match = true; /* mcelem is found */
634 break;
635 }
636 }
637 }
638
639 if (match && numbers)
640 {
641 /* MCELEM matches the array item; use its frequency. */
642 elem_selec = numbers[mcelem_index];
643 mcelem_index++;
644 }
645 else
646 {
647 /*
648 * The element is not in MCELEM. Punt, but assume that the
649 * selectivity cannot be more than minfreq / 2.
650 */
651 elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
652 }
653
654 /*
655 * Update overall selectivity using the current element's selectivity
656 * and an assumption of element occurrence independence.
657 */
658 if (operator == OID_ARRAY_CONTAINS_OP)
659 selec *= elem_selec;
660 else
661 selec = selec + elem_selec - selec * elem_selec;
662
663 /* Clamp intermediate results to stay sane despite roundoff error */
664 CLAMP_PROBABILITY(selec);
665 }
666
667 return selec;
668 }
669
670 /*
671 * Estimate selectivity of "column <@ const" based on most common element
672 * statistics.
673 *
674 * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
675 * the array column's MCELEM statistics slot, or are NULL/0 if stats are
676 * not available. array_data (of length nitems) is the constant's elements.
677 * hist (of length nhist) is from the array column's DECHIST statistics slot,
678 * or is NULL/0 if those stats are not available.
679 *
680 * Both the mcelem and array_data arrays are assumed presorted according
681 * to the element type's cmpfunc. Null elements are not present.
682 *
683 * Independent element occurrence would imply a particular distribution of
684 * distinct element counts among matching rows. Real data usually falsifies
685 * that assumption. For example, in a set of 11-element integer arrays having
686 * elements in the range [0..10], element occurrences are typically not
687 * independent. If they were, a sufficiently-large set would include all
688 * distinct element counts 0 through 11. We correct for this using the
689 * histogram of distinct element counts.
690 *
691 * In the "column @> const" and "column && const" cases, we usually have a
692 * "const" with low number of elements (otherwise we have selectivity close
693 * to 0 or 1 respectively). That's why the effect of dependence related
694 * to distinct element count distribution is negligible there. In the
695 * "column <@ const" case, number of elements is usually high (otherwise we
696 * have selectivity close to 0). That's why we should do a correction with
697 * the array distinct element count distribution here.
698 *
699 * Using the histogram of distinct element counts produces a different
700 * distribution law than independent occurrences of elements. This
701 * distribution law can be described as follows:
702 *
703 * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
704 * (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
705 *
706 * where:
707 * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
708 * (1 - occurrence, 0 - no occurrence) in row
709 * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
710 * (scalar values in [0..1]) according to collected statistics
711 * m = o1 + o2 + ... + on = total number of distinct elements in row
712 * hist[m] - histogram data for occurrence of m elements.
713 * ind[m] - probability of m occurrences from n events assuming their
714 * probabilities to be equal to frequencies of array elements.
715 *
716 * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
717 * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
718 */
719 static Selectivity
mcelem_array_contained_selec(Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers,Datum * array_data,int nitems,float4 * hist,int nhist,Oid operator,FmgrInfo * cmpfunc)720 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
721 float4 *numbers, int nnumbers,
722 Datum *array_data, int nitems,
723 float4 *hist, int nhist,
724 Oid operator, FmgrInfo *cmpfunc)
725 {
726 int mcelem_index,
727 i,
728 unique_nitems = 0;
729 float selec,
730 minfreq,
731 nullelem_freq;
732 float *dist,
733 *mcelem_dist,
734 *hist_part;
735 float avg_count,
736 mult,
737 rest;
738 float *elem_selec;
739
740 /*
741 * There should be three more Numbers than Values in the MCELEM slot,
742 * because the last three cells should hold minimal and maximal frequency
743 * among the non-null elements, and then the frequency of null elements.
744 * Punt if not right, because we can't do much without the element freqs.
745 */
746 if (numbers == NULL || nnumbers != nmcelem + 3)
747 return DEFAULT_CONTAIN_SEL;
748
749 /* Can't do much without a count histogram, either */
750 if (hist == NULL || nhist < 3)
751 return DEFAULT_CONTAIN_SEL;
752
753 /*
754 * Grab some of the summary statistics that compute_array_stats() stores:
755 * lowest frequency, frequency of null elements, and average distinct
756 * element count.
757 */
758 minfreq = numbers[nmcelem];
759 nullelem_freq = numbers[nmcelem + 2];
760 avg_count = hist[nhist - 1];
761
762 /*
763 * "rest" will be the sum of the frequencies of all elements not
764 * represented in MCELEM. The average distinct element count is the sum
765 * of the frequencies of *all* elements. Begin with that; we will proceed
766 * to subtract the MCELEM frequencies.
767 */
768 rest = avg_count;
769
770 /*
771 * mult is a multiplier representing estimate of probability that each
772 * mcelem that is not present in constant doesn't occur.
773 */
774 mult = 1.0f;
775
776 /*
777 * elem_selec is array of estimated frequencies for elements in the
778 * constant.
779 */
780 elem_selec = (float *) palloc(sizeof(float) * nitems);
781
782 /* Scan mcelem and array in parallel. */
783 mcelem_index = 0;
784 for (i = 0; i < nitems; i++)
785 {
786 bool match = false;
787
788 /* Ignore any duplicates in the array data. */
789 if (i > 0 &&
790 element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
791 continue;
792
793 /*
794 * Iterate over MCELEM until we find an entry greater than or equal to
795 * this element of the constant. Update "rest" and "mult" for mcelem
796 * entries skipped over.
797 */
798 while (mcelem_index < nmcelem)
799 {
800 int cmp = element_compare(&mcelem[mcelem_index],
801 &array_data[i],
802 cmpfunc);
803
804 if (cmp < 0)
805 {
806 mult *= (1.0f - numbers[mcelem_index]);
807 rest -= numbers[mcelem_index];
808 mcelem_index++;
809 }
810 else
811 {
812 if (cmp == 0)
813 match = true; /* mcelem is found */
814 break;
815 }
816 }
817
818 if (match)
819 {
820 /* MCELEM matches the array item. */
821 elem_selec[unique_nitems] = numbers[mcelem_index];
822 /* "rest" is decremented for all mcelems, matched or not */
823 rest -= numbers[mcelem_index];
824 mcelem_index++;
825 }
826 else
827 {
828 /*
829 * The element is not in MCELEM. Punt, but assume that the
830 * selectivity cannot be more than minfreq / 2.
831 */
832 elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
833 minfreq / 2);
834 }
835
836 unique_nitems++;
837 }
838
839 /*
840 * If we handled all constant elements without exhausting the MCELEM
841 * array, finish walking it to complete calculation of "rest" and "mult".
842 */
843 while (mcelem_index < nmcelem)
844 {
845 mult *= (1.0f - numbers[mcelem_index]);
846 rest -= numbers[mcelem_index];
847 mcelem_index++;
848 }
849
850 /*
851 * The presence of many distinct rare elements materially decreases
852 * selectivity. Use the Poisson distribution to estimate the probability
853 * of a column value having zero occurrences of such elements. See above
854 * for the definition of "rest".
855 */
856 mult *= exp(-rest);
857
858 /*----------
859 * Using the distinct element count histogram requires
860 * O(unique_nitems * (nmcelem + unique_nitems))
861 * operations. Beyond a certain computational cost threshold, it's
862 * reasonable to sacrifice accuracy for decreased planning time. We limit
863 * the number of operations to EFFORT * nmcelem; since nmcelem is limited
864 * by the column's statistics target, the work done is user-controllable.
865 *
866 * If the number of operations would be too large, we can reduce it
867 * without losing all accuracy by reducing unique_nitems and considering
868 * only the most-common elements of the constant array. To make the
869 * results exactly match what we would have gotten with only those
870 * elements to start with, we'd have to remove any discarded elements'
871 * frequencies from "mult", but since this is only an approximation
872 * anyway, we don't bother with that. Therefore it's sufficient to qsort
873 * elem_selec[] and take the largest elements. (They will no longer match
874 * up with the elements of array_data[], but we don't care.)
875 *----------
876 */
877 #define EFFORT 100
878
879 if ((nmcelem + unique_nitems) > 0 &&
880 unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
881 {
882 /*
883 * Use the quadratic formula to solve for largest allowable N. We
884 * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
885 */
886 double b = (double) nmcelem;
887 int n;
888
889 n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
890
891 /* Sort, then take just the first n elements */
892 qsort(elem_selec, unique_nitems, sizeof(float),
893 float_compare_desc);
894 unique_nitems = n;
895 }
896
897 /*
898 * Calculate probabilities of each distinct element count for both mcelems
899 * and constant elements. At this point, assume independent element
900 * occurrence.
901 */
902 dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
903 mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
904
905 /* ignore hist[nhist-1], which is the average not a histogram member */
906 hist_part = calc_hist(hist, nhist - 1, unique_nitems);
907
908 selec = 0.0f;
909 for (i = 0; i <= unique_nitems; i++)
910 {
911 /*
912 * mult * dist[i] / mcelem_dist[i] gives us probability of qual
913 * matching from assumption of independent element occurrence with the
914 * condition that distinct element count = i.
915 */
916 if (mcelem_dist[i] > 0)
917 selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
918 }
919
920 pfree(dist);
921 pfree(mcelem_dist);
922 pfree(hist_part);
923 pfree(elem_selec);
924
925 /* Take into account occurrence of NULL element. */
926 selec *= (1.0f - nullelem_freq);
927
928 CLAMP_PROBABILITY(selec);
929
930 return selec;
931 }
932
933 /*
934 * Calculate the first n distinct element count probabilities from a
935 * histogram of distinct element counts.
936 *
937 * Returns a palloc'd array of n+1 entries, with array[k] being the
938 * probability of element count k, k in [0..n].
939 *
940 * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
941 * (nhist - 1)) probability to each value in (a,b) and an additional half of
942 * that to a and b themselves.
943 */
944 static float *
calc_hist(const float4 * hist,int nhist,int n)945 calc_hist(const float4 *hist, int nhist, int n)
946 {
947 float *hist_part;
948 int k,
949 i = 0;
950 float prev_interval = 0,
951 next_interval;
952 float frac;
953
954 hist_part = (float *) palloc((n + 1) * sizeof(float));
955
956 /*
957 * frac is a probability contribution for each interval between histogram
958 * values. We have nhist - 1 intervals, so contribution of each one will
959 * be 1 / (nhist - 1).
960 */
961 frac = 1.0f / ((float) (nhist - 1));
962
963 for (k = 0; k <= n; k++)
964 {
965 int count = 0;
966
967 /*
968 * Count the histogram boundaries equal to k. (Although the histogram
969 * should theoretically contain only exact integers, entries are
970 * floats so there could be roundoff error in large values. Treat any
971 * fractional value as equal to the next larger k.)
972 */
973 while (i < nhist && hist[i] <= k)
974 {
975 count++;
976 i++;
977 }
978
979 if (count > 0)
980 {
981 /* k is an exact bound for at least one histogram box. */
982 float val;
983
984 /* Find length between current histogram value and the next one */
985 if (i < nhist)
986 next_interval = hist[i] - hist[i - 1];
987 else
988 next_interval = 0;
989
990 /*
991 * count - 1 histogram boxes contain k exclusively. They
992 * contribute a total of (count - 1) * frac probability. Also
993 * factor in the partial histogram boxes on either side.
994 */
995 val = (float) (count - 1);
996 if (next_interval > 0)
997 val += 0.5f / next_interval;
998 if (prev_interval > 0)
999 val += 0.5f / prev_interval;
1000 hist_part[k] = frac * val;
1001
1002 prev_interval = next_interval;
1003 }
1004 else
1005 {
1006 /* k does not appear as an exact histogram bound. */
1007 if (prev_interval > 0)
1008 hist_part[k] = frac / prev_interval;
1009 else
1010 hist_part[k] = 0.0f;
1011 }
1012 }
1013
1014 return hist_part;
1015 }
1016
1017 /*
1018 * Consider n independent events with probabilities p[]. This function
1019 * calculates probabilities of exact k of events occurrence for k in [0..m].
1020 * Returns a palloc'd array of size m+1.
1021 *
1022 * "rest" is the sum of the probabilities of all low-probability events not
1023 * included in p.
1024 *
1025 * Imagine matrix M of size (n + 1) x (m + 1). Element M[i,j] denotes the
1026 * probability that exactly j of first i events occur. Obviously M[0,0] = 1.
1027 * For any constant j, each increment of i increases the probability iff the
1028 * event occurs. So, by the law of total probability:
1029 * M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1030 * for i > 0, j > 0.
1031 * M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1032 */
1033 static float *
calc_distr(const float * p,int n,int m,float rest)1034 calc_distr(const float *p, int n, int m, float rest)
1035 {
1036 float *row,
1037 *prev_row,
1038 *tmp;
1039 int i,
1040 j;
1041
1042 /*
1043 * Since we return only the last row of the matrix and need only the
1044 * current and previous row for calculations, allocate two rows.
1045 */
1046 row = (float *) palloc((m + 1) * sizeof(float));
1047 prev_row = (float *) palloc((m + 1) * sizeof(float));
1048
1049 /* M[0,0] = 1 */
1050 row[0] = 1.0f;
1051 for (i = 1; i <= n; i++)
1052 {
1053 float t = p[i - 1];
1054
1055 /* Swap rows */
1056 tmp = row;
1057 row = prev_row;
1058 prev_row = tmp;
1059
1060 /* Calculate next row */
1061 for (j = 0; j <= i && j <= m; j++)
1062 {
1063 float val = 0.0f;
1064
1065 if (j < i)
1066 val += prev_row[j] * (1.0f - t);
1067 if (j > 0)
1068 val += prev_row[j - 1] * t;
1069 row[j] = val;
1070 }
1071 }
1072
1073 /*
1074 * The presence of many distinct rare (not in "p") elements materially
1075 * decreases selectivity. Model their collective occurrence with the
1076 * Poisson distribution.
1077 */
1078 if (rest > DEFAULT_CONTAIN_SEL)
1079 {
1080 float t;
1081
1082 /* Swap rows */
1083 tmp = row;
1084 row = prev_row;
1085 prev_row = tmp;
1086
1087 for (i = 0; i <= m; i++)
1088 row[i] = 0.0f;
1089
1090 /* Value of Poisson distribution for 0 occurrences */
1091 t = exp(-rest);
1092
1093 /*
1094 * Calculate convolution of previously computed distribution and the
1095 * Poisson distribution.
1096 */
1097 for (i = 0; i <= m; i++)
1098 {
1099 for (j = 0; j <= m - i; j++)
1100 row[j + i] += prev_row[j] * t;
1101
1102 /* Get Poisson distribution value for (i + 1) occurrences */
1103 t *= rest / (float) (i + 1);
1104 }
1105 }
1106
1107 pfree(prev_row);
1108 return row;
1109 }
1110
1111 /* Fast function for floor value of 2 based logarithm calculation. */
1112 static int
floor_log2(uint32 n)1113 floor_log2(uint32 n)
1114 {
1115 int logval = 0;
1116
1117 if (n == 0)
1118 return -1;
1119 if (n >= (1 << 16))
1120 {
1121 n >>= 16;
1122 logval += 16;
1123 }
1124 if (n >= (1 << 8))
1125 {
1126 n >>= 8;
1127 logval += 8;
1128 }
1129 if (n >= (1 << 4))
1130 {
1131 n >>= 4;
1132 logval += 4;
1133 }
1134 if (n >= (1 << 2))
1135 {
1136 n >>= 2;
1137 logval += 2;
1138 }
1139 if (n >= (1 << 1))
1140 {
1141 logval += 1;
1142 }
1143 return logval;
1144 }
1145
1146 /*
1147 * find_next_mcelem binary-searches a most common elements array, starting
1148 * from *index, for the first member >= value. It saves the position of the
1149 * match into *index and returns true if it's an exact match. (Note: we
1150 * assume the mcelem elements are distinct so there can't be more than one
1151 * exact match.)
1152 */
1153 static bool
find_next_mcelem(Datum * mcelem,int nmcelem,Datum value,int * index,FmgrInfo * cmpfunc)1154 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
1155 FmgrInfo *cmpfunc)
1156 {
1157 int l = *index,
1158 r = nmcelem - 1,
1159 i,
1160 res;
1161
1162 while (l <= r)
1163 {
1164 i = (l + r) / 2;
1165 res = element_compare(&mcelem[i], &value, cmpfunc);
1166 if (res == 0)
1167 {
1168 *index = i;
1169 return true;
1170 }
1171 else if (res < 0)
1172 l = i + 1;
1173 else
1174 r = i - 1;
1175 }
1176 *index = l;
1177 return false;
1178 }
1179
1180 /*
1181 * Comparison function for elements.
1182 *
1183 * We use the element type's default btree opclass, and the default collation
1184 * if the type is collation-sensitive.
1185 *
1186 * XXX consider using SortSupport infrastructure
1187 */
1188 static int
element_compare(const void * key1,const void * key2,void * arg)1189 element_compare(const void *key1, const void *key2, void *arg)
1190 {
1191 Datum d1 = *((const Datum *) key1);
1192 Datum d2 = *((const Datum *) key2);
1193 FmgrInfo *cmpfunc = (FmgrInfo *) arg;
1194 Datum c;
1195
1196 c = FunctionCall2Coll(cmpfunc, DEFAULT_COLLATION_OID, d1, d2);
1197 return DatumGetInt32(c);
1198 }
1199
1200 /*
1201 * Comparison function for sorting floats into descending order.
1202 */
1203 static int
float_compare_desc(const void * key1,const void * key2)1204 float_compare_desc(const void *key1, const void *key2)
1205 {
1206 float d1 = *((const float *) key1);
1207 float d2 = *((const float *) key2);
1208
1209 if (d1 > d2)
1210 return -1;
1211 else if (d1 < d2)
1212 return 1;
1213 else
1214 return 0;
1215 }
1216