1 /*-------------------------------------------------------------------------
2  *
3  * array_selfuncs.c
4  *	  Functions for selectivity estimation of array operators
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/utils/adt/array_selfuncs.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <math.h>
18 
19 #include "access/htup_details.h"
20 #include "catalog/pg_collation.h"
21 #include "catalog/pg_operator.h"
22 #include "catalog/pg_statistic.h"
23 #include "optimizer/clauses.h"
24 #include "utils/array.h"
25 #include "utils/lsyscache.h"
26 #include "utils/selfuncs.h"
27 #include "utils/typcache.h"
28 
29 
30 /* Default selectivity constant for "@>" and "<@" operators */
31 #define DEFAULT_CONTAIN_SEL 0.005
32 
33 /* Default selectivity constant for "&&" operator */
34 #define DEFAULT_OVERLAP_SEL 0.01
35 
36 /* Default selectivity for given operator */
37 #define DEFAULT_SEL(operator) \
38 	((operator) == OID_ARRAY_OVERLAP_OP ? \
39 		DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
40 
41 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
42 				  Oid elemtype, Oid operator);
43 static Selectivity mcelem_array_selec(ArrayType *array,
44 				   TypeCacheEntry *typentry,
45 				   Datum *mcelem, int nmcelem,
46 				   float4 *numbers, int nnumbers,
47 				   float4 *hist, int nhist,
48 				   Oid operator, FmgrInfo *cmpfunc);
49 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
50 								   float4 *numbers, int nnumbers,
51 								   Datum *array_data, int nitems,
52 								   Oid operator, FmgrInfo *cmpfunc);
53 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
54 							 float4 *numbers, int nnumbers,
55 							 Datum *array_data, int nitems,
56 							 float4 *hist, int nhist,
57 							 Oid operator, FmgrInfo *cmpfunc);
58 static float *calc_hist(const float4 *hist, int nhist, int n);
59 static float *calc_distr(const float *p, int n, int m, float rest);
60 static int	floor_log2(uint32 n);
61 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
62 				 int *index, FmgrInfo *cmpfunc);
63 static int	element_compare(const void *key1, const void *key2, void *arg);
64 static int	float_compare_desc(const void *key1, const void *key2);
65 
66 
67 /*
68  * scalararraysel_containment
69  *		Estimate selectivity of ScalarArrayOpExpr via array containment.
70  *
71  * If we have const =/<> ANY/ALL (array_var) then we can estimate the
72  * selectivity as though this were an array containment operator,
73  * array_var op ARRAY[const].
74  *
75  * scalararraysel() has already verified that the ScalarArrayOpExpr's operator
76  * is the array element type's default equality or inequality operator, and
77  * has aggressively simplified both inputs to constants.
78  *
79  * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
80  */
81 Selectivity
scalararraysel_containment(PlannerInfo * root,Node * leftop,Node * rightop,Oid elemtype,bool isEquality,bool useOr,int varRelid)82 scalararraysel_containment(PlannerInfo *root,
83 						   Node *leftop, Node *rightop,
84 						   Oid elemtype, bool isEquality, bool useOr,
85 						   int varRelid)
86 {
87 	Selectivity selec;
88 	VariableStatData vardata;
89 	Datum		constval;
90 	TypeCacheEntry *typentry;
91 	FmgrInfo   *cmpfunc;
92 
93 	/*
94 	 * rightop must be a variable, else punt.
95 	 */
96 	examine_variable(root, rightop, varRelid, &vardata);
97 	if (!vardata.rel)
98 	{
99 		ReleaseVariableStats(vardata);
100 		return -1.0;
101 	}
102 
103 	/*
104 	 * leftop must be a constant, else punt.
105 	 */
106 	if (!IsA(leftop, Const))
107 	{
108 		ReleaseVariableStats(vardata);
109 		return -1.0;
110 	}
111 	if (((Const *) leftop)->constisnull)
112 	{
113 		/* qual can't succeed if null on left */
114 		ReleaseVariableStats(vardata);
115 		return (Selectivity) 0.0;
116 	}
117 	constval = ((Const *) leftop)->constvalue;
118 
119 	/* Get element type's default comparison function */
120 	typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
121 	if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
122 	{
123 		ReleaseVariableStats(vardata);
124 		return -1.0;
125 	}
126 	cmpfunc = &typentry->cmp_proc_finfo;
127 
128 	/*
129 	 * If the operator is <>, swap ANY/ALL, then invert the result later.
130 	 */
131 	if (!isEquality)
132 		useOr = !useOr;
133 
134 	/* Get array element stats for var, if available */
135 	if (HeapTupleIsValid(vardata.statsTuple) &&
136 		statistic_proc_security_check(&vardata, cmpfunc->fn_oid))
137 	{
138 		Form_pg_statistic stats;
139 		Datum	   *values;
140 		int			nvalues;
141 		float4	   *numbers;
142 		int			nnumbers;
143 		float4	   *hist;
144 		int			nhist;
145 
146 		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
147 
148 		/* MCELEM will be an array of same type as element */
149 		if (get_attstatsslot(vardata.statsTuple,
150 							 elemtype, vardata.atttypmod,
151 							 STATISTIC_KIND_MCELEM, InvalidOid,
152 							 NULL,
153 							 &values, &nvalues,
154 							 &numbers, &nnumbers))
155 		{
156 			/* For ALL case, also get histogram of distinct-element counts */
157 			if (useOr ||
158 				!get_attstatsslot(vardata.statsTuple,
159 								  elemtype, vardata.atttypmod,
160 								  STATISTIC_KIND_DECHIST, InvalidOid,
161 								  NULL,
162 								  NULL, NULL,
163 								  &hist, &nhist))
164 			{
165 				hist = NULL;
166 				nhist = 0;
167 			}
168 
169 			/*
170 			 * For = ANY, estimate as var @> ARRAY[const].
171 			 *
172 			 * For = ALL, estimate as var <@ ARRAY[const].
173 			 */
174 			if (useOr)
175 				selec = mcelem_array_contain_overlap_selec(values, nvalues,
176 														   numbers, nnumbers,
177 														   &constval, 1,
178 													   OID_ARRAY_CONTAINS_OP,
179 														   cmpfunc);
180 			else
181 				selec = mcelem_array_contained_selec(values, nvalues,
182 													 numbers, nnumbers,
183 													 &constval, 1,
184 													 hist, nhist,
185 													 OID_ARRAY_CONTAINED_OP,
186 													 cmpfunc);
187 
188 			if (hist)
189 				free_attstatsslot(elemtype, NULL, 0, hist, nhist);
190 			free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
191 		}
192 		else
193 		{
194 			/* No most-common-elements info, so do without */
195 			if (useOr)
196 				selec = mcelem_array_contain_overlap_selec(NULL, 0,
197 														   NULL, 0,
198 														   &constval, 1,
199 													   OID_ARRAY_CONTAINS_OP,
200 														   cmpfunc);
201 			else
202 				selec = mcelem_array_contained_selec(NULL, 0,
203 													 NULL, 0,
204 													 &constval, 1,
205 													 NULL, 0,
206 													 OID_ARRAY_CONTAINED_OP,
207 													 cmpfunc);
208 		}
209 
210 		/*
211 		 * MCE stats count only non-null rows, so adjust for null rows.
212 		 */
213 		selec *= (1.0 - stats->stanullfrac);
214 	}
215 	else
216 	{
217 		/* No stats at all, so do without */
218 		if (useOr)
219 			selec = mcelem_array_contain_overlap_selec(NULL, 0,
220 													   NULL, 0,
221 													   &constval, 1,
222 													   OID_ARRAY_CONTAINS_OP,
223 													   cmpfunc);
224 		else
225 			selec = mcelem_array_contained_selec(NULL, 0,
226 												 NULL, 0,
227 												 &constval, 1,
228 												 NULL, 0,
229 												 OID_ARRAY_CONTAINED_OP,
230 												 cmpfunc);
231 		/* we assume no nulls here, so no stanullfrac correction */
232 	}
233 
234 	ReleaseVariableStats(vardata);
235 
236 	/*
237 	 * If the operator is <>, invert the results.
238 	 */
239 	if (!isEquality)
240 		selec = 1.0 - selec;
241 
242 	CLAMP_PROBABILITY(selec);
243 
244 	return selec;
245 }
246 
247 /*
248  * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
249  */
250 Datum
arraycontsel(PG_FUNCTION_ARGS)251 arraycontsel(PG_FUNCTION_ARGS)
252 {
253 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
254 	Oid			operator = PG_GETARG_OID(1);
255 	List	   *args = (List *) PG_GETARG_POINTER(2);
256 	int			varRelid = PG_GETARG_INT32(3);
257 	VariableStatData vardata;
258 	Node	   *other;
259 	bool		varonleft;
260 	Selectivity selec;
261 	Oid			element_typeid;
262 
263 	/*
264 	 * If expression is not (variable op something) or (something op
265 	 * variable), then punt and return a default estimate.
266 	 */
267 	if (!get_restriction_variable(root, args, varRelid,
268 								  &vardata, &other, &varonleft))
269 		PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
270 
271 	/*
272 	 * Can't do anything useful if the something is not a constant, either.
273 	 */
274 	if (!IsA(other, Const))
275 	{
276 		ReleaseVariableStats(vardata);
277 		PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
278 	}
279 
280 	/*
281 	 * The "&&", "@>" and "<@" operators are strict, so we can cope with a
282 	 * NULL constant right away.
283 	 */
284 	if (((Const *) other)->constisnull)
285 	{
286 		ReleaseVariableStats(vardata);
287 		PG_RETURN_FLOAT8(0.0);
288 	}
289 
290 	/*
291 	 * If var is on the right, commute the operator, so that we can assume the
292 	 * var is on the left in what follows.
293 	 */
294 	if (!varonleft)
295 	{
296 		if (operator == OID_ARRAY_CONTAINS_OP)
297 			operator = OID_ARRAY_CONTAINED_OP;
298 		else if (operator == OID_ARRAY_CONTAINED_OP)
299 			operator = OID_ARRAY_CONTAINS_OP;
300 	}
301 
302 	/*
303 	 * OK, there's a Var and a Const we're dealing with here.  We need the
304 	 * Const to be an array with same element type as column, else we can't do
305 	 * anything useful.  (Such cases will likely fail at runtime, but here
306 	 * we'd rather just return a default estimate.)
307 	 */
308 	element_typeid = get_base_element_type(((Const *) other)->consttype);
309 	if (element_typeid != InvalidOid &&
310 		element_typeid == get_base_element_type(vardata.vartype))
311 	{
312 		selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
313 								  element_typeid, operator);
314 	}
315 	else
316 	{
317 		selec = DEFAULT_SEL(operator);
318 	}
319 
320 	ReleaseVariableStats(vardata);
321 
322 	CLAMP_PROBABILITY(selec);
323 
324 	PG_RETURN_FLOAT8((float8) selec);
325 }
326 
327 /*
328  * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
329  */
330 Datum
arraycontjoinsel(PG_FUNCTION_ARGS)331 arraycontjoinsel(PG_FUNCTION_ARGS)
332 {
333 	/* For the moment this is just a stub */
334 	Oid			operator = PG_GETARG_OID(1);
335 
336 	PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
337 }
338 
339 /*
340  * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
341  * or "arraycolumn <@ const" based on the statistics
342  *
343  * This function is mainly responsible for extracting the pg_statistic data
344  * to be used; we then pass the problem on to mcelem_array_selec().
345  */
346 static Selectivity
calc_arraycontsel(VariableStatData * vardata,Datum constval,Oid elemtype,Oid operator)347 calc_arraycontsel(VariableStatData *vardata, Datum constval,
348 				  Oid elemtype, Oid operator)
349 {
350 	Selectivity selec;
351 	TypeCacheEntry *typentry;
352 	FmgrInfo   *cmpfunc;
353 	ArrayType  *array;
354 
355 	/* Get element type's default comparison function */
356 	typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
357 	if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
358 		return DEFAULT_SEL(operator);
359 	cmpfunc = &typentry->cmp_proc_finfo;
360 
361 	/*
362 	 * The caller made sure the const is an array with same element type, so
363 	 * get it now
364 	 */
365 	array = DatumGetArrayTypeP(constval);
366 
367 	if (HeapTupleIsValid(vardata->statsTuple) &&
368 		statistic_proc_security_check(vardata, cmpfunc->fn_oid))
369 	{
370 		Form_pg_statistic stats;
371 		Datum	   *values;
372 		int			nvalues;
373 		float4	   *numbers;
374 		int			nnumbers;
375 		float4	   *hist;
376 		int			nhist;
377 
378 		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
379 
380 		/* MCELEM will be an array of same type as column */
381 		if (get_attstatsslot(vardata->statsTuple,
382 							 elemtype, vardata->atttypmod,
383 							 STATISTIC_KIND_MCELEM, InvalidOid,
384 							 NULL,
385 							 &values, &nvalues,
386 							 &numbers, &nnumbers))
387 		{
388 			/*
389 			 * For "array <@ const" case we also need histogram of distinct
390 			 * element counts.
391 			 */
392 			if (operator != OID_ARRAY_CONTAINED_OP ||
393 				!get_attstatsslot(vardata->statsTuple,
394 								  elemtype, vardata->atttypmod,
395 								  STATISTIC_KIND_DECHIST, InvalidOid,
396 								  NULL,
397 								  NULL, NULL,
398 								  &hist, &nhist))
399 			{
400 				hist = NULL;
401 				nhist = 0;
402 			}
403 
404 			/* Use the most-common-elements slot for the array Var. */
405 			selec = mcelem_array_selec(array, typentry,
406 									   values, nvalues,
407 									   numbers, nnumbers,
408 									   hist, nhist,
409 									   operator, cmpfunc);
410 
411 			if (hist)
412 				free_attstatsslot(elemtype, NULL, 0, hist, nhist);
413 			free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
414 		}
415 		else
416 		{
417 			/* No most-common-elements info, so do without */
418 			selec = mcelem_array_selec(array, typentry,
419 									   NULL, 0, NULL, 0, NULL, 0,
420 									   operator, cmpfunc);
421 		}
422 
423 		/*
424 		 * MCE stats count only non-null rows, so adjust for null rows.
425 		 */
426 		selec *= (1.0 - stats->stanullfrac);
427 	}
428 	else
429 	{
430 		/* No stats at all, so do without */
431 		selec = mcelem_array_selec(array, typentry,
432 								   NULL, 0, NULL, 0, NULL, 0,
433 								   operator, cmpfunc);
434 		/* we assume no nulls here, so no stanullfrac correction */
435 	}
436 
437 	/* If constant was toasted, release the copy we made */
438 	if (PointerGetDatum(array) != constval)
439 		pfree(array);
440 
441 	return selec;
442 }
443 
444 /*
445  * Array selectivity estimation based on most common elements statistics
446  *
447  * This function just deconstructs and sorts the array constant's contents,
448  * and then passes the problem on to mcelem_array_contain_overlap_selec or
449  * mcelem_array_contained_selec depending on the operator.
450  */
451 static Selectivity
mcelem_array_selec(ArrayType * array,TypeCacheEntry * typentry,Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers,float4 * hist,int nhist,Oid operator,FmgrInfo * cmpfunc)452 mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry,
453 				   Datum *mcelem, int nmcelem,
454 				   float4 *numbers, int nnumbers,
455 				   float4 *hist, int nhist,
456 				   Oid operator, FmgrInfo *cmpfunc)
457 {
458 	Selectivity selec;
459 	int			num_elems;
460 	Datum	   *elem_values;
461 	bool	   *elem_nulls;
462 	bool		null_present;
463 	int			nonnull_nitems;
464 	int			i;
465 
466 	/*
467 	 * Prepare constant array data for sorting.  Sorting lets us find unique
468 	 * elements and efficiently merge with the MCELEM array.
469 	 */
470 	deconstruct_array(array,
471 					  typentry->type_id,
472 					  typentry->typlen,
473 					  typentry->typbyval,
474 					  typentry->typalign,
475 					  &elem_values, &elem_nulls, &num_elems);
476 
477 	/* Collapse out any null elements */
478 	nonnull_nitems = 0;
479 	null_present = false;
480 	for (i = 0; i < num_elems; i++)
481 	{
482 		if (elem_nulls[i])
483 			null_present = true;
484 		else
485 			elem_values[nonnull_nitems++] = elem_values[i];
486 	}
487 
488 	/*
489 	 * Query "column @> '{anything, null}'" matches nothing.  For the other
490 	 * two operators, presence of a null in the constant can be ignored.
491 	 */
492 	if (null_present && operator == OID_ARRAY_CONTAINS_OP)
493 	{
494 		pfree(elem_values);
495 		pfree(elem_nulls);
496 		return (Selectivity) 0.0;
497 	}
498 
499 	/* Sort extracted elements using their default comparison function. */
500 	qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
501 			  element_compare, cmpfunc);
502 
503 	/* Separate cases according to operator */
504 	if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
505 		selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
506 												   numbers, nnumbers,
507 												 elem_values, nonnull_nitems,
508 												   operator, cmpfunc);
509 	else if (operator == OID_ARRAY_CONTAINED_OP)
510 		selec = mcelem_array_contained_selec(mcelem, nmcelem,
511 											 numbers, nnumbers,
512 											 elem_values, nonnull_nitems,
513 											 hist, nhist,
514 											 operator, cmpfunc);
515 	else
516 	{
517 		elog(ERROR, "arraycontsel called for unrecognized operator %u",
518 			 operator);
519 		selec = 0.0;			/* keep compiler quiet */
520 	}
521 
522 	pfree(elem_values);
523 	pfree(elem_nulls);
524 	return selec;
525 }
526 
527 /*
528  * Estimate selectivity of "column @> const" and "column && const" based on
529  * most common element statistics.  This estimation assumes element
530  * occurrences are independent.
531  *
532  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
533  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
534  * not available.  array_data (of length nitems) is the constant's elements.
535  *
536  * Both the mcelem and array_data arrays are assumed presorted according
537  * to the element type's cmpfunc.  Null elements are not present.
538  *
539  * TODO: this estimate probably could be improved by using the distinct
540  * elements count histogram.  For example, excepting the special case of
541  * "column @> '{}'", we can multiply the calculated selectivity by the
542  * fraction of nonempty arrays in the column.
543  */
544 static Selectivity
mcelem_array_contain_overlap_selec(Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers,Datum * array_data,int nitems,Oid operator,FmgrInfo * cmpfunc)545 mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
546 								   float4 *numbers, int nnumbers,
547 								   Datum *array_data, int nitems,
548 								   Oid operator, FmgrInfo *cmpfunc)
549 {
550 	Selectivity selec,
551 				elem_selec;
552 	int			mcelem_index,
553 				i;
554 	bool		use_bsearch;
555 	float4		minfreq;
556 
557 	/*
558 	 * There should be three more Numbers than Values, because the last three
559 	 * cells should hold minimal and maximal frequency among the non-null
560 	 * elements, and then the frequency of null elements.  Ignore the Numbers
561 	 * if not right.
562 	 */
563 	if (nnumbers != nmcelem + 3)
564 	{
565 		numbers = NULL;
566 		nnumbers = 0;
567 	}
568 
569 	if (numbers)
570 	{
571 		/* Grab the lowest observed frequency */
572 		minfreq = numbers[nmcelem];
573 	}
574 	else
575 	{
576 		/* Without statistics make some default assumptions */
577 		minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
578 	}
579 
580 	/* Decide whether it is faster to use binary search or not. */
581 	if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
582 		use_bsearch = true;
583 	else
584 		use_bsearch = false;
585 
586 	if (operator == OID_ARRAY_CONTAINS_OP)
587 	{
588 		/*
589 		 * Initial selectivity for "column @> const" query is 1.0, and it will
590 		 * be decreased with each element of constant array.
591 		 */
592 		selec = 1.0;
593 	}
594 	else
595 	{
596 		/*
597 		 * Initial selectivity for "column && const" query is 0.0, and it will
598 		 * be increased with each element of constant array.
599 		 */
600 		selec = 0.0;
601 	}
602 
603 	/* Scan mcelem and array in parallel. */
604 	mcelem_index = 0;
605 	for (i = 0; i < nitems; i++)
606 	{
607 		bool		match = false;
608 
609 		/* Ignore any duplicates in the array data. */
610 		if (i > 0 &&
611 			element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
612 			continue;
613 
614 		/* Find the smallest MCELEM >= this array item. */
615 		if (use_bsearch)
616 		{
617 			match = find_next_mcelem(mcelem, nmcelem, array_data[i],
618 									 &mcelem_index, cmpfunc);
619 		}
620 		else
621 		{
622 			while (mcelem_index < nmcelem)
623 			{
624 				int			cmp = element_compare(&mcelem[mcelem_index],
625 												  &array_data[i],
626 												  cmpfunc);
627 
628 				if (cmp < 0)
629 					mcelem_index++;
630 				else
631 				{
632 					if (cmp == 0)
633 						match = true;	/* mcelem is found */
634 					break;
635 				}
636 			}
637 		}
638 
639 		if (match && numbers)
640 		{
641 			/* MCELEM matches the array item; use its frequency. */
642 			elem_selec = numbers[mcelem_index];
643 			mcelem_index++;
644 		}
645 		else
646 		{
647 			/*
648 			 * The element is not in MCELEM.  Punt, but assume that the
649 			 * selectivity cannot be more than minfreq / 2.
650 			 */
651 			elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
652 		}
653 
654 		/*
655 		 * Update overall selectivity using the current element's selectivity
656 		 * and an assumption of element occurrence independence.
657 		 */
658 		if (operator == OID_ARRAY_CONTAINS_OP)
659 			selec *= elem_selec;
660 		else
661 			selec = selec + elem_selec - selec * elem_selec;
662 
663 		/* Clamp intermediate results to stay sane despite roundoff error */
664 		CLAMP_PROBABILITY(selec);
665 	}
666 
667 	return selec;
668 }
669 
670 /*
671  * Estimate selectivity of "column <@ const" based on most common element
672  * statistics.
673  *
674  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
675  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
676  * not available.  array_data (of length nitems) is the constant's elements.
677  * hist (of length nhist) is from the array column's DECHIST statistics slot,
678  * or is NULL/0 if those stats are not available.
679  *
680  * Both the mcelem and array_data arrays are assumed presorted according
681  * to the element type's cmpfunc.  Null elements are not present.
682  *
683  * Independent element occurrence would imply a particular distribution of
684  * distinct element counts among matching rows.  Real data usually falsifies
685  * that assumption.  For example, in a set of 11-element integer arrays having
686  * elements in the range [0..10], element occurrences are typically not
687  * independent.  If they were, a sufficiently-large set would include all
688  * distinct element counts 0 through 11.  We correct for this using the
689  * histogram of distinct element counts.
690  *
691  * In the "column @> const" and "column && const" cases, we usually have a
692  * "const" with low number of elements (otherwise we have selectivity close
693  * to 0 or 1 respectively).  That's why the effect of dependence related
694  * to distinct element count distribution is negligible there.  In the
695  * "column <@ const" case, number of elements is usually high (otherwise we
696  * have selectivity close to 0).  That's why we should do a correction with
697  * the array distinct element count distribution here.
698  *
699  * Using the histogram of distinct element counts produces a different
700  * distribution law than independent occurrences of elements.  This
701  * distribution law can be described as follows:
702  *
703  * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
704  *	  (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
705  *
706  * where:
707  * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
708  *		(1 - occurrence, 0 - no occurrence) in row
709  * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
710  *		(scalar values in [0..1]) according to collected statistics
711  * m = o1 + o2 + ... + on = total number of distinct elements in row
712  * hist[m] - histogram data for occurrence of m elements.
713  * ind[m] - probability of m occurrences from n events assuming their
714  *	  probabilities to be equal to frequencies of array elements.
715  *
716  * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
717  * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
718  */
719 static Selectivity
mcelem_array_contained_selec(Datum * mcelem,int nmcelem,float4 * numbers,int nnumbers,Datum * array_data,int nitems,float4 * hist,int nhist,Oid operator,FmgrInfo * cmpfunc)720 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
721 							 float4 *numbers, int nnumbers,
722 							 Datum *array_data, int nitems,
723 							 float4 *hist, int nhist,
724 							 Oid operator, FmgrInfo *cmpfunc)
725 {
726 	int			mcelem_index,
727 				i,
728 				unique_nitems = 0;
729 	float		selec,
730 				minfreq,
731 				nullelem_freq;
732 	float	   *dist,
733 			   *mcelem_dist,
734 			   *hist_part;
735 	float		avg_count,
736 				mult,
737 				rest;
738 	float	   *elem_selec;
739 
740 	/*
741 	 * There should be three more Numbers than Values in the MCELEM slot,
742 	 * because the last three cells should hold minimal and maximal frequency
743 	 * among the non-null elements, and then the frequency of null elements.
744 	 * Punt if not right, because we can't do much without the element freqs.
745 	 */
746 	if (numbers == NULL || nnumbers != nmcelem + 3)
747 		return DEFAULT_CONTAIN_SEL;
748 
749 	/* Can't do much without a count histogram, either */
750 	if (hist == NULL || nhist < 3)
751 		return DEFAULT_CONTAIN_SEL;
752 
753 	/*
754 	 * Grab some of the summary statistics that compute_array_stats() stores:
755 	 * lowest frequency, frequency of null elements, and average distinct
756 	 * element count.
757 	 */
758 	minfreq = numbers[nmcelem];
759 	nullelem_freq = numbers[nmcelem + 2];
760 	avg_count = hist[nhist - 1];
761 
762 	/*
763 	 * "rest" will be the sum of the frequencies of all elements not
764 	 * represented in MCELEM.  The average distinct element count is the sum
765 	 * of the frequencies of *all* elements.  Begin with that; we will proceed
766 	 * to subtract the MCELEM frequencies.
767 	 */
768 	rest = avg_count;
769 
770 	/*
771 	 * mult is a multiplier representing estimate of probability that each
772 	 * mcelem that is not present in constant doesn't occur.
773 	 */
774 	mult = 1.0f;
775 
776 	/*
777 	 * elem_selec is array of estimated frequencies for elements in the
778 	 * constant.
779 	 */
780 	elem_selec = (float *) palloc(sizeof(float) * nitems);
781 
782 	/* Scan mcelem and array in parallel. */
783 	mcelem_index = 0;
784 	for (i = 0; i < nitems; i++)
785 	{
786 		bool		match = false;
787 
788 		/* Ignore any duplicates in the array data. */
789 		if (i > 0 &&
790 			element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
791 			continue;
792 
793 		/*
794 		 * Iterate over MCELEM until we find an entry greater than or equal to
795 		 * this element of the constant.  Update "rest" and "mult" for mcelem
796 		 * entries skipped over.
797 		 */
798 		while (mcelem_index < nmcelem)
799 		{
800 			int			cmp = element_compare(&mcelem[mcelem_index],
801 											  &array_data[i],
802 											  cmpfunc);
803 
804 			if (cmp < 0)
805 			{
806 				mult *= (1.0f - numbers[mcelem_index]);
807 				rest -= numbers[mcelem_index];
808 				mcelem_index++;
809 			}
810 			else
811 			{
812 				if (cmp == 0)
813 					match = true;		/* mcelem is found */
814 				break;
815 			}
816 		}
817 
818 		if (match)
819 		{
820 			/* MCELEM matches the array item. */
821 			elem_selec[unique_nitems] = numbers[mcelem_index];
822 			/* "rest" is decremented for all mcelems, matched or not */
823 			rest -= numbers[mcelem_index];
824 			mcelem_index++;
825 		}
826 		else
827 		{
828 			/*
829 			 * The element is not in MCELEM.  Punt, but assume that the
830 			 * selectivity cannot be more than minfreq / 2.
831 			 */
832 			elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
833 											minfreq / 2);
834 		}
835 
836 		unique_nitems++;
837 	}
838 
839 	/*
840 	 * If we handled all constant elements without exhausting the MCELEM
841 	 * array, finish walking it to complete calculation of "rest" and "mult".
842 	 */
843 	while (mcelem_index < nmcelem)
844 	{
845 		mult *= (1.0f - numbers[mcelem_index]);
846 		rest -= numbers[mcelem_index];
847 		mcelem_index++;
848 	}
849 
850 	/*
851 	 * The presence of many distinct rare elements materially decreases
852 	 * selectivity.  Use the Poisson distribution to estimate the probability
853 	 * of a column value having zero occurrences of such elements.  See above
854 	 * for the definition of "rest".
855 	 */
856 	mult *= exp(-rest);
857 
858 	/*----------
859 	 * Using the distinct element count histogram requires
860 	 *		O(unique_nitems * (nmcelem + unique_nitems))
861 	 * operations.  Beyond a certain computational cost threshold, it's
862 	 * reasonable to sacrifice accuracy for decreased planning time.  We limit
863 	 * the number of operations to EFFORT * nmcelem; since nmcelem is limited
864 	 * by the column's statistics target, the work done is user-controllable.
865 	 *
866 	 * If the number of operations would be too large, we can reduce it
867 	 * without losing all accuracy by reducing unique_nitems and considering
868 	 * only the most-common elements of the constant array.  To make the
869 	 * results exactly match what we would have gotten with only those
870 	 * elements to start with, we'd have to remove any discarded elements'
871 	 * frequencies from "mult", but since this is only an approximation
872 	 * anyway, we don't bother with that.  Therefore it's sufficient to qsort
873 	 * elem_selec[] and take the largest elements.  (They will no longer match
874 	 * up with the elements of array_data[], but we don't care.)
875 	 *----------
876 	 */
877 #define EFFORT 100
878 
879 	if ((nmcelem + unique_nitems) > 0 &&
880 		unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
881 	{
882 		/*
883 		 * Use the quadratic formula to solve for largest allowable N.  We
884 		 * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
885 		 */
886 		double		b = (double) nmcelem;
887 		int			n;
888 
889 		n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
890 
891 		/* Sort, then take just the first n elements */
892 		qsort(elem_selec, unique_nitems, sizeof(float),
893 			  float_compare_desc);
894 		unique_nitems = n;
895 	}
896 
897 	/*
898 	 * Calculate probabilities of each distinct element count for both mcelems
899 	 * and constant elements.  At this point, assume independent element
900 	 * occurrence.
901 	 */
902 	dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
903 	mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
904 
905 	/* ignore hist[nhist-1], which is the average not a histogram member */
906 	hist_part = calc_hist(hist, nhist - 1, unique_nitems);
907 
908 	selec = 0.0f;
909 	for (i = 0; i <= unique_nitems; i++)
910 	{
911 		/*
912 		 * mult * dist[i] / mcelem_dist[i] gives us probability of qual
913 		 * matching from assumption of independent element occurrence with the
914 		 * condition that distinct element count = i.
915 		 */
916 		if (mcelem_dist[i] > 0)
917 			selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
918 	}
919 
920 	pfree(dist);
921 	pfree(mcelem_dist);
922 	pfree(hist_part);
923 	pfree(elem_selec);
924 
925 	/* Take into account occurrence of NULL element. */
926 	selec *= (1.0f - nullelem_freq);
927 
928 	CLAMP_PROBABILITY(selec);
929 
930 	return selec;
931 }
932 
933 /*
934  * Calculate the first n distinct element count probabilities from a
935  * histogram of distinct element counts.
936  *
937  * Returns a palloc'd array of n+1 entries, with array[k] being the
938  * probability of element count k, k in [0..n].
939  *
940  * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
941  * (nhist - 1)) probability to each value in (a,b) and an additional half of
942  * that to a and b themselves.
943  */
944 static float *
calc_hist(const float4 * hist,int nhist,int n)945 calc_hist(const float4 *hist, int nhist, int n)
946 {
947 	float	   *hist_part;
948 	int			k,
949 				i = 0;
950 	float		prev_interval = 0,
951 				next_interval;
952 	float		frac;
953 
954 	hist_part = (float *) palloc((n + 1) * sizeof(float));
955 
956 	/*
957 	 * frac is a probability contribution for each interval between histogram
958 	 * values.  We have nhist - 1 intervals, so contribution of each one will
959 	 * be 1 / (nhist - 1).
960 	 */
961 	frac = 1.0f / ((float) (nhist - 1));
962 
963 	for (k = 0; k <= n; k++)
964 	{
965 		int			count = 0;
966 
967 		/*
968 		 * Count the histogram boundaries equal to k.  (Although the histogram
969 		 * should theoretically contain only exact integers, entries are
970 		 * floats so there could be roundoff error in large values.  Treat any
971 		 * fractional value as equal to the next larger k.)
972 		 */
973 		while (i < nhist && hist[i] <= k)
974 		{
975 			count++;
976 			i++;
977 		}
978 
979 		if (count > 0)
980 		{
981 			/* k is an exact bound for at least one histogram box. */
982 			float		val;
983 
984 			/* Find length between current histogram value and the next one */
985 			if (i < nhist)
986 				next_interval = hist[i] - hist[i - 1];
987 			else
988 				next_interval = 0;
989 
990 			/*
991 			 * count - 1 histogram boxes contain k exclusively.  They
992 			 * contribute a total of (count - 1) * frac probability.  Also
993 			 * factor in the partial histogram boxes on either side.
994 			 */
995 			val = (float) (count - 1);
996 			if (next_interval > 0)
997 				val += 0.5f / next_interval;
998 			if (prev_interval > 0)
999 				val += 0.5f / prev_interval;
1000 			hist_part[k] = frac * val;
1001 
1002 			prev_interval = next_interval;
1003 		}
1004 		else
1005 		{
1006 			/* k does not appear as an exact histogram bound. */
1007 			if (prev_interval > 0)
1008 				hist_part[k] = frac / prev_interval;
1009 			else
1010 				hist_part[k] = 0.0f;
1011 		}
1012 	}
1013 
1014 	return hist_part;
1015 }
1016 
1017 /*
1018  * Consider n independent events with probabilities p[].  This function
1019  * calculates probabilities of exact k of events occurrence for k in [0..m].
1020  * Returns a palloc'd array of size m+1.
1021  *
1022  * "rest" is the sum of the probabilities of all low-probability events not
1023  * included in p.
1024  *
1025  * Imagine matrix M of size (n + 1) x (m + 1).  Element M[i,j] denotes the
1026  * probability that exactly j of first i events occur.  Obviously M[0,0] = 1.
1027  * For any constant j, each increment of i increases the probability iff the
1028  * event occurs.  So, by the law of total probability:
1029  *	M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1030  *		for i > 0, j > 0.
1031  *	M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1032  */
1033 static float *
calc_distr(const float * p,int n,int m,float rest)1034 calc_distr(const float *p, int n, int m, float rest)
1035 {
1036 	float	   *row,
1037 			   *prev_row,
1038 			   *tmp;
1039 	int			i,
1040 				j;
1041 
1042 	/*
1043 	 * Since we return only the last row of the matrix and need only the
1044 	 * current and previous row for calculations, allocate two rows.
1045 	 */
1046 	row = (float *) palloc((m + 1) * sizeof(float));
1047 	prev_row = (float *) palloc((m + 1) * sizeof(float));
1048 
1049 	/* M[0,0] = 1 */
1050 	row[0] = 1.0f;
1051 	for (i = 1; i <= n; i++)
1052 	{
1053 		float		t = p[i - 1];
1054 
1055 		/* Swap rows */
1056 		tmp = row;
1057 		row = prev_row;
1058 		prev_row = tmp;
1059 
1060 		/* Calculate next row */
1061 		for (j = 0; j <= i && j <= m; j++)
1062 		{
1063 			float		val = 0.0f;
1064 
1065 			if (j < i)
1066 				val += prev_row[j] * (1.0f - t);
1067 			if (j > 0)
1068 				val += prev_row[j - 1] * t;
1069 			row[j] = val;
1070 		}
1071 	}
1072 
1073 	/*
1074 	 * The presence of many distinct rare (not in "p") elements materially
1075 	 * decreases selectivity.  Model their collective occurrence with the
1076 	 * Poisson distribution.
1077 	 */
1078 	if (rest > DEFAULT_CONTAIN_SEL)
1079 	{
1080 		float		t;
1081 
1082 		/* Swap rows */
1083 		tmp = row;
1084 		row = prev_row;
1085 		prev_row = tmp;
1086 
1087 		for (i = 0; i <= m; i++)
1088 			row[i] = 0.0f;
1089 
1090 		/* Value of Poisson distribution for 0 occurrences */
1091 		t = exp(-rest);
1092 
1093 		/*
1094 		 * Calculate convolution of previously computed distribution and the
1095 		 * Poisson distribution.
1096 		 */
1097 		for (i = 0; i <= m; i++)
1098 		{
1099 			for (j = 0; j <= m - i; j++)
1100 				row[j + i] += prev_row[j] * t;
1101 
1102 			/* Get Poisson distribution value for (i + 1) occurrences */
1103 			t *= rest / (float) (i + 1);
1104 		}
1105 	}
1106 
1107 	pfree(prev_row);
1108 	return row;
1109 }
1110 
1111 /* Fast function for floor value of 2 based logarithm calculation. */
1112 static int
floor_log2(uint32 n)1113 floor_log2(uint32 n)
1114 {
1115 	int			logval = 0;
1116 
1117 	if (n == 0)
1118 		return -1;
1119 	if (n >= (1 << 16))
1120 	{
1121 		n >>= 16;
1122 		logval += 16;
1123 	}
1124 	if (n >= (1 << 8))
1125 	{
1126 		n >>= 8;
1127 		logval += 8;
1128 	}
1129 	if (n >= (1 << 4))
1130 	{
1131 		n >>= 4;
1132 		logval += 4;
1133 	}
1134 	if (n >= (1 << 2))
1135 	{
1136 		n >>= 2;
1137 		logval += 2;
1138 	}
1139 	if (n >= (1 << 1))
1140 	{
1141 		logval += 1;
1142 	}
1143 	return logval;
1144 }
1145 
1146 /*
1147  * find_next_mcelem binary-searches a most common elements array, starting
1148  * from *index, for the first member >= value.  It saves the position of the
1149  * match into *index and returns true if it's an exact match.  (Note: we
1150  * assume the mcelem elements are distinct so there can't be more than one
1151  * exact match.)
1152  */
1153 static bool
find_next_mcelem(Datum * mcelem,int nmcelem,Datum value,int * index,FmgrInfo * cmpfunc)1154 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
1155 				 FmgrInfo *cmpfunc)
1156 {
1157 	int			l = *index,
1158 				r = nmcelem - 1,
1159 				i,
1160 				res;
1161 
1162 	while (l <= r)
1163 	{
1164 		i = (l + r) / 2;
1165 		res = element_compare(&mcelem[i], &value, cmpfunc);
1166 		if (res == 0)
1167 		{
1168 			*index = i;
1169 			return true;
1170 		}
1171 		else if (res < 0)
1172 			l = i + 1;
1173 		else
1174 			r = i - 1;
1175 	}
1176 	*index = l;
1177 	return false;
1178 }
1179 
1180 /*
1181  * Comparison function for elements.
1182  *
1183  * We use the element type's default btree opclass, and the default collation
1184  * if the type is collation-sensitive.
1185  *
1186  * XXX consider using SortSupport infrastructure
1187  */
1188 static int
element_compare(const void * key1,const void * key2,void * arg)1189 element_compare(const void *key1, const void *key2, void *arg)
1190 {
1191 	Datum		d1 = *((const Datum *) key1);
1192 	Datum		d2 = *((const Datum *) key2);
1193 	FmgrInfo   *cmpfunc = (FmgrInfo *) arg;
1194 	Datum		c;
1195 
1196 	c = FunctionCall2Coll(cmpfunc, DEFAULT_COLLATION_OID, d1, d2);
1197 	return DatumGetInt32(c);
1198 }
1199 
1200 /*
1201  * Comparison function for sorting floats into descending order.
1202  */
1203 static int
float_compare_desc(const void * key1,const void * key2)1204 float_compare_desc(const void *key1, const void *key2)
1205 {
1206 	float		d1 = *((const float *) key1);
1207 	float		d2 = *((const float *) key2);
1208 
1209 	if (d1 > d2)
1210 		return -1;
1211 	else if (d1 < d2)
1212 		return 1;
1213 	else
1214 		return 0;
1215 }
1216