1 /*-------------------------------------------------------------------------
2  *
3  * dependencies.c
4  *	  POSTGRES functional dependencies
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/statistics/dependencies.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "access/htup_details.h"
17 #include "access/sysattr.h"
18 #include "catalog/pg_operator.h"
19 #include "catalog/pg_statistic_ext.h"
20 #include "catalog/pg_statistic_ext_data.h"
21 #include "lib/stringinfo.h"
22 #include "nodes/nodeFuncs.h"
23 #include "nodes/nodes.h"
24 #include "nodes/pathnodes.h"
25 #include "optimizer/clauses.h"
26 #include "optimizer/optimizer.h"
27 #include "statistics/extended_stats_internal.h"
28 #include "statistics/statistics.h"
29 #include "utils/bytea.h"
30 #include "utils/fmgroids.h"
31 #include "utils/fmgrprotos.h"
32 #include "utils/lsyscache.h"
33 #include "utils/memutils.h"
34 #include "utils/selfuncs.h"
35 #include "utils/syscache.h"
36 #include "utils/typcache.h"
37 
38 /* size of the struct header fields (magic, type, ndeps) */
39 #define SizeOfHeader		(3 * sizeof(uint32))
40 
41 /* size of a serialized dependency (degree, natts, atts) */
42 #define SizeOfItem(natts) \
43 	(sizeof(double) + sizeof(AttrNumber) * (1 + (natts)))
44 
45 /* minimal size of a dependency (with two attributes) */
46 #define MinSizeOfItem	SizeOfItem(2)
47 
48 /* minimal size of dependencies, when all deps are minimal */
49 #define MinSizeOfItems(ndeps) \
50 	(SizeOfHeader + (ndeps) * MinSizeOfItem)
51 
52 /*
53  * Internal state for DependencyGenerator of dependencies. Dependencies are similar to
54  * k-permutations of n elements, except that the order does not matter for the
55  * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
56  */
57 typedef struct DependencyGeneratorData
58 {
59 	int			k;				/* size of the dependency */
60 	int			n;				/* number of possible attributes */
61 	int			current;		/* next dependency to return (index) */
62 	AttrNumber	ndependencies;	/* number of dependencies generated */
63 	AttrNumber *dependencies;	/* array of pre-generated dependencies	*/
64 } DependencyGeneratorData;
65 
66 typedef DependencyGeneratorData *DependencyGenerator;
67 
68 static void generate_dependencies_recurse(DependencyGenerator state,
69 										  int index, AttrNumber start, AttrNumber *current);
70 static void generate_dependencies(DependencyGenerator state);
71 static DependencyGenerator DependencyGenerator_init(int n, int k);
72 static void DependencyGenerator_free(DependencyGenerator state);
73 static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
74 static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
75 static bool dependency_is_fully_matched(MVDependency *dependency,
76 										Bitmapset *attnums);
77 static bool dependency_is_compatible_clause(Node *clause, Index relid,
78 											AttrNumber *attnum);
79 static bool dependency_is_compatible_expression(Node *clause, Index relid,
80 												List *statlist, Node **expr);
81 static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
82 											   int ndependencies, Bitmapset *attnums);
83 static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
84 												 int varRelid, JoinType jointype,
85 												 SpecialJoinInfo *sjinfo,
86 												 MVDependency **dependencies,
87 												 int ndependencies,
88 												 AttrNumber *list_attnums,
89 												 Bitmapset **estimatedclauses);
90 
91 static void
generate_dependencies_recurse(DependencyGenerator state,int index,AttrNumber start,AttrNumber * current)92 generate_dependencies_recurse(DependencyGenerator state, int index,
93 							  AttrNumber start, AttrNumber *current)
94 {
95 	/*
96 	 * The generator handles the first (k-1) elements differently from the
97 	 * last element.
98 	 */
99 	if (index < (state->k - 1))
100 	{
101 		AttrNumber	i;
102 
103 		/*
104 		 * The first (k-1) values have to be in ascending order, which we
105 		 * generate recursively.
106 		 */
107 
108 		for (i = start; i < state->n; i++)
109 		{
110 			current[index] = i;
111 			generate_dependencies_recurse(state, (index + 1), (i + 1), current);
112 		}
113 	}
114 	else
115 	{
116 		int			i;
117 
118 		/*
119 		 * the last element is the implied value, which does not respect the
120 		 * ascending order. We just need to check that the value is not in the
121 		 * first (k-1) elements.
122 		 */
123 
124 		for (i = 0; i < state->n; i++)
125 		{
126 			int			j;
127 			bool		match = false;
128 
129 			current[index] = i;
130 
131 			for (j = 0; j < index; j++)
132 			{
133 				if (current[j] == i)
134 				{
135 					match = true;
136 					break;
137 				}
138 			}
139 
140 			/*
141 			 * If the value is not found in the first part of the dependency,
142 			 * we're done.
143 			 */
144 			if (!match)
145 			{
146 				state->dependencies = (AttrNumber *) repalloc(state->dependencies,
147 															  state->k * (state->ndependencies + 1) * sizeof(AttrNumber));
148 				memcpy(&state->dependencies[(state->k * state->ndependencies)],
149 					   current, state->k * sizeof(AttrNumber));
150 				state->ndependencies++;
151 			}
152 		}
153 	}
154 }
155 
156 /* generate all dependencies (k-permutations of n elements) */
157 static void
generate_dependencies(DependencyGenerator state)158 generate_dependencies(DependencyGenerator state)
159 {
160 	AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k);
161 
162 	generate_dependencies_recurse(state, 0, 0, current);
163 
164 	pfree(current);
165 }
166 
167 /*
168  * initialize the DependencyGenerator of variations, and prebuild the variations
169  *
170  * This pre-builds all the variations. We could also generate them in
171  * DependencyGenerator_next(), but this seems simpler.
172  */
173 static DependencyGenerator
DependencyGenerator_init(int n,int k)174 DependencyGenerator_init(int n, int k)
175 {
176 	DependencyGenerator state;
177 
178 	Assert((n >= k) && (k > 0));
179 
180 	/* allocate the DependencyGenerator state */
181 	state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData));
182 	state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber));
183 
184 	state->ndependencies = 0;
185 	state->current = 0;
186 	state->k = k;
187 	state->n = n;
188 
189 	/* now actually pre-generate all the variations */
190 	generate_dependencies(state);
191 
192 	return state;
193 }
194 
195 /* free the DependencyGenerator state */
196 static void
DependencyGenerator_free(DependencyGenerator state)197 DependencyGenerator_free(DependencyGenerator state)
198 {
199 	pfree(state->dependencies);
200 	pfree(state);
201 
202 }
203 
204 /* generate next combination */
205 static AttrNumber *
DependencyGenerator_next(DependencyGenerator state)206 DependencyGenerator_next(DependencyGenerator state)
207 {
208 	if (state->current == state->ndependencies)
209 		return NULL;
210 
211 	return &state->dependencies[state->k * state->current++];
212 }
213 
214 
215 /*
216  * validates functional dependency on the data
217  *
218  * An actual work horse of detecting functional dependencies. Given a variation
219  * of k attributes, it checks that the first (k-1) are sufficient to determine
220  * the last one.
221  */
222 static double
dependency_degree(StatsBuildData * data,int k,AttrNumber * dependency)223 dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
224 {
225 	int			i,
226 				nitems;
227 	MultiSortSupport mss;
228 	SortItem   *items;
229 	AttrNumber *attnums_dep;
230 
231 	/* counters valid within a group */
232 	int			group_size = 0;
233 	int			n_violations = 0;
234 
235 	/* total number of rows supporting (consistent with) the dependency */
236 	int			n_supporting_rows = 0;
237 
238 	/* Make sure we have at least two input attributes. */
239 	Assert(k >= 2);
240 
241 	/* sort info for all attributes columns */
242 	mss = multi_sort_init(k);
243 
244 	/*
245 	 * Translate the array of indexes to regular attnums for the dependency
246 	 * (we will need this to identify the columns in StatsBuildData).
247 	 */
248 	attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
249 	for (i = 0; i < k; i++)
250 		attnums_dep[i] = data->attnums[dependency[i]];
251 
252 	/*
253 	 * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
254 	 *
255 	 * (a) sort the data lexicographically
256 	 *
257 	 * (b) split the data into groups by first (k-1) columns
258 	 *
259 	 * (c) for each group count different values in the last column
260 	 *
261 	 * We use the column data types' default sort operators and collations;
262 	 * perhaps at some point it'd be worth using column-specific collations?
263 	 */
264 
265 	/* prepare the sort function for the dimensions */
266 	for (i = 0; i < k; i++)
267 	{
268 		VacAttrStats *colstat = data->stats[dependency[i]];
269 		TypeCacheEntry *type;
270 
271 		type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
272 		if (type->lt_opr == InvalidOid) /* shouldn't happen */
273 			elog(ERROR, "cache lookup failed for ordering operator for type %u",
274 				 colstat->attrtypid);
275 
276 		/* prepare the sort function for this dimension */
277 		multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
278 	}
279 
280 	/*
281 	 * build an array of SortItem(s) sorted using the multi-sort support
282 	 *
283 	 * XXX This relies on all stats entries pointing to the same tuple
284 	 * descriptor.  For now that assumption holds, but it might change in the
285 	 * future for example if we support statistics on multiple tables.
286 	 */
287 	items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
288 
289 	/*
290 	 * Walk through the sorted array, split it into rows according to the
291 	 * first (k-1) columns. If there's a single value in the last column, we
292 	 * count the group as 'supporting' the functional dependency. Otherwise we
293 	 * count it as contradicting.
294 	 */
295 
296 	/* start with the first row forming a group */
297 	group_size = 1;
298 
299 	/* loop 1 beyond the end of the array so that we count the final group */
300 	for (i = 1; i <= nitems; i++)
301 	{
302 		/*
303 		 * Check if the group ended, which may be either because we processed
304 		 * all the items (i==nitems), or because the i-th item is not equal to
305 		 * the preceding one.
306 		 */
307 		if (i == nitems ||
308 			multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
309 		{
310 			/*
311 			 * If no violations were found in the group then track the rows of
312 			 * the group as supporting the functional dependency.
313 			 */
314 			if (n_violations == 0)
315 				n_supporting_rows += group_size;
316 
317 			/* Reset counters for the new group */
318 			n_violations = 0;
319 			group_size = 1;
320 			continue;
321 		}
322 		/* first columns match, but the last one does not (so contradicting) */
323 		else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0)
324 			n_violations++;
325 
326 		group_size++;
327 	}
328 
329 	/* Compute the 'degree of validity' as (supporting/total). */
330 	return (n_supporting_rows * 1.0 / data->numrows);
331 }
332 
333 /*
334  * detects functional dependencies between groups of columns
335  *
336  * Generates all possible subsets of columns (variations) and computes
337  * the degree of validity for each one. For example when creating statistics
338  * on three columns (a,b,c) there are 9 possible dependencies
339  *
340  *	   two columns			  three columns
341  *	   -----------			  -------------
342  *	   (a) -> b				  (a,b) -> c
343  *	   (a) -> c				  (a,c) -> b
344  *	   (b) -> a				  (b,c) -> a
345  *	   (b) -> c
346  *	   (c) -> a
347  *	   (c) -> b
348  */
349 MVDependencies *
statext_dependencies_build(StatsBuildData * data)350 statext_dependencies_build(StatsBuildData *data)
351 {
352 	int			i,
353 				k;
354 
355 	/* result */
356 	MVDependencies *dependencies = NULL;
357 	MemoryContext	cxt;
358 
359 	Assert(data->nattnums >= 2);
360 
361 	/* tracks memory allocated by dependency_degree calls */
362 	cxt = AllocSetContextCreate(CurrentMemoryContext,
363 								"dependency_degree cxt",
364 								ALLOCSET_DEFAULT_SIZES);
365 
366 	/*
367 	 * We'll try build functional dependencies starting from the smallest ones
368 	 * covering just 2 columns, to the largest ones, covering all columns
369 	 * included in the statistics object.  We start from the smallest ones
370 	 * because we want to be able to skip already implied ones.
371 	 */
372 	for (k = 2; k <= data->nattnums; k++)
373 	{
374 		AttrNumber *dependency; /* array with k elements */
375 
376 		/* prepare a DependencyGenerator of variation */
377 		DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
378 
379 		/* generate all possible variations of k values (out of n) */
380 		while ((dependency = DependencyGenerator_next(DependencyGenerator)))
381 		{
382 			double		degree;
383 			MVDependency *d;
384 			MemoryContext oldcxt;
385 
386 			/* release memory used by dependency degree calculation */
387 			oldcxt = MemoryContextSwitchTo(cxt);
388 
389 			/* compute how valid the dependency seems */
390 			degree = dependency_degree(data, k, dependency);
391 
392 			MemoryContextSwitchTo(oldcxt);
393 			MemoryContextReset(cxt);
394 
395 			/*
396 			 * if the dependency seems entirely invalid, don't store it
397 			 */
398 			if (degree == 0.0)
399 				continue;
400 
401 			d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
402 										 + k * sizeof(AttrNumber));
403 
404 			/* copy the dependency (and keep the indexes into stxkeys) */
405 			d->degree = degree;
406 			d->nattributes = k;
407 			for (i = 0; i < k; i++)
408 				d->attributes[i] = data->attnums[dependency[i]];
409 
410 			/* initialize the list of dependencies */
411 			if (dependencies == NULL)
412 			{
413 				dependencies
414 					= (MVDependencies *) palloc0(sizeof(MVDependencies));
415 
416 				dependencies->magic = STATS_DEPS_MAGIC;
417 				dependencies->type = STATS_DEPS_TYPE_BASIC;
418 				dependencies->ndeps = 0;
419 			}
420 
421 			dependencies->ndeps++;
422 			dependencies = (MVDependencies *) repalloc(dependencies,
423 													   offsetof(MVDependencies, deps)
424 													   + dependencies->ndeps * sizeof(MVDependency *));
425 
426 			dependencies->deps[dependencies->ndeps - 1] = d;
427 		}
428 
429 		/*
430 		 * we're done with variations of k elements, so free the
431 		 * DependencyGenerator
432 		 */
433 		DependencyGenerator_free(DependencyGenerator);
434 	}
435 
436 	MemoryContextDelete(cxt);
437 
438 	return dependencies;
439 }
440 
441 
442 /*
443  * Serialize list of dependencies into a bytea value.
444  */
445 bytea *
statext_dependencies_serialize(MVDependencies * dependencies)446 statext_dependencies_serialize(MVDependencies *dependencies)
447 {
448 	int			i;
449 	bytea	   *output;
450 	char	   *tmp;
451 	Size		len;
452 
453 	/* we need to store ndeps, with a number of attributes for each one */
454 	len = VARHDRSZ + SizeOfHeader;
455 
456 	/* and also include space for the actual attribute numbers and degrees */
457 	for (i = 0; i < dependencies->ndeps; i++)
458 		len += SizeOfItem(dependencies->deps[i]->nattributes);
459 
460 	output = (bytea *) palloc0(len);
461 	SET_VARSIZE(output, len);
462 
463 	tmp = VARDATA(output);
464 
465 	/* Store the base struct values (magic, type, ndeps) */
466 	memcpy(tmp, &dependencies->magic, sizeof(uint32));
467 	tmp += sizeof(uint32);
468 	memcpy(tmp, &dependencies->type, sizeof(uint32));
469 	tmp += sizeof(uint32);
470 	memcpy(tmp, &dependencies->ndeps, sizeof(uint32));
471 	tmp += sizeof(uint32);
472 
473 	/* store number of attributes and attribute numbers for each dependency */
474 	for (i = 0; i < dependencies->ndeps; i++)
475 	{
476 		MVDependency *d = dependencies->deps[i];
477 
478 		memcpy(tmp, &d->degree, sizeof(double));
479 		tmp += sizeof(double);
480 
481 		memcpy(tmp, &d->nattributes, sizeof(AttrNumber));
482 		tmp += sizeof(AttrNumber);
483 
484 		memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes);
485 		tmp += sizeof(AttrNumber) * d->nattributes;
486 
487 		/* protect against overflow */
488 		Assert(tmp <= ((char *) output + len));
489 	}
490 
491 	/* make sure we've produced exactly the right amount of data */
492 	Assert(tmp == ((char *) output + len));
493 
494 	return output;
495 }
496 
497 /*
498  * Reads serialized dependencies into MVDependencies structure.
499  */
500 MVDependencies *
statext_dependencies_deserialize(bytea * data)501 statext_dependencies_deserialize(bytea *data)
502 {
503 	int			i;
504 	Size		min_expected_size;
505 	MVDependencies *dependencies;
506 	char	   *tmp;
507 
508 	if (data == NULL)
509 		return NULL;
510 
511 	if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
512 		elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)",
513 			 VARSIZE_ANY_EXHDR(data), SizeOfHeader);
514 
515 	/* read the MVDependencies header */
516 	dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies));
517 
518 	/* initialize pointer to the data part (skip the varlena header) */
519 	tmp = VARDATA_ANY(data);
520 
521 	/* read the header fields and perform basic sanity checks */
522 	memcpy(&dependencies->magic, tmp, sizeof(uint32));
523 	tmp += sizeof(uint32);
524 	memcpy(&dependencies->type, tmp, sizeof(uint32));
525 	tmp += sizeof(uint32);
526 	memcpy(&dependencies->ndeps, tmp, sizeof(uint32));
527 	tmp += sizeof(uint32);
528 
529 	if (dependencies->magic != STATS_DEPS_MAGIC)
530 		elog(ERROR, "invalid dependency magic %d (expected %d)",
531 			 dependencies->magic, STATS_DEPS_MAGIC);
532 
533 	if (dependencies->type != STATS_DEPS_TYPE_BASIC)
534 		elog(ERROR, "invalid dependency type %d (expected %d)",
535 			 dependencies->type, STATS_DEPS_TYPE_BASIC);
536 
537 	if (dependencies->ndeps == 0)
538 		elog(ERROR, "invalid zero-length item array in MVDependencies");
539 
540 	/* what minimum bytea size do we expect for those parameters */
541 	min_expected_size = SizeOfItem(dependencies->ndeps);
542 
543 	if (VARSIZE_ANY_EXHDR(data) < min_expected_size)
544 		elog(ERROR, "invalid dependencies size %zd (expected at least %zd)",
545 			 VARSIZE_ANY_EXHDR(data), min_expected_size);
546 
547 	/* allocate space for the MCV items */
548 	dependencies = repalloc(dependencies, offsetof(MVDependencies, deps)
549 							+ (dependencies->ndeps * sizeof(MVDependency *)));
550 
551 	for (i = 0; i < dependencies->ndeps; i++)
552 	{
553 		double		degree;
554 		AttrNumber	k;
555 		MVDependency *d;
556 
557 		/* degree of validity */
558 		memcpy(&degree, tmp, sizeof(double));
559 		tmp += sizeof(double);
560 
561 		/* number of attributes */
562 		memcpy(&k, tmp, sizeof(AttrNumber));
563 		tmp += sizeof(AttrNumber);
564 
565 		/* is the number of attributes valid? */
566 		Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS));
567 
568 		/* now that we know the number of attributes, allocate the dependency */
569 		d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
570 									 + (k * sizeof(AttrNumber)));
571 
572 		d->degree = degree;
573 		d->nattributes = k;
574 
575 		/* copy attribute numbers */
576 		memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes);
577 		tmp += sizeof(AttrNumber) * d->nattributes;
578 
579 		dependencies->deps[i] = d;
580 
581 		/* still within the bytea */
582 		Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
583 	}
584 
585 	/* we should have consumed the whole bytea exactly */
586 	Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
587 
588 	return dependencies;
589 }
590 
591 /*
592  * dependency_is_fully_matched
593  *		checks that a functional dependency is fully matched given clauses on
594  *		attributes (assuming the clauses are suitable equality clauses)
595  */
596 static bool
dependency_is_fully_matched(MVDependency * dependency,Bitmapset * attnums)597 dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
598 {
599 	int			j;
600 
601 	/*
602 	 * Check that the dependency actually is fully covered by clauses. We have
603 	 * to translate all attribute numbers, as those are referenced
604 	 */
605 	for (j = 0; j < dependency->nattributes; j++)
606 	{
607 		int			attnum = dependency->attributes[j];
608 
609 		if (!bms_is_member(attnum, attnums))
610 			return false;
611 	}
612 
613 	return true;
614 }
615 
616 /*
617  * statext_dependencies_load
618  *		Load the functional dependencies for the indicated pg_statistic_ext tuple
619  */
620 MVDependencies *
statext_dependencies_load(Oid mvoid)621 statext_dependencies_load(Oid mvoid)
622 {
623 	MVDependencies *result;
624 	bool		isnull;
625 	Datum		deps;
626 	HeapTuple	htup;
627 
628 	htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
629 	if (!HeapTupleIsValid(htup))
630 		elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
631 
632 	deps = SysCacheGetAttr(STATEXTDATASTXOID, htup,
633 						   Anum_pg_statistic_ext_data_stxddependencies, &isnull);
634 	if (isnull)
635 		elog(ERROR,
636 			 "requested statistics kind \"%c\" is not yet built for statistics object %u",
637 			 STATS_EXT_DEPENDENCIES, mvoid);
638 
639 	result = statext_dependencies_deserialize(DatumGetByteaPP(deps));
640 
641 	ReleaseSysCache(htup);
642 
643 	return result;
644 }
645 
646 /*
647  * pg_dependencies_in		- input routine for type pg_dependencies.
648  *
649  * pg_dependencies is real enough to be a table column, but it has no operations
650  * of its own, and disallows input too
651  */
652 Datum
pg_dependencies_in(PG_FUNCTION_ARGS)653 pg_dependencies_in(PG_FUNCTION_ARGS)
654 {
655 	/*
656 	 * pg_node_list stores the data in binary form and parsing text input is
657 	 * not needed, so disallow this.
658 	 */
659 	ereport(ERROR,
660 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
661 			 errmsg("cannot accept a value of type %s", "pg_dependencies")));
662 
663 	PG_RETURN_VOID();			/* keep compiler quiet */
664 }
665 
666 /*
667  * pg_dependencies		- output routine for type pg_dependencies.
668  */
669 Datum
pg_dependencies_out(PG_FUNCTION_ARGS)670 pg_dependencies_out(PG_FUNCTION_ARGS)
671 {
672 	bytea	   *data = PG_GETARG_BYTEA_PP(0);
673 	MVDependencies *dependencies = statext_dependencies_deserialize(data);
674 	int			i,
675 				j;
676 	StringInfoData str;
677 
678 	initStringInfo(&str);
679 	appendStringInfoChar(&str, '{');
680 
681 	for (i = 0; i < dependencies->ndeps; i++)
682 	{
683 		MVDependency *dependency = dependencies->deps[i];
684 
685 		if (i > 0)
686 			appendStringInfoString(&str, ", ");
687 
688 		appendStringInfoChar(&str, '"');
689 		for (j = 0; j < dependency->nattributes; j++)
690 		{
691 			if (j == dependency->nattributes - 1)
692 				appendStringInfoString(&str, " => ");
693 			else if (j > 0)
694 				appendStringInfoString(&str, ", ");
695 
696 			appendStringInfo(&str, "%d", dependency->attributes[j]);
697 		}
698 		appendStringInfo(&str, "\": %f", dependency->degree);
699 	}
700 
701 	appendStringInfoChar(&str, '}');
702 
703 	PG_RETURN_CSTRING(str.data);
704 }
705 
706 /*
707  * pg_dependencies_recv		- binary input routine for type pg_dependencies.
708  */
709 Datum
pg_dependencies_recv(PG_FUNCTION_ARGS)710 pg_dependencies_recv(PG_FUNCTION_ARGS)
711 {
712 	ereport(ERROR,
713 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
714 			 errmsg("cannot accept a value of type %s", "pg_dependencies")));
715 
716 	PG_RETURN_VOID();			/* keep compiler quiet */
717 }
718 
719 /*
720  * pg_dependencies_send		- binary output routine for type pg_dependencies.
721  *
722  * Functional dependencies are serialized in a bytea value (although the type
723  * is named differently), so let's just send that.
724  */
725 Datum
pg_dependencies_send(PG_FUNCTION_ARGS)726 pg_dependencies_send(PG_FUNCTION_ARGS)
727 {
728 	return byteasend(fcinfo);
729 }
730 
731 /*
732  * dependency_is_compatible_clause
733  *		Determines if the clause is compatible with functional dependencies
734  *
735  * Only clauses that have the form of equality to a pseudoconstant, or can be
736  * interpreted that way, are currently accepted.  Furthermore the variable
737  * part of the clause must be a simple Var belonging to the specified
738  * relation, whose attribute number we return in *attnum on success.
739  */
740 static bool
dependency_is_compatible_clause(Node * clause,Index relid,AttrNumber * attnum)741 dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
742 {
743 	Var		   *var;
744 	Node	   *clause_expr;
745 
746 	if (IsA(clause, RestrictInfo))
747 	{
748 		RestrictInfo *rinfo = (RestrictInfo *) clause;
749 
750 		/* Pseudoconstants are not interesting (they couldn't contain a Var) */
751 		if (rinfo->pseudoconstant)
752 			return false;
753 
754 		/* Clauses referencing multiple, or no, varnos are incompatible */
755 		if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
756 			return false;
757 
758 		clause = (Node *) rinfo->clause;
759 	}
760 
761 	if (is_opclause(clause))
762 	{
763 		/* If it's an opclause, check for Var = Const or Const = Var. */
764 		OpExpr	   *expr = (OpExpr *) clause;
765 
766 		/* Only expressions with two arguments are candidates. */
767 		if (list_length(expr->args) != 2)
768 			return false;
769 
770 		/* Make sure non-selected argument is a pseudoconstant. */
771 		if (is_pseudo_constant_clause(lsecond(expr->args)))
772 			clause_expr = linitial(expr->args);
773 		else if (is_pseudo_constant_clause(linitial(expr->args)))
774 			clause_expr = lsecond(expr->args);
775 		else
776 			return false;
777 
778 		/*
779 		 * If it's not an "=" operator, just ignore the clause, as it's not
780 		 * compatible with functional dependencies.
781 		 *
782 		 * This uses the function for estimating selectivity, not the operator
783 		 * directly (a bit awkward, but well ...).
784 		 *
785 		 * XXX this is pretty dubious; probably it'd be better to check btree
786 		 * or hash opclass membership, so as not to be fooled by custom
787 		 * selectivity functions, and to be more consistent with decisions
788 		 * elsewhere in the planner.
789 		 */
790 		if (get_oprrest(expr->opno) != F_EQSEL)
791 			return false;
792 
793 		/* OK to proceed with checking "var" */
794 	}
795 	else if (IsA(clause, ScalarArrayOpExpr))
796 	{
797 		/* If it's an scalar array operator, check for Var IN Const. */
798 		ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
799 
800 		/*
801 		 * Reject ALL() variant, we only care about ANY/IN.
802 		 *
803 		 * XXX Maybe we should check if all the values are the same, and allow
804 		 * ALL in that case? Doesn't seem very practical, though.
805 		 */
806 		if (!expr->useOr)
807 			return false;
808 
809 		/* Only expressions with two arguments are candidates. */
810 		if (list_length(expr->args) != 2)
811 			return false;
812 
813 		/*
814 		 * We know it's always (Var IN Const), so we assume the var is the
815 		 * first argument, and pseudoconstant is the second one.
816 		 */
817 		if (!is_pseudo_constant_clause(lsecond(expr->args)))
818 			return false;
819 
820 		clause_expr = linitial(expr->args);
821 
822 		/*
823 		 * If it's not an "=" operator, just ignore the clause, as it's not
824 		 * compatible with functional dependencies. The operator is identified
825 		 * simply by looking at which function it uses to estimate
826 		 * selectivity. That's a bit strange, but it's what other similar
827 		 * places do.
828 		 */
829 		if (get_oprrest(expr->opno) != F_EQSEL)
830 			return false;
831 
832 		/* OK to proceed with checking "var" */
833 	}
834 	else if (is_orclause(clause))
835 	{
836 		BoolExpr   *bool_expr = (BoolExpr *) clause;
837 		ListCell   *lc;
838 
839 		/* start with no attribute number */
840 		*attnum = InvalidAttrNumber;
841 
842 		foreach(lc, bool_expr->args)
843 		{
844 			AttrNumber	clause_attnum;
845 
846 			/*
847 			 * Had we found incompatible clause in the arguments, treat the
848 			 * whole clause as incompatible.
849 			 */
850 			if (!dependency_is_compatible_clause((Node *) lfirst(lc),
851 												 relid, &clause_attnum))
852 				return false;
853 
854 			if (*attnum == InvalidAttrNumber)
855 				*attnum = clause_attnum;
856 
857 			/* ensure all the variables are the same (same attnum) */
858 			if (*attnum != clause_attnum)
859 				return false;
860 		}
861 
862 		/* the Var is already checked by the recursive call */
863 		return true;
864 	}
865 	else if (is_notclause(clause))
866 	{
867 		/*
868 		 * "NOT x" can be interpreted as "x = false", so get the argument and
869 		 * proceed with seeing if it's a suitable Var.
870 		 */
871 		clause_expr = (Node *) get_notclausearg(clause);
872 	}
873 	else
874 	{
875 		/*
876 		 * A boolean expression "x" can be interpreted as "x = true", so
877 		 * proceed with seeing if it's a suitable Var.
878 		 */
879 		clause_expr = (Node *) clause;
880 	}
881 
882 	/*
883 	 * We may ignore any RelabelType node above the operand.  (There won't be
884 	 * more than one, since eval_const_expressions has been applied already.)
885 	 */
886 	if (IsA(clause_expr, RelabelType))
887 		clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
888 
889 	/* We only support plain Vars for now */
890 	if (!IsA(clause_expr, Var))
891 		return false;
892 
893 	/* OK, we know we have a Var */
894 	var = (Var *) clause_expr;
895 
896 	/* Ensure Var is from the correct relation */
897 	if (var->varno != relid)
898 		return false;
899 
900 	/* We also better ensure the Var is from the current level */
901 	if (var->varlevelsup != 0)
902 		return false;
903 
904 	/* Also ignore system attributes (we don't allow stats on those) */
905 	if (!AttrNumberIsForUserDefinedAttr(var->varattno))
906 		return false;
907 
908 	*attnum = var->varattno;
909 	return true;
910 }
911 
912 /*
913  * find_strongest_dependency
914  *		find the strongest dependency on the attributes
915  *
916  * When applying functional dependencies, we start with the strongest
917  * dependencies. That is, we select the dependency that:
918  *
919  * (a) has all attributes covered by equality clauses
920  *
921  * (b) has the most attributes
922  *
923  * (c) has the highest degree of validity
924  *
925  * This guarantees that we eliminate the most redundant conditions first
926  * (see the comment in dependencies_clauselist_selectivity).
927  */
928 static MVDependency *
find_strongest_dependency(MVDependencies ** dependencies,int ndependencies,Bitmapset * attnums)929 find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
930 						  Bitmapset *attnums)
931 {
932 	int			i,
933 				j;
934 	MVDependency *strongest = NULL;
935 
936 	/* number of attnums in clauses */
937 	int			nattnums = bms_num_members(attnums);
938 
939 	/*
940 	 * Iterate over the MVDependency items and find the strongest one from the
941 	 * fully-matched dependencies. We do the cheap checks first, before
942 	 * matching it against the attnums.
943 	 */
944 	for (i = 0; i < ndependencies; i++)
945 	{
946 		for (j = 0; j < dependencies[i]->ndeps; j++)
947 		{
948 			MVDependency *dependency = dependencies[i]->deps[j];
949 
950 			/*
951 			 * Skip dependencies referencing more attributes than available
952 			 * clauses, as those can't be fully matched.
953 			 */
954 			if (dependency->nattributes > nattnums)
955 				continue;
956 
957 			if (strongest)
958 			{
959 				/* skip dependencies on fewer attributes than the strongest. */
960 				if (dependency->nattributes < strongest->nattributes)
961 					continue;
962 
963 				/* also skip weaker dependencies when attribute count matches */
964 				if (strongest->nattributes == dependency->nattributes &&
965 					strongest->degree > dependency->degree)
966 					continue;
967 			}
968 
969 			/*
970 			 * this dependency is stronger, but we must still check that it's
971 			 * fully matched to these attnums. We perform this check last as
972 			 * it's slightly more expensive than the previous checks.
973 			 */
974 			if (dependency_is_fully_matched(dependency, attnums))
975 				strongest = dependency; /* save new best match */
976 		}
977 	}
978 
979 	return strongest;
980 }
981 
982 /*
983  * clauselist_apply_dependencies
984  *		Apply the specified functional dependencies to a list of clauses and
985  *		return the estimated selectivity of the clauses that are compatible
986  *		with any of the given dependencies.
987  *
988  * This will estimate all not-already-estimated clauses that are compatible
989  * with functional dependencies, and which have an attribute mentioned by any
990  * of the given dependencies (either as an implying or implied attribute).
991  *
992  * Given (lists of) clauses on attributes (a,b) and a functional dependency
993  * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined
994  * using the formula
995  *
996  *		P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
997  *
998  * where 'f' is the degree of dependency.  This reflects the fact that we
999  * expect a fraction f of all rows to be consistent with the dependency
1000  * (a=>b), and so have a selectivity of P(a), while the remaining rows are
1001  * treated as independent.
1002  *
1003  * In practice, we use a slightly modified version of this formula, which uses
1004  * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result
1005  * should obviously not exceed either column's individual selectivity.  I.e.,
1006  * we actually combine selectivities using the formula
1007  *
1008  *		P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
1009  *
1010  * This can make quite a difference if the specific values matching the
1011  * clauses are not consistent with the functional dependency.
1012  */
1013 static Selectivity
clauselist_apply_dependencies(PlannerInfo * root,List * clauses,int varRelid,JoinType jointype,SpecialJoinInfo * sjinfo,MVDependency ** dependencies,int ndependencies,AttrNumber * list_attnums,Bitmapset ** estimatedclauses)1014 clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
1015 							  int varRelid, JoinType jointype,
1016 							  SpecialJoinInfo *sjinfo,
1017 							  MVDependency **dependencies, int ndependencies,
1018 							  AttrNumber *list_attnums,
1019 							  Bitmapset **estimatedclauses)
1020 {
1021 	Bitmapset  *attnums;
1022 	int			i;
1023 	int			j;
1024 	int			nattrs;
1025 	Selectivity *attr_sel;
1026 	int			attidx;
1027 	int			listidx;
1028 	ListCell   *l;
1029 	Selectivity s1;
1030 
1031 	/*
1032 	 * Extract the attnums of all implying and implied attributes from all the
1033 	 * given dependencies.  Each of these attributes is expected to have at
1034 	 * least 1 not-already-estimated compatible clause that we will estimate
1035 	 * here.
1036 	 */
1037 	attnums = NULL;
1038 	for (i = 0; i < ndependencies; i++)
1039 	{
1040 		for (j = 0; j < dependencies[i]->nattributes; j++)
1041 		{
1042 			AttrNumber	attnum = dependencies[i]->attributes[j];
1043 
1044 			attnums = bms_add_member(attnums, attnum);
1045 		}
1046 	}
1047 
1048 	/*
1049 	 * Compute per-column selectivity estimates for each of these attributes,
1050 	 * and mark all the corresponding clauses as estimated.
1051 	 */
1052 	nattrs = bms_num_members(attnums);
1053 	attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs);
1054 
1055 	attidx = 0;
1056 	i = -1;
1057 	while ((i = bms_next_member(attnums, i)) >= 0)
1058 	{
1059 		List	   *attr_clauses = NIL;
1060 		Selectivity simple_sel;
1061 
1062 		listidx = -1;
1063 		foreach(l, clauses)
1064 		{
1065 			Node	   *clause = (Node *) lfirst(l);
1066 
1067 			listidx++;
1068 			if (list_attnums[listidx] == i)
1069 			{
1070 				attr_clauses = lappend(attr_clauses, clause);
1071 				*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
1072 			}
1073 		}
1074 
1075 		simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid,
1076 												jointype, sjinfo, false);
1077 		attr_sel[attidx++] = simple_sel;
1078 	}
1079 
1080 	/*
1081 	 * Now combine these selectivities using the dependency information.  For
1082 	 * chains of dependencies such as a -> b -> c, the b -> c dependency will
1083 	 * come before the a -> b dependency in the array, so we traverse the
1084 	 * array backwards to ensure such chains are computed in the right order.
1085 	 *
1086 	 * As explained above, pairs of selectivities are combined using the
1087 	 * formula
1088 	 *
1089 	 * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
1090 	 *
1091 	 * to ensure that the combined selectivity is never greater than either
1092 	 * individual selectivity.
1093 	 *
1094 	 * Where multiple dependencies apply (e.g., a -> b -> c), we use
1095 	 * conditional probabilities to compute the overall result as follows:
1096 	 *
1097 	 * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a)
1098 	 *
1099 	 * so we replace the selectivities of all implied attributes with
1100 	 * conditional probabilities, that are conditional on all their implying
1101 	 * attributes.  The selectivities of all other non-implied attributes are
1102 	 * left as they are.
1103 	 */
1104 	for (i = ndependencies - 1; i >= 0; i--)
1105 	{
1106 		MVDependency *dependency = dependencies[i];
1107 		AttrNumber	attnum;
1108 		Selectivity s2;
1109 		double		f;
1110 
1111 		/* Selectivity of all the implying attributes */
1112 		s1 = 1.0;
1113 		for (j = 0; j < dependency->nattributes - 1; j++)
1114 		{
1115 			attnum = dependency->attributes[j];
1116 			attidx = bms_member_index(attnums, attnum);
1117 			s1 *= attr_sel[attidx];
1118 		}
1119 
1120 		/* Original selectivity of the implied attribute */
1121 		attnum = dependency->attributes[j];
1122 		attidx = bms_member_index(attnums, attnum);
1123 		s2 = attr_sel[attidx];
1124 
1125 		/*
1126 		 * Replace s2 with the conditional probability s2 given s1, computed
1127 		 * using the formula P(b|a) = P(a,b) / P(a), which simplifies to
1128 		 *
1129 		 * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b)
1130 		 *
1131 		 * where P(a) = s1, the selectivity of the implying attributes, and
1132 		 * P(b) = s2, the selectivity of the implied attribute.
1133 		 */
1134 		f = dependency->degree;
1135 
1136 		if (s1 <= s2)
1137 			attr_sel[attidx] = f + (1 - f) * s2;
1138 		else
1139 			attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2;
1140 	}
1141 
1142 	/*
1143 	 * The overall selectivity of all the clauses on all these attributes is
1144 	 * then the product of all the original (non-implied) probabilities and
1145 	 * the new conditional (implied) probabilities.
1146 	 */
1147 	s1 = 1.0;
1148 	for (i = 0; i < nattrs; i++)
1149 		s1 *= attr_sel[i];
1150 
1151 	CLAMP_PROBABILITY(s1);
1152 
1153 	pfree(attr_sel);
1154 	bms_free(attnums);
1155 
1156 	return s1;
1157 }
1158 
1159 /*
1160  * dependency_is_compatible_expression
1161  *		Determines if the expression is compatible with functional dependencies
1162  *
1163  * Similar to dependency_is_compatible_clause, but doesn't enforce that the
1164  * expression is a simple Var. OTOH we check that there's at least one
1165  * statistics object matching the expression.
1166  */
1167 static bool
dependency_is_compatible_expression(Node * clause,Index relid,List * statlist,Node ** expr)1168 dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
1169 {
1170 	List	   *vars;
1171 	ListCell   *lc,
1172 			   *lc2;
1173 	Node	   *clause_expr;
1174 
1175 	if (IsA(clause, RestrictInfo))
1176 	{
1177 		RestrictInfo *rinfo = (RestrictInfo *) clause;
1178 
1179 		/* Pseudoconstants are not interesting (they couldn't contain a Var) */
1180 		if (rinfo->pseudoconstant)
1181 			return false;
1182 
1183 		/* Clauses referencing multiple, or no, varnos are incompatible */
1184 		if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
1185 			return false;
1186 
1187 		clause = (Node *) rinfo->clause;
1188 	}
1189 
1190 	if (is_opclause(clause))
1191 	{
1192 		/* If it's an opclause, check for Var = Const or Const = Var. */
1193 		OpExpr	   *expr = (OpExpr *) clause;
1194 
1195 		/* Only expressions with two arguments are candidates. */
1196 		if (list_length(expr->args) != 2)
1197 			return false;
1198 
1199 		/* Make sure non-selected argument is a pseudoconstant. */
1200 		if (is_pseudo_constant_clause(lsecond(expr->args)))
1201 			clause_expr = linitial(expr->args);
1202 		else if (is_pseudo_constant_clause(linitial(expr->args)))
1203 			clause_expr = lsecond(expr->args);
1204 		else
1205 			return false;
1206 
1207 		/*
1208 		 * If it's not an "=" operator, just ignore the clause, as it's not
1209 		 * compatible with functional dependencies.
1210 		 *
1211 		 * This uses the function for estimating selectivity, not the operator
1212 		 * directly (a bit awkward, but well ...).
1213 		 *
1214 		 * XXX this is pretty dubious; probably it'd be better to check btree
1215 		 * or hash opclass membership, so as not to be fooled by custom
1216 		 * selectivity functions, and to be more consistent with decisions
1217 		 * elsewhere in the planner.
1218 		 */
1219 		if (get_oprrest(expr->opno) != F_EQSEL)
1220 			return false;
1221 
1222 		/* OK to proceed with checking "var" */
1223 	}
1224 	else if (IsA(clause, ScalarArrayOpExpr))
1225 	{
1226 		/* If it's an scalar array operator, check for Var IN Const. */
1227 		ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
1228 
1229 		/*
1230 		 * Reject ALL() variant, we only care about ANY/IN.
1231 		 *
1232 		 * FIXME Maybe we should check if all the values are the same, and
1233 		 * allow ALL in that case? Doesn't seem very practical, though.
1234 		 */
1235 		if (!expr->useOr)
1236 			return false;
1237 
1238 		/* Only expressions with two arguments are candidates. */
1239 		if (list_length(expr->args) != 2)
1240 			return false;
1241 
1242 		/*
1243 		 * We know it's always (Var IN Const), so we assume the var is the
1244 		 * first argument, and pseudoconstant is the second one.
1245 		 */
1246 		if (!is_pseudo_constant_clause(lsecond(expr->args)))
1247 			return false;
1248 
1249 		clause_expr = linitial(expr->args);
1250 
1251 		/*
1252 		 * If it's not an "=" operator, just ignore the clause, as it's not
1253 		 * compatible with functional dependencies. The operator is identified
1254 		 * simply by looking at which function it uses to estimate
1255 		 * selectivity. That's a bit strange, but it's what other similar
1256 		 * places do.
1257 		 */
1258 		if (get_oprrest(expr->opno) != F_EQSEL)
1259 			return false;
1260 
1261 		/* OK to proceed with checking "var" */
1262 	}
1263 	else if (is_orclause(clause))
1264 	{
1265 		BoolExpr   *bool_expr = (BoolExpr *) clause;
1266 		ListCell   *lc;
1267 
1268 		/* start with no expression (we'll use the first match) */
1269 		*expr = NULL;
1270 
1271 		foreach(lc, bool_expr->args)
1272 		{
1273 			Node	   *or_expr = NULL;
1274 
1275 			/*
1276 			 * Had we found incompatible expression in the arguments, treat
1277 			 * the whole expression as incompatible.
1278 			 */
1279 			if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
1280 													 statlist, &or_expr))
1281 				return false;
1282 
1283 			if (*expr == NULL)
1284 				*expr = or_expr;
1285 
1286 			/* ensure all the expressions are the same */
1287 			if (!equal(or_expr, *expr))
1288 				return false;
1289 		}
1290 
1291 		/* the expression is already checked by the recursive call */
1292 		return true;
1293 	}
1294 	else if (is_notclause(clause))
1295 	{
1296 		/*
1297 		 * "NOT x" can be interpreted as "x = false", so get the argument and
1298 		 * proceed with seeing if it's a suitable Var.
1299 		 */
1300 		clause_expr = (Node *) get_notclausearg(clause);
1301 	}
1302 	else
1303 	{
1304 		/*
1305 		 * A boolean expression "x" can be interpreted as "x = true", so
1306 		 * proceed with seeing if it's a suitable Var.
1307 		 */
1308 		clause_expr = (Node *) clause;
1309 	}
1310 
1311 	/*
1312 	 * We may ignore any RelabelType node above the operand.  (There won't be
1313 	 * more than one, since eval_const_expressions has been applied already.)
1314 	 */
1315 	if (IsA(clause_expr, RelabelType))
1316 		clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
1317 
1318 	vars = pull_var_clause(clause_expr, 0);
1319 
1320 	foreach(lc, vars)
1321 	{
1322 		Var		   *var = (Var *) lfirst(lc);
1323 
1324 		/* Ensure Var is from the correct relation */
1325 		if (var->varno != relid)
1326 			return false;
1327 
1328 		/* We also better ensure the Var is from the current level */
1329 		if (var->varlevelsup != 0)
1330 			return false;
1331 
1332 		/* Also ignore system attributes (we don't allow stats on those) */
1333 		if (!AttrNumberIsForUserDefinedAttr(var->varattno))
1334 			return false;
1335 	}
1336 
1337 	/*
1338 	 * Check if we actually have a matching statistics for the expression.
1339 	 *
1340 	 * XXX Maybe this is an overkill. We'll eliminate the expressions later.
1341 	 */
1342 	foreach(lc, statlist)
1343 	{
1344 		StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
1345 
1346 		/* ignore stats without dependencies */
1347 		if (info->kind != STATS_EXT_DEPENDENCIES)
1348 			continue;
1349 
1350 		foreach(lc2, info->exprs)
1351 		{
1352 			Node	   *stat_expr = (Node *) lfirst(lc2);
1353 
1354 			if (equal(clause_expr, stat_expr))
1355 			{
1356 				*expr = stat_expr;
1357 				return true;
1358 			}
1359 		}
1360 	}
1361 
1362 	return false;
1363 }
1364 
1365 /*
1366  * dependencies_clauselist_selectivity
1367  *		Return the estimated selectivity of (a subset of) the given clauses
1368  *		using functional dependency statistics, or 1.0 if no useful functional
1369  *		dependency statistic exists.
1370  *
1371  * 'estimatedclauses' is an input/output argument that gets a bit set
1372  * corresponding to the (zero-based) list index of each clause that is included
1373  * in the estimated selectivity.
1374  *
1375  * Given equality clauses on attributes (a,b) we find the strongest dependency
1376  * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
1377  * dependency, we then combine the per-clause selectivities using the formula
1378  *
1379  *	   P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
1380  *
1381  * where 'f' is the degree of the dependency.  (Actually we use a slightly
1382  * modified version of this formula -- see clauselist_apply_dependencies()).
1383  *
1384  * With clauses on more than two attributes, the dependencies are applied
1385  * recursively, starting with the widest/strongest dependencies. For example
1386  * P(a,b,c) is first split like this:
1387  *
1388  *	   P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c)
1389  *
1390  * assuming (a,b=>c) is the strongest dependency.
1391  */
1392 Selectivity
dependencies_clauselist_selectivity(PlannerInfo * root,List * clauses,int varRelid,JoinType jointype,SpecialJoinInfo * sjinfo,RelOptInfo * rel,Bitmapset ** estimatedclauses)1393 dependencies_clauselist_selectivity(PlannerInfo *root,
1394 									List *clauses,
1395 									int varRelid,
1396 									JoinType jointype,
1397 									SpecialJoinInfo *sjinfo,
1398 									RelOptInfo *rel,
1399 									Bitmapset **estimatedclauses)
1400 {
1401 	Selectivity s1 = 1.0;
1402 	ListCell   *l;
1403 	Bitmapset  *clauses_attnums = NULL;
1404 	AttrNumber *list_attnums;
1405 	int			listidx;
1406 	MVDependencies **func_dependencies;
1407 	int			nfunc_dependencies;
1408 	int			total_ndeps;
1409 	MVDependency **dependencies;
1410 	int			ndependencies;
1411 	int			i;
1412 	AttrNumber	attnum_offset;
1413 
1414 	/* unique expressions */
1415 	Node	  **unique_exprs;
1416 	int			unique_exprs_cnt;
1417 
1418 	/* check if there's any stats that might be useful for us. */
1419 	if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
1420 		return 1.0;
1421 
1422 	list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
1423 										 list_length(clauses));
1424 
1425 	/*
1426 	 * We allocate space as if every clause was a unique expression, although
1427 	 * that's probably overkill. Some will be simple column references that
1428 	 * we'll translate to attnums, and there might be duplicates. But it's
1429 	 * easier and cheaper to just do one allocation than repalloc later.
1430 	 */
1431 	unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
1432 	unique_exprs_cnt = 0;
1433 
1434 	/*
1435 	 * Pre-process the clauses list to extract the attnums seen in each item.
1436 	 * We need to determine if there's any clauses which will be useful for
1437 	 * dependency selectivity estimations. Along the way we'll record all of
1438 	 * the attnums for each clause in a list which we'll reference later so we
1439 	 * don't need to repeat the same work again. We'll also keep track of all
1440 	 * attnums seen.
1441 	 *
1442 	 * We also skip clauses that we already estimated using different types of
1443 	 * statistics (we treat them as incompatible).
1444 	 *
1445 	 * To handle expressions, we assign them negative attnums, as if it was a
1446 	 * system attribute (this is fine, as we only allow extended stats on user
1447 	 * attributes). And then we offset everything by the number of
1448 	 * expressions, so that we can store the values in a bitmapset.
1449 	 */
1450 	listidx = 0;
1451 	foreach(l, clauses)
1452 	{
1453 		Node	   *clause = (Node *) lfirst(l);
1454 		AttrNumber	attnum;
1455 		Node	   *expr = NULL;
1456 
1457 		/* ignore clause by default */
1458 		list_attnums[listidx] = InvalidAttrNumber;
1459 
1460 		if (!bms_is_member(listidx, *estimatedclauses))
1461 		{
1462 			/*
1463 			 * If it's a simple column reference, just extract the attnum. If
1464 			 * it's an expression, assign a negative attnum as if it was a
1465 			 * system attribute.
1466 			 */
1467 			if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
1468 			{
1469 				list_attnums[listidx] = attnum;
1470 			}
1471 			else if (dependency_is_compatible_expression(clause, rel->relid,
1472 														 rel->statlist,
1473 														 &expr))
1474 			{
1475 				/* special attnum assigned to this expression */
1476 				attnum = InvalidAttrNumber;
1477 
1478 				Assert(expr != NULL);
1479 
1480 				/* If the expression is duplicate, use the same attnum. */
1481 				for (i = 0; i < unique_exprs_cnt; i++)
1482 				{
1483 					if (equal(unique_exprs[i], expr))
1484 					{
1485 						/* negative attribute number to expression */
1486 						attnum = -(i + 1);
1487 						break;
1488 					}
1489 				}
1490 
1491 				/* not found in the list, so add it */
1492 				if (attnum == InvalidAttrNumber)
1493 				{
1494 					unique_exprs[unique_exprs_cnt++] = expr;
1495 
1496 					/* after incrementing the value, to get -1, -2, ... */
1497 					attnum = (-unique_exprs_cnt);
1498 				}
1499 
1500 				/* remember which attnum was assigned to this clause */
1501 				list_attnums[listidx] = attnum;
1502 			}
1503 		}
1504 
1505 		listidx++;
1506 	}
1507 
1508 	Assert(listidx == list_length(clauses));
1509 
1510 	/*
1511 	 * How much we need to offset the attnums? If there are no expressions,
1512 	 * then no offset is needed. Otherwise we need to offset enough for the
1513 	 * lowest value (-unique_exprs_cnt) to become 1.
1514 	 */
1515 	if (unique_exprs_cnt > 0)
1516 		attnum_offset = (unique_exprs_cnt + 1);
1517 	else
1518 		attnum_offset = 0;
1519 
1520 	/*
1521 	 * Now that we know how many expressions there are, we can offset the
1522 	 * values just enough to build the bitmapset.
1523 	 */
1524 	for (i = 0; i < list_length(clauses); i++)
1525 	{
1526 		AttrNumber	attnum;
1527 
1528 		/* ignore incompatible or already estimated clauses */
1529 		if (list_attnums[i] == InvalidAttrNumber)
1530 			continue;
1531 
1532 		/* make sure the attnum is in the expected range */
1533 		Assert(list_attnums[i] >= (-unique_exprs_cnt));
1534 		Assert(list_attnums[i] <= MaxHeapAttributeNumber);
1535 
1536 		/* make sure the attnum is positive (valid AttrNumber) */
1537 		attnum = list_attnums[i] + attnum_offset;
1538 
1539 		/*
1540 		 * Either it's a regular attribute, or it's an expression, in which
1541 		 * case we must not have seen it before (expressions are unique).
1542 		 *
1543 		 * XXX Check whether it's a regular attribute has to be done using the
1544 		 * original attnum, while the second check has to use the value with
1545 		 * an offset.
1546 		 */
1547 		Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
1548 			   !bms_is_member(attnum, clauses_attnums));
1549 
1550 		/*
1551 		 * Remember the offset attnum, both for attributes and expressions.
1552 		 * We'll pass list_attnums to clauselist_apply_dependencies, which
1553 		 * uses it to identify clauses in a bitmap. We could also pass the
1554 		 * offset, but this is more convenient.
1555 		 */
1556 		list_attnums[i] = attnum;
1557 
1558 		clauses_attnums = bms_add_member(clauses_attnums, attnum);
1559 	}
1560 
1561 	/*
1562 	 * If there's not at least two distinct attnums and expressions, then
1563 	 * reject the whole list of clauses. We must return 1.0 so the calling
1564 	 * function's selectivity is unaffected.
1565 	 */
1566 	if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
1567 	{
1568 		bms_free(clauses_attnums);
1569 		pfree(list_attnums);
1570 		return 1.0;
1571 	}
1572 
1573 	/*
1574 	 * Load all functional dependencies matching at least two parameters. We
1575 	 * can simply consider all dependencies at once, without having to search
1576 	 * for the best statistics object.
1577 	 *
1578 	 * To not waste cycles and memory, we deserialize dependencies only for
1579 	 * statistics that match at least two attributes. The array is allocated
1580 	 * with the assumption that all objects match - we could grow the array to
1581 	 * make it just the right size, but it's likely wasteful anyway thanks to
1582 	 * moving the freed chunks to freelists etc.
1583 	 */
1584 	func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) *
1585 												   list_length(rel->statlist));
1586 	nfunc_dependencies = 0;
1587 	total_ndeps = 0;
1588 
1589 	foreach(l, rel->statlist)
1590 	{
1591 		StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
1592 		int			nmatched;
1593 		int			nexprs;
1594 		int			k;
1595 		MVDependencies *deps;
1596 
1597 		/* skip statistics that are not of the correct type */
1598 		if (stat->kind != STATS_EXT_DEPENDENCIES)
1599 			continue;
1600 
1601 		/*
1602 		 * Count matching attributes - we have to undo the attnum offsets. The
1603 		 * input attribute numbers are not offset (expressions are not
1604 		 * included in stat->keys, so it's not necessary). But we need to
1605 		 * offset it before checking against clauses_attnums.
1606 		 */
1607 		nmatched = 0;
1608 		k = -1;
1609 		while ((k = bms_next_member(stat->keys, k)) >= 0)
1610 		{
1611 			AttrNumber	attnum = (AttrNumber) k;
1612 
1613 			/* skip expressions */
1614 			if (!AttrNumberIsForUserDefinedAttr(attnum))
1615 				continue;
1616 
1617 			/* apply the same offset as above */
1618 			attnum += attnum_offset;
1619 
1620 			if (bms_is_member(attnum, clauses_attnums))
1621 				nmatched++;
1622 		}
1623 
1624 		/* count matching expressions */
1625 		nexprs = 0;
1626 		for (i = 0; i < unique_exprs_cnt; i++)
1627 		{
1628 			ListCell   *lc;
1629 
1630 			foreach(lc, stat->exprs)
1631 			{
1632 				Node	   *stat_expr = (Node *) lfirst(lc);
1633 
1634 				/* try to match it */
1635 				if (equal(stat_expr, unique_exprs[i]))
1636 					nexprs++;
1637 			}
1638 		}
1639 
1640 		/*
1641 		 * Skip objects matching fewer than two attributes/expressions from
1642 		 * clauses.
1643 		 */
1644 		if (nmatched + nexprs < 2)
1645 			continue;
1646 
1647 		deps = statext_dependencies_load(stat->statOid);
1648 
1649 		/*
1650 		 * The expressions may be represented by different attnums in the
1651 		 * stats, we need to remap them to be consistent with the clauses.
1652 		 * That will make the later steps (e.g. picking the strongest item and
1653 		 * so on) much simpler and cheaper, because it won't need to care
1654 		 * about the offset at all.
1655 		 *
1656 		 * When we're at it, we can ignore dependencies that are not fully
1657 		 * matched by clauses (i.e. referencing attributes or expressions that
1658 		 * are not in the clauses).
1659 		 *
1660 		 * We have to do this for all statistics, as long as there are any
1661 		 * expressions - we need to shift the attnums in all dependencies.
1662 		 *
1663 		 * XXX Maybe we should do this always, because it also eliminates some
1664 		 * of the dependencies early. It might be cheaper than having to walk
1665 		 * the longer list in find_strongest_dependency later, especially as
1666 		 * we need to do that repeatedly?
1667 		 *
1668 		 * XXX We have to do this even when there are no expressions in
1669 		 * clauses, otherwise find_strongest_dependency may fail for stats
1670 		 * with expressions (due to lookup of negative value in bitmap). So we
1671 		 * need to at least filter out those dependencies. Maybe we could do
1672 		 * it in a cheaper way (if there are no expr clauses, we can just
1673 		 * discard all negative attnums without any lookups).
1674 		 */
1675 		if (unique_exprs_cnt > 0 || stat->exprs != NIL)
1676 		{
1677 			int			ndeps = 0;
1678 
1679 			for (i = 0; i < deps->ndeps; i++)
1680 			{
1681 				bool		skip = false;
1682 				MVDependency *dep = deps->deps[i];
1683 				int			j;
1684 
1685 				for (j = 0; j < dep->nattributes; j++)
1686 				{
1687 					int			idx;
1688 					Node	   *expr;
1689 					int			k;
1690 					AttrNumber	unique_attnum = InvalidAttrNumber;
1691 					AttrNumber	attnum;
1692 
1693 					/* undo the per-statistics offset */
1694 					attnum = dep->attributes[j];
1695 
1696 					/*
1697 					 * For regular attributes we can simply check if it
1698 					 * matches any clause. If there's no matching clause, we
1699 					 * can just ignore it. We need to offset the attnum
1700 					 * though.
1701 					 */
1702 					if (AttrNumberIsForUserDefinedAttr(attnum))
1703 					{
1704 						dep->attributes[j] = attnum + attnum_offset;
1705 
1706 						if (!bms_is_member(dep->attributes[j], clauses_attnums))
1707 						{
1708 							skip = true;
1709 							break;
1710 						}
1711 
1712 						continue;
1713 					}
1714 
1715 					/*
1716 					 * the attnum should be a valid system attnum (-1, -2,
1717 					 * ...)
1718 					 */
1719 					Assert(AttributeNumberIsValid(attnum));
1720 
1721 					/*
1722 					 * For expressions, we need to do two translations. First
1723 					 * we have to translate the negative attnum to index in
1724 					 * the list of expressions (in the statistics object).
1725 					 * Then we need to see if there's a matching clause. The
1726 					 * index of the unique expression determines the attnum
1727 					 * (and we offset it).
1728 					 */
1729 					idx = -(1 + attnum);
1730 
1731 					/* Is the expression index is valid? */
1732 					Assert((idx >= 0) && (idx < list_length(stat->exprs)));
1733 
1734 					expr = (Node *) list_nth(stat->exprs, idx);
1735 
1736 					/* try to find the expression in the unique list */
1737 					for (k = 0; k < unique_exprs_cnt; k++)
1738 					{
1739 						/*
1740 						 * found a matching unique expression, use the attnum
1741 						 * (derived from index of the unique expression)
1742 						 */
1743 						if (equal(unique_exprs[k], expr))
1744 						{
1745 							unique_attnum = -(k + 1) + attnum_offset;
1746 							break;
1747 						}
1748 					}
1749 
1750 					/*
1751 					 * Found no matching expression, so we can simply skip
1752 					 * this dependency, because there's no chance it will be
1753 					 * fully covered.
1754 					 */
1755 					if (unique_attnum == InvalidAttrNumber)
1756 					{
1757 						skip = true;
1758 						break;
1759 					}
1760 
1761 					/* otherwise remap it to the new attnum */
1762 					dep->attributes[j] = unique_attnum;
1763 				}
1764 
1765 				/* if found a matching dependency, keep it */
1766 				if (!skip)
1767 				{
1768 					/* maybe we've skipped something earlier, so move it */
1769 					if (ndeps != i)
1770 						deps->deps[ndeps] = deps->deps[i];
1771 
1772 					ndeps++;
1773 				}
1774 			}
1775 
1776 			deps->ndeps = ndeps;
1777 		}
1778 
1779 		/*
1780 		 * It's possible we've removed all dependencies, in which case we
1781 		 * don't bother adding it to the list.
1782 		 */
1783 		if (deps->ndeps > 0)
1784 		{
1785 			func_dependencies[nfunc_dependencies] = deps;
1786 			total_ndeps += deps->ndeps;
1787 			nfunc_dependencies++;
1788 		}
1789 	}
1790 
1791 	/* if no matching stats could be found then we've nothing to do */
1792 	if (nfunc_dependencies == 0)
1793 	{
1794 		pfree(func_dependencies);
1795 		bms_free(clauses_attnums);
1796 		pfree(list_attnums);
1797 		pfree(unique_exprs);
1798 		return 1.0;
1799 	}
1800 
1801 	/*
1802 	 * Work out which dependencies we can apply, starting with the
1803 	 * widest/strongest ones, and proceeding to smaller/weaker ones.
1804 	 */
1805 	dependencies = (MVDependency **) palloc(sizeof(MVDependency *) *
1806 											total_ndeps);
1807 	ndependencies = 0;
1808 
1809 	while (true)
1810 	{
1811 		MVDependency *dependency;
1812 		AttrNumber	attnum;
1813 
1814 		/* the widest/strongest dependency, fully matched by clauses */
1815 		dependency = find_strongest_dependency(func_dependencies,
1816 											   nfunc_dependencies,
1817 											   clauses_attnums);
1818 		if (!dependency)
1819 			break;
1820 
1821 		dependencies[ndependencies++] = dependency;
1822 
1823 		/* Ignore dependencies using this implied attribute in later loops */
1824 		attnum = dependency->attributes[dependency->nattributes - 1];
1825 		clauses_attnums = bms_del_member(clauses_attnums, attnum);
1826 	}
1827 
1828 	/*
1829 	 * If we found applicable dependencies, use them to estimate all
1830 	 * compatible clauses on attributes that they refer to.
1831 	 */
1832 	if (ndependencies != 0)
1833 		s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype,
1834 										   sjinfo, dependencies, ndependencies,
1835 										   list_attnums, estimatedclauses);
1836 
1837 	/* free deserialized functional dependencies (and then the array) */
1838 	for (i = 0; i < nfunc_dependencies; i++)
1839 		pfree(func_dependencies[i]);
1840 
1841 	pfree(dependencies);
1842 	pfree(func_dependencies);
1843 	bms_free(clauses_attnums);
1844 	pfree(list_attnums);
1845 	pfree(unique_exprs);
1846 
1847 	return s1;
1848 }
1849