1 /*-------------------------------------------------------------------------
2  *
3  * dependencies.c
4  *	  POSTGRES functional dependencies
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/statistics/dependencies.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "access/htup_details.h"
17 #include "access/sysattr.h"
18 #include "catalog/pg_operator.h"
19 #include "catalog/pg_statistic_ext.h"
20 #include "catalog/pg_statistic_ext_data.h"
21 #include "lib/stringinfo.h"
22 #include "nodes/nodeFuncs.h"
23 #include "optimizer/clauses.h"
24 #include "optimizer/optimizer.h"
25 #include "nodes/nodes.h"
26 #include "nodes/pathnodes.h"
27 #include "statistics/extended_stats_internal.h"
28 #include "statistics/statistics.h"
29 #include "utils/bytea.h"
30 #include "utils/fmgroids.h"
31 #include "utils/fmgrprotos.h"
32 #include "utils/lsyscache.h"
33 #include "utils/memutils.h"
34 #include "utils/selfuncs.h"
35 #include "utils/syscache.h"
36 #include "utils/typcache.h"
37 
38 /* size of the struct header fields (magic, type, ndeps) */
39 #define SizeOfHeader		(3 * sizeof(uint32))
40 
41 /* size of a serialized dependency (degree, natts, atts) */
42 #define SizeOfItem(natts) \
43 	(sizeof(double) + sizeof(AttrNumber) * (1 + (natts)))
44 
45 /* minimal size of a dependency (with two attributes) */
46 #define MinSizeOfItem	SizeOfItem(2)
47 
48 /* minimal size of dependencies, when all deps are minimal */
49 #define MinSizeOfItems(ndeps) \
50 	(SizeOfHeader + (ndeps) * MinSizeOfItem)
51 
52 /*
53  * Internal state for DependencyGenerator of dependencies. Dependencies are similar to
54  * k-permutations of n elements, except that the order does not matter for the
55  * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
56  */
57 typedef struct DependencyGeneratorData
58 {
59 	int			k;				/* size of the dependency */
60 	int			n;				/* number of possible attributes */
61 	int			current;		/* next dependency to return (index) */
62 	AttrNumber	ndependencies;	/* number of dependencies generated */
63 	AttrNumber *dependencies;	/* array of pre-generated dependencies	*/
64 } DependencyGeneratorData;
65 
66 typedef DependencyGeneratorData *DependencyGenerator;
67 
68 static void generate_dependencies_recurse(DependencyGenerator state,
69 										  int index, AttrNumber start, AttrNumber *current);
70 static void generate_dependencies(DependencyGenerator state);
71 static DependencyGenerator DependencyGenerator_init(int n, int k);
72 static void DependencyGenerator_free(DependencyGenerator state);
73 static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
74 static double dependency_degree(int numrows, HeapTuple *rows, int k,
75 								AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
76 static bool dependency_is_fully_matched(MVDependency *dependency,
77 										Bitmapset *attnums);
78 static bool dependency_implies_attribute(MVDependency *dependency,
79 										 AttrNumber attnum);
80 static bool dependency_is_compatible_clause(Node *clause, Index relid,
81 											AttrNumber *attnum);
82 static MVDependency *find_strongest_dependency(StatisticExtInfo *stats,
83 											   MVDependencies *dependencies,
84 											   Bitmapset *attnums);
85 
86 static void
generate_dependencies_recurse(DependencyGenerator state,int index,AttrNumber start,AttrNumber * current)87 generate_dependencies_recurse(DependencyGenerator state, int index,
88 							  AttrNumber start, AttrNumber *current)
89 {
90 	/*
91 	 * The generator handles the first (k-1) elements differently from the
92 	 * last element.
93 	 */
94 	if (index < (state->k - 1))
95 	{
96 		AttrNumber	i;
97 
98 		/*
99 		 * The first (k-1) values have to be in ascending order, which we
100 		 * generate recursively.
101 		 */
102 
103 		for (i = start; i < state->n; i++)
104 		{
105 			current[index] = i;
106 			generate_dependencies_recurse(state, (index + 1), (i + 1), current);
107 		}
108 	}
109 	else
110 	{
111 		int			i;
112 
113 		/*
114 		 * the last element is the implied value, which does not respect the
115 		 * ascending order. We just need to check that the value is not in the
116 		 * first (k-1) elements.
117 		 */
118 
119 		for (i = 0; i < state->n; i++)
120 		{
121 			int			j;
122 			bool		match = false;
123 
124 			current[index] = i;
125 
126 			for (j = 0; j < index; j++)
127 			{
128 				if (current[j] == i)
129 				{
130 					match = true;
131 					break;
132 				}
133 			}
134 
135 			/*
136 			 * If the value is not found in the first part of the dependency,
137 			 * we're done.
138 			 */
139 			if (!match)
140 			{
141 				state->dependencies = (AttrNumber *) repalloc(state->dependencies,
142 															  state->k * (state->ndependencies + 1) * sizeof(AttrNumber));
143 				memcpy(&state->dependencies[(state->k * state->ndependencies)],
144 					   current, state->k * sizeof(AttrNumber));
145 				state->ndependencies++;
146 			}
147 		}
148 	}
149 }
150 
151 /* generate all dependencies (k-permutations of n elements) */
152 static void
generate_dependencies(DependencyGenerator state)153 generate_dependencies(DependencyGenerator state)
154 {
155 	AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k);
156 
157 	generate_dependencies_recurse(state, 0, 0, current);
158 
159 	pfree(current);
160 }
161 
162 /*
163  * initialize the DependencyGenerator of variations, and prebuild the variations
164  *
165  * This pre-builds all the variations. We could also generate them in
166  * DependencyGenerator_next(), but this seems simpler.
167  */
168 static DependencyGenerator
DependencyGenerator_init(int n,int k)169 DependencyGenerator_init(int n, int k)
170 {
171 	DependencyGenerator state;
172 
173 	Assert((n >= k) && (k > 0));
174 
175 	/* allocate the DependencyGenerator state */
176 	state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData));
177 	state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber));
178 
179 	state->ndependencies = 0;
180 	state->current = 0;
181 	state->k = k;
182 	state->n = n;
183 
184 	/* now actually pre-generate all the variations */
185 	generate_dependencies(state);
186 
187 	return state;
188 }
189 
190 /* free the DependencyGenerator state */
191 static void
DependencyGenerator_free(DependencyGenerator state)192 DependencyGenerator_free(DependencyGenerator state)
193 {
194 	pfree(state->dependencies);
195 	pfree(state);
196 
197 }
198 
199 /* generate next combination */
200 static AttrNumber *
DependencyGenerator_next(DependencyGenerator state)201 DependencyGenerator_next(DependencyGenerator state)
202 {
203 	if (state->current == state->ndependencies)
204 		return NULL;
205 
206 	return &state->dependencies[state->k * state->current++];
207 }
208 
209 
210 /*
211  * validates functional dependency on the data
212  *
213  * An actual work horse of detecting functional dependencies. Given a variation
214  * of k attributes, it checks that the first (k-1) are sufficient to determine
215  * the last one.
216  */
217 static double
dependency_degree(int numrows,HeapTuple * rows,int k,AttrNumber * dependency,VacAttrStats ** stats,Bitmapset * attrs)218 dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
219 				  VacAttrStats **stats, Bitmapset *attrs)
220 {
221 	int			i,
222 				nitems;
223 	MultiSortSupport mss;
224 	SortItem   *items;
225 	AttrNumber *attnums;
226 	AttrNumber *attnums_dep;
227 	int			numattrs;
228 
229 	/* counters valid within a group */
230 	int			group_size = 0;
231 	int			n_violations = 0;
232 
233 	/* total number of rows supporting (consistent with) the dependency */
234 	int			n_supporting_rows = 0;
235 
236 	/* Make sure we have at least two input attributes. */
237 	Assert(k >= 2);
238 
239 	/* sort info for all attributes columns */
240 	mss = multi_sort_init(k);
241 
242 	/*
243 	 * Transform the attrs from bitmap to an array to make accessing the i-th
244 	 * member easier, and then construct a filtered version with only attnums
245 	 * referenced by the dependency we validate.
246 	 */
247 	attnums = build_attnums_array(attrs, &numattrs);
248 
249 	attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
250 	for (i = 0; i < k; i++)
251 		attnums_dep[i] = attnums[dependency[i]];
252 
253 	/*
254 	 * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
255 	 *
256 	 * (a) sort the data lexicographically
257 	 *
258 	 * (b) split the data into groups by first (k-1) columns
259 	 *
260 	 * (c) for each group count different values in the last column
261 	 *
262 	 * We use the column data types' default sort operators and collations;
263 	 * perhaps at some point it'd be worth using column-specific collations?
264 	 */
265 
266 	/* prepare the sort function for the dimensions */
267 	for (i = 0; i < k; i++)
268 	{
269 		VacAttrStats *colstat = stats[dependency[i]];
270 		TypeCacheEntry *type;
271 
272 		type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
273 		if (type->lt_opr == InvalidOid) /* shouldn't happen */
274 			elog(ERROR, "cache lookup failed for ordering operator for type %u",
275 				 colstat->attrtypid);
276 
277 		/* prepare the sort function for this dimension */
278 		multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
279 	}
280 
281 	/*
282 	 * build an array of SortItem(s) sorted using the multi-sort support
283 	 *
284 	 * XXX This relies on all stats entries pointing to the same tuple
285 	 * descriptor.  For now that assumption holds, but it might change in the
286 	 * future for example if we support statistics on multiple tables.
287 	 */
288 	items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
289 							   mss, k, attnums_dep);
290 
291 	/*
292 	 * Walk through the sorted array, split it into rows according to the
293 	 * first (k-1) columns. If there's a single value in the last column, we
294 	 * count the group as 'supporting' the functional dependency. Otherwise we
295 	 * count it as contradicting.
296 	 */
297 
298 	/* start with the first row forming a group */
299 	group_size = 1;
300 
301 	/* loop 1 beyond the end of the array so that we count the final group */
302 	for (i = 1; i <= nitems; i++)
303 	{
304 		/*
305 		 * Check if the group ended, which may be either because we processed
306 		 * all the items (i==nitems), or because the i-th item is not equal to
307 		 * the preceding one.
308 		 */
309 		if (i == nitems ||
310 			multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
311 		{
312 			/*
313 			 * If no violations were found in the group then track the rows of
314 			 * the group as supporting the functional dependency.
315 			 */
316 			if (n_violations == 0)
317 				n_supporting_rows += group_size;
318 
319 			/* Reset counters for the new group */
320 			n_violations = 0;
321 			group_size = 1;
322 			continue;
323 		}
324 		/* first columns match, but the last one does not (so contradicting) */
325 		else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0)
326 			n_violations++;
327 
328 		group_size++;
329 	}
330 
331 	/* Compute the 'degree of validity' as (supporting/total). */
332 	return (n_supporting_rows * 1.0 / numrows);
333 }
334 
335 /*
336  * detects functional dependencies between groups of columns
337  *
338  * Generates all possible subsets of columns (variations) and computes
339  * the degree of validity for each one. For example when creating statistics
340  * on three columns (a,b,c) there are 9 possible dependencies
341  *
342  *	   two columns			  three columns
343  *	   -----------			  -------------
344  *	   (a) -> b				  (a,b) -> c
345  *	   (a) -> c				  (a,c) -> b
346  *	   (b) -> a				  (b,c) -> a
347  *	   (b) -> c
348  *	   (c) -> a
349  *	   (c) -> b
350  */
351 MVDependencies *
statext_dependencies_build(int numrows,HeapTuple * rows,Bitmapset * attrs,VacAttrStats ** stats)352 statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
353 						   VacAttrStats **stats)
354 {
355 	int			i,
356 				k;
357 	int			numattrs;
358 	AttrNumber *attnums;
359 
360 	/* result */
361 	MVDependencies *dependencies = NULL;
362 	MemoryContext	cxt;
363 
364 	/*
365 	 * Transform the bms into an array, to make accessing i-th member easier.
366 	 */
367 	attnums = build_attnums_array(attrs, &numattrs);
368 
369 	Assert(numattrs >= 2);
370 
371 	/* tracks memory allocated by dependency_degree calls */
372 	cxt = AllocSetContextCreate(CurrentMemoryContext,
373 								"dependency_degree cxt",
374 								ALLOCSET_DEFAULT_SIZES);
375 
376 	/*
377 	 * We'll try build functional dependencies starting from the smallest ones
378 	 * covering just 2 columns, to the largest ones, covering all columns
379 	 * included in the statistics object.  We start from the smallest ones
380 	 * because we want to be able to skip already implied ones.
381 	 */
382 	for (k = 2; k <= numattrs; k++)
383 	{
384 		AttrNumber *dependency; /* array with k elements */
385 
386 		/* prepare a DependencyGenerator of variation */
387 		DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k);
388 
389 		/* generate all possible variations of k values (out of n) */
390 		while ((dependency = DependencyGenerator_next(DependencyGenerator)))
391 		{
392 			double		degree;
393 			MVDependency *d;
394 			MemoryContext oldcxt;
395 
396 			/* release memory used by dependency degree calculation */
397 			oldcxt = MemoryContextSwitchTo(cxt);
398 
399 			/* compute how valid the dependency seems */
400 			degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
401 
402 			MemoryContextSwitchTo(oldcxt);
403 			MemoryContextReset(cxt);
404 
405 			/*
406 			 * if the dependency seems entirely invalid, don't store it
407 			 */
408 			if (degree == 0.0)
409 				continue;
410 
411 			d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
412 										 + k * sizeof(AttrNumber));
413 
414 			/* copy the dependency (and keep the indexes into stxkeys) */
415 			d->degree = degree;
416 			d->nattributes = k;
417 			for (i = 0; i < k; i++)
418 				d->attributes[i] = attnums[dependency[i]];
419 
420 			/* initialize the list of dependencies */
421 			if (dependencies == NULL)
422 			{
423 				dependencies
424 					= (MVDependencies *) palloc0(sizeof(MVDependencies));
425 
426 				dependencies->magic = STATS_DEPS_MAGIC;
427 				dependencies->type = STATS_DEPS_TYPE_BASIC;
428 				dependencies->ndeps = 0;
429 			}
430 
431 			dependencies->ndeps++;
432 			dependencies = (MVDependencies *) repalloc(dependencies,
433 													   offsetof(MVDependencies, deps)
434 													   + dependencies->ndeps * sizeof(MVDependency *));
435 
436 			dependencies->deps[dependencies->ndeps - 1] = d;
437 		}
438 
439 		/*
440 		 * we're done with variations of k elements, so free the
441 		 * DependencyGenerator
442 		 */
443 		DependencyGenerator_free(DependencyGenerator);
444 	}
445 
446 	MemoryContextDelete(cxt);
447 
448 	return dependencies;
449 }
450 
451 
452 /*
453  * Serialize list of dependencies into a bytea value.
454  */
455 bytea *
statext_dependencies_serialize(MVDependencies * dependencies)456 statext_dependencies_serialize(MVDependencies *dependencies)
457 {
458 	int			i;
459 	bytea	   *output;
460 	char	   *tmp;
461 	Size		len;
462 
463 	/* we need to store ndeps, with a number of attributes for each one */
464 	len = VARHDRSZ + SizeOfHeader;
465 
466 	/* and also include space for the actual attribute numbers and degrees */
467 	for (i = 0; i < dependencies->ndeps; i++)
468 		len += SizeOfItem(dependencies->deps[i]->nattributes);
469 
470 	output = (bytea *) palloc0(len);
471 	SET_VARSIZE(output, len);
472 
473 	tmp = VARDATA(output);
474 
475 	/* Store the base struct values (magic, type, ndeps) */
476 	memcpy(tmp, &dependencies->magic, sizeof(uint32));
477 	tmp += sizeof(uint32);
478 	memcpy(tmp, &dependencies->type, sizeof(uint32));
479 	tmp += sizeof(uint32);
480 	memcpy(tmp, &dependencies->ndeps, sizeof(uint32));
481 	tmp += sizeof(uint32);
482 
483 	/* store number of attributes and attribute numbers for each dependency */
484 	for (i = 0; i < dependencies->ndeps; i++)
485 	{
486 		MVDependency *d = dependencies->deps[i];
487 
488 		memcpy(tmp, &d->degree, sizeof(double));
489 		tmp += sizeof(double);
490 
491 		memcpy(tmp, &d->nattributes, sizeof(AttrNumber));
492 		tmp += sizeof(AttrNumber);
493 
494 		memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes);
495 		tmp += sizeof(AttrNumber) * d->nattributes;
496 
497 		/* protect against overflow */
498 		Assert(tmp <= ((char *) output + len));
499 	}
500 
501 	/* make sure we've produced exactly the right amount of data */
502 	Assert(tmp == ((char *) output + len));
503 
504 	return output;
505 }
506 
507 /*
508  * Reads serialized dependencies into MVDependencies structure.
509  */
510 MVDependencies *
statext_dependencies_deserialize(bytea * data)511 statext_dependencies_deserialize(bytea *data)
512 {
513 	int			i;
514 	Size		min_expected_size;
515 	MVDependencies *dependencies;
516 	char	   *tmp;
517 
518 	if (data == NULL)
519 		return NULL;
520 
521 	if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
522 		elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)",
523 			 VARSIZE_ANY_EXHDR(data), SizeOfHeader);
524 
525 	/* read the MVDependencies header */
526 	dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies));
527 
528 	/* initialize pointer to the data part (skip the varlena header) */
529 	tmp = VARDATA_ANY(data);
530 
531 	/* read the header fields and perform basic sanity checks */
532 	memcpy(&dependencies->magic, tmp, sizeof(uint32));
533 	tmp += sizeof(uint32);
534 	memcpy(&dependencies->type, tmp, sizeof(uint32));
535 	tmp += sizeof(uint32);
536 	memcpy(&dependencies->ndeps, tmp, sizeof(uint32));
537 	tmp += sizeof(uint32);
538 
539 	if (dependencies->magic != STATS_DEPS_MAGIC)
540 		elog(ERROR, "invalid dependency magic %d (expected %d)",
541 			 dependencies->magic, STATS_DEPS_MAGIC);
542 
543 	if (dependencies->type != STATS_DEPS_TYPE_BASIC)
544 		elog(ERROR, "invalid dependency type %d (expected %d)",
545 			 dependencies->type, STATS_DEPS_TYPE_BASIC);
546 
547 	if (dependencies->ndeps == 0)
548 		elog(ERROR, "invalid zero-length item array in MVDependencies");
549 
550 	/* what minimum bytea size do we expect for those parameters */
551 	min_expected_size = SizeOfItem(dependencies->ndeps);
552 
553 	if (VARSIZE_ANY_EXHDR(data) < min_expected_size)
554 		elog(ERROR, "invalid dependencies size %zd (expected at least %zd)",
555 			 VARSIZE_ANY_EXHDR(data), min_expected_size);
556 
557 	/* allocate space for the MCV items */
558 	dependencies = repalloc(dependencies, offsetof(MVDependencies, deps)
559 							+ (dependencies->ndeps * sizeof(MVDependency *)));
560 
561 	for (i = 0; i < dependencies->ndeps; i++)
562 	{
563 		double		degree;
564 		AttrNumber	k;
565 		MVDependency *d;
566 
567 		/* degree of validity */
568 		memcpy(&degree, tmp, sizeof(double));
569 		tmp += sizeof(double);
570 
571 		/* number of attributes */
572 		memcpy(&k, tmp, sizeof(AttrNumber));
573 		tmp += sizeof(AttrNumber);
574 
575 		/* is the number of attributes valid? */
576 		Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS));
577 
578 		/* now that we know the number of attributes, allocate the dependency */
579 		d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
580 									 + (k * sizeof(AttrNumber)));
581 
582 		d->degree = degree;
583 		d->nattributes = k;
584 
585 		/* copy attribute numbers */
586 		memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes);
587 		tmp += sizeof(AttrNumber) * d->nattributes;
588 
589 		dependencies->deps[i] = d;
590 
591 		/* still within the bytea */
592 		Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
593 	}
594 
595 	/* we should have consumed the whole bytea exactly */
596 	Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
597 
598 	return dependencies;
599 }
600 
601 /*
602  * dependency_is_fully_matched
603  *		checks that a functional dependency is fully matched given clauses on
604  *		attributes (assuming the clauses are suitable equality clauses)
605  */
606 static bool
dependency_is_fully_matched(MVDependency * dependency,Bitmapset * attnums)607 dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
608 {
609 	int			j;
610 
611 	/*
612 	 * Check that the dependency actually is fully covered by clauses. We have
613 	 * to translate all attribute numbers, as those are referenced
614 	 */
615 	for (j = 0; j < dependency->nattributes; j++)
616 	{
617 		int			attnum = dependency->attributes[j];
618 
619 		if (!bms_is_member(attnum, attnums))
620 			return false;
621 	}
622 
623 	return true;
624 }
625 
626 /*
627  * dependency_implies_attribute
628  *		check that the attnum matches is implied by the functional dependency
629  */
630 static bool
dependency_implies_attribute(MVDependency * dependency,AttrNumber attnum)631 dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum)
632 {
633 	if (attnum == dependency->attributes[dependency->nattributes - 1])
634 		return true;
635 
636 	return false;
637 }
638 
639 /*
640  * statext_dependencies_load
641  *		Load the functional dependencies for the indicated pg_statistic_ext tuple
642  */
643 MVDependencies *
statext_dependencies_load(Oid mvoid)644 statext_dependencies_load(Oid mvoid)
645 {
646 	MVDependencies *result;
647 	bool		isnull;
648 	Datum		deps;
649 	HeapTuple	htup;
650 
651 	htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
652 	if (!HeapTupleIsValid(htup))
653 		elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
654 
655 	deps = SysCacheGetAttr(STATEXTDATASTXOID, htup,
656 						   Anum_pg_statistic_ext_data_stxddependencies, &isnull);
657 	if (isnull)
658 		elog(ERROR,
659 			 "requested statistics kind \"%c\" is not yet built for statistics object %u",
660 			 STATS_EXT_DEPENDENCIES, mvoid);
661 
662 	result = statext_dependencies_deserialize(DatumGetByteaPP(deps));
663 
664 	ReleaseSysCache(htup);
665 
666 	return result;
667 }
668 
669 /*
670  * pg_dependencies_in		- input routine for type pg_dependencies.
671  *
672  * pg_dependencies is real enough to be a table column, but it has no operations
673  * of its own, and disallows input too
674  */
675 Datum
pg_dependencies_in(PG_FUNCTION_ARGS)676 pg_dependencies_in(PG_FUNCTION_ARGS)
677 {
678 	/*
679 	 * pg_node_list stores the data in binary form and parsing text input is
680 	 * not needed, so disallow this.
681 	 */
682 	ereport(ERROR,
683 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
684 			 errmsg("cannot accept a value of type %s", "pg_dependencies")));
685 
686 	PG_RETURN_VOID();			/* keep compiler quiet */
687 }
688 
689 /*
690  * pg_dependencies		- output routine for type pg_dependencies.
691  */
692 Datum
pg_dependencies_out(PG_FUNCTION_ARGS)693 pg_dependencies_out(PG_FUNCTION_ARGS)
694 {
695 	bytea	   *data = PG_GETARG_BYTEA_PP(0);
696 	MVDependencies *dependencies = statext_dependencies_deserialize(data);
697 	int			i,
698 				j;
699 	StringInfoData str;
700 
701 	initStringInfo(&str);
702 	appendStringInfoChar(&str, '{');
703 
704 	for (i = 0; i < dependencies->ndeps; i++)
705 	{
706 		MVDependency *dependency = dependencies->deps[i];
707 
708 		if (i > 0)
709 			appendStringInfoString(&str, ", ");
710 
711 		appendStringInfoChar(&str, '"');
712 		for (j = 0; j < dependency->nattributes; j++)
713 		{
714 			if (j == dependency->nattributes - 1)
715 				appendStringInfoString(&str, " => ");
716 			else if (j > 0)
717 				appendStringInfoString(&str, ", ");
718 
719 			appendStringInfo(&str, "%d", dependency->attributes[j]);
720 		}
721 		appendStringInfo(&str, "\": %f", dependency->degree);
722 	}
723 
724 	appendStringInfoChar(&str, '}');
725 
726 	PG_RETURN_CSTRING(str.data);
727 }
728 
729 /*
730  * pg_dependencies_recv		- binary input routine for type pg_dependencies.
731  */
732 Datum
pg_dependencies_recv(PG_FUNCTION_ARGS)733 pg_dependencies_recv(PG_FUNCTION_ARGS)
734 {
735 	ereport(ERROR,
736 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
737 			 errmsg("cannot accept a value of type %s", "pg_dependencies")));
738 
739 	PG_RETURN_VOID();			/* keep compiler quiet */
740 }
741 
742 /*
743  * pg_dependencies_send		- binary output routine for type pg_dependencies.
744  *
745  * Functional dependencies are serialized in a bytea value (although the type
746  * is named differently), so let's just send that.
747  */
748 Datum
pg_dependencies_send(PG_FUNCTION_ARGS)749 pg_dependencies_send(PG_FUNCTION_ARGS)
750 {
751 	return byteasend(fcinfo);
752 }
753 
754 /*
755  * dependency_is_compatible_clause
756  *		Determines if the clause is compatible with functional dependencies
757  *
758  * Only clauses that have the form of equality to a pseudoconstant, or can be
759  * interpreted that way, are currently accepted.  Furthermore the variable
760  * part of the clause must be a simple Var belonging to the specified
761  * relation, whose attribute number we return in *attnum on success.
762  */
763 static bool
dependency_is_compatible_clause(Node * clause,Index relid,AttrNumber * attnum)764 dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
765 {
766 	RestrictInfo *rinfo = (RestrictInfo *) clause;
767 	Var		   *var;
768 
769 	if (!IsA(rinfo, RestrictInfo))
770 		return false;
771 
772 	/* Pseudoconstants are not interesting (they couldn't contain a Var) */
773 	if (rinfo->pseudoconstant)
774 		return false;
775 
776 	/* Clauses referencing multiple, or no, varnos are incompatible */
777 	if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
778 		return false;
779 
780 	if (is_opclause(rinfo->clause))
781 	{
782 		/* If it's an opclause, check for Var = Const or Const = Var. */
783 		OpExpr	   *expr = (OpExpr *) rinfo->clause;
784 
785 		/* Only expressions with two arguments are candidates. */
786 		if (list_length(expr->args) != 2)
787 			return false;
788 
789 		/* Make sure non-selected argument is a pseudoconstant. */
790 		if (is_pseudo_constant_clause(lsecond(expr->args)))
791 			var = linitial(expr->args);
792 		else if (is_pseudo_constant_clause(linitial(expr->args)))
793 			var = lsecond(expr->args);
794 		else
795 			return false;
796 
797 		/*
798 		 * If it's not an "=" operator, just ignore the clause, as it's not
799 		 * compatible with functional dependencies.
800 		 *
801 		 * This uses the function for estimating selectivity, not the operator
802 		 * directly (a bit awkward, but well ...).
803 		 *
804 		 * XXX this is pretty dubious; probably it'd be better to check btree
805 		 * or hash opclass membership, so as not to be fooled by custom
806 		 * selectivity functions, and to be more consistent with decisions
807 		 * elsewhere in the planner.
808 		 */
809 		if (get_oprrest(expr->opno) != F_EQSEL)
810 			return false;
811 
812 		/* OK to proceed with checking "var" */
813 	}
814 	else if (is_notclause(rinfo->clause))
815 	{
816 		/*
817 		 * "NOT x" can be interpreted as "x = false", so get the argument and
818 		 * proceed with seeing if it's a suitable Var.
819 		 */
820 		var = (Var *) get_notclausearg(rinfo->clause);
821 	}
822 	else
823 	{
824 		/*
825 		 * A boolean expression "x" can be interpreted as "x = true", so
826 		 * proceed with seeing if it's a suitable Var.
827 		 */
828 		var = (Var *) rinfo->clause;
829 	}
830 
831 	/*
832 	 * We may ignore any RelabelType node above the operand.  (There won't be
833 	 * more than one, since eval_const_expressions has been applied already.)
834 	 */
835 	if (IsA(var, RelabelType))
836 		var = (Var *) ((RelabelType *) var)->arg;
837 
838 	/* We only support plain Vars for now */
839 	if (!IsA(var, Var))
840 		return false;
841 
842 	/* Ensure Var is from the correct relation */
843 	if (var->varno != relid)
844 		return false;
845 
846 	/* We also better ensure the Var is from the current level */
847 	if (var->varlevelsup != 0)
848 		return false;
849 
850 	/* Also ignore system attributes (we don't allow stats on those) */
851 	if (!AttrNumberIsForUserDefinedAttr(var->varattno))
852 		return false;
853 
854 	*attnum = var->varattno;
855 	return true;
856 }
857 
858 /*
859  * find_strongest_dependency
860  *		find the strongest dependency on the attributes
861  *
862  * When applying functional dependencies, we start with the strongest
863  * dependencies. That is, we select the dependency that:
864  *
865  * (a) has all attributes covered by equality clauses
866  *
867  * (b) has the most attributes
868  *
869  * (c) has the highest degree of validity
870  *
871  * This guarantees that we eliminate the most redundant conditions first
872  * (see the comment in dependencies_clauselist_selectivity).
873  */
874 static MVDependency *
find_strongest_dependency(StatisticExtInfo * stats,MVDependencies * dependencies,Bitmapset * attnums)875 find_strongest_dependency(StatisticExtInfo *stats, MVDependencies *dependencies,
876 						  Bitmapset *attnums)
877 {
878 	int			i;
879 	MVDependency *strongest = NULL;
880 
881 	/* number of attnums in clauses */
882 	int			nattnums = bms_num_members(attnums);
883 
884 	/*
885 	 * Iterate over the MVDependency items and find the strongest one from the
886 	 * fully-matched dependencies. We do the cheap checks first, before
887 	 * matching it against the attnums.
888 	 */
889 	for (i = 0; i < dependencies->ndeps; i++)
890 	{
891 		MVDependency *dependency = dependencies->deps[i];
892 
893 		/*
894 		 * Skip dependencies referencing more attributes than available
895 		 * clauses, as those can't be fully matched.
896 		 */
897 		if (dependency->nattributes > nattnums)
898 			continue;
899 
900 		if (strongest)
901 		{
902 			/* skip dependencies on fewer attributes than the strongest. */
903 			if (dependency->nattributes < strongest->nattributes)
904 				continue;
905 
906 			/* also skip weaker dependencies when attribute count matches */
907 			if (strongest->nattributes == dependency->nattributes &&
908 				strongest->degree > dependency->degree)
909 				continue;
910 		}
911 
912 		/*
913 		 * this dependency is stronger, but we must still check that it's
914 		 * fully matched to these attnums. We perform this check last as it's
915 		 * slightly more expensive than the previous checks.
916 		 */
917 		if (dependency_is_fully_matched(dependency, attnums))
918 			strongest = dependency; /* save new best match */
919 	}
920 
921 	return strongest;
922 }
923 
924 /*
925  * dependencies_clauselist_selectivity
926  *		Return the estimated selectivity of (a subset of) the given clauses
927  *		using functional dependency statistics, or 1.0 if no useful functional
928  *		dependency statistic exists.
929  *
930  * 'estimatedclauses' is an input/output argument that gets a bit set
931  * corresponding to the (zero-based) list index of each clause that is included
932  * in the estimated selectivity.
933  *
934  * Given equality clauses on attributes (a,b) we find the strongest dependency
935  * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
936  * dependency, we then combine the per-clause selectivities using the formula
937  *
938  *	   P(a,b) = P(a) * [f + (1-f)*P(b)]
939  *
940  * where 'f' is the degree of the dependency.
941  *
942  * With clauses on more than two attributes, the dependencies are applied
943  * recursively, starting with the widest/strongest dependencies. For example
944  * P(a,b,c) is first split like this:
945  *
946  *	   P(a,b,c) = P(a,b) * [f + (1-f)*P(c)]
947  *
948  * assuming (a,b=>c) is the strongest dependency.
949  */
950 Selectivity
dependencies_clauselist_selectivity(PlannerInfo * root,List * clauses,int varRelid,JoinType jointype,SpecialJoinInfo * sjinfo,RelOptInfo * rel,Bitmapset ** estimatedclauses)951 dependencies_clauselist_selectivity(PlannerInfo *root,
952 									List *clauses,
953 									int varRelid,
954 									JoinType jointype,
955 									SpecialJoinInfo *sjinfo,
956 									RelOptInfo *rel,
957 									Bitmapset **estimatedclauses)
958 {
959 	Selectivity s1 = 1.0;
960 	ListCell   *l;
961 	Bitmapset  *clauses_attnums = NULL;
962 	StatisticExtInfo *stat;
963 	MVDependencies *dependencies;
964 	Bitmapset **list_attnums;
965 	int			listidx;
966 
967 	/* check if there's any stats that might be useful for us. */
968 	if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
969 		return 1.0;
970 
971 	list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
972 										 list_length(clauses));
973 
974 	/*
975 	 * Pre-process the clauses list to extract the attnums seen in each item.
976 	 * We need to determine if there's any clauses which will be useful for
977 	 * dependency selectivity estimations. Along the way we'll record all of
978 	 * the attnums for each clause in a list which we'll reference later so we
979 	 * don't need to repeat the same work again. We'll also keep track of all
980 	 * attnums seen.
981 	 *
982 	 * We also skip clauses that we already estimated using different types of
983 	 * statistics (we treat them as incompatible).
984 	 */
985 	listidx = 0;
986 	foreach(l, clauses)
987 	{
988 		Node	   *clause = (Node *) lfirst(l);
989 		AttrNumber	attnum;
990 
991 		if (!bms_is_member(listidx, *estimatedclauses) &&
992 			dependency_is_compatible_clause(clause, rel->relid, &attnum))
993 		{
994 			list_attnums[listidx] = bms_make_singleton(attnum);
995 			clauses_attnums = bms_add_member(clauses_attnums, attnum);
996 		}
997 		else
998 			list_attnums[listidx] = NULL;
999 
1000 		listidx++;
1001 	}
1002 
1003 	/*
1004 	 * If there's not at least two distinct attnums then reject the whole list
1005 	 * of clauses. We must return 1.0 so the calling function's selectivity is
1006 	 * unaffected.
1007 	 */
1008 	if (bms_num_members(clauses_attnums) < 2)
1009 	{
1010 		pfree(list_attnums);
1011 		return 1.0;
1012 	}
1013 
1014 	/* find the best suited statistics object for these attnums */
1015 	stat = choose_best_statistics(rel->statlist, STATS_EXT_DEPENDENCIES,
1016 								  list_attnums, list_length(clauses));
1017 
1018 	/* if no matching stats could be found then we've nothing to do */
1019 	if (!stat)
1020 	{
1021 		pfree(list_attnums);
1022 		return 1.0;
1023 	}
1024 
1025 	/* load the dependency items stored in the statistics object */
1026 	dependencies = statext_dependencies_load(stat->statOid);
1027 
1028 	/*
1029 	 * Apply the dependencies recursively, starting with the widest/strongest
1030 	 * ones, and proceeding to the smaller/weaker ones. At the end of each
1031 	 * round we factor in the selectivity of clauses on the implied attribute,
1032 	 * and remove the clauses from the list.
1033 	 */
1034 	while (true)
1035 	{
1036 		Selectivity s2 = 1.0;
1037 		MVDependency *dependency;
1038 
1039 		/* the widest/strongest dependency, fully matched by clauses */
1040 		dependency = find_strongest_dependency(stat, dependencies,
1041 											   clauses_attnums);
1042 
1043 		/* if no suitable dependency was found, we're done */
1044 		if (!dependency)
1045 			break;
1046 
1047 		/*
1048 		 * We found an applicable dependency, so find all the clauses on the
1049 		 * implied attribute - with dependency (a,b => c) we look for clauses
1050 		 * on 'c'.
1051 		 */
1052 		listidx = -1;
1053 		foreach(l, clauses)
1054 		{
1055 			Node	   *clause;
1056 			AttrNumber	attnum;
1057 
1058 			listidx++;
1059 
1060 			/*
1061 			 * Skip incompatible clauses, and ones we've already estimated on.
1062 			 */
1063 			if (!list_attnums[listidx])
1064 				continue;
1065 
1066 			/*
1067 			 * We expect the bitmaps ton contain a single attribute number.
1068 			 */
1069 			attnum = bms_singleton_member(list_attnums[listidx]);
1070 
1071 			/*
1072 			 * Technically we could find more than one clause for a given
1073 			 * attnum. Since these clauses must be equality clauses, we choose
1074 			 * to only take the selectivity estimate from the final clause in
1075 			 * the list for this attnum. If the attnum happens to be compared
1076 			 * to a different Const in another clause then no rows will match
1077 			 * anyway. If it happens to be compared to the same Const, then
1078 			 * ignoring the additional clause is just the thing to do.
1079 			 */
1080 			if (dependency_implies_attribute(dependency, attnum))
1081 			{
1082 				clause = (Node *) lfirst(l);
1083 
1084 				s2 = clause_selectivity(root, clause, varRelid, jointype,
1085 										sjinfo);
1086 
1087 				/* mark this one as done, so we don't touch it again. */
1088 				*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
1089 
1090 				/*
1091 				 * Mark that we've got and used the dependency on this clause.
1092 				 * We'll want to ignore this when looking for the next
1093 				 * strongest dependency above.
1094 				 */
1095 				clauses_attnums = bms_del_member(clauses_attnums, attnum);
1096 			}
1097 		}
1098 
1099 		/*
1100 		 * Now factor in the selectivity for all the "implied" clauses into
1101 		 * the final one, using this formula:
1102 		 *
1103 		 * P(a,b) = P(a) * (f + (1-f) * P(b))
1104 		 *
1105 		 * where 'f' is the degree of validity of the dependency.
1106 		 */
1107 		s1 *= (dependency->degree + (1 - dependency->degree) * s2);
1108 	}
1109 
1110 	pfree(dependencies);
1111 	pfree(list_attnums);
1112 
1113 	return s1;
1114 }
1115