1 /*-------------------------------------------------------------------------
2 *
3 * dependencies.c
4 * POSTGRES functional dependencies
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/statistics/dependencies.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include "access/htup_details.h"
17 #include "access/sysattr.h"
18 #include "catalog/pg_operator.h"
19 #include "catalog/pg_statistic_ext.h"
20 #include "catalog/pg_statistic_ext_data.h"
21 #include "lib/stringinfo.h"
22 #include "nodes/nodeFuncs.h"
23 #include "optimizer/clauses.h"
24 #include "optimizer/optimizer.h"
25 #include "nodes/nodes.h"
26 #include "nodes/pathnodes.h"
27 #include "statistics/extended_stats_internal.h"
28 #include "statistics/statistics.h"
29 #include "utils/bytea.h"
30 #include "utils/fmgroids.h"
31 #include "utils/fmgrprotos.h"
32 #include "utils/lsyscache.h"
33 #include "utils/memutils.h"
34 #include "utils/selfuncs.h"
35 #include "utils/syscache.h"
36 #include "utils/typcache.h"
37
38 /* size of the struct header fields (magic, type, ndeps) */
39 #define SizeOfHeader (3 * sizeof(uint32))
40
41 /* size of a serialized dependency (degree, natts, atts) */
42 #define SizeOfItem(natts) \
43 (sizeof(double) + sizeof(AttrNumber) * (1 + (natts)))
44
45 /* minimal size of a dependency (with two attributes) */
46 #define MinSizeOfItem SizeOfItem(2)
47
48 /* minimal size of dependencies, when all deps are minimal */
49 #define MinSizeOfItems(ndeps) \
50 (SizeOfHeader + (ndeps) * MinSizeOfItem)
51
52 /*
53 * Internal state for DependencyGenerator of dependencies. Dependencies are similar to
54 * k-permutations of n elements, except that the order does not matter for the
55 * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
56 */
57 typedef struct DependencyGeneratorData
58 {
59 int k; /* size of the dependency */
60 int n; /* number of possible attributes */
61 int current; /* next dependency to return (index) */
62 AttrNumber ndependencies; /* number of dependencies generated */
63 AttrNumber *dependencies; /* array of pre-generated dependencies */
64 } DependencyGeneratorData;
65
66 typedef DependencyGeneratorData *DependencyGenerator;
67
68 static void generate_dependencies_recurse(DependencyGenerator state,
69 int index, AttrNumber start, AttrNumber *current);
70 static void generate_dependencies(DependencyGenerator state);
71 static DependencyGenerator DependencyGenerator_init(int n, int k);
72 static void DependencyGenerator_free(DependencyGenerator state);
73 static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
74 static double dependency_degree(int numrows, HeapTuple *rows, int k,
75 AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
76 static bool dependency_is_fully_matched(MVDependency *dependency,
77 Bitmapset *attnums);
78 static bool dependency_implies_attribute(MVDependency *dependency,
79 AttrNumber attnum);
80 static bool dependency_is_compatible_clause(Node *clause, Index relid,
81 AttrNumber *attnum);
82 static MVDependency *find_strongest_dependency(StatisticExtInfo *stats,
83 MVDependencies *dependencies,
84 Bitmapset *attnums);
85
86 static void
generate_dependencies_recurse(DependencyGenerator state,int index,AttrNumber start,AttrNumber * current)87 generate_dependencies_recurse(DependencyGenerator state, int index,
88 AttrNumber start, AttrNumber *current)
89 {
90 /*
91 * The generator handles the first (k-1) elements differently from the
92 * last element.
93 */
94 if (index < (state->k - 1))
95 {
96 AttrNumber i;
97
98 /*
99 * The first (k-1) values have to be in ascending order, which we
100 * generate recursively.
101 */
102
103 for (i = start; i < state->n; i++)
104 {
105 current[index] = i;
106 generate_dependencies_recurse(state, (index + 1), (i + 1), current);
107 }
108 }
109 else
110 {
111 int i;
112
113 /*
114 * the last element is the implied value, which does not respect the
115 * ascending order. We just need to check that the value is not in the
116 * first (k-1) elements.
117 */
118
119 for (i = 0; i < state->n; i++)
120 {
121 int j;
122 bool match = false;
123
124 current[index] = i;
125
126 for (j = 0; j < index; j++)
127 {
128 if (current[j] == i)
129 {
130 match = true;
131 break;
132 }
133 }
134
135 /*
136 * If the value is not found in the first part of the dependency,
137 * we're done.
138 */
139 if (!match)
140 {
141 state->dependencies = (AttrNumber *) repalloc(state->dependencies,
142 state->k * (state->ndependencies + 1) * sizeof(AttrNumber));
143 memcpy(&state->dependencies[(state->k * state->ndependencies)],
144 current, state->k * sizeof(AttrNumber));
145 state->ndependencies++;
146 }
147 }
148 }
149 }
150
151 /* generate all dependencies (k-permutations of n elements) */
152 static void
generate_dependencies(DependencyGenerator state)153 generate_dependencies(DependencyGenerator state)
154 {
155 AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k);
156
157 generate_dependencies_recurse(state, 0, 0, current);
158
159 pfree(current);
160 }
161
162 /*
163 * initialize the DependencyGenerator of variations, and prebuild the variations
164 *
165 * This pre-builds all the variations. We could also generate them in
166 * DependencyGenerator_next(), but this seems simpler.
167 */
168 static DependencyGenerator
DependencyGenerator_init(int n,int k)169 DependencyGenerator_init(int n, int k)
170 {
171 DependencyGenerator state;
172
173 Assert((n >= k) && (k > 0));
174
175 /* allocate the DependencyGenerator state */
176 state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData));
177 state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber));
178
179 state->ndependencies = 0;
180 state->current = 0;
181 state->k = k;
182 state->n = n;
183
184 /* now actually pre-generate all the variations */
185 generate_dependencies(state);
186
187 return state;
188 }
189
190 /* free the DependencyGenerator state */
191 static void
DependencyGenerator_free(DependencyGenerator state)192 DependencyGenerator_free(DependencyGenerator state)
193 {
194 pfree(state->dependencies);
195 pfree(state);
196
197 }
198
199 /* generate next combination */
200 static AttrNumber *
DependencyGenerator_next(DependencyGenerator state)201 DependencyGenerator_next(DependencyGenerator state)
202 {
203 if (state->current == state->ndependencies)
204 return NULL;
205
206 return &state->dependencies[state->k * state->current++];
207 }
208
209
210 /*
211 * validates functional dependency on the data
212 *
213 * An actual work horse of detecting functional dependencies. Given a variation
214 * of k attributes, it checks that the first (k-1) are sufficient to determine
215 * the last one.
216 */
217 static double
dependency_degree(int numrows,HeapTuple * rows,int k,AttrNumber * dependency,VacAttrStats ** stats,Bitmapset * attrs)218 dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
219 VacAttrStats **stats, Bitmapset *attrs)
220 {
221 int i,
222 nitems;
223 MultiSortSupport mss;
224 SortItem *items;
225 AttrNumber *attnums;
226 AttrNumber *attnums_dep;
227 int numattrs;
228
229 /* counters valid within a group */
230 int group_size = 0;
231 int n_violations = 0;
232
233 /* total number of rows supporting (consistent with) the dependency */
234 int n_supporting_rows = 0;
235
236 /* Make sure we have at least two input attributes. */
237 Assert(k >= 2);
238
239 /* sort info for all attributes columns */
240 mss = multi_sort_init(k);
241
242 /*
243 * Transform the attrs from bitmap to an array to make accessing the i-th
244 * member easier, and then construct a filtered version with only attnums
245 * referenced by the dependency we validate.
246 */
247 attnums = build_attnums_array(attrs, &numattrs);
248
249 attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
250 for (i = 0; i < k; i++)
251 attnums_dep[i] = attnums[dependency[i]];
252
253 /*
254 * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
255 *
256 * (a) sort the data lexicographically
257 *
258 * (b) split the data into groups by first (k-1) columns
259 *
260 * (c) for each group count different values in the last column
261 *
262 * We use the column data types' default sort operators and collations;
263 * perhaps at some point it'd be worth using column-specific collations?
264 */
265
266 /* prepare the sort function for the dimensions */
267 for (i = 0; i < k; i++)
268 {
269 VacAttrStats *colstat = stats[dependency[i]];
270 TypeCacheEntry *type;
271
272 type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
273 if (type->lt_opr == InvalidOid) /* shouldn't happen */
274 elog(ERROR, "cache lookup failed for ordering operator for type %u",
275 colstat->attrtypid);
276
277 /* prepare the sort function for this dimension */
278 multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
279 }
280
281 /*
282 * build an array of SortItem(s) sorted using the multi-sort support
283 *
284 * XXX This relies on all stats entries pointing to the same tuple
285 * descriptor. For now that assumption holds, but it might change in the
286 * future for example if we support statistics on multiple tables.
287 */
288 items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
289 mss, k, attnums_dep);
290
291 /*
292 * Walk through the sorted array, split it into rows according to the
293 * first (k-1) columns. If there's a single value in the last column, we
294 * count the group as 'supporting' the functional dependency. Otherwise we
295 * count it as contradicting.
296 */
297
298 /* start with the first row forming a group */
299 group_size = 1;
300
301 /* loop 1 beyond the end of the array so that we count the final group */
302 for (i = 1; i <= nitems; i++)
303 {
304 /*
305 * Check if the group ended, which may be either because we processed
306 * all the items (i==nitems), or because the i-th item is not equal to
307 * the preceding one.
308 */
309 if (i == nitems ||
310 multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
311 {
312 /*
313 * If no violations were found in the group then track the rows of
314 * the group as supporting the functional dependency.
315 */
316 if (n_violations == 0)
317 n_supporting_rows += group_size;
318
319 /* Reset counters for the new group */
320 n_violations = 0;
321 group_size = 1;
322 continue;
323 }
324 /* first columns match, but the last one does not (so contradicting) */
325 else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0)
326 n_violations++;
327
328 group_size++;
329 }
330
331 /* Compute the 'degree of validity' as (supporting/total). */
332 return (n_supporting_rows * 1.0 / numrows);
333 }
334
335 /*
336 * detects functional dependencies between groups of columns
337 *
338 * Generates all possible subsets of columns (variations) and computes
339 * the degree of validity for each one. For example when creating statistics
340 * on three columns (a,b,c) there are 9 possible dependencies
341 *
342 * two columns three columns
343 * ----------- -------------
344 * (a) -> b (a,b) -> c
345 * (a) -> c (a,c) -> b
346 * (b) -> a (b,c) -> a
347 * (b) -> c
348 * (c) -> a
349 * (c) -> b
350 */
351 MVDependencies *
statext_dependencies_build(int numrows,HeapTuple * rows,Bitmapset * attrs,VacAttrStats ** stats)352 statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
353 VacAttrStats **stats)
354 {
355 int i,
356 k;
357 int numattrs;
358 AttrNumber *attnums;
359
360 /* result */
361 MVDependencies *dependencies = NULL;
362 MemoryContext cxt;
363
364 /*
365 * Transform the bms into an array, to make accessing i-th member easier.
366 */
367 attnums = build_attnums_array(attrs, &numattrs);
368
369 Assert(numattrs >= 2);
370
371 /* tracks memory allocated by dependency_degree calls */
372 cxt = AllocSetContextCreate(CurrentMemoryContext,
373 "dependency_degree cxt",
374 ALLOCSET_DEFAULT_SIZES);
375
376 /*
377 * We'll try build functional dependencies starting from the smallest ones
378 * covering just 2 columns, to the largest ones, covering all columns
379 * included in the statistics object. We start from the smallest ones
380 * because we want to be able to skip already implied ones.
381 */
382 for (k = 2; k <= numattrs; k++)
383 {
384 AttrNumber *dependency; /* array with k elements */
385
386 /* prepare a DependencyGenerator of variation */
387 DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k);
388
389 /* generate all possible variations of k values (out of n) */
390 while ((dependency = DependencyGenerator_next(DependencyGenerator)))
391 {
392 double degree;
393 MVDependency *d;
394 MemoryContext oldcxt;
395
396 /* release memory used by dependency degree calculation */
397 oldcxt = MemoryContextSwitchTo(cxt);
398
399 /* compute how valid the dependency seems */
400 degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
401
402 MemoryContextSwitchTo(oldcxt);
403 MemoryContextReset(cxt);
404
405 /*
406 * if the dependency seems entirely invalid, don't store it
407 */
408 if (degree == 0.0)
409 continue;
410
411 d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
412 + k * sizeof(AttrNumber));
413
414 /* copy the dependency (and keep the indexes into stxkeys) */
415 d->degree = degree;
416 d->nattributes = k;
417 for (i = 0; i < k; i++)
418 d->attributes[i] = attnums[dependency[i]];
419
420 /* initialize the list of dependencies */
421 if (dependencies == NULL)
422 {
423 dependencies
424 = (MVDependencies *) palloc0(sizeof(MVDependencies));
425
426 dependencies->magic = STATS_DEPS_MAGIC;
427 dependencies->type = STATS_DEPS_TYPE_BASIC;
428 dependencies->ndeps = 0;
429 }
430
431 dependencies->ndeps++;
432 dependencies = (MVDependencies *) repalloc(dependencies,
433 offsetof(MVDependencies, deps)
434 + dependencies->ndeps * sizeof(MVDependency *));
435
436 dependencies->deps[dependencies->ndeps - 1] = d;
437 }
438
439 /*
440 * we're done with variations of k elements, so free the
441 * DependencyGenerator
442 */
443 DependencyGenerator_free(DependencyGenerator);
444 }
445
446 MemoryContextDelete(cxt);
447
448 return dependencies;
449 }
450
451
452 /*
453 * Serialize list of dependencies into a bytea value.
454 */
455 bytea *
statext_dependencies_serialize(MVDependencies * dependencies)456 statext_dependencies_serialize(MVDependencies *dependencies)
457 {
458 int i;
459 bytea *output;
460 char *tmp;
461 Size len;
462
463 /* we need to store ndeps, with a number of attributes for each one */
464 len = VARHDRSZ + SizeOfHeader;
465
466 /* and also include space for the actual attribute numbers and degrees */
467 for (i = 0; i < dependencies->ndeps; i++)
468 len += SizeOfItem(dependencies->deps[i]->nattributes);
469
470 output = (bytea *) palloc0(len);
471 SET_VARSIZE(output, len);
472
473 tmp = VARDATA(output);
474
475 /* Store the base struct values (magic, type, ndeps) */
476 memcpy(tmp, &dependencies->magic, sizeof(uint32));
477 tmp += sizeof(uint32);
478 memcpy(tmp, &dependencies->type, sizeof(uint32));
479 tmp += sizeof(uint32);
480 memcpy(tmp, &dependencies->ndeps, sizeof(uint32));
481 tmp += sizeof(uint32);
482
483 /* store number of attributes and attribute numbers for each dependency */
484 for (i = 0; i < dependencies->ndeps; i++)
485 {
486 MVDependency *d = dependencies->deps[i];
487
488 memcpy(tmp, &d->degree, sizeof(double));
489 tmp += sizeof(double);
490
491 memcpy(tmp, &d->nattributes, sizeof(AttrNumber));
492 tmp += sizeof(AttrNumber);
493
494 memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes);
495 tmp += sizeof(AttrNumber) * d->nattributes;
496
497 /* protect against overflow */
498 Assert(tmp <= ((char *) output + len));
499 }
500
501 /* make sure we've produced exactly the right amount of data */
502 Assert(tmp == ((char *) output + len));
503
504 return output;
505 }
506
507 /*
508 * Reads serialized dependencies into MVDependencies structure.
509 */
510 MVDependencies *
statext_dependencies_deserialize(bytea * data)511 statext_dependencies_deserialize(bytea *data)
512 {
513 int i;
514 Size min_expected_size;
515 MVDependencies *dependencies;
516 char *tmp;
517
518 if (data == NULL)
519 return NULL;
520
521 if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
522 elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)",
523 VARSIZE_ANY_EXHDR(data), SizeOfHeader);
524
525 /* read the MVDependencies header */
526 dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies));
527
528 /* initialize pointer to the data part (skip the varlena header) */
529 tmp = VARDATA_ANY(data);
530
531 /* read the header fields and perform basic sanity checks */
532 memcpy(&dependencies->magic, tmp, sizeof(uint32));
533 tmp += sizeof(uint32);
534 memcpy(&dependencies->type, tmp, sizeof(uint32));
535 tmp += sizeof(uint32);
536 memcpy(&dependencies->ndeps, tmp, sizeof(uint32));
537 tmp += sizeof(uint32);
538
539 if (dependencies->magic != STATS_DEPS_MAGIC)
540 elog(ERROR, "invalid dependency magic %d (expected %d)",
541 dependencies->magic, STATS_DEPS_MAGIC);
542
543 if (dependencies->type != STATS_DEPS_TYPE_BASIC)
544 elog(ERROR, "invalid dependency type %d (expected %d)",
545 dependencies->type, STATS_DEPS_TYPE_BASIC);
546
547 if (dependencies->ndeps == 0)
548 elog(ERROR, "invalid zero-length item array in MVDependencies");
549
550 /* what minimum bytea size do we expect for those parameters */
551 min_expected_size = SizeOfItem(dependencies->ndeps);
552
553 if (VARSIZE_ANY_EXHDR(data) < min_expected_size)
554 elog(ERROR, "invalid dependencies size %zd (expected at least %zd)",
555 VARSIZE_ANY_EXHDR(data), min_expected_size);
556
557 /* allocate space for the MCV items */
558 dependencies = repalloc(dependencies, offsetof(MVDependencies, deps)
559 + (dependencies->ndeps * sizeof(MVDependency *)));
560
561 for (i = 0; i < dependencies->ndeps; i++)
562 {
563 double degree;
564 AttrNumber k;
565 MVDependency *d;
566
567 /* degree of validity */
568 memcpy(°ree, tmp, sizeof(double));
569 tmp += sizeof(double);
570
571 /* number of attributes */
572 memcpy(&k, tmp, sizeof(AttrNumber));
573 tmp += sizeof(AttrNumber);
574
575 /* is the number of attributes valid? */
576 Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS));
577
578 /* now that we know the number of attributes, allocate the dependency */
579 d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
580 + (k * sizeof(AttrNumber)));
581
582 d->degree = degree;
583 d->nattributes = k;
584
585 /* copy attribute numbers */
586 memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes);
587 tmp += sizeof(AttrNumber) * d->nattributes;
588
589 dependencies->deps[i] = d;
590
591 /* still within the bytea */
592 Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
593 }
594
595 /* we should have consumed the whole bytea exactly */
596 Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
597
598 return dependencies;
599 }
600
601 /*
602 * dependency_is_fully_matched
603 * checks that a functional dependency is fully matched given clauses on
604 * attributes (assuming the clauses are suitable equality clauses)
605 */
606 static bool
dependency_is_fully_matched(MVDependency * dependency,Bitmapset * attnums)607 dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
608 {
609 int j;
610
611 /*
612 * Check that the dependency actually is fully covered by clauses. We have
613 * to translate all attribute numbers, as those are referenced
614 */
615 for (j = 0; j < dependency->nattributes; j++)
616 {
617 int attnum = dependency->attributes[j];
618
619 if (!bms_is_member(attnum, attnums))
620 return false;
621 }
622
623 return true;
624 }
625
626 /*
627 * dependency_implies_attribute
628 * check that the attnum matches is implied by the functional dependency
629 */
630 static bool
dependency_implies_attribute(MVDependency * dependency,AttrNumber attnum)631 dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum)
632 {
633 if (attnum == dependency->attributes[dependency->nattributes - 1])
634 return true;
635
636 return false;
637 }
638
639 /*
640 * statext_dependencies_load
641 * Load the functional dependencies for the indicated pg_statistic_ext tuple
642 */
643 MVDependencies *
statext_dependencies_load(Oid mvoid)644 statext_dependencies_load(Oid mvoid)
645 {
646 MVDependencies *result;
647 bool isnull;
648 Datum deps;
649 HeapTuple htup;
650
651 htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
652 if (!HeapTupleIsValid(htup))
653 elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
654
655 deps = SysCacheGetAttr(STATEXTDATASTXOID, htup,
656 Anum_pg_statistic_ext_data_stxddependencies, &isnull);
657 if (isnull)
658 elog(ERROR,
659 "requested statistics kind \"%c\" is not yet built for statistics object %u",
660 STATS_EXT_DEPENDENCIES, mvoid);
661
662 result = statext_dependencies_deserialize(DatumGetByteaPP(deps));
663
664 ReleaseSysCache(htup);
665
666 return result;
667 }
668
669 /*
670 * pg_dependencies_in - input routine for type pg_dependencies.
671 *
672 * pg_dependencies is real enough to be a table column, but it has no operations
673 * of its own, and disallows input too
674 */
675 Datum
pg_dependencies_in(PG_FUNCTION_ARGS)676 pg_dependencies_in(PG_FUNCTION_ARGS)
677 {
678 /*
679 * pg_node_list stores the data in binary form and parsing text input is
680 * not needed, so disallow this.
681 */
682 ereport(ERROR,
683 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
684 errmsg("cannot accept a value of type %s", "pg_dependencies")));
685
686 PG_RETURN_VOID(); /* keep compiler quiet */
687 }
688
689 /*
690 * pg_dependencies - output routine for type pg_dependencies.
691 */
692 Datum
pg_dependencies_out(PG_FUNCTION_ARGS)693 pg_dependencies_out(PG_FUNCTION_ARGS)
694 {
695 bytea *data = PG_GETARG_BYTEA_PP(0);
696 MVDependencies *dependencies = statext_dependencies_deserialize(data);
697 int i,
698 j;
699 StringInfoData str;
700
701 initStringInfo(&str);
702 appendStringInfoChar(&str, '{');
703
704 for (i = 0; i < dependencies->ndeps; i++)
705 {
706 MVDependency *dependency = dependencies->deps[i];
707
708 if (i > 0)
709 appendStringInfoString(&str, ", ");
710
711 appendStringInfoChar(&str, '"');
712 for (j = 0; j < dependency->nattributes; j++)
713 {
714 if (j == dependency->nattributes - 1)
715 appendStringInfoString(&str, " => ");
716 else if (j > 0)
717 appendStringInfoString(&str, ", ");
718
719 appendStringInfo(&str, "%d", dependency->attributes[j]);
720 }
721 appendStringInfo(&str, "\": %f", dependency->degree);
722 }
723
724 appendStringInfoChar(&str, '}');
725
726 PG_RETURN_CSTRING(str.data);
727 }
728
729 /*
730 * pg_dependencies_recv - binary input routine for type pg_dependencies.
731 */
732 Datum
pg_dependencies_recv(PG_FUNCTION_ARGS)733 pg_dependencies_recv(PG_FUNCTION_ARGS)
734 {
735 ereport(ERROR,
736 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
737 errmsg("cannot accept a value of type %s", "pg_dependencies")));
738
739 PG_RETURN_VOID(); /* keep compiler quiet */
740 }
741
742 /*
743 * pg_dependencies_send - binary output routine for type pg_dependencies.
744 *
745 * Functional dependencies are serialized in a bytea value (although the type
746 * is named differently), so let's just send that.
747 */
748 Datum
pg_dependencies_send(PG_FUNCTION_ARGS)749 pg_dependencies_send(PG_FUNCTION_ARGS)
750 {
751 return byteasend(fcinfo);
752 }
753
754 /*
755 * dependency_is_compatible_clause
756 * Determines if the clause is compatible with functional dependencies
757 *
758 * Only clauses that have the form of equality to a pseudoconstant, or can be
759 * interpreted that way, are currently accepted. Furthermore the variable
760 * part of the clause must be a simple Var belonging to the specified
761 * relation, whose attribute number we return in *attnum on success.
762 */
763 static bool
dependency_is_compatible_clause(Node * clause,Index relid,AttrNumber * attnum)764 dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
765 {
766 RestrictInfo *rinfo = (RestrictInfo *) clause;
767 Var *var;
768
769 if (!IsA(rinfo, RestrictInfo))
770 return false;
771
772 /* Pseudoconstants are not interesting (they couldn't contain a Var) */
773 if (rinfo->pseudoconstant)
774 return false;
775
776 /* Clauses referencing multiple, or no, varnos are incompatible */
777 if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
778 return false;
779
780 if (is_opclause(rinfo->clause))
781 {
782 /* If it's an opclause, check for Var = Const or Const = Var. */
783 OpExpr *expr = (OpExpr *) rinfo->clause;
784
785 /* Only expressions with two arguments are candidates. */
786 if (list_length(expr->args) != 2)
787 return false;
788
789 /* Make sure non-selected argument is a pseudoconstant. */
790 if (is_pseudo_constant_clause(lsecond(expr->args)))
791 var = linitial(expr->args);
792 else if (is_pseudo_constant_clause(linitial(expr->args)))
793 var = lsecond(expr->args);
794 else
795 return false;
796
797 /*
798 * If it's not an "=" operator, just ignore the clause, as it's not
799 * compatible with functional dependencies.
800 *
801 * This uses the function for estimating selectivity, not the operator
802 * directly (a bit awkward, but well ...).
803 *
804 * XXX this is pretty dubious; probably it'd be better to check btree
805 * or hash opclass membership, so as not to be fooled by custom
806 * selectivity functions, and to be more consistent with decisions
807 * elsewhere in the planner.
808 */
809 if (get_oprrest(expr->opno) != F_EQSEL)
810 return false;
811
812 /* OK to proceed with checking "var" */
813 }
814 else if (is_notclause(rinfo->clause))
815 {
816 /*
817 * "NOT x" can be interpreted as "x = false", so get the argument and
818 * proceed with seeing if it's a suitable Var.
819 */
820 var = (Var *) get_notclausearg(rinfo->clause);
821 }
822 else
823 {
824 /*
825 * A boolean expression "x" can be interpreted as "x = true", so
826 * proceed with seeing if it's a suitable Var.
827 */
828 var = (Var *) rinfo->clause;
829 }
830
831 /*
832 * We may ignore any RelabelType node above the operand. (There won't be
833 * more than one, since eval_const_expressions has been applied already.)
834 */
835 if (IsA(var, RelabelType))
836 var = (Var *) ((RelabelType *) var)->arg;
837
838 /* We only support plain Vars for now */
839 if (!IsA(var, Var))
840 return false;
841
842 /* Ensure Var is from the correct relation */
843 if (var->varno != relid)
844 return false;
845
846 /* We also better ensure the Var is from the current level */
847 if (var->varlevelsup != 0)
848 return false;
849
850 /* Also ignore system attributes (we don't allow stats on those) */
851 if (!AttrNumberIsForUserDefinedAttr(var->varattno))
852 return false;
853
854 *attnum = var->varattno;
855 return true;
856 }
857
858 /*
859 * find_strongest_dependency
860 * find the strongest dependency on the attributes
861 *
862 * When applying functional dependencies, we start with the strongest
863 * dependencies. That is, we select the dependency that:
864 *
865 * (a) has all attributes covered by equality clauses
866 *
867 * (b) has the most attributes
868 *
869 * (c) has the highest degree of validity
870 *
871 * This guarantees that we eliminate the most redundant conditions first
872 * (see the comment in dependencies_clauselist_selectivity).
873 */
874 static MVDependency *
find_strongest_dependency(StatisticExtInfo * stats,MVDependencies * dependencies,Bitmapset * attnums)875 find_strongest_dependency(StatisticExtInfo *stats, MVDependencies *dependencies,
876 Bitmapset *attnums)
877 {
878 int i;
879 MVDependency *strongest = NULL;
880
881 /* number of attnums in clauses */
882 int nattnums = bms_num_members(attnums);
883
884 /*
885 * Iterate over the MVDependency items and find the strongest one from the
886 * fully-matched dependencies. We do the cheap checks first, before
887 * matching it against the attnums.
888 */
889 for (i = 0; i < dependencies->ndeps; i++)
890 {
891 MVDependency *dependency = dependencies->deps[i];
892
893 /*
894 * Skip dependencies referencing more attributes than available
895 * clauses, as those can't be fully matched.
896 */
897 if (dependency->nattributes > nattnums)
898 continue;
899
900 if (strongest)
901 {
902 /* skip dependencies on fewer attributes than the strongest. */
903 if (dependency->nattributes < strongest->nattributes)
904 continue;
905
906 /* also skip weaker dependencies when attribute count matches */
907 if (strongest->nattributes == dependency->nattributes &&
908 strongest->degree > dependency->degree)
909 continue;
910 }
911
912 /*
913 * this dependency is stronger, but we must still check that it's
914 * fully matched to these attnums. We perform this check last as it's
915 * slightly more expensive than the previous checks.
916 */
917 if (dependency_is_fully_matched(dependency, attnums))
918 strongest = dependency; /* save new best match */
919 }
920
921 return strongest;
922 }
923
924 /*
925 * dependencies_clauselist_selectivity
926 * Return the estimated selectivity of (a subset of) the given clauses
927 * using functional dependency statistics, or 1.0 if no useful functional
928 * dependency statistic exists.
929 *
930 * 'estimatedclauses' is an input/output argument that gets a bit set
931 * corresponding to the (zero-based) list index of each clause that is included
932 * in the estimated selectivity.
933 *
934 * Given equality clauses on attributes (a,b) we find the strongest dependency
935 * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
936 * dependency, we then combine the per-clause selectivities using the formula
937 *
938 * P(a,b) = P(a) * [f + (1-f)*P(b)]
939 *
940 * where 'f' is the degree of the dependency.
941 *
942 * With clauses on more than two attributes, the dependencies are applied
943 * recursively, starting with the widest/strongest dependencies. For example
944 * P(a,b,c) is first split like this:
945 *
946 * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)]
947 *
948 * assuming (a,b=>c) is the strongest dependency.
949 */
950 Selectivity
dependencies_clauselist_selectivity(PlannerInfo * root,List * clauses,int varRelid,JoinType jointype,SpecialJoinInfo * sjinfo,RelOptInfo * rel,Bitmapset ** estimatedclauses)951 dependencies_clauselist_selectivity(PlannerInfo *root,
952 List *clauses,
953 int varRelid,
954 JoinType jointype,
955 SpecialJoinInfo *sjinfo,
956 RelOptInfo *rel,
957 Bitmapset **estimatedclauses)
958 {
959 Selectivity s1 = 1.0;
960 ListCell *l;
961 Bitmapset *clauses_attnums = NULL;
962 StatisticExtInfo *stat;
963 MVDependencies *dependencies;
964 Bitmapset **list_attnums;
965 int listidx;
966
967 /* check if there's any stats that might be useful for us. */
968 if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
969 return 1.0;
970
971 list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
972 list_length(clauses));
973
974 /*
975 * Pre-process the clauses list to extract the attnums seen in each item.
976 * We need to determine if there's any clauses which will be useful for
977 * dependency selectivity estimations. Along the way we'll record all of
978 * the attnums for each clause in a list which we'll reference later so we
979 * don't need to repeat the same work again. We'll also keep track of all
980 * attnums seen.
981 *
982 * We also skip clauses that we already estimated using different types of
983 * statistics (we treat them as incompatible).
984 */
985 listidx = 0;
986 foreach(l, clauses)
987 {
988 Node *clause = (Node *) lfirst(l);
989 AttrNumber attnum;
990
991 if (!bms_is_member(listidx, *estimatedclauses) &&
992 dependency_is_compatible_clause(clause, rel->relid, &attnum))
993 {
994 list_attnums[listidx] = bms_make_singleton(attnum);
995 clauses_attnums = bms_add_member(clauses_attnums, attnum);
996 }
997 else
998 list_attnums[listidx] = NULL;
999
1000 listidx++;
1001 }
1002
1003 /*
1004 * If there's not at least two distinct attnums then reject the whole list
1005 * of clauses. We must return 1.0 so the calling function's selectivity is
1006 * unaffected.
1007 */
1008 if (bms_num_members(clauses_attnums) < 2)
1009 {
1010 pfree(list_attnums);
1011 return 1.0;
1012 }
1013
1014 /* find the best suited statistics object for these attnums */
1015 stat = choose_best_statistics(rel->statlist, STATS_EXT_DEPENDENCIES,
1016 list_attnums, list_length(clauses));
1017
1018 /* if no matching stats could be found then we've nothing to do */
1019 if (!stat)
1020 {
1021 pfree(list_attnums);
1022 return 1.0;
1023 }
1024
1025 /* load the dependency items stored in the statistics object */
1026 dependencies = statext_dependencies_load(stat->statOid);
1027
1028 /*
1029 * Apply the dependencies recursively, starting with the widest/strongest
1030 * ones, and proceeding to the smaller/weaker ones. At the end of each
1031 * round we factor in the selectivity of clauses on the implied attribute,
1032 * and remove the clauses from the list.
1033 */
1034 while (true)
1035 {
1036 Selectivity s2 = 1.0;
1037 MVDependency *dependency;
1038
1039 /* the widest/strongest dependency, fully matched by clauses */
1040 dependency = find_strongest_dependency(stat, dependencies,
1041 clauses_attnums);
1042
1043 /* if no suitable dependency was found, we're done */
1044 if (!dependency)
1045 break;
1046
1047 /*
1048 * We found an applicable dependency, so find all the clauses on the
1049 * implied attribute - with dependency (a,b => c) we look for clauses
1050 * on 'c'.
1051 */
1052 listidx = -1;
1053 foreach(l, clauses)
1054 {
1055 Node *clause;
1056 AttrNumber attnum;
1057
1058 listidx++;
1059
1060 /*
1061 * Skip incompatible clauses, and ones we've already estimated on.
1062 */
1063 if (!list_attnums[listidx])
1064 continue;
1065
1066 /*
1067 * We expect the bitmaps ton contain a single attribute number.
1068 */
1069 attnum = bms_singleton_member(list_attnums[listidx]);
1070
1071 /*
1072 * Technically we could find more than one clause for a given
1073 * attnum. Since these clauses must be equality clauses, we choose
1074 * to only take the selectivity estimate from the final clause in
1075 * the list for this attnum. If the attnum happens to be compared
1076 * to a different Const in another clause then no rows will match
1077 * anyway. If it happens to be compared to the same Const, then
1078 * ignoring the additional clause is just the thing to do.
1079 */
1080 if (dependency_implies_attribute(dependency, attnum))
1081 {
1082 clause = (Node *) lfirst(l);
1083
1084 s2 = clause_selectivity(root, clause, varRelid, jointype,
1085 sjinfo);
1086
1087 /* mark this one as done, so we don't touch it again. */
1088 *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
1089
1090 /*
1091 * Mark that we've got and used the dependency on this clause.
1092 * We'll want to ignore this when looking for the next
1093 * strongest dependency above.
1094 */
1095 clauses_attnums = bms_del_member(clauses_attnums, attnum);
1096 }
1097 }
1098
1099 /*
1100 * Now factor in the selectivity for all the "implied" clauses into
1101 * the final one, using this formula:
1102 *
1103 * P(a,b) = P(a) * (f + (1-f) * P(b))
1104 *
1105 * where 'f' is the degree of validity of the dependency.
1106 */
1107 s1 *= (dependency->degree + (1 - dependency->degree) * s2);
1108 }
1109
1110 pfree(dependencies);
1111 pfree(list_attnums);
1112
1113 return s1;
1114 }
1115