1 /*-------------------------------------------------------------------------
2 *
3 * dependencies.c
4 * POSTGRES functional dependencies
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/statistics/dependencies.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include "access/htup_details.h"
17 #include "access/sysattr.h"
18 #include "catalog/pg_operator.h"
19 #include "catalog/pg_statistic_ext.h"
20 #include "catalog/pg_statistic_ext_data.h"
21 #include "lib/stringinfo.h"
22 #include "nodes/nodeFuncs.h"
23 #include "nodes/nodes.h"
24 #include "nodes/pathnodes.h"
25 #include "optimizer/clauses.h"
26 #include "optimizer/optimizer.h"
27 #include "statistics/extended_stats_internal.h"
28 #include "statistics/statistics.h"
29 #include "utils/bytea.h"
30 #include "utils/fmgroids.h"
31 #include "utils/fmgrprotos.h"
32 #include "utils/lsyscache.h"
33 #include "utils/memutils.h"
34 #include "utils/selfuncs.h"
35 #include "utils/syscache.h"
36 #include "utils/typcache.h"
37
38 /* size of the struct header fields (magic, type, ndeps) */
39 #define SizeOfHeader (3 * sizeof(uint32))
40
41 /* size of a serialized dependency (degree, natts, atts) */
42 #define SizeOfItem(natts) \
43 (sizeof(double) + sizeof(AttrNumber) * (1 + (natts)))
44
45 /* minimal size of a dependency (with two attributes) */
46 #define MinSizeOfItem SizeOfItem(2)
47
48 /* minimal size of dependencies, when all deps are minimal */
49 #define MinSizeOfItems(ndeps) \
50 (SizeOfHeader + (ndeps) * MinSizeOfItem)
51
52 /*
53 * Internal state for DependencyGenerator of dependencies. Dependencies are similar to
54 * k-permutations of n elements, except that the order does not matter for the
55 * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
56 */
57 typedef struct DependencyGeneratorData
58 {
59 int k; /* size of the dependency */
60 int n; /* number of possible attributes */
61 int current; /* next dependency to return (index) */
62 AttrNumber ndependencies; /* number of dependencies generated */
63 AttrNumber *dependencies; /* array of pre-generated dependencies */
64 } DependencyGeneratorData;
65
66 typedef DependencyGeneratorData *DependencyGenerator;
67
68 static void generate_dependencies_recurse(DependencyGenerator state,
69 int index, AttrNumber start, AttrNumber *current);
70 static void generate_dependencies(DependencyGenerator state);
71 static DependencyGenerator DependencyGenerator_init(int n, int k);
72 static void DependencyGenerator_free(DependencyGenerator state);
73 static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
74 static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
75 static bool dependency_is_fully_matched(MVDependency *dependency,
76 Bitmapset *attnums);
77 static bool dependency_is_compatible_clause(Node *clause, Index relid,
78 AttrNumber *attnum);
79 static bool dependency_is_compatible_expression(Node *clause, Index relid,
80 List *statlist, Node **expr);
81 static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
82 int ndependencies, Bitmapset *attnums);
83 static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
84 int varRelid, JoinType jointype,
85 SpecialJoinInfo *sjinfo,
86 MVDependency **dependencies,
87 int ndependencies,
88 AttrNumber *list_attnums,
89 Bitmapset **estimatedclauses);
90
91 static void
generate_dependencies_recurse(DependencyGenerator state,int index,AttrNumber start,AttrNumber * current)92 generate_dependencies_recurse(DependencyGenerator state, int index,
93 AttrNumber start, AttrNumber *current)
94 {
95 /*
96 * The generator handles the first (k-1) elements differently from the
97 * last element.
98 */
99 if (index < (state->k - 1))
100 {
101 AttrNumber i;
102
103 /*
104 * The first (k-1) values have to be in ascending order, which we
105 * generate recursively.
106 */
107
108 for (i = start; i < state->n; i++)
109 {
110 current[index] = i;
111 generate_dependencies_recurse(state, (index + 1), (i + 1), current);
112 }
113 }
114 else
115 {
116 int i;
117
118 /*
119 * the last element is the implied value, which does not respect the
120 * ascending order. We just need to check that the value is not in the
121 * first (k-1) elements.
122 */
123
124 for (i = 0; i < state->n; i++)
125 {
126 int j;
127 bool match = false;
128
129 current[index] = i;
130
131 for (j = 0; j < index; j++)
132 {
133 if (current[j] == i)
134 {
135 match = true;
136 break;
137 }
138 }
139
140 /*
141 * If the value is not found in the first part of the dependency,
142 * we're done.
143 */
144 if (!match)
145 {
146 state->dependencies = (AttrNumber *) repalloc(state->dependencies,
147 state->k * (state->ndependencies + 1) * sizeof(AttrNumber));
148 memcpy(&state->dependencies[(state->k * state->ndependencies)],
149 current, state->k * sizeof(AttrNumber));
150 state->ndependencies++;
151 }
152 }
153 }
154 }
155
156 /* generate all dependencies (k-permutations of n elements) */
157 static void
generate_dependencies(DependencyGenerator state)158 generate_dependencies(DependencyGenerator state)
159 {
160 AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k);
161
162 generate_dependencies_recurse(state, 0, 0, current);
163
164 pfree(current);
165 }
166
167 /*
168 * initialize the DependencyGenerator of variations, and prebuild the variations
169 *
170 * This pre-builds all the variations. We could also generate them in
171 * DependencyGenerator_next(), but this seems simpler.
172 */
173 static DependencyGenerator
DependencyGenerator_init(int n,int k)174 DependencyGenerator_init(int n, int k)
175 {
176 DependencyGenerator state;
177
178 Assert((n >= k) && (k > 0));
179
180 /* allocate the DependencyGenerator state */
181 state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData));
182 state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber));
183
184 state->ndependencies = 0;
185 state->current = 0;
186 state->k = k;
187 state->n = n;
188
189 /* now actually pre-generate all the variations */
190 generate_dependencies(state);
191
192 return state;
193 }
194
195 /* free the DependencyGenerator state */
196 static void
DependencyGenerator_free(DependencyGenerator state)197 DependencyGenerator_free(DependencyGenerator state)
198 {
199 pfree(state->dependencies);
200 pfree(state);
201
202 }
203
204 /* generate next combination */
205 static AttrNumber *
DependencyGenerator_next(DependencyGenerator state)206 DependencyGenerator_next(DependencyGenerator state)
207 {
208 if (state->current == state->ndependencies)
209 return NULL;
210
211 return &state->dependencies[state->k * state->current++];
212 }
213
214
215 /*
216 * validates functional dependency on the data
217 *
218 * An actual work horse of detecting functional dependencies. Given a variation
219 * of k attributes, it checks that the first (k-1) are sufficient to determine
220 * the last one.
221 */
222 static double
dependency_degree(StatsBuildData * data,int k,AttrNumber * dependency)223 dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
224 {
225 int i,
226 nitems;
227 MultiSortSupport mss;
228 SortItem *items;
229 AttrNumber *attnums_dep;
230
231 /* counters valid within a group */
232 int group_size = 0;
233 int n_violations = 0;
234
235 /* total number of rows supporting (consistent with) the dependency */
236 int n_supporting_rows = 0;
237
238 /* Make sure we have at least two input attributes. */
239 Assert(k >= 2);
240
241 /* sort info for all attributes columns */
242 mss = multi_sort_init(k);
243
244 /*
245 * Translate the array of indexes to regular attnums for the dependency
246 * (we will need this to identify the columns in StatsBuildData).
247 */
248 attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
249 for (i = 0; i < k; i++)
250 attnums_dep[i] = data->attnums[dependency[i]];
251
252 /*
253 * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
254 *
255 * (a) sort the data lexicographically
256 *
257 * (b) split the data into groups by first (k-1) columns
258 *
259 * (c) for each group count different values in the last column
260 *
261 * We use the column data types' default sort operators and collations;
262 * perhaps at some point it'd be worth using column-specific collations?
263 */
264
265 /* prepare the sort function for the dimensions */
266 for (i = 0; i < k; i++)
267 {
268 VacAttrStats *colstat = data->stats[dependency[i]];
269 TypeCacheEntry *type;
270
271 type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
272 if (type->lt_opr == InvalidOid) /* shouldn't happen */
273 elog(ERROR, "cache lookup failed for ordering operator for type %u",
274 colstat->attrtypid);
275
276 /* prepare the sort function for this dimension */
277 multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
278 }
279
280 /*
281 * build an array of SortItem(s) sorted using the multi-sort support
282 *
283 * XXX This relies on all stats entries pointing to the same tuple
284 * descriptor. For now that assumption holds, but it might change in the
285 * future for example if we support statistics on multiple tables.
286 */
287 items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
288
289 /*
290 * Walk through the sorted array, split it into rows according to the
291 * first (k-1) columns. If there's a single value in the last column, we
292 * count the group as 'supporting' the functional dependency. Otherwise we
293 * count it as contradicting.
294 */
295
296 /* start with the first row forming a group */
297 group_size = 1;
298
299 /* loop 1 beyond the end of the array so that we count the final group */
300 for (i = 1; i <= nitems; i++)
301 {
302 /*
303 * Check if the group ended, which may be either because we processed
304 * all the items (i==nitems), or because the i-th item is not equal to
305 * the preceding one.
306 */
307 if (i == nitems ||
308 multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
309 {
310 /*
311 * If no violations were found in the group then track the rows of
312 * the group as supporting the functional dependency.
313 */
314 if (n_violations == 0)
315 n_supporting_rows += group_size;
316
317 /* Reset counters for the new group */
318 n_violations = 0;
319 group_size = 1;
320 continue;
321 }
322 /* first columns match, but the last one does not (so contradicting) */
323 else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0)
324 n_violations++;
325
326 group_size++;
327 }
328
329 /* Compute the 'degree of validity' as (supporting/total). */
330 return (n_supporting_rows * 1.0 / data->numrows);
331 }
332
333 /*
334 * detects functional dependencies between groups of columns
335 *
336 * Generates all possible subsets of columns (variations) and computes
337 * the degree of validity for each one. For example when creating statistics
338 * on three columns (a,b,c) there are 9 possible dependencies
339 *
340 * two columns three columns
341 * ----------- -------------
342 * (a) -> b (a,b) -> c
343 * (a) -> c (a,c) -> b
344 * (b) -> a (b,c) -> a
345 * (b) -> c
346 * (c) -> a
347 * (c) -> b
348 */
349 MVDependencies *
statext_dependencies_build(StatsBuildData * data)350 statext_dependencies_build(StatsBuildData *data)
351 {
352 int i,
353 k;
354
355 /* result */
356 MVDependencies *dependencies = NULL;
357 MemoryContext cxt;
358
359 Assert(data->nattnums >= 2);
360
361 /* tracks memory allocated by dependency_degree calls */
362 cxt = AllocSetContextCreate(CurrentMemoryContext,
363 "dependency_degree cxt",
364 ALLOCSET_DEFAULT_SIZES);
365
366 /*
367 * We'll try build functional dependencies starting from the smallest ones
368 * covering just 2 columns, to the largest ones, covering all columns
369 * included in the statistics object. We start from the smallest ones
370 * because we want to be able to skip already implied ones.
371 */
372 for (k = 2; k <= data->nattnums; k++)
373 {
374 AttrNumber *dependency; /* array with k elements */
375
376 /* prepare a DependencyGenerator of variation */
377 DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
378
379 /* generate all possible variations of k values (out of n) */
380 while ((dependency = DependencyGenerator_next(DependencyGenerator)))
381 {
382 double degree;
383 MVDependency *d;
384 MemoryContext oldcxt;
385
386 /* release memory used by dependency degree calculation */
387 oldcxt = MemoryContextSwitchTo(cxt);
388
389 /* compute how valid the dependency seems */
390 degree = dependency_degree(data, k, dependency);
391
392 MemoryContextSwitchTo(oldcxt);
393 MemoryContextReset(cxt);
394
395 /*
396 * if the dependency seems entirely invalid, don't store it
397 */
398 if (degree == 0.0)
399 continue;
400
401 d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
402 + k * sizeof(AttrNumber));
403
404 /* copy the dependency (and keep the indexes into stxkeys) */
405 d->degree = degree;
406 d->nattributes = k;
407 for (i = 0; i < k; i++)
408 d->attributes[i] = data->attnums[dependency[i]];
409
410 /* initialize the list of dependencies */
411 if (dependencies == NULL)
412 {
413 dependencies
414 = (MVDependencies *) palloc0(sizeof(MVDependencies));
415
416 dependencies->magic = STATS_DEPS_MAGIC;
417 dependencies->type = STATS_DEPS_TYPE_BASIC;
418 dependencies->ndeps = 0;
419 }
420
421 dependencies->ndeps++;
422 dependencies = (MVDependencies *) repalloc(dependencies,
423 offsetof(MVDependencies, deps)
424 + dependencies->ndeps * sizeof(MVDependency *));
425
426 dependencies->deps[dependencies->ndeps - 1] = d;
427 }
428
429 /*
430 * we're done with variations of k elements, so free the
431 * DependencyGenerator
432 */
433 DependencyGenerator_free(DependencyGenerator);
434 }
435
436 MemoryContextDelete(cxt);
437
438 return dependencies;
439 }
440
441
442 /*
443 * Serialize list of dependencies into a bytea value.
444 */
445 bytea *
statext_dependencies_serialize(MVDependencies * dependencies)446 statext_dependencies_serialize(MVDependencies *dependencies)
447 {
448 int i;
449 bytea *output;
450 char *tmp;
451 Size len;
452
453 /* we need to store ndeps, with a number of attributes for each one */
454 len = VARHDRSZ + SizeOfHeader;
455
456 /* and also include space for the actual attribute numbers and degrees */
457 for (i = 0; i < dependencies->ndeps; i++)
458 len += SizeOfItem(dependencies->deps[i]->nattributes);
459
460 output = (bytea *) palloc0(len);
461 SET_VARSIZE(output, len);
462
463 tmp = VARDATA(output);
464
465 /* Store the base struct values (magic, type, ndeps) */
466 memcpy(tmp, &dependencies->magic, sizeof(uint32));
467 tmp += sizeof(uint32);
468 memcpy(tmp, &dependencies->type, sizeof(uint32));
469 tmp += sizeof(uint32);
470 memcpy(tmp, &dependencies->ndeps, sizeof(uint32));
471 tmp += sizeof(uint32);
472
473 /* store number of attributes and attribute numbers for each dependency */
474 for (i = 0; i < dependencies->ndeps; i++)
475 {
476 MVDependency *d = dependencies->deps[i];
477
478 memcpy(tmp, &d->degree, sizeof(double));
479 tmp += sizeof(double);
480
481 memcpy(tmp, &d->nattributes, sizeof(AttrNumber));
482 tmp += sizeof(AttrNumber);
483
484 memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes);
485 tmp += sizeof(AttrNumber) * d->nattributes;
486
487 /* protect against overflow */
488 Assert(tmp <= ((char *) output + len));
489 }
490
491 /* make sure we've produced exactly the right amount of data */
492 Assert(tmp == ((char *) output + len));
493
494 return output;
495 }
496
497 /*
498 * Reads serialized dependencies into MVDependencies structure.
499 */
500 MVDependencies *
statext_dependencies_deserialize(bytea * data)501 statext_dependencies_deserialize(bytea *data)
502 {
503 int i;
504 Size min_expected_size;
505 MVDependencies *dependencies;
506 char *tmp;
507
508 if (data == NULL)
509 return NULL;
510
511 if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
512 elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)",
513 VARSIZE_ANY_EXHDR(data), SizeOfHeader);
514
515 /* read the MVDependencies header */
516 dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies));
517
518 /* initialize pointer to the data part (skip the varlena header) */
519 tmp = VARDATA_ANY(data);
520
521 /* read the header fields and perform basic sanity checks */
522 memcpy(&dependencies->magic, tmp, sizeof(uint32));
523 tmp += sizeof(uint32);
524 memcpy(&dependencies->type, tmp, sizeof(uint32));
525 tmp += sizeof(uint32);
526 memcpy(&dependencies->ndeps, tmp, sizeof(uint32));
527 tmp += sizeof(uint32);
528
529 if (dependencies->magic != STATS_DEPS_MAGIC)
530 elog(ERROR, "invalid dependency magic %d (expected %d)",
531 dependencies->magic, STATS_DEPS_MAGIC);
532
533 if (dependencies->type != STATS_DEPS_TYPE_BASIC)
534 elog(ERROR, "invalid dependency type %d (expected %d)",
535 dependencies->type, STATS_DEPS_TYPE_BASIC);
536
537 if (dependencies->ndeps == 0)
538 elog(ERROR, "invalid zero-length item array in MVDependencies");
539
540 /* what minimum bytea size do we expect for those parameters */
541 min_expected_size = SizeOfItem(dependencies->ndeps);
542
543 if (VARSIZE_ANY_EXHDR(data) < min_expected_size)
544 elog(ERROR, "invalid dependencies size %zd (expected at least %zd)",
545 VARSIZE_ANY_EXHDR(data), min_expected_size);
546
547 /* allocate space for the MCV items */
548 dependencies = repalloc(dependencies, offsetof(MVDependencies, deps)
549 + (dependencies->ndeps * sizeof(MVDependency *)));
550
551 for (i = 0; i < dependencies->ndeps; i++)
552 {
553 double degree;
554 AttrNumber k;
555 MVDependency *d;
556
557 /* degree of validity */
558 memcpy(°ree, tmp, sizeof(double));
559 tmp += sizeof(double);
560
561 /* number of attributes */
562 memcpy(&k, tmp, sizeof(AttrNumber));
563 tmp += sizeof(AttrNumber);
564
565 /* is the number of attributes valid? */
566 Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS));
567
568 /* now that we know the number of attributes, allocate the dependency */
569 d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
570 + (k * sizeof(AttrNumber)));
571
572 d->degree = degree;
573 d->nattributes = k;
574
575 /* copy attribute numbers */
576 memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes);
577 tmp += sizeof(AttrNumber) * d->nattributes;
578
579 dependencies->deps[i] = d;
580
581 /* still within the bytea */
582 Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
583 }
584
585 /* we should have consumed the whole bytea exactly */
586 Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
587
588 return dependencies;
589 }
590
591 /*
592 * dependency_is_fully_matched
593 * checks that a functional dependency is fully matched given clauses on
594 * attributes (assuming the clauses are suitable equality clauses)
595 */
596 static bool
dependency_is_fully_matched(MVDependency * dependency,Bitmapset * attnums)597 dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
598 {
599 int j;
600
601 /*
602 * Check that the dependency actually is fully covered by clauses. We have
603 * to translate all attribute numbers, as those are referenced
604 */
605 for (j = 0; j < dependency->nattributes; j++)
606 {
607 int attnum = dependency->attributes[j];
608
609 if (!bms_is_member(attnum, attnums))
610 return false;
611 }
612
613 return true;
614 }
615
616 /*
617 * statext_dependencies_load
618 * Load the functional dependencies for the indicated pg_statistic_ext tuple
619 */
620 MVDependencies *
statext_dependencies_load(Oid mvoid)621 statext_dependencies_load(Oid mvoid)
622 {
623 MVDependencies *result;
624 bool isnull;
625 Datum deps;
626 HeapTuple htup;
627
628 htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
629 if (!HeapTupleIsValid(htup))
630 elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
631
632 deps = SysCacheGetAttr(STATEXTDATASTXOID, htup,
633 Anum_pg_statistic_ext_data_stxddependencies, &isnull);
634 if (isnull)
635 elog(ERROR,
636 "requested statistics kind \"%c\" is not yet built for statistics object %u",
637 STATS_EXT_DEPENDENCIES, mvoid);
638
639 result = statext_dependencies_deserialize(DatumGetByteaPP(deps));
640
641 ReleaseSysCache(htup);
642
643 return result;
644 }
645
646 /*
647 * pg_dependencies_in - input routine for type pg_dependencies.
648 *
649 * pg_dependencies is real enough to be a table column, but it has no operations
650 * of its own, and disallows input too
651 */
652 Datum
pg_dependencies_in(PG_FUNCTION_ARGS)653 pg_dependencies_in(PG_FUNCTION_ARGS)
654 {
655 /*
656 * pg_node_list stores the data in binary form and parsing text input is
657 * not needed, so disallow this.
658 */
659 ereport(ERROR,
660 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
661 errmsg("cannot accept a value of type %s", "pg_dependencies")));
662
663 PG_RETURN_VOID(); /* keep compiler quiet */
664 }
665
666 /*
667 * pg_dependencies - output routine for type pg_dependencies.
668 */
669 Datum
pg_dependencies_out(PG_FUNCTION_ARGS)670 pg_dependencies_out(PG_FUNCTION_ARGS)
671 {
672 bytea *data = PG_GETARG_BYTEA_PP(0);
673 MVDependencies *dependencies = statext_dependencies_deserialize(data);
674 int i,
675 j;
676 StringInfoData str;
677
678 initStringInfo(&str);
679 appendStringInfoChar(&str, '{');
680
681 for (i = 0; i < dependencies->ndeps; i++)
682 {
683 MVDependency *dependency = dependencies->deps[i];
684
685 if (i > 0)
686 appendStringInfoString(&str, ", ");
687
688 appendStringInfoChar(&str, '"');
689 for (j = 0; j < dependency->nattributes; j++)
690 {
691 if (j == dependency->nattributes - 1)
692 appendStringInfoString(&str, " => ");
693 else if (j > 0)
694 appendStringInfoString(&str, ", ");
695
696 appendStringInfo(&str, "%d", dependency->attributes[j]);
697 }
698 appendStringInfo(&str, "\": %f", dependency->degree);
699 }
700
701 appendStringInfoChar(&str, '}');
702
703 PG_RETURN_CSTRING(str.data);
704 }
705
706 /*
707 * pg_dependencies_recv - binary input routine for type pg_dependencies.
708 */
709 Datum
pg_dependencies_recv(PG_FUNCTION_ARGS)710 pg_dependencies_recv(PG_FUNCTION_ARGS)
711 {
712 ereport(ERROR,
713 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
714 errmsg("cannot accept a value of type %s", "pg_dependencies")));
715
716 PG_RETURN_VOID(); /* keep compiler quiet */
717 }
718
719 /*
720 * pg_dependencies_send - binary output routine for type pg_dependencies.
721 *
722 * Functional dependencies are serialized in a bytea value (although the type
723 * is named differently), so let's just send that.
724 */
725 Datum
pg_dependencies_send(PG_FUNCTION_ARGS)726 pg_dependencies_send(PG_FUNCTION_ARGS)
727 {
728 return byteasend(fcinfo);
729 }
730
731 /*
732 * dependency_is_compatible_clause
733 * Determines if the clause is compatible with functional dependencies
734 *
735 * Only clauses that have the form of equality to a pseudoconstant, or can be
736 * interpreted that way, are currently accepted. Furthermore the variable
737 * part of the clause must be a simple Var belonging to the specified
738 * relation, whose attribute number we return in *attnum on success.
739 */
740 static bool
dependency_is_compatible_clause(Node * clause,Index relid,AttrNumber * attnum)741 dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
742 {
743 Var *var;
744 Node *clause_expr;
745
746 if (IsA(clause, RestrictInfo))
747 {
748 RestrictInfo *rinfo = (RestrictInfo *) clause;
749
750 /* Pseudoconstants are not interesting (they couldn't contain a Var) */
751 if (rinfo->pseudoconstant)
752 return false;
753
754 /* Clauses referencing multiple, or no, varnos are incompatible */
755 if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
756 return false;
757
758 clause = (Node *) rinfo->clause;
759 }
760
761 if (is_opclause(clause))
762 {
763 /* If it's an opclause, check for Var = Const or Const = Var. */
764 OpExpr *expr = (OpExpr *) clause;
765
766 /* Only expressions with two arguments are candidates. */
767 if (list_length(expr->args) != 2)
768 return false;
769
770 /* Make sure non-selected argument is a pseudoconstant. */
771 if (is_pseudo_constant_clause(lsecond(expr->args)))
772 clause_expr = linitial(expr->args);
773 else if (is_pseudo_constant_clause(linitial(expr->args)))
774 clause_expr = lsecond(expr->args);
775 else
776 return false;
777
778 /*
779 * If it's not an "=" operator, just ignore the clause, as it's not
780 * compatible with functional dependencies.
781 *
782 * This uses the function for estimating selectivity, not the operator
783 * directly (a bit awkward, but well ...).
784 *
785 * XXX this is pretty dubious; probably it'd be better to check btree
786 * or hash opclass membership, so as not to be fooled by custom
787 * selectivity functions, and to be more consistent with decisions
788 * elsewhere in the planner.
789 */
790 if (get_oprrest(expr->opno) != F_EQSEL)
791 return false;
792
793 /* OK to proceed with checking "var" */
794 }
795 else if (IsA(clause, ScalarArrayOpExpr))
796 {
797 /* If it's an scalar array operator, check for Var IN Const. */
798 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
799
800 /*
801 * Reject ALL() variant, we only care about ANY/IN.
802 *
803 * XXX Maybe we should check if all the values are the same, and allow
804 * ALL in that case? Doesn't seem very practical, though.
805 */
806 if (!expr->useOr)
807 return false;
808
809 /* Only expressions with two arguments are candidates. */
810 if (list_length(expr->args) != 2)
811 return false;
812
813 /*
814 * We know it's always (Var IN Const), so we assume the var is the
815 * first argument, and pseudoconstant is the second one.
816 */
817 if (!is_pseudo_constant_clause(lsecond(expr->args)))
818 return false;
819
820 clause_expr = linitial(expr->args);
821
822 /*
823 * If it's not an "=" operator, just ignore the clause, as it's not
824 * compatible with functional dependencies. The operator is identified
825 * simply by looking at which function it uses to estimate
826 * selectivity. That's a bit strange, but it's what other similar
827 * places do.
828 */
829 if (get_oprrest(expr->opno) != F_EQSEL)
830 return false;
831
832 /* OK to proceed with checking "var" */
833 }
834 else if (is_orclause(clause))
835 {
836 BoolExpr *bool_expr = (BoolExpr *) clause;
837 ListCell *lc;
838
839 /* start with no attribute number */
840 *attnum = InvalidAttrNumber;
841
842 foreach(lc, bool_expr->args)
843 {
844 AttrNumber clause_attnum;
845
846 /*
847 * Had we found incompatible clause in the arguments, treat the
848 * whole clause as incompatible.
849 */
850 if (!dependency_is_compatible_clause((Node *) lfirst(lc),
851 relid, &clause_attnum))
852 return false;
853
854 if (*attnum == InvalidAttrNumber)
855 *attnum = clause_attnum;
856
857 /* ensure all the variables are the same (same attnum) */
858 if (*attnum != clause_attnum)
859 return false;
860 }
861
862 /* the Var is already checked by the recursive call */
863 return true;
864 }
865 else if (is_notclause(clause))
866 {
867 /*
868 * "NOT x" can be interpreted as "x = false", so get the argument and
869 * proceed with seeing if it's a suitable Var.
870 */
871 clause_expr = (Node *) get_notclausearg(clause);
872 }
873 else
874 {
875 /*
876 * A boolean expression "x" can be interpreted as "x = true", so
877 * proceed with seeing if it's a suitable Var.
878 */
879 clause_expr = (Node *) clause;
880 }
881
882 /*
883 * We may ignore any RelabelType node above the operand. (There won't be
884 * more than one, since eval_const_expressions has been applied already.)
885 */
886 if (IsA(clause_expr, RelabelType))
887 clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
888
889 /* We only support plain Vars for now */
890 if (!IsA(clause_expr, Var))
891 return false;
892
893 /* OK, we know we have a Var */
894 var = (Var *) clause_expr;
895
896 /* Ensure Var is from the correct relation */
897 if (var->varno != relid)
898 return false;
899
900 /* We also better ensure the Var is from the current level */
901 if (var->varlevelsup != 0)
902 return false;
903
904 /* Also ignore system attributes (we don't allow stats on those) */
905 if (!AttrNumberIsForUserDefinedAttr(var->varattno))
906 return false;
907
908 *attnum = var->varattno;
909 return true;
910 }
911
912 /*
913 * find_strongest_dependency
914 * find the strongest dependency on the attributes
915 *
916 * When applying functional dependencies, we start with the strongest
917 * dependencies. That is, we select the dependency that:
918 *
919 * (a) has all attributes covered by equality clauses
920 *
921 * (b) has the most attributes
922 *
923 * (c) has the highest degree of validity
924 *
925 * This guarantees that we eliminate the most redundant conditions first
926 * (see the comment in dependencies_clauselist_selectivity).
927 */
928 static MVDependency *
find_strongest_dependency(MVDependencies ** dependencies,int ndependencies,Bitmapset * attnums)929 find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
930 Bitmapset *attnums)
931 {
932 int i,
933 j;
934 MVDependency *strongest = NULL;
935
936 /* number of attnums in clauses */
937 int nattnums = bms_num_members(attnums);
938
939 /*
940 * Iterate over the MVDependency items and find the strongest one from the
941 * fully-matched dependencies. We do the cheap checks first, before
942 * matching it against the attnums.
943 */
944 for (i = 0; i < ndependencies; i++)
945 {
946 for (j = 0; j < dependencies[i]->ndeps; j++)
947 {
948 MVDependency *dependency = dependencies[i]->deps[j];
949
950 /*
951 * Skip dependencies referencing more attributes than available
952 * clauses, as those can't be fully matched.
953 */
954 if (dependency->nattributes > nattnums)
955 continue;
956
957 if (strongest)
958 {
959 /* skip dependencies on fewer attributes than the strongest. */
960 if (dependency->nattributes < strongest->nattributes)
961 continue;
962
963 /* also skip weaker dependencies when attribute count matches */
964 if (strongest->nattributes == dependency->nattributes &&
965 strongest->degree > dependency->degree)
966 continue;
967 }
968
969 /*
970 * this dependency is stronger, but we must still check that it's
971 * fully matched to these attnums. We perform this check last as
972 * it's slightly more expensive than the previous checks.
973 */
974 if (dependency_is_fully_matched(dependency, attnums))
975 strongest = dependency; /* save new best match */
976 }
977 }
978
979 return strongest;
980 }
981
982 /*
983 * clauselist_apply_dependencies
984 * Apply the specified functional dependencies to a list of clauses and
985 * return the estimated selectivity of the clauses that are compatible
986 * with any of the given dependencies.
987 *
988 * This will estimate all not-already-estimated clauses that are compatible
989 * with functional dependencies, and which have an attribute mentioned by any
990 * of the given dependencies (either as an implying or implied attribute).
991 *
992 * Given (lists of) clauses on attributes (a,b) and a functional dependency
993 * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined
994 * using the formula
995 *
996 * P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
997 *
998 * where 'f' is the degree of dependency. This reflects the fact that we
999 * expect a fraction f of all rows to be consistent with the dependency
1000 * (a=>b), and so have a selectivity of P(a), while the remaining rows are
1001 * treated as independent.
1002 *
1003 * In practice, we use a slightly modified version of this formula, which uses
1004 * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result
1005 * should obviously not exceed either column's individual selectivity. I.e.,
1006 * we actually combine selectivities using the formula
1007 *
1008 * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
1009 *
1010 * This can make quite a difference if the specific values matching the
1011 * clauses are not consistent with the functional dependency.
1012 */
1013 static Selectivity
clauselist_apply_dependencies(PlannerInfo * root,List * clauses,int varRelid,JoinType jointype,SpecialJoinInfo * sjinfo,MVDependency ** dependencies,int ndependencies,AttrNumber * list_attnums,Bitmapset ** estimatedclauses)1014 clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
1015 int varRelid, JoinType jointype,
1016 SpecialJoinInfo *sjinfo,
1017 MVDependency **dependencies, int ndependencies,
1018 AttrNumber *list_attnums,
1019 Bitmapset **estimatedclauses)
1020 {
1021 Bitmapset *attnums;
1022 int i;
1023 int j;
1024 int nattrs;
1025 Selectivity *attr_sel;
1026 int attidx;
1027 int listidx;
1028 ListCell *l;
1029 Selectivity s1;
1030
1031 /*
1032 * Extract the attnums of all implying and implied attributes from all the
1033 * given dependencies. Each of these attributes is expected to have at
1034 * least 1 not-already-estimated compatible clause that we will estimate
1035 * here.
1036 */
1037 attnums = NULL;
1038 for (i = 0; i < ndependencies; i++)
1039 {
1040 for (j = 0; j < dependencies[i]->nattributes; j++)
1041 {
1042 AttrNumber attnum = dependencies[i]->attributes[j];
1043
1044 attnums = bms_add_member(attnums, attnum);
1045 }
1046 }
1047
1048 /*
1049 * Compute per-column selectivity estimates for each of these attributes,
1050 * and mark all the corresponding clauses as estimated.
1051 */
1052 nattrs = bms_num_members(attnums);
1053 attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs);
1054
1055 attidx = 0;
1056 i = -1;
1057 while ((i = bms_next_member(attnums, i)) >= 0)
1058 {
1059 List *attr_clauses = NIL;
1060 Selectivity simple_sel;
1061
1062 listidx = -1;
1063 foreach(l, clauses)
1064 {
1065 Node *clause = (Node *) lfirst(l);
1066
1067 listidx++;
1068 if (list_attnums[listidx] == i)
1069 {
1070 attr_clauses = lappend(attr_clauses, clause);
1071 *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
1072 }
1073 }
1074
1075 simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid,
1076 jointype, sjinfo, false);
1077 attr_sel[attidx++] = simple_sel;
1078 }
1079
1080 /*
1081 * Now combine these selectivities using the dependency information. For
1082 * chains of dependencies such as a -> b -> c, the b -> c dependency will
1083 * come before the a -> b dependency in the array, so we traverse the
1084 * array backwards to ensure such chains are computed in the right order.
1085 *
1086 * As explained above, pairs of selectivities are combined using the
1087 * formula
1088 *
1089 * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
1090 *
1091 * to ensure that the combined selectivity is never greater than either
1092 * individual selectivity.
1093 *
1094 * Where multiple dependencies apply (e.g., a -> b -> c), we use
1095 * conditional probabilities to compute the overall result as follows:
1096 *
1097 * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a)
1098 *
1099 * so we replace the selectivities of all implied attributes with
1100 * conditional probabilities, that are conditional on all their implying
1101 * attributes. The selectivities of all other non-implied attributes are
1102 * left as they are.
1103 */
1104 for (i = ndependencies - 1; i >= 0; i--)
1105 {
1106 MVDependency *dependency = dependencies[i];
1107 AttrNumber attnum;
1108 Selectivity s2;
1109 double f;
1110
1111 /* Selectivity of all the implying attributes */
1112 s1 = 1.0;
1113 for (j = 0; j < dependency->nattributes - 1; j++)
1114 {
1115 attnum = dependency->attributes[j];
1116 attidx = bms_member_index(attnums, attnum);
1117 s1 *= attr_sel[attidx];
1118 }
1119
1120 /* Original selectivity of the implied attribute */
1121 attnum = dependency->attributes[j];
1122 attidx = bms_member_index(attnums, attnum);
1123 s2 = attr_sel[attidx];
1124
1125 /*
1126 * Replace s2 with the conditional probability s2 given s1, computed
1127 * using the formula P(b|a) = P(a,b) / P(a), which simplifies to
1128 *
1129 * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b)
1130 *
1131 * where P(a) = s1, the selectivity of the implying attributes, and
1132 * P(b) = s2, the selectivity of the implied attribute.
1133 */
1134 f = dependency->degree;
1135
1136 if (s1 <= s2)
1137 attr_sel[attidx] = f + (1 - f) * s2;
1138 else
1139 attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2;
1140 }
1141
1142 /*
1143 * The overall selectivity of all the clauses on all these attributes is
1144 * then the product of all the original (non-implied) probabilities and
1145 * the new conditional (implied) probabilities.
1146 */
1147 s1 = 1.0;
1148 for (i = 0; i < nattrs; i++)
1149 s1 *= attr_sel[i];
1150
1151 CLAMP_PROBABILITY(s1);
1152
1153 pfree(attr_sel);
1154 bms_free(attnums);
1155
1156 return s1;
1157 }
1158
1159 /*
1160 * dependency_is_compatible_expression
1161 * Determines if the expression is compatible with functional dependencies
1162 *
1163 * Similar to dependency_is_compatible_clause, but doesn't enforce that the
1164 * expression is a simple Var. OTOH we check that there's at least one
1165 * statistics object matching the expression.
1166 */
1167 static bool
dependency_is_compatible_expression(Node * clause,Index relid,List * statlist,Node ** expr)1168 dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
1169 {
1170 List *vars;
1171 ListCell *lc,
1172 *lc2;
1173 Node *clause_expr;
1174
1175 if (IsA(clause, RestrictInfo))
1176 {
1177 RestrictInfo *rinfo = (RestrictInfo *) clause;
1178
1179 /* Pseudoconstants are not interesting (they couldn't contain a Var) */
1180 if (rinfo->pseudoconstant)
1181 return false;
1182
1183 /* Clauses referencing multiple, or no, varnos are incompatible */
1184 if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
1185 return false;
1186
1187 clause = (Node *) rinfo->clause;
1188 }
1189
1190 if (is_opclause(clause))
1191 {
1192 /* If it's an opclause, check for Var = Const or Const = Var. */
1193 OpExpr *expr = (OpExpr *) clause;
1194
1195 /* Only expressions with two arguments are candidates. */
1196 if (list_length(expr->args) != 2)
1197 return false;
1198
1199 /* Make sure non-selected argument is a pseudoconstant. */
1200 if (is_pseudo_constant_clause(lsecond(expr->args)))
1201 clause_expr = linitial(expr->args);
1202 else if (is_pseudo_constant_clause(linitial(expr->args)))
1203 clause_expr = lsecond(expr->args);
1204 else
1205 return false;
1206
1207 /*
1208 * If it's not an "=" operator, just ignore the clause, as it's not
1209 * compatible with functional dependencies.
1210 *
1211 * This uses the function for estimating selectivity, not the operator
1212 * directly (a bit awkward, but well ...).
1213 *
1214 * XXX this is pretty dubious; probably it'd be better to check btree
1215 * or hash opclass membership, so as not to be fooled by custom
1216 * selectivity functions, and to be more consistent with decisions
1217 * elsewhere in the planner.
1218 */
1219 if (get_oprrest(expr->opno) != F_EQSEL)
1220 return false;
1221
1222 /* OK to proceed with checking "var" */
1223 }
1224 else if (IsA(clause, ScalarArrayOpExpr))
1225 {
1226 /* If it's an scalar array operator, check for Var IN Const. */
1227 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
1228
1229 /*
1230 * Reject ALL() variant, we only care about ANY/IN.
1231 *
1232 * FIXME Maybe we should check if all the values are the same, and
1233 * allow ALL in that case? Doesn't seem very practical, though.
1234 */
1235 if (!expr->useOr)
1236 return false;
1237
1238 /* Only expressions with two arguments are candidates. */
1239 if (list_length(expr->args) != 2)
1240 return false;
1241
1242 /*
1243 * We know it's always (Var IN Const), so we assume the var is the
1244 * first argument, and pseudoconstant is the second one.
1245 */
1246 if (!is_pseudo_constant_clause(lsecond(expr->args)))
1247 return false;
1248
1249 clause_expr = linitial(expr->args);
1250
1251 /*
1252 * If it's not an "=" operator, just ignore the clause, as it's not
1253 * compatible with functional dependencies. The operator is identified
1254 * simply by looking at which function it uses to estimate
1255 * selectivity. That's a bit strange, but it's what other similar
1256 * places do.
1257 */
1258 if (get_oprrest(expr->opno) != F_EQSEL)
1259 return false;
1260
1261 /* OK to proceed with checking "var" */
1262 }
1263 else if (is_orclause(clause))
1264 {
1265 BoolExpr *bool_expr = (BoolExpr *) clause;
1266 ListCell *lc;
1267
1268 /* start with no expression (we'll use the first match) */
1269 *expr = NULL;
1270
1271 foreach(lc, bool_expr->args)
1272 {
1273 Node *or_expr = NULL;
1274
1275 /*
1276 * Had we found incompatible expression in the arguments, treat
1277 * the whole expression as incompatible.
1278 */
1279 if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
1280 statlist, &or_expr))
1281 return false;
1282
1283 if (*expr == NULL)
1284 *expr = or_expr;
1285
1286 /* ensure all the expressions are the same */
1287 if (!equal(or_expr, *expr))
1288 return false;
1289 }
1290
1291 /* the expression is already checked by the recursive call */
1292 return true;
1293 }
1294 else if (is_notclause(clause))
1295 {
1296 /*
1297 * "NOT x" can be interpreted as "x = false", so get the argument and
1298 * proceed with seeing if it's a suitable Var.
1299 */
1300 clause_expr = (Node *) get_notclausearg(clause);
1301 }
1302 else
1303 {
1304 /*
1305 * A boolean expression "x" can be interpreted as "x = true", so
1306 * proceed with seeing if it's a suitable Var.
1307 */
1308 clause_expr = (Node *) clause;
1309 }
1310
1311 /*
1312 * We may ignore any RelabelType node above the operand. (There won't be
1313 * more than one, since eval_const_expressions has been applied already.)
1314 */
1315 if (IsA(clause_expr, RelabelType))
1316 clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
1317
1318 vars = pull_var_clause(clause_expr, 0);
1319
1320 foreach(lc, vars)
1321 {
1322 Var *var = (Var *) lfirst(lc);
1323
1324 /* Ensure Var is from the correct relation */
1325 if (var->varno != relid)
1326 return false;
1327
1328 /* We also better ensure the Var is from the current level */
1329 if (var->varlevelsup != 0)
1330 return false;
1331
1332 /* Also ignore system attributes (we don't allow stats on those) */
1333 if (!AttrNumberIsForUserDefinedAttr(var->varattno))
1334 return false;
1335 }
1336
1337 /*
1338 * Check if we actually have a matching statistics for the expression.
1339 *
1340 * XXX Maybe this is an overkill. We'll eliminate the expressions later.
1341 */
1342 foreach(lc, statlist)
1343 {
1344 StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
1345
1346 /* ignore stats without dependencies */
1347 if (info->kind != STATS_EXT_DEPENDENCIES)
1348 continue;
1349
1350 foreach(lc2, info->exprs)
1351 {
1352 Node *stat_expr = (Node *) lfirst(lc2);
1353
1354 if (equal(clause_expr, stat_expr))
1355 {
1356 *expr = stat_expr;
1357 return true;
1358 }
1359 }
1360 }
1361
1362 return false;
1363 }
1364
1365 /*
1366 * dependencies_clauselist_selectivity
1367 * Return the estimated selectivity of (a subset of) the given clauses
1368 * using functional dependency statistics, or 1.0 if no useful functional
1369 * dependency statistic exists.
1370 *
1371 * 'estimatedclauses' is an input/output argument that gets a bit set
1372 * corresponding to the (zero-based) list index of each clause that is included
1373 * in the estimated selectivity.
1374 *
1375 * Given equality clauses on attributes (a,b) we find the strongest dependency
1376 * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
1377 * dependency, we then combine the per-clause selectivities using the formula
1378 *
1379 * P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
1380 *
1381 * where 'f' is the degree of the dependency. (Actually we use a slightly
1382 * modified version of this formula -- see clauselist_apply_dependencies()).
1383 *
1384 * With clauses on more than two attributes, the dependencies are applied
1385 * recursively, starting with the widest/strongest dependencies. For example
1386 * P(a,b,c) is first split like this:
1387 *
1388 * P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c)
1389 *
1390 * assuming (a,b=>c) is the strongest dependency.
1391 */
1392 Selectivity
dependencies_clauselist_selectivity(PlannerInfo * root,List * clauses,int varRelid,JoinType jointype,SpecialJoinInfo * sjinfo,RelOptInfo * rel,Bitmapset ** estimatedclauses)1393 dependencies_clauselist_selectivity(PlannerInfo *root,
1394 List *clauses,
1395 int varRelid,
1396 JoinType jointype,
1397 SpecialJoinInfo *sjinfo,
1398 RelOptInfo *rel,
1399 Bitmapset **estimatedclauses)
1400 {
1401 Selectivity s1 = 1.0;
1402 ListCell *l;
1403 Bitmapset *clauses_attnums = NULL;
1404 AttrNumber *list_attnums;
1405 int listidx;
1406 MVDependencies **func_dependencies;
1407 int nfunc_dependencies;
1408 int total_ndeps;
1409 MVDependency **dependencies;
1410 int ndependencies;
1411 int i;
1412 AttrNumber attnum_offset;
1413
1414 /* unique expressions */
1415 Node **unique_exprs;
1416 int unique_exprs_cnt;
1417
1418 /* check if there's any stats that might be useful for us. */
1419 if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
1420 return 1.0;
1421
1422 list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
1423 list_length(clauses));
1424
1425 /*
1426 * We allocate space as if every clause was a unique expression, although
1427 * that's probably overkill. Some will be simple column references that
1428 * we'll translate to attnums, and there might be duplicates. But it's
1429 * easier and cheaper to just do one allocation than repalloc later.
1430 */
1431 unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
1432 unique_exprs_cnt = 0;
1433
1434 /*
1435 * Pre-process the clauses list to extract the attnums seen in each item.
1436 * We need to determine if there's any clauses which will be useful for
1437 * dependency selectivity estimations. Along the way we'll record all of
1438 * the attnums for each clause in a list which we'll reference later so we
1439 * don't need to repeat the same work again. We'll also keep track of all
1440 * attnums seen.
1441 *
1442 * We also skip clauses that we already estimated using different types of
1443 * statistics (we treat them as incompatible).
1444 *
1445 * To handle expressions, we assign them negative attnums, as if it was a
1446 * system attribute (this is fine, as we only allow extended stats on user
1447 * attributes). And then we offset everything by the number of
1448 * expressions, so that we can store the values in a bitmapset.
1449 */
1450 listidx = 0;
1451 foreach(l, clauses)
1452 {
1453 Node *clause = (Node *) lfirst(l);
1454 AttrNumber attnum;
1455 Node *expr = NULL;
1456
1457 /* ignore clause by default */
1458 list_attnums[listidx] = InvalidAttrNumber;
1459
1460 if (!bms_is_member(listidx, *estimatedclauses))
1461 {
1462 /*
1463 * If it's a simple column reference, just extract the attnum. If
1464 * it's an expression, assign a negative attnum as if it was a
1465 * system attribute.
1466 */
1467 if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
1468 {
1469 list_attnums[listidx] = attnum;
1470 }
1471 else if (dependency_is_compatible_expression(clause, rel->relid,
1472 rel->statlist,
1473 &expr))
1474 {
1475 /* special attnum assigned to this expression */
1476 attnum = InvalidAttrNumber;
1477
1478 Assert(expr != NULL);
1479
1480 /* If the expression is duplicate, use the same attnum. */
1481 for (i = 0; i < unique_exprs_cnt; i++)
1482 {
1483 if (equal(unique_exprs[i], expr))
1484 {
1485 /* negative attribute number to expression */
1486 attnum = -(i + 1);
1487 break;
1488 }
1489 }
1490
1491 /* not found in the list, so add it */
1492 if (attnum == InvalidAttrNumber)
1493 {
1494 unique_exprs[unique_exprs_cnt++] = expr;
1495
1496 /* after incrementing the value, to get -1, -2, ... */
1497 attnum = (-unique_exprs_cnt);
1498 }
1499
1500 /* remember which attnum was assigned to this clause */
1501 list_attnums[listidx] = attnum;
1502 }
1503 }
1504
1505 listidx++;
1506 }
1507
1508 Assert(listidx == list_length(clauses));
1509
1510 /*
1511 * How much we need to offset the attnums? If there are no expressions,
1512 * then no offset is needed. Otherwise we need to offset enough for the
1513 * lowest value (-unique_exprs_cnt) to become 1.
1514 */
1515 if (unique_exprs_cnt > 0)
1516 attnum_offset = (unique_exprs_cnt + 1);
1517 else
1518 attnum_offset = 0;
1519
1520 /*
1521 * Now that we know how many expressions there are, we can offset the
1522 * values just enough to build the bitmapset.
1523 */
1524 for (i = 0; i < list_length(clauses); i++)
1525 {
1526 AttrNumber attnum;
1527
1528 /* ignore incompatible or already estimated clauses */
1529 if (list_attnums[i] == InvalidAttrNumber)
1530 continue;
1531
1532 /* make sure the attnum is in the expected range */
1533 Assert(list_attnums[i] >= (-unique_exprs_cnt));
1534 Assert(list_attnums[i] <= MaxHeapAttributeNumber);
1535
1536 /* make sure the attnum is positive (valid AttrNumber) */
1537 attnum = list_attnums[i] + attnum_offset;
1538
1539 /*
1540 * Either it's a regular attribute, or it's an expression, in which
1541 * case we must not have seen it before (expressions are unique).
1542 *
1543 * XXX Check whether it's a regular attribute has to be done using the
1544 * original attnum, while the second check has to use the value with
1545 * an offset.
1546 */
1547 Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
1548 !bms_is_member(attnum, clauses_attnums));
1549
1550 /*
1551 * Remember the offset attnum, both for attributes and expressions.
1552 * We'll pass list_attnums to clauselist_apply_dependencies, which
1553 * uses it to identify clauses in a bitmap. We could also pass the
1554 * offset, but this is more convenient.
1555 */
1556 list_attnums[i] = attnum;
1557
1558 clauses_attnums = bms_add_member(clauses_attnums, attnum);
1559 }
1560
1561 /*
1562 * If there's not at least two distinct attnums and expressions, then
1563 * reject the whole list of clauses. We must return 1.0 so the calling
1564 * function's selectivity is unaffected.
1565 */
1566 if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
1567 {
1568 bms_free(clauses_attnums);
1569 pfree(list_attnums);
1570 return 1.0;
1571 }
1572
1573 /*
1574 * Load all functional dependencies matching at least two parameters. We
1575 * can simply consider all dependencies at once, without having to search
1576 * for the best statistics object.
1577 *
1578 * To not waste cycles and memory, we deserialize dependencies only for
1579 * statistics that match at least two attributes. The array is allocated
1580 * with the assumption that all objects match - we could grow the array to
1581 * make it just the right size, but it's likely wasteful anyway thanks to
1582 * moving the freed chunks to freelists etc.
1583 */
1584 func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) *
1585 list_length(rel->statlist));
1586 nfunc_dependencies = 0;
1587 total_ndeps = 0;
1588
1589 foreach(l, rel->statlist)
1590 {
1591 StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
1592 int nmatched;
1593 int nexprs;
1594 int k;
1595 MVDependencies *deps;
1596
1597 /* skip statistics that are not of the correct type */
1598 if (stat->kind != STATS_EXT_DEPENDENCIES)
1599 continue;
1600
1601 /*
1602 * Count matching attributes - we have to undo the attnum offsets. The
1603 * input attribute numbers are not offset (expressions are not
1604 * included in stat->keys, so it's not necessary). But we need to
1605 * offset it before checking against clauses_attnums.
1606 */
1607 nmatched = 0;
1608 k = -1;
1609 while ((k = bms_next_member(stat->keys, k)) >= 0)
1610 {
1611 AttrNumber attnum = (AttrNumber) k;
1612
1613 /* skip expressions */
1614 if (!AttrNumberIsForUserDefinedAttr(attnum))
1615 continue;
1616
1617 /* apply the same offset as above */
1618 attnum += attnum_offset;
1619
1620 if (bms_is_member(attnum, clauses_attnums))
1621 nmatched++;
1622 }
1623
1624 /* count matching expressions */
1625 nexprs = 0;
1626 for (i = 0; i < unique_exprs_cnt; i++)
1627 {
1628 ListCell *lc;
1629
1630 foreach(lc, stat->exprs)
1631 {
1632 Node *stat_expr = (Node *) lfirst(lc);
1633
1634 /* try to match it */
1635 if (equal(stat_expr, unique_exprs[i]))
1636 nexprs++;
1637 }
1638 }
1639
1640 /*
1641 * Skip objects matching fewer than two attributes/expressions from
1642 * clauses.
1643 */
1644 if (nmatched + nexprs < 2)
1645 continue;
1646
1647 deps = statext_dependencies_load(stat->statOid);
1648
1649 /*
1650 * The expressions may be represented by different attnums in the
1651 * stats, we need to remap them to be consistent with the clauses.
1652 * That will make the later steps (e.g. picking the strongest item and
1653 * so on) much simpler and cheaper, because it won't need to care
1654 * about the offset at all.
1655 *
1656 * When we're at it, we can ignore dependencies that are not fully
1657 * matched by clauses (i.e. referencing attributes or expressions that
1658 * are not in the clauses).
1659 *
1660 * We have to do this for all statistics, as long as there are any
1661 * expressions - we need to shift the attnums in all dependencies.
1662 *
1663 * XXX Maybe we should do this always, because it also eliminates some
1664 * of the dependencies early. It might be cheaper than having to walk
1665 * the longer list in find_strongest_dependency later, especially as
1666 * we need to do that repeatedly?
1667 *
1668 * XXX We have to do this even when there are no expressions in
1669 * clauses, otherwise find_strongest_dependency may fail for stats
1670 * with expressions (due to lookup of negative value in bitmap). So we
1671 * need to at least filter out those dependencies. Maybe we could do
1672 * it in a cheaper way (if there are no expr clauses, we can just
1673 * discard all negative attnums without any lookups).
1674 */
1675 if (unique_exprs_cnt > 0 || stat->exprs != NIL)
1676 {
1677 int ndeps = 0;
1678
1679 for (i = 0; i < deps->ndeps; i++)
1680 {
1681 bool skip = false;
1682 MVDependency *dep = deps->deps[i];
1683 int j;
1684
1685 for (j = 0; j < dep->nattributes; j++)
1686 {
1687 int idx;
1688 Node *expr;
1689 int k;
1690 AttrNumber unique_attnum = InvalidAttrNumber;
1691 AttrNumber attnum;
1692
1693 /* undo the per-statistics offset */
1694 attnum = dep->attributes[j];
1695
1696 /*
1697 * For regular attributes we can simply check if it
1698 * matches any clause. If there's no matching clause, we
1699 * can just ignore it. We need to offset the attnum
1700 * though.
1701 */
1702 if (AttrNumberIsForUserDefinedAttr(attnum))
1703 {
1704 dep->attributes[j] = attnum + attnum_offset;
1705
1706 if (!bms_is_member(dep->attributes[j], clauses_attnums))
1707 {
1708 skip = true;
1709 break;
1710 }
1711
1712 continue;
1713 }
1714
1715 /*
1716 * the attnum should be a valid system attnum (-1, -2,
1717 * ...)
1718 */
1719 Assert(AttributeNumberIsValid(attnum));
1720
1721 /*
1722 * For expressions, we need to do two translations. First
1723 * we have to translate the negative attnum to index in
1724 * the list of expressions (in the statistics object).
1725 * Then we need to see if there's a matching clause. The
1726 * index of the unique expression determines the attnum
1727 * (and we offset it).
1728 */
1729 idx = -(1 + attnum);
1730
1731 /* Is the expression index is valid? */
1732 Assert((idx >= 0) && (idx < list_length(stat->exprs)));
1733
1734 expr = (Node *) list_nth(stat->exprs, idx);
1735
1736 /* try to find the expression in the unique list */
1737 for (k = 0; k < unique_exprs_cnt; k++)
1738 {
1739 /*
1740 * found a matching unique expression, use the attnum
1741 * (derived from index of the unique expression)
1742 */
1743 if (equal(unique_exprs[k], expr))
1744 {
1745 unique_attnum = -(k + 1) + attnum_offset;
1746 break;
1747 }
1748 }
1749
1750 /*
1751 * Found no matching expression, so we can simply skip
1752 * this dependency, because there's no chance it will be
1753 * fully covered.
1754 */
1755 if (unique_attnum == InvalidAttrNumber)
1756 {
1757 skip = true;
1758 break;
1759 }
1760
1761 /* otherwise remap it to the new attnum */
1762 dep->attributes[j] = unique_attnum;
1763 }
1764
1765 /* if found a matching dependency, keep it */
1766 if (!skip)
1767 {
1768 /* maybe we've skipped something earlier, so move it */
1769 if (ndeps != i)
1770 deps->deps[ndeps] = deps->deps[i];
1771
1772 ndeps++;
1773 }
1774 }
1775
1776 deps->ndeps = ndeps;
1777 }
1778
1779 /*
1780 * It's possible we've removed all dependencies, in which case we
1781 * don't bother adding it to the list.
1782 */
1783 if (deps->ndeps > 0)
1784 {
1785 func_dependencies[nfunc_dependencies] = deps;
1786 total_ndeps += deps->ndeps;
1787 nfunc_dependencies++;
1788 }
1789 }
1790
1791 /* if no matching stats could be found then we've nothing to do */
1792 if (nfunc_dependencies == 0)
1793 {
1794 pfree(func_dependencies);
1795 bms_free(clauses_attnums);
1796 pfree(list_attnums);
1797 pfree(unique_exprs);
1798 return 1.0;
1799 }
1800
1801 /*
1802 * Work out which dependencies we can apply, starting with the
1803 * widest/strongest ones, and proceeding to smaller/weaker ones.
1804 */
1805 dependencies = (MVDependency **) palloc(sizeof(MVDependency *) *
1806 total_ndeps);
1807 ndependencies = 0;
1808
1809 while (true)
1810 {
1811 MVDependency *dependency;
1812 AttrNumber attnum;
1813
1814 /* the widest/strongest dependency, fully matched by clauses */
1815 dependency = find_strongest_dependency(func_dependencies,
1816 nfunc_dependencies,
1817 clauses_attnums);
1818 if (!dependency)
1819 break;
1820
1821 dependencies[ndependencies++] = dependency;
1822
1823 /* Ignore dependencies using this implied attribute in later loops */
1824 attnum = dependency->attributes[dependency->nattributes - 1];
1825 clauses_attnums = bms_del_member(clauses_attnums, attnum);
1826 }
1827
1828 /*
1829 * If we found applicable dependencies, use them to estimate all
1830 * compatible clauses on attributes that they refer to.
1831 */
1832 if (ndependencies != 0)
1833 s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype,
1834 sjinfo, dependencies, ndependencies,
1835 list_attnums, estimatedclauses);
1836
1837 /* free deserialized functional dependencies (and then the array) */
1838 for (i = 0; i < nfunc_dependencies; i++)
1839 pfree(func_dependencies[i]);
1840
1841 pfree(dependencies);
1842 pfree(func_dependencies);
1843 bms_free(clauses_attnums);
1844 pfree(list_attnums);
1845 pfree(unique_exprs);
1846
1847 return s1;
1848 }
1849