1 /*-------------------------------------------------------------------------
2 *
3 * mcv.c
4 * POSTGRES multivariate MCV lists
5 *
6 *
7 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/statistics/mcv.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include <math.h>
18
19 #include "access/htup_details.h"
20 #include "catalog/pg_collation.h"
21 #include "catalog/pg_statistic_ext.h"
22 #include "catalog/pg_statistic_ext_data.h"
23 #include "fmgr.h"
24 #include "funcapi.h"
25 #include "nodes/nodeFuncs.h"
26 #include "optimizer/clauses.h"
27 #include "statistics/extended_stats_internal.h"
28 #include "statistics/statistics.h"
29 #include "utils/array.h"
30 #include "utils/builtins.h"
31 #include "utils/bytea.h"
32 #include "utils/fmgroids.h"
33 #include "utils/fmgrprotos.h"
34 #include "utils/lsyscache.h"
35 #include "utils/syscache.h"
36 #include "utils/typcache.h"
37
38 /*
39 * Computes size of a serialized MCV item, depending on the number of
40 * dimensions (columns) the statistic is defined on. The datum values are
41 * stored in a separate array (deduplicated, to minimize the size), and
42 * so the serialized items only store uint16 indexes into that array.
43 *
44 * Each serialized item stores (in this order):
45 *
46 * - indexes to values (ndim * sizeof(uint16))
47 * - null flags (ndim * sizeof(bool))
48 * - frequency (sizeof(double))
49 * - base_frequency (sizeof(double))
50 *
51 * There is no alignment padding within an MCV item.
52 * So in total each MCV item requires this many bytes:
53 *
54 * ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)
55 */
56 #define ITEM_SIZE(ndims) \
57 ((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double))
58
59 /*
60 * Used to compute size of serialized MCV list representation.
61 */
62 #define MinSizeOfMCVList \
63 (VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber))
64
65 /*
66 * Size of the serialized MCV list, excluding the space needed for
67 * deduplicated per-dimension values. The macro is meant to be used
68 * when it's not yet safe to access the serialized info about amount
69 * of data for each column.
70 */
71 #define SizeOfMCVList(ndims,nitems) \
72 ((MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \
73 ((ndims) * sizeof(DimensionInfo)) + \
74 ((nitems) * ITEM_SIZE(ndims)))
75
76 static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
77
78 static SortItem *build_distinct_groups(int numrows, SortItem *items,
79 MultiSortSupport mss, int *ndistinct);
80
81 static SortItem **build_column_frequencies(SortItem *groups, int ngroups,
82 MultiSortSupport mss, int *ncounts);
83
84 static int count_distinct_groups(int numrows, SortItem *items,
85 MultiSortSupport mss);
86
87 /*
88 * Compute new value for bitmap item, considering whether it's used for
89 * clauses connected by AND/OR.
90 */
91 #define RESULT_MERGE(value, is_or, match) \
92 ((is_or) ? ((value) || (match)) : ((value) && (match)))
93
94 /*
95 * When processing a list of clauses, the bitmap item may get set to a value
96 * such that additional clauses can't change it. For example, when processing
97 * a list of clauses connected to AND, as soon as the item gets set to 'false'
98 * then it'll remain like that. Similarly clauses connected by OR and 'true'.
99 *
100 * Returns true when the value in the bitmap can't change no matter how the
101 * remaining clauses are evaluated.
102 */
103 #define RESULT_IS_FINAL(value, is_or) ((is_or) ? (value) : (!(value)))
104
105 /*
106 * get_mincount_for_mcv_list
107 * Determine the minimum number of times a value needs to appear in
108 * the sample for it to be included in the MCV list.
109 *
110 * We want to keep only values that appear sufficiently often in the
111 * sample that it is reasonable to extrapolate their sample frequencies to
112 * the entire table. We do this by placing an upper bound on the relative
113 * standard error of the sample frequency, so that any estimates the
114 * planner generates from the MCV statistics can be expected to be
115 * reasonably accurate.
116 *
117 * Since we are sampling without replacement, the sample frequency of a
118 * particular value is described by a hypergeometric distribution. A
119 * common rule of thumb when estimating errors in this situation is to
120 * require at least 10 instances of the value in the sample, in which case
121 * the distribution can be approximated by a normal distribution, and
122 * standard error analysis techniques can be applied. Given a sample size
123 * of n, a population size of N, and a sample frequency of p=cnt/n, the
124 * standard error of the proportion p is given by
125 * SE = sqrt(p*(1-p)/n) * sqrt((N-n)/(N-1))
126 * where the second term is the finite population correction. To get
127 * reasonably accurate planner estimates, we impose an upper bound on the
128 * relative standard error of 20% -- i.e., SE/p < 0.2. This 20% relative
129 * error bound is fairly arbitrary, but has been found empirically to work
130 * well. Rearranging this formula gives a lower bound on the number of
131 * instances of the value seen:
132 * cnt > n*(N-n) / (N-n+0.04*n*(N-1))
133 * This bound is at most 25, and approaches 0 as n approaches 0 or N. The
134 * case where n approaches 0 cannot happen in practice, since the sample
135 * size is at least 300. The case where n approaches N corresponds to
136 * sampling the whole the table, in which case it is reasonable to keep
137 * the whole MCV list (have no lower bound), so it makes sense to apply
138 * this formula for all inputs, even though the above derivation is
139 * technically only valid when the right hand side is at least around 10.
140 *
141 * An alternative way to look at this formula is as follows -- assume that
142 * the number of instances of the value seen scales up to the entire
143 * table, so that the population count is K=N*cnt/n. Then the distribution
144 * in the sample is a hypergeometric distribution parameterised by N, n
145 * and K, and the bound above is mathematically equivalent to demanding
146 * that the standard deviation of that distribution is less than 20% of
147 * its mean. Thus the relative errors in any planner estimates produced
148 * from the MCV statistics are likely to be not too large.
149 */
150 static double
get_mincount_for_mcv_list(int samplerows,double totalrows)151 get_mincount_for_mcv_list(int samplerows, double totalrows)
152 {
153 double n = samplerows;
154 double N = totalrows;
155 double numer,
156 denom;
157
158 numer = n * (N - n);
159 denom = N - n + 0.04 * n * (N - 1);
160
161 /* Guard against division by zero (possible if n = N = 1) */
162 if (denom == 0.0)
163 return 0.0;
164
165 return numer / denom;
166 }
167
168 /*
169 * Builds MCV list from the set of sampled rows.
170 *
171 * The algorithm is quite simple:
172 *
173 * (1) sort the data (default collation, '<' for the data type)
174 *
175 * (2) count distinct groups, decide how many to keep
176 *
177 * (3) build the MCV list using the threshold determined in (2)
178 *
179 * (4) remove rows represented by the MCV from the sample
180 *
181 */
182 MCVList *
statext_mcv_build(int numrows,HeapTuple * rows,Bitmapset * attrs,VacAttrStats ** stats,double totalrows,int stattarget)183 statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
184 VacAttrStats **stats, double totalrows, int stattarget)
185 {
186 int i,
187 numattrs,
188 ngroups,
189 nitems;
190 AttrNumber *attnums;
191 double mincount;
192 SortItem *items;
193 SortItem *groups;
194 MCVList *mcvlist = NULL;
195 MultiSortSupport mss;
196
197 attnums = build_attnums_array(attrs, &numattrs);
198
199 /* comparator for all the columns */
200 mss = build_mss(stats, numattrs);
201
202 /* sort the rows */
203 items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
204 mss, numattrs, attnums);
205
206 if (!items)
207 return NULL;
208
209 /* transform the sorted rows into groups (sorted by frequency) */
210 groups = build_distinct_groups(nitems, items, mss, &ngroups);
211
212 /*
213 * Maximum number of MCV items to store, based on the statistics target we
214 * computed for the statistics object (from target set for the object
215 * itself, attributes and the system default). In any case, we can't keep
216 * more groups than we have available.
217 */
218 nitems = stattarget;
219 if (nitems > ngroups)
220 nitems = ngroups;
221
222 /*
223 * Decide how many items to keep in the MCV list. We can't use the same
224 * algorithm as per-column MCV lists, because that only considers the
225 * actual group frequency - but we're primarily interested in how the
226 * actual frequency differs from the base frequency (product of simple
227 * per-column frequencies, as if the columns were independent).
228 *
229 * Using the same algorithm might exclude items that are close to the
230 * "average" frequency of the sample. But that does not say whether the
231 * observed frequency is close to the base frequency or not. We also need
232 * to consider unexpectedly uncommon items (again, compared to the base
233 * frequency), and the single-column algorithm does not have to.
234 *
235 * We simply decide how many items to keep by computing minimum count
236 * using get_mincount_for_mcv_list() and then keep all items that seem to
237 * be more common than that.
238 */
239 mincount = get_mincount_for_mcv_list(numrows, totalrows);
240
241 /*
242 * Walk the groups until we find the first group with a count below the
243 * mincount threshold (the index of that group is the number of groups we
244 * want to keep).
245 */
246 for (i = 0; i < nitems; i++)
247 {
248 if (groups[i].count < mincount)
249 {
250 nitems = i;
251 break;
252 }
253 }
254
255 /*
256 * At this point we know the number of items for the MCV list. There might
257 * be none (for uniform distribution with many groups), and in that case
258 * there will be no MCV list. Otherwise construct the MCV list.
259 */
260 if (nitems > 0)
261 {
262 int j;
263 SortItem key;
264 MultiSortSupport tmp;
265
266 /* frequencies for values in each attribute */
267 SortItem **freqs;
268 int *nfreqs;
269
270 /* used to search values */
271 tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
272 + sizeof(SortSupportData));
273
274 /* compute frequencies for values in each column */
275 nfreqs = (int *) palloc0(sizeof(int) * numattrs);
276 freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
277
278 /*
279 * Allocate the MCV list structure, set the global parameters.
280 */
281 mcvlist = (MCVList *) palloc0(offsetof(MCVList, items) +
282 sizeof(MCVItem) * nitems);
283
284 mcvlist->magic = STATS_MCV_MAGIC;
285 mcvlist->type = STATS_MCV_TYPE_BASIC;
286 mcvlist->ndimensions = numattrs;
287 mcvlist->nitems = nitems;
288
289 /* store info about data type OIDs */
290 for (i = 0; i < numattrs; i++)
291 mcvlist->types[i] = stats[i]->attrtypid;
292
293 /* Copy the first chunk of groups into the result. */
294 for (i = 0; i < nitems; i++)
295 {
296 /* just pointer to the proper place in the list */
297 MCVItem *item = &mcvlist->items[i];
298
299 item->values = (Datum *) palloc(sizeof(Datum) * numattrs);
300 item->isnull = (bool *) palloc(sizeof(bool) * numattrs);
301
302 /* copy values for the group */
303 memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs);
304 memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs);
305
306 /* groups should be sorted by frequency in descending order */
307 Assert((i == 0) || (groups[i - 1].count >= groups[i].count));
308
309 /* group frequency */
310 item->frequency = (double) groups[i].count / numrows;
311
312 /* base frequency, if the attributes were independent */
313 item->base_frequency = 1.0;
314 for (j = 0; j < numattrs; j++)
315 {
316 SortItem *freq;
317
318 /* single dimension */
319 tmp->ndims = 1;
320 tmp->ssup[0] = mss->ssup[j];
321
322 /* fill search key */
323 key.values = &groups[i].values[j];
324 key.isnull = &groups[i].isnull[j];
325
326 freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
327 sizeof(SortItem),
328 multi_sort_compare, tmp);
329
330 item->base_frequency *= ((double) freq->count) / numrows;
331 }
332 }
333
334 pfree(nfreqs);
335 pfree(freqs);
336 }
337
338 pfree(items);
339 pfree(groups);
340
341 return mcvlist;
342 }
343
344 /*
345 * build_mss
346 * build MultiSortSupport for the attributes passed in attrs
347 */
348 static MultiSortSupport
build_mss(VacAttrStats ** stats,int numattrs)349 build_mss(VacAttrStats **stats, int numattrs)
350 {
351 int i;
352
353 /* Sort by multiple columns (using array of SortSupport) */
354 MultiSortSupport mss = multi_sort_init(numattrs);
355
356 /* prepare the sort functions for all the attributes */
357 for (i = 0; i < numattrs; i++)
358 {
359 VacAttrStats *colstat = stats[i];
360 TypeCacheEntry *type;
361
362 type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
363 if (type->lt_opr == InvalidOid) /* shouldn't happen */
364 elog(ERROR, "cache lookup failed for ordering operator for type %u",
365 colstat->attrtypid);
366
367 multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
368 }
369
370 return mss;
371 }
372
373 /*
374 * count_distinct_groups
375 * count distinct combinations of SortItems in the array
376 *
377 * The array is assumed to be sorted according to the MultiSortSupport.
378 */
379 static int
count_distinct_groups(int numrows,SortItem * items,MultiSortSupport mss)380 count_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss)
381 {
382 int i;
383 int ndistinct;
384
385 ndistinct = 1;
386 for (i = 1; i < numrows; i++)
387 {
388 /* make sure the array really is sorted */
389 Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
390
391 if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
392 ndistinct += 1;
393 }
394
395 return ndistinct;
396 }
397
398 /*
399 * compare_sort_item_count
400 * comparator for sorting items by count (frequencies) in descending order
401 */
402 static int
compare_sort_item_count(const void * a,const void * b)403 compare_sort_item_count(const void *a, const void *b)
404 {
405 SortItem *ia = (SortItem *) a;
406 SortItem *ib = (SortItem *) b;
407
408 if (ia->count == ib->count)
409 return 0;
410 else if (ia->count > ib->count)
411 return -1;
412
413 return 1;
414 }
415
416 /*
417 * build_distinct_groups
418 * build an array of SortItems for distinct groups and counts matching items
419 *
420 * The input array is assumed to be sorted
421 */
422 static SortItem *
build_distinct_groups(int numrows,SortItem * items,MultiSortSupport mss,int * ndistinct)423 build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
424 int *ndistinct)
425 {
426 int i,
427 j;
428 int ngroups = count_distinct_groups(numrows, items, mss);
429
430 SortItem *groups = (SortItem *) palloc(ngroups * sizeof(SortItem));
431
432 j = 0;
433 groups[0] = items[0];
434 groups[0].count = 1;
435
436 for (i = 1; i < numrows; i++)
437 {
438 /* Assume sorted in ascending order. */
439 Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
440
441 /* New distinct group detected. */
442 if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
443 {
444 groups[++j] = items[i];
445 groups[j].count = 0;
446 }
447
448 groups[j].count++;
449 }
450
451 /* ensure we filled the expected number of distinct groups */
452 Assert(j + 1 == ngroups);
453
454 /* Sort the distinct groups by frequency (in descending order). */
455 pg_qsort((void *) groups, ngroups, sizeof(SortItem),
456 compare_sort_item_count);
457
458 *ndistinct = ngroups;
459 return groups;
460 }
461
462 /* compare sort items (single dimension) */
463 static int
sort_item_compare(const void * a,const void * b,void * arg)464 sort_item_compare(const void *a, const void *b, void *arg)
465 {
466 SortSupport ssup = (SortSupport) arg;
467 SortItem *ia = (SortItem *) a;
468 SortItem *ib = (SortItem *) b;
469
470 return ApplySortComparator(ia->values[0], ia->isnull[0],
471 ib->values[0], ib->isnull[0],
472 ssup);
473 }
474
475 /*
476 * build_column_frequencies
477 * compute frequencies of values in each column
478 *
479 * This returns an array of SortItems for each attribute the MCV is built
480 * on, with a frequency (number of occurrences) for each value. This is
481 * then used to compute "base" frequency of MCV items.
482 *
483 * All the memory is allocated in a single chunk, so that a single pfree
484 * is enough to release it. We do not allocate space for values/isnull
485 * arrays in the SortItems, because we can simply point into the input
486 * groups directly.
487 */
488 static SortItem **
build_column_frequencies(SortItem * groups,int ngroups,MultiSortSupport mss,int * ncounts)489 build_column_frequencies(SortItem *groups, int ngroups,
490 MultiSortSupport mss, int *ncounts)
491 {
492 int i,
493 dim;
494 SortItem **result;
495 char *ptr;
496
497 Assert(groups);
498 Assert(ncounts);
499
500 /* allocate arrays for all columns as a single chunk */
501 ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) +
502 mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
503
504 /* initial array of pointers */
505 result = (SortItem **) ptr;
506 ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims);
507
508 for (dim = 0; dim < mss->ndims; dim++)
509 {
510 SortSupport ssup = &mss->ssup[dim];
511
512 /* array of values for a single column */
513 result[dim] = (SortItem *) ptr;
514 ptr += MAXALIGN(sizeof(SortItem) * ngroups);
515
516 /* extract data for the dimension */
517 for (i = 0; i < ngroups; i++)
518 {
519 /* point into the input groups */
520 result[dim][i].values = &groups[i].values[dim];
521 result[dim][i].isnull = &groups[i].isnull[dim];
522 result[dim][i].count = groups[i].count;
523 }
524
525 /* sort the values, deduplicate */
526 qsort_arg((void *) result[dim], ngroups, sizeof(SortItem),
527 sort_item_compare, ssup);
528
529 /*
530 * Identify distinct values, compute frequency (there might be
531 * multiple MCV items containing this value, so we need to sum counts
532 * from all of them.
533 */
534 ncounts[dim] = 1;
535 for (i = 1; i < ngroups; i++)
536 {
537 if (sort_item_compare(&result[dim][i - 1], &result[dim][i], ssup) == 0)
538 {
539 result[dim][ncounts[dim] - 1].count += result[dim][i].count;
540 continue;
541 }
542
543 result[dim][ncounts[dim]] = result[dim][i];
544
545 ncounts[dim]++;
546 }
547 }
548
549 return result;
550 }
551
552 /*
553 * statext_mcv_load
554 * Load the MCV list for the indicated pg_statistic_ext tuple
555 */
556 MCVList *
statext_mcv_load(Oid mvoid)557 statext_mcv_load(Oid mvoid)
558 {
559 MCVList *result;
560 bool isnull;
561 Datum mcvlist;
562 HeapTuple htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
563
564 if (!HeapTupleIsValid(htup))
565 elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
566
567 mcvlist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
568 Anum_pg_statistic_ext_data_stxdmcv, &isnull);
569
570 if (isnull)
571 elog(ERROR,
572 "requested statistics kind \"%c\" is not yet built for statistics object %u",
573 STATS_EXT_DEPENDENCIES, mvoid);
574
575 result = statext_mcv_deserialize(DatumGetByteaP(mcvlist));
576
577 ReleaseSysCache(htup);
578
579 return result;
580 }
581
582
583 /*
584 * statext_mcv_serialize
585 * Serialize MCV list into a pg_mcv_list value.
586 *
587 * The MCV items may include values of various data types, and it's reasonable
588 * to expect redundancy (values for a given attribute, repeated for multiple
589 * MCV list items). So we deduplicate the values into arrays, and then replace
590 * the values by indexes into those arrays.
591 *
592 * The overall structure of the serialized representation looks like this:
593 *
594 * +---------------+----------------+---------------------+-------+
595 * | header fields | dimension info | deduplicated values | items |
596 * +---------------+----------------+---------------------+-------+
597 *
598 * Where dimension info stores information about type of K-th attribute (e.g.
599 * typlen, typbyval and length of deduplicated values). Deduplicated values
600 * store deduplicated values for each attribute. And items store the actual
601 * MCV list items, with values replaced by indexes into the arrays.
602 *
603 * When serializing the items, we use uint16 indexes. The number of MCV items
604 * is limited by the statistics target (which is capped to 10k at the moment).
605 * We might increase this to 65k and still fit into uint16, so there's a bit of
606 * slack. Furthermore, this limit is on the number of distinct values per column,
607 * and we usually have few of those (and various combinations of them for the
608 * those MCV list). So uint16 seems fine for now.
609 *
610 * We don't really expect the serialization to save as much space as for
611 * histograms, as we are not doing any bucket splits (which is the source
612 * of high redundancy in histograms).
613 *
614 * TODO: Consider packing boolean flags (NULL) for each item into a single char
615 * (or a longer type) instead of using an array of bool items.
616 */
617 bytea *
statext_mcv_serialize(MCVList * mcvlist,VacAttrStats ** stats)618 statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
619 {
620 int i;
621 int dim;
622 int ndims = mcvlist->ndimensions;
623
624 SortSupport ssup;
625 DimensionInfo *info;
626
627 Size total_length;
628
629 /* serialized items (indexes into arrays, etc.) */
630 bytea *raw;
631 char *ptr;
632 char *endptr PG_USED_FOR_ASSERTS_ONLY;
633
634 /* values per dimension (and number of non-NULL values) */
635 Datum **values = (Datum **) palloc0(sizeof(Datum *) * ndims);
636 int *counts = (int *) palloc0(sizeof(int) * ndims);
637
638 /*
639 * We'll include some rudimentary information about the attribute types
640 * (length, by-val flag), so that we don't have to look them up while
641 * deserializating the MCV list (we already have the type OID in the
642 * header). This is safe, because when changing type of the attribute the
643 * statistics gets dropped automatically. We need to store the info about
644 * the arrays of deduplicated values anyway.
645 */
646 info = (DimensionInfo *) palloc0(sizeof(DimensionInfo) * ndims);
647
648 /* sort support data for all attributes included in the MCV list */
649 ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims);
650
651 /* collect and deduplicate values for each dimension (attribute) */
652 for (dim = 0; dim < ndims; dim++)
653 {
654 int ndistinct;
655 TypeCacheEntry *typentry;
656
657 /*
658 * Lookup the LT operator (can't get it from stats extra_data, as we
659 * don't know how to interpret that - scalar vs. array etc.).
660 */
661 typentry = lookup_type_cache(stats[dim]->attrtypid, TYPECACHE_LT_OPR);
662
663 /* copy important info about the data type (length, by-value) */
664 info[dim].typlen = stats[dim]->attrtype->typlen;
665 info[dim].typbyval = stats[dim]->attrtype->typbyval;
666
667 /* allocate space for values in the attribute and collect them */
668 values[dim] = (Datum *) palloc0(sizeof(Datum) * mcvlist->nitems);
669
670 for (i = 0; i < mcvlist->nitems; i++)
671 {
672 /* skip NULL values - we don't need to deduplicate those */
673 if (mcvlist->items[i].isnull[dim])
674 continue;
675
676 /* append the value at the end */
677 values[dim][counts[dim]] = mcvlist->items[i].values[dim];
678 counts[dim] += 1;
679 }
680
681 /* if there are just NULL values in this dimension, we're done */
682 if (counts[dim] == 0)
683 continue;
684
685 /* sort and deduplicate the data */
686 ssup[dim].ssup_cxt = CurrentMemoryContext;
687 ssup[dim].ssup_collation = stats[dim]->attrcollid;
688 ssup[dim].ssup_nulls_first = false;
689
690 PrepareSortSupportFromOrderingOp(typentry->lt_opr, &ssup[dim]);
691
692 qsort_arg(values[dim], counts[dim], sizeof(Datum),
693 compare_scalars_simple, &ssup[dim]);
694
695 /*
696 * Walk through the array and eliminate duplicate values, but keep the
697 * ordering (so that we can do bsearch later). We know there's at
698 * least one item as (counts[dim] != 0), so we can skip the first
699 * element.
700 */
701 ndistinct = 1; /* number of distinct values */
702 for (i = 1; i < counts[dim]; i++)
703 {
704 /* expect sorted array */
705 Assert(compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]) <= 0);
706
707 /* if the value is the same as the previous one, we can skip it */
708 if (!compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]))
709 continue;
710
711 values[dim][ndistinct] = values[dim][i];
712 ndistinct += 1;
713 }
714
715 /* we must not exceed PG_UINT16_MAX, as we use uint16 indexes */
716 Assert(ndistinct <= PG_UINT16_MAX);
717
718 /*
719 * Store additional info about the attribute - number of deduplicated
720 * values, and also size of the serialized data. For fixed-length data
721 * types this is trivial to compute, for varwidth types we need to
722 * actually walk the array and sum the sizes.
723 */
724 info[dim].nvalues = ndistinct;
725
726 if (info[dim].typbyval) /* by-value data types */
727 {
728 info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
729
730 /*
731 * We copy the data into the MCV item during deserialization, so
732 * we don't need to allocate any extra space.
733 */
734 info[dim].nbytes_aligned = 0;
735 }
736 else if (info[dim].typlen > 0) /* fixed-length by-ref */
737 {
738 /*
739 * We don't care about alignment in the serialized data, so we
740 * pack the data as much as possible. But we also track how much
741 * data will be needed after deserialization, and in that case we
742 * need to account for alignment of each item.
743 *
744 * Note: As the items are fixed-length, we could easily compute
745 * this during deserialization, but we do it here anyway.
746 */
747 info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
748 info[dim].nbytes_aligned = info[dim].nvalues * MAXALIGN(info[dim].typlen);
749 }
750 else if (info[dim].typlen == -1) /* varlena */
751 {
752 info[dim].nbytes = 0;
753 info[dim].nbytes_aligned = 0;
754 for (i = 0; i < info[dim].nvalues; i++)
755 {
756 Size len;
757
758 /*
759 * For varlena values, we detoast the values and store the
760 * length and data separately. We don't bother with alignment
761 * here, which means that during deserialization we need to
762 * copy the fields and only access the copies.
763 */
764 values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i]));
765
766 /* serialized length (uint32 length + data) */
767 len = VARSIZE_ANY_EXHDR(values[dim][i]);
768 info[dim].nbytes += sizeof(uint32); /* length */
769 info[dim].nbytes += len; /* value (no header) */
770
771 /*
772 * During deserialization we'll build regular varlena values
773 * with full headers, and we need to align them properly.
774 */
775 info[dim].nbytes_aligned += MAXALIGN(VARHDRSZ + len);
776 }
777 }
778 else if (info[dim].typlen == -2) /* cstring */
779 {
780 info[dim].nbytes = 0;
781 info[dim].nbytes_aligned = 0;
782 for (i = 0; i < info[dim].nvalues; i++)
783 {
784 Size len;
785
786 /*
787 * For cstring, we do similar thing as for varlena - first we
788 * store the length as uint32 and then the data. We don't care
789 * about alignment, which means that during deserialization we
790 * need to copy the fields and only access the copies.
791 */
792
793 /* c-strings include terminator, so +1 byte */
794 len = strlen(DatumGetCString(values[dim][i])) + 1;
795 info[dim].nbytes += sizeof(uint32); /* length */
796 info[dim].nbytes += len; /* value */
797
798 /* space needed for properly aligned deserialized copies */
799 info[dim].nbytes_aligned += MAXALIGN(len);
800 }
801 }
802
803 /* we know (count>0) so there must be some data */
804 Assert(info[dim].nbytes > 0);
805 }
806
807 /*
808 * Now we can finally compute how much space we'll actually need for the
809 * whole serialized MCV list (varlena header, MCV header, dimension info
810 * for each attribute, deduplicated values and items).
811 */
812 total_length = (3 * sizeof(uint32)) /* magic + type + nitems */
813 + sizeof(AttrNumber) /* ndimensions */
814 + (ndims * sizeof(Oid)); /* attribute types */
815
816 /* dimension info */
817 total_length += ndims * sizeof(DimensionInfo);
818
819 /* add space for the arrays of deduplicated values */
820 for (i = 0; i < ndims; i++)
821 total_length += info[i].nbytes;
822
823 /*
824 * And finally account for the items (those are fixed-length, thanks to
825 * replacing values with uint16 indexes into the deduplicated arrays).
826 */
827 total_length += mcvlist->nitems * ITEM_SIZE(dim);
828
829 /*
830 * Allocate space for the whole serialized MCV list (we'll skip bytes, so
831 * we set them to zero to make the result more compressible).
832 */
833 raw = (bytea *) palloc0(VARHDRSZ + total_length);
834 SET_VARSIZE(raw, VARHDRSZ + total_length);
835
836 ptr = VARDATA(raw);
837 endptr = ptr + total_length;
838
839 /* copy the MCV list header fields, one by one */
840 memcpy(ptr, &mcvlist->magic, sizeof(uint32));
841 ptr += sizeof(uint32);
842
843 memcpy(ptr, &mcvlist->type, sizeof(uint32));
844 ptr += sizeof(uint32);
845
846 memcpy(ptr, &mcvlist->nitems, sizeof(uint32));
847 ptr += sizeof(uint32);
848
849 memcpy(ptr, &mcvlist->ndimensions, sizeof(AttrNumber));
850 ptr += sizeof(AttrNumber);
851
852 memcpy(ptr, mcvlist->types, sizeof(Oid) * ndims);
853 ptr += (sizeof(Oid) * ndims);
854
855 /* store information about the attributes (data amounts, ...) */
856 memcpy(ptr, info, sizeof(DimensionInfo) * ndims);
857 ptr += sizeof(DimensionInfo) * ndims;
858
859 /* Copy the deduplicated values for all attributes to the output. */
860 for (dim = 0; dim < ndims; dim++)
861 {
862 /* remember the starting point for Asserts later */
863 char *start PG_USED_FOR_ASSERTS_ONLY = ptr;
864
865 for (i = 0; i < info[dim].nvalues; i++)
866 {
867 Datum value = values[dim][i];
868
869 if (info[dim].typbyval) /* passed by value */
870 {
871 Datum tmp;
872
873 /*
874 * For values passed by value, we need to copy just the
875 * significant bytes - we can't use memcpy directly, as that
876 * assumes little endian behavior. store_att_byval does
877 * almost what we need, but it requires properly aligned
878 * buffer - the output buffer does not guarantee that. So we
879 * simply use a local Datum variable (which guarantees proper
880 * alignment), and then copy the value from it.
881 */
882 store_att_byval(&tmp, value, info[dim].typlen);
883
884 memcpy(ptr, &tmp, info[dim].typlen);
885 ptr += info[dim].typlen;
886 }
887 else if (info[dim].typlen > 0) /* passed by reference */
888 {
889 /* no special alignment needed, treated as char array */
890 memcpy(ptr, DatumGetPointer(value), info[dim].typlen);
891 ptr += info[dim].typlen;
892 }
893 else if (info[dim].typlen == -1) /* varlena */
894 {
895 uint32 len = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
896
897 /* copy the length */
898 memcpy(ptr, &len, sizeof(uint32));
899 ptr += sizeof(uint32);
900
901 /* data from the varlena value (without the header) */
902 memcpy(ptr, VARDATA_ANY(DatumGetPointer(value)), len);
903 ptr += len;
904 }
905 else if (info[dim].typlen == -2) /* cstring */
906 {
907 uint32 len = (uint32) strlen(DatumGetCString(value)) + 1;
908
909 /* copy the length */
910 memcpy(ptr, &len, sizeof(uint32));
911 ptr += sizeof(uint32);
912
913 /* value */
914 memcpy(ptr, DatumGetCString(value), len);
915 ptr += len;
916 }
917
918 /* no underflows or overflows */
919 Assert((ptr > start) && ((ptr - start) <= info[dim].nbytes));
920 }
921
922 /* we should get exactly nbytes of data for this dimension */
923 Assert((ptr - start) == info[dim].nbytes);
924 }
925
926 /* Serialize the items, with uint16 indexes instead of the values. */
927 for (i = 0; i < mcvlist->nitems; i++)
928 {
929 MCVItem *mcvitem = &mcvlist->items[i];
930
931 /* don't write beyond the allocated space */
932 Assert(ptr <= (endptr - ITEM_SIZE(dim)));
933
934 /* copy NULL and frequency flags into the serialized MCV */
935 memcpy(ptr, mcvitem->isnull, sizeof(bool) * ndims);
936 ptr += sizeof(bool) * ndims;
937
938 memcpy(ptr, &mcvitem->frequency, sizeof(double));
939 ptr += sizeof(double);
940
941 memcpy(ptr, &mcvitem->base_frequency, sizeof(double));
942 ptr += sizeof(double);
943
944 /* store the indexes last */
945 for (dim = 0; dim < ndims; dim++)
946 {
947 uint16 index = 0;
948 Datum *value;
949
950 /* do the lookup only for non-NULL values */
951 if (!mcvitem->isnull[dim])
952 {
953 value = (Datum *) bsearch_arg(&mcvitem->values[dim], values[dim],
954 info[dim].nvalues, sizeof(Datum),
955 compare_scalars_simple, &ssup[dim]);
956
957 Assert(value != NULL); /* serialization or deduplication
958 * error */
959
960 /* compute index within the deduplicated array */
961 index = (uint16) (value - values[dim]);
962
963 /* check the index is within expected bounds */
964 Assert(index < info[dim].nvalues);
965 }
966
967 /* copy the index into the serialized MCV */
968 memcpy(ptr, &index, sizeof(uint16));
969 ptr += sizeof(uint16);
970 }
971
972 /* make sure we don't overflow the allocated value */
973 Assert(ptr <= endptr);
974 }
975
976 /* at this point we expect to match the total_length exactly */
977 Assert(ptr == endptr);
978
979 pfree(values);
980 pfree(counts);
981
982 return raw;
983 }
984
985 /*
986 * statext_mcv_deserialize
987 * Reads serialized MCV list into MCVList structure.
988 *
989 * All the memory needed by the MCV list is allocated as a single chunk, so
990 * it's possible to simply pfree() it at once.
991 */
992 MCVList *
statext_mcv_deserialize(bytea * data)993 statext_mcv_deserialize(bytea *data)
994 {
995 int dim,
996 i;
997 Size expected_size;
998 MCVList *mcvlist;
999 char *raw;
1000 char *ptr;
1001 char *endptr PG_USED_FOR_ASSERTS_ONLY;
1002
1003 int ndims,
1004 nitems;
1005 DimensionInfo *info = NULL;
1006
1007 /* local allocation buffer (used only for deserialization) */
1008 Datum **map = NULL;
1009
1010 /* MCV list */
1011 Size mcvlen;
1012
1013 /* buffer used for the result */
1014 Size datalen;
1015 char *dataptr;
1016 char *valuesptr;
1017 char *isnullptr;
1018
1019 if (data == NULL)
1020 return NULL;
1021
1022 /*
1023 * We can't possibly deserialize a MCV list if there's not even a complete
1024 * header. We need an explicit formula here, because we serialize the
1025 * header fields one by one, so we need to ignore struct alignment.
1026 */
1027 if (VARSIZE_ANY(data) < MinSizeOfMCVList)
1028 elog(ERROR, "invalid MCV size %zd (expected at least %zu)",
1029 VARSIZE_ANY(data), MinSizeOfMCVList);
1030
1031 /* read the MCV list header */
1032 mcvlist = (MCVList *) palloc0(offsetof(MCVList, items));
1033
1034 /* pointer to the data part (skip the varlena header) */
1035 raw = (char *) data;
1036 ptr = VARDATA_ANY(raw);
1037 endptr = (char *) raw + VARSIZE_ANY(data);
1038
1039 /* get the header and perform further sanity checks */
1040 memcpy(&mcvlist->magic, ptr, sizeof(uint32));
1041 ptr += sizeof(uint32);
1042
1043 memcpy(&mcvlist->type, ptr, sizeof(uint32));
1044 ptr += sizeof(uint32);
1045
1046 memcpy(&mcvlist->nitems, ptr, sizeof(uint32));
1047 ptr += sizeof(uint32);
1048
1049 memcpy(&mcvlist->ndimensions, ptr, sizeof(AttrNumber));
1050 ptr += sizeof(AttrNumber);
1051
1052 if (mcvlist->magic != STATS_MCV_MAGIC)
1053 elog(ERROR, "invalid MCV magic %u (expected %u)",
1054 mcvlist->magic, STATS_MCV_MAGIC);
1055
1056 if (mcvlist->type != STATS_MCV_TYPE_BASIC)
1057 elog(ERROR, "invalid MCV type %u (expected %u)",
1058 mcvlist->type, STATS_MCV_TYPE_BASIC);
1059
1060 if (mcvlist->ndimensions == 0)
1061 elog(ERROR, "invalid zero-length dimension array in MCVList");
1062 else if ((mcvlist->ndimensions > STATS_MAX_DIMENSIONS) ||
1063 (mcvlist->ndimensions < 0))
1064 elog(ERROR, "invalid length (%d) dimension array in MCVList",
1065 mcvlist->ndimensions);
1066
1067 if (mcvlist->nitems == 0)
1068 elog(ERROR, "invalid zero-length item array in MCVList");
1069 else if (mcvlist->nitems > STATS_MCVLIST_MAX_ITEMS)
1070 elog(ERROR, "invalid length (%u) item array in MCVList",
1071 mcvlist->nitems);
1072
1073 nitems = mcvlist->nitems;
1074 ndims = mcvlist->ndimensions;
1075
1076 /*
1077 * Check amount of data including DimensionInfo for all dimensions and
1078 * also the serialized items (including uint16 indexes). Also, walk
1079 * through the dimension information and add it to the sum.
1080 */
1081 expected_size = SizeOfMCVList(ndims, nitems);
1082
1083 /*
1084 * Check that we have at least the dimension and info records, along with
1085 * the items. We don't know the size of the serialized values yet. We need
1086 * to do this check first, before accessing the dimension info.
1087 */
1088 if (VARSIZE_ANY(data) < expected_size)
1089 elog(ERROR, "invalid MCV size %zd (expected %zu)",
1090 VARSIZE_ANY(data), expected_size);
1091
1092 /* Now copy the array of type Oids. */
1093 memcpy(mcvlist->types, ptr, sizeof(Oid) * ndims);
1094 ptr += (sizeof(Oid) * ndims);
1095
1096 /* Now it's safe to access the dimension info. */
1097 info = palloc(ndims * sizeof(DimensionInfo));
1098
1099 memcpy(info, ptr, ndims * sizeof(DimensionInfo));
1100 ptr += (ndims * sizeof(DimensionInfo));
1101
1102 /* account for the value arrays */
1103 for (dim = 0; dim < ndims; dim++)
1104 {
1105 /*
1106 * XXX I wonder if we can/should rely on asserts here. Maybe those
1107 * checks should be done every time?
1108 */
1109 Assert(info[dim].nvalues >= 0);
1110 Assert(info[dim].nbytes >= 0);
1111
1112 expected_size += info[dim].nbytes;
1113 }
1114
1115 /*
1116 * Now we know the total expected MCV size, including all the pieces
1117 * (header, dimension info. items and deduplicated data). So do the final
1118 * check on size.
1119 */
1120 if (VARSIZE_ANY(data) != expected_size)
1121 elog(ERROR, "invalid MCV size %zd (expected %zu)",
1122 VARSIZE_ANY(data), expected_size);
1123
1124 /*
1125 * We need an array of Datum values for each dimension, so that we can
1126 * easily translate the uint16 indexes later. We also need a top-level
1127 * array of pointers to those per-dimension arrays.
1128 *
1129 * While allocating the arrays for dimensions, compute how much space we
1130 * need for a copy of the by-ref data, as we can't simply point to the
1131 * original values (it might go away).
1132 */
1133 datalen = 0; /* space for by-ref data */
1134 map = (Datum **) palloc(ndims * sizeof(Datum *));
1135
1136 for (dim = 0; dim < ndims; dim++)
1137 {
1138 map[dim] = (Datum *) palloc(sizeof(Datum) * info[dim].nvalues);
1139
1140 /* space needed for a copy of data for by-ref types */
1141 datalen += info[dim].nbytes_aligned;
1142 }
1143
1144 /*
1145 * Now resize the MCV list so that the allocation includes all the data.
1146 *
1147 * Allocate space for a copy of the data, as we can't simply reference the
1148 * serialized data - it's not aligned properly, and it may disappear while
1149 * we're still using the MCV list, e.g. due to catcache release.
1150 *
1151 * We do care about alignment here, because we will allocate all the
1152 * pieces at once, but then use pointers to different parts.
1153 */
1154 mcvlen = MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
1155
1156 /* arrays of values and isnull flags for all MCV items */
1157 mcvlen += nitems * MAXALIGN(sizeof(Datum) * ndims);
1158 mcvlen += nitems * MAXALIGN(sizeof(bool) * ndims);
1159
1160 /* we don't quite need to align this, but it makes some asserts easier */
1161 mcvlen += MAXALIGN(datalen);
1162
1163 /* now resize the deserialized MCV list, and compute pointers to parts */
1164 mcvlist = repalloc(mcvlist, mcvlen);
1165
1166 /* pointer to the beginning of values/isnull arrays */
1167 valuesptr = (char *) mcvlist
1168 + MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
1169
1170 isnullptr = valuesptr + (nitems * MAXALIGN(sizeof(Datum) * ndims));
1171
1172 dataptr = isnullptr + (nitems * MAXALIGN(sizeof(bool) * ndims));
1173
1174 /*
1175 * Build mapping (index => value) for translating the serialized data into
1176 * the in-memory representation.
1177 */
1178 for (dim = 0; dim < ndims; dim++)
1179 {
1180 /* remember start position in the input array */
1181 char *start PG_USED_FOR_ASSERTS_ONLY = ptr;
1182
1183 if (info[dim].typbyval)
1184 {
1185 /* for by-val types we simply copy data into the mapping */
1186 for (i = 0; i < info[dim].nvalues; i++)
1187 {
1188 Datum v = 0;
1189
1190 memcpy(&v, ptr, info[dim].typlen);
1191 ptr += info[dim].typlen;
1192
1193 map[dim][i] = fetch_att(&v, true, info[dim].typlen);
1194
1195 /* no under/overflow of input array */
1196 Assert(ptr <= (start + info[dim].nbytes));
1197 }
1198 }
1199 else
1200 {
1201 /* for by-ref types we need to also make a copy of the data */
1202
1203 /* passed by reference, but fixed length (name, tid, ...) */
1204 if (info[dim].typlen > 0)
1205 {
1206 for (i = 0; i < info[dim].nvalues; i++)
1207 {
1208 memcpy(dataptr, ptr, info[dim].typlen);
1209 ptr += info[dim].typlen;
1210
1211 /* just point into the array */
1212 map[dim][i] = PointerGetDatum(dataptr);
1213 dataptr += MAXALIGN(info[dim].typlen);
1214 }
1215 }
1216 else if (info[dim].typlen == -1)
1217 {
1218 /* varlena */
1219 for (i = 0; i < info[dim].nvalues; i++)
1220 {
1221 uint32 len;
1222
1223 /* read the uint32 length */
1224 memcpy(&len, ptr, sizeof(uint32));
1225 ptr += sizeof(uint32);
1226
1227 /* the length is data-only */
1228 SET_VARSIZE(dataptr, len + VARHDRSZ);
1229 memcpy(VARDATA(dataptr), ptr, len);
1230 ptr += len;
1231
1232 /* just point into the array */
1233 map[dim][i] = PointerGetDatum(dataptr);
1234
1235 /* skip to place of the next deserialized value */
1236 dataptr += MAXALIGN(len + VARHDRSZ);
1237 }
1238 }
1239 else if (info[dim].typlen == -2)
1240 {
1241 /* cstring */
1242 for (i = 0; i < info[dim].nvalues; i++)
1243 {
1244 uint32 len;
1245
1246 memcpy(&len, ptr, sizeof(uint32));
1247 ptr += sizeof(uint32);
1248
1249 memcpy(dataptr, ptr, len);
1250 ptr += len;
1251
1252 /* just point into the array */
1253 map[dim][i] = PointerGetDatum(dataptr);
1254 dataptr += MAXALIGN(len);
1255 }
1256 }
1257
1258 /* no under/overflow of input array */
1259 Assert(ptr <= (start + info[dim].nbytes));
1260
1261 /* no overflow of the output mcv value */
1262 Assert(dataptr <= ((char *) mcvlist + mcvlen));
1263 }
1264
1265 /* check we consumed input data for this dimension exactly */
1266 Assert(ptr == (start + info[dim].nbytes));
1267 }
1268
1269 /* we should have also filled the MCV list exactly */
1270 Assert(dataptr == ((char *) mcvlist + mcvlen));
1271
1272 /* deserialize the MCV items and translate the indexes to Datums */
1273 for (i = 0; i < nitems; i++)
1274 {
1275 MCVItem *item = &mcvlist->items[i];
1276
1277 item->values = (Datum *) valuesptr;
1278 valuesptr += MAXALIGN(sizeof(Datum) * ndims);
1279
1280 item->isnull = (bool *) isnullptr;
1281 isnullptr += MAXALIGN(sizeof(bool) * ndims);
1282
1283 memcpy(item->isnull, ptr, sizeof(bool) * ndims);
1284 ptr += sizeof(bool) * ndims;
1285
1286 memcpy(&item->frequency, ptr, sizeof(double));
1287 ptr += sizeof(double);
1288
1289 memcpy(&item->base_frequency, ptr, sizeof(double));
1290 ptr += sizeof(double);
1291
1292 /* finally translate the indexes (for non-NULL only) */
1293 for (dim = 0; dim < ndims; dim++)
1294 {
1295 uint16 index;
1296
1297 memcpy(&index, ptr, sizeof(uint16));
1298 ptr += sizeof(uint16);
1299
1300 if (item->isnull[dim])
1301 continue;
1302
1303 item->values[dim] = map[dim][index];
1304 }
1305
1306 /* check we're not overflowing the input */
1307 Assert(ptr <= endptr);
1308 }
1309
1310 /* check that we processed all the data */
1311 Assert(ptr == endptr);
1312
1313 /* release the buffers used for mapping */
1314 for (dim = 0; dim < ndims; dim++)
1315 pfree(map[dim]);
1316
1317 pfree(map);
1318
1319 return mcvlist;
1320 }
1321
1322 /*
1323 * SRF with details about buckets of a histogram:
1324 *
1325 * - item ID (0...nitems)
1326 * - values (string array)
1327 * - nulls only (boolean array)
1328 * - frequency (double precision)
1329 * - base_frequency (double precision)
1330 *
1331 * The input is the OID of the statistics, and there are no rows returned if
1332 * the statistics contains no histogram.
1333 */
1334 Datum
pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)1335 pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)
1336 {
1337 FuncCallContext *funcctx;
1338
1339 /* stuff done only on the first call of the function */
1340 if (SRF_IS_FIRSTCALL())
1341 {
1342 MemoryContext oldcontext;
1343 MCVList *mcvlist;
1344 TupleDesc tupdesc;
1345
1346 /* create a function context for cross-call persistence */
1347 funcctx = SRF_FIRSTCALL_INIT();
1348
1349 /* switch to memory context appropriate for multiple function calls */
1350 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1351
1352 mcvlist = statext_mcv_deserialize(PG_GETARG_BYTEA_P(0));
1353
1354 funcctx->user_fctx = mcvlist;
1355
1356 /* total number of tuples to be returned */
1357 funcctx->max_calls = 0;
1358 if (funcctx->user_fctx != NULL)
1359 funcctx->max_calls = mcvlist->nitems;
1360
1361 /* Build a tuple descriptor for our result type */
1362 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1363 ereport(ERROR,
1364 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1365 errmsg("function returning record called in context "
1366 "that cannot accept type record")));
1367 tupdesc = BlessTupleDesc(tupdesc);
1368
1369 /*
1370 * generate attribute metadata needed later to produce tuples from raw
1371 * C strings
1372 */
1373 funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
1374
1375 MemoryContextSwitchTo(oldcontext);
1376 }
1377
1378 /* stuff done on every call of the function */
1379 funcctx = SRF_PERCALL_SETUP();
1380
1381 if (funcctx->call_cntr < funcctx->max_calls) /* do when there is more
1382 * left to send */
1383 {
1384 Datum values[5];
1385 bool nulls[5];
1386 HeapTuple tuple;
1387 Datum result;
1388 ArrayBuildState *astate_values = NULL;
1389 ArrayBuildState *astate_nulls = NULL;
1390
1391 int i;
1392 MCVList *mcvlist;
1393 MCVItem *item;
1394
1395 mcvlist = (MCVList *) funcctx->user_fctx;
1396
1397 Assert(funcctx->call_cntr < mcvlist->nitems);
1398
1399 item = &mcvlist->items[funcctx->call_cntr];
1400
1401 for (i = 0; i < mcvlist->ndimensions; i++)
1402 {
1403
1404 astate_nulls = accumArrayResult(astate_nulls,
1405 BoolGetDatum(item->isnull[i]),
1406 false,
1407 BOOLOID,
1408 CurrentMemoryContext);
1409
1410 if (!item->isnull[i])
1411 {
1412 bool isvarlena;
1413 Oid outfunc;
1414 FmgrInfo fmgrinfo;
1415 Datum val;
1416 text *txt;
1417
1418 /* lookup output func for the type */
1419 getTypeOutputInfo(mcvlist->types[i], &outfunc, &isvarlena);
1420 fmgr_info(outfunc, &fmgrinfo);
1421
1422 val = FunctionCall1(&fmgrinfo, item->values[i]);
1423 txt = cstring_to_text(DatumGetPointer(val));
1424
1425 astate_values = accumArrayResult(astate_values,
1426 PointerGetDatum(txt),
1427 false,
1428 TEXTOID,
1429 CurrentMemoryContext);
1430 }
1431 else
1432 astate_values = accumArrayResult(astate_values,
1433 (Datum) 0,
1434 true,
1435 TEXTOID,
1436 CurrentMemoryContext);
1437 }
1438
1439 values[0] = Int32GetDatum(funcctx->call_cntr);
1440 values[1] = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext));
1441 values[2] = PointerGetDatum(makeArrayResult(astate_nulls, CurrentMemoryContext));
1442 values[3] = Float8GetDatum(item->frequency);
1443 values[4] = Float8GetDatum(item->base_frequency);
1444
1445 /* no NULLs in the tuple */
1446 memset(nulls, 0, sizeof(nulls));
1447
1448 /* build a tuple */
1449 tuple = heap_form_tuple(funcctx->attinmeta->tupdesc, values, nulls);
1450
1451 /* make the tuple into a datum */
1452 result = HeapTupleGetDatum(tuple);
1453
1454 SRF_RETURN_NEXT(funcctx, result);
1455 }
1456 else /* do when there is no more left */
1457 {
1458 SRF_RETURN_DONE(funcctx);
1459 }
1460 }
1461
1462 /*
1463 * pg_mcv_list_in - input routine for type pg_mcv_list.
1464 *
1465 * pg_mcv_list is real enough to be a table column, but it has no operations
1466 * of its own, and disallows input too
1467 */
1468 Datum
pg_mcv_list_in(PG_FUNCTION_ARGS)1469 pg_mcv_list_in(PG_FUNCTION_ARGS)
1470 {
1471 /*
1472 * pg_mcv_list stores the data in binary form and parsing text input is
1473 * not needed, so disallow this.
1474 */
1475 ereport(ERROR,
1476 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1477 errmsg("cannot accept a value of type %s", "pg_mcv_list")));
1478
1479 PG_RETURN_VOID(); /* keep compiler quiet */
1480 }
1481
1482
1483 /*
1484 * pg_mcv_list_out - output routine for type pg_mcv_list.
1485 *
1486 * MCV lists are serialized into a bytea value, so we simply call byteaout()
1487 * to serialize the value into text. But it'd be nice to serialize that into
1488 * a meaningful representation (e.g. for inspection by people).
1489 *
1490 * XXX This should probably return something meaningful, similar to what
1491 * pg_dependencies_out does. Not sure how to deal with the deduplicated
1492 * values, though - do we want to expand that or not?
1493 */
1494 Datum
pg_mcv_list_out(PG_FUNCTION_ARGS)1495 pg_mcv_list_out(PG_FUNCTION_ARGS)
1496 {
1497 return byteaout(fcinfo);
1498 }
1499
1500 /*
1501 * pg_mcv_list_recv - binary input routine for type pg_mcv_list.
1502 */
1503 Datum
pg_mcv_list_recv(PG_FUNCTION_ARGS)1504 pg_mcv_list_recv(PG_FUNCTION_ARGS)
1505 {
1506 ereport(ERROR,
1507 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1508 errmsg("cannot accept a value of type %s", "pg_mcv_list")));
1509
1510 PG_RETURN_VOID(); /* keep compiler quiet */
1511 }
1512
1513 /*
1514 * pg_mcv_list_send - binary output routine for type pg_mcv_list.
1515 *
1516 * MCV lists are serialized in a bytea value (although the type is named
1517 * differently), so let's just send that.
1518 */
1519 Datum
pg_mcv_list_send(PG_FUNCTION_ARGS)1520 pg_mcv_list_send(PG_FUNCTION_ARGS)
1521 {
1522 return byteasend(fcinfo);
1523 }
1524
1525 /*
1526 * mcv_get_match_bitmap
1527 * Evaluate clauses using the MCV list, and update the match bitmap.
1528 *
1529 * A match bitmap keeps match/mismatch status for each MCV item, and we
1530 * update it based on additional clauses. We also use it to skip items
1531 * that can't possibly match (e.g. item marked as "mismatch" can't change
1532 * to "match" when evaluating AND clause list).
1533 *
1534 * The function also returns a flag indicating whether there was an
1535 * equality condition for all attributes, the minimum frequency in the MCV
1536 * list, and a total MCV frequency (sum of frequencies for all items).
1537 *
1538 * XXX Currently the match bitmap uses a bool for each MCV item, which is
1539 * somewhat wasteful as we could do with just a single bit, thus reducing
1540 * the size to ~1/8. It would also allow us to combine bitmaps simply using
1541 * & and |, which should be faster than min/max. The bitmaps are fairly
1542 * small, though (thanks to the cap on the MCV list size).
1543 */
1544 static bool *
mcv_get_match_bitmap(PlannerInfo * root,List * clauses,Bitmapset * keys,MCVList * mcvlist,bool is_or)1545 mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
1546 Bitmapset *keys, MCVList *mcvlist, bool is_or)
1547 {
1548 int i;
1549 ListCell *l;
1550 bool *matches;
1551
1552 /* The bitmap may be partially built. */
1553 Assert(clauses != NIL);
1554 Assert(list_length(clauses) >= 1);
1555 Assert(mcvlist != NULL);
1556 Assert(mcvlist->nitems > 0);
1557 Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS);
1558
1559 matches = palloc(sizeof(bool) * mcvlist->nitems);
1560 memset(matches, (is_or) ? false : true,
1561 sizeof(bool) * mcvlist->nitems);
1562
1563 /*
1564 * Loop through the list of clauses, and for each of them evaluate all the
1565 * MCV items not yet eliminated by the preceding clauses.
1566 */
1567 foreach(l, clauses)
1568 {
1569 Node *clause = (Node *) lfirst(l);
1570
1571 /* if it's a RestrictInfo, then extract the clause */
1572 if (IsA(clause, RestrictInfo))
1573 clause = (Node *) ((RestrictInfo *) clause)->clause;
1574
1575 /*
1576 * Handle the various types of clauses - OpClause, NullTest and
1577 * AND/OR/NOT
1578 */
1579 if (is_opclause(clause))
1580 {
1581 OpExpr *expr = (OpExpr *) clause;
1582 FmgrInfo opproc;
1583
1584 /* valid only after examine_clause_args returns true */
1585 Var *var;
1586 Const *cst;
1587 bool varonleft;
1588
1589 fmgr_info(get_opcode(expr->opno), &opproc);
1590
1591 /* extract the var and const from the expression */
1592 if (examine_clause_args(expr->args, &var, &cst, &varonleft))
1593 {
1594 int idx;
1595
1596 /* match the attribute to a dimension of the statistic */
1597 idx = bms_member_index(keys, var->varattno);
1598
1599 /*
1600 * Walk through the MCV items and evaluate the current clause.
1601 * We can skip items that were already ruled out, and
1602 * terminate if there are no remaining MCV items that might
1603 * possibly match.
1604 */
1605 for (i = 0; i < mcvlist->nitems; i++)
1606 {
1607 bool match = true;
1608 MCVItem *item = &mcvlist->items[i];
1609
1610 /*
1611 * When the MCV item or the Const value is NULL we can
1612 * treat this as a mismatch. We must not call the operator
1613 * because of strictness.
1614 */
1615 if (item->isnull[idx] || cst->constisnull)
1616 {
1617 matches[i] = RESULT_MERGE(matches[i], is_or, false);
1618 continue;
1619 }
1620
1621 /*
1622 * Skip MCV items that can't change result in the bitmap.
1623 * Once the value gets false for AND-lists, or true for
1624 * OR-lists, we don't need to look at more clauses.
1625 */
1626 if (RESULT_IS_FINAL(matches[i], is_or))
1627 continue;
1628
1629 /*
1630 * First check whether the constant is below the lower
1631 * boundary (in that case we can skip the bucket, because
1632 * there's no overlap).
1633 *
1634 * We don't store collations used to build the statistics,
1635 * but we can use the collation for the attribute itself,
1636 * as stored in varcollid. We do reset the statistics
1637 * after a type change (including collation change), so
1638 * this is OK. We may need to relax this after allowing
1639 * extended statistics on expressions.
1640 */
1641 if (varonleft)
1642 match = DatumGetBool(FunctionCall2Coll(&opproc,
1643 var->varcollid,
1644 item->values[idx],
1645 cst->constvalue));
1646 else
1647 match = DatumGetBool(FunctionCall2Coll(&opproc,
1648 var->varcollid,
1649 cst->constvalue,
1650 item->values[idx]));
1651
1652 /* update the match bitmap with the result */
1653 matches[i] = RESULT_MERGE(matches[i], is_or, match);
1654 }
1655 }
1656 }
1657 else if (IsA(clause, ScalarArrayOpExpr))
1658 {
1659 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
1660 FmgrInfo opproc;
1661
1662 /* valid only after examine_clause_args returns true */
1663 Var *var;
1664 Const *cst;
1665 bool varonleft;
1666
1667 fmgr_info(get_opcode(expr->opno), &opproc);
1668
1669 /* extract the var and const from the expression */
1670 if (examine_clause_args(expr->args, &var, &cst, &varonleft))
1671 {
1672 int idx;
1673
1674 ArrayType *arrayval;
1675 int16 elmlen;
1676 bool elmbyval;
1677 char elmalign;
1678 int num_elems;
1679 Datum *elem_values;
1680 bool *elem_nulls;
1681
1682 /* ScalarArrayOpExpr has the Var always on the left */
1683 Assert(varonleft);
1684
1685 if (!cst->constisnull)
1686 {
1687 arrayval = DatumGetArrayTypeP(cst->constvalue);
1688 get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
1689 &elmlen, &elmbyval, &elmalign);
1690 deconstruct_array(arrayval,
1691 ARR_ELEMTYPE(arrayval),
1692 elmlen, elmbyval, elmalign,
1693 &elem_values, &elem_nulls, &num_elems);
1694 }
1695
1696 /* match the attribute to a dimension of the statistic */
1697 idx = bms_member_index(keys, var->varattno);
1698
1699 /*
1700 * Walk through the MCV items and evaluate the current clause.
1701 * We can skip items that were already ruled out, and
1702 * terminate if there are no remaining MCV items that might
1703 * possibly match.
1704 */
1705 for (i = 0; i < mcvlist->nitems; i++)
1706 {
1707 int j;
1708 bool match = (expr->useOr ? false : true);
1709 MCVItem *item = &mcvlist->items[i];
1710
1711 /*
1712 * When the MCV item or the Const value is NULL we can
1713 * treat this as a mismatch. We must not call the operator
1714 * because of strictness.
1715 */
1716 if (item->isnull[idx] || cst->constisnull)
1717 {
1718 matches[i] = RESULT_MERGE(matches[i], is_or, false);
1719 continue;
1720 }
1721
1722 /*
1723 * Skip MCV items that can't change result in the bitmap.
1724 * Once the value gets false for AND-lists, or true for
1725 * OR-lists, we don't need to look at more clauses.
1726 */
1727 if (RESULT_IS_FINAL(matches[i], is_or))
1728 continue;
1729
1730 for (j = 0; j < num_elems; j++)
1731 {
1732 Datum elem_value = elem_values[j];
1733 bool elem_isnull = elem_nulls[j];
1734 bool elem_match;
1735
1736 /* NULL values always evaluate as not matching. */
1737 if (elem_isnull)
1738 {
1739 match = RESULT_MERGE(match, expr->useOr, false);
1740 continue;
1741 }
1742
1743 /*
1744 * Stop evaluating the array elements once we reach
1745 * match value that can't change - ALL() is the same
1746 * as AND-list, ANY() is the same as OR-list.
1747 */
1748 if (RESULT_IS_FINAL(match, expr->useOr))
1749 break;
1750
1751 elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
1752 var->varcollid,
1753 item->values[idx],
1754 elem_value));
1755
1756 match = RESULT_MERGE(match, expr->useOr, elem_match);
1757 }
1758
1759 /* update the match bitmap with the result */
1760 matches[i] = RESULT_MERGE(matches[i], is_or, match);
1761 }
1762 }
1763 }
1764 else if (IsA(clause, NullTest))
1765 {
1766 NullTest *expr = (NullTest *) clause;
1767 Var *var = (Var *) (expr->arg);
1768
1769 /* match the attribute to a dimension of the statistic */
1770 int idx = bms_member_index(keys, var->varattno);
1771
1772 /*
1773 * Walk through the MCV items and evaluate the current clause. We
1774 * can skip items that were already ruled out, and terminate if
1775 * there are no remaining MCV items that might possibly match.
1776 */
1777 for (i = 0; i < mcvlist->nitems; i++)
1778 {
1779 bool match = false; /* assume mismatch */
1780 MCVItem *item = &mcvlist->items[i];
1781
1782 /* if the clause mismatches the MCV item, update the bitmap */
1783 switch (expr->nulltesttype)
1784 {
1785 case IS_NULL:
1786 match = (item->isnull[idx]) ? true : match;
1787 break;
1788
1789 case IS_NOT_NULL:
1790 match = (!item->isnull[idx]) ? true : match;
1791 break;
1792 }
1793
1794 /* now, update the match bitmap, depending on OR/AND type */
1795 matches[i] = RESULT_MERGE(matches[i], is_or, match);
1796 }
1797 }
1798 else if (is_orclause(clause) || is_andclause(clause))
1799 {
1800 /* AND/OR clause, with all subclauses being compatible */
1801
1802 int i;
1803 BoolExpr *bool_clause = ((BoolExpr *) clause);
1804 List *bool_clauses = bool_clause->args;
1805
1806 /* match/mismatch bitmap for each MCV item */
1807 bool *bool_matches = NULL;
1808
1809 Assert(bool_clauses != NIL);
1810 Assert(list_length(bool_clauses) >= 2);
1811
1812 /* build the match bitmap for the OR-clauses */
1813 bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
1814 mcvlist, is_orclause(clause));
1815
1816 /*
1817 * Merge the bitmap produced by mcv_get_match_bitmap into the
1818 * current one. We need to consider if we're evaluating AND or OR
1819 * condition when merging the results.
1820 */
1821 for (i = 0; i < mcvlist->nitems; i++)
1822 matches[i] = RESULT_MERGE(matches[i], is_or, bool_matches[i]);
1823
1824 pfree(bool_matches);
1825 }
1826 else if (is_notclause(clause))
1827 {
1828 /* NOT clause, with all subclauses compatible */
1829
1830 int i;
1831 BoolExpr *not_clause = ((BoolExpr *) clause);
1832 List *not_args = not_clause->args;
1833
1834 /* match/mismatch bitmap for each MCV item */
1835 bool *not_matches = NULL;
1836
1837 Assert(not_args != NIL);
1838 Assert(list_length(not_args) == 1);
1839
1840 /* build the match bitmap for the NOT-clause */
1841 not_matches = mcv_get_match_bitmap(root, not_args, keys,
1842 mcvlist, false);
1843
1844 /*
1845 * Merge the bitmap produced by mcv_get_match_bitmap into the
1846 * current one. We're handling a NOT clause, so invert the result
1847 * before merging it into the global bitmap.
1848 */
1849 for (i = 0; i < mcvlist->nitems; i++)
1850 matches[i] = RESULT_MERGE(matches[i], is_or, !not_matches[i]);
1851
1852 pfree(not_matches);
1853 }
1854 else if (IsA(clause, Var))
1855 {
1856 /* Var (has to be a boolean Var, possibly from below NOT) */
1857
1858 Var *var = (Var *) (clause);
1859
1860 /* match the attribute to a dimension of the statistic */
1861 int idx = bms_member_index(keys, var->varattno);
1862
1863 Assert(var->vartype == BOOLOID);
1864
1865 /*
1866 * Walk through the MCV items and evaluate the current clause. We
1867 * can skip items that were already ruled out, and terminate if
1868 * there are no remaining MCV items that might possibly match.
1869 */
1870 for (i = 0; i < mcvlist->nitems; i++)
1871 {
1872 MCVItem *item = &mcvlist->items[i];
1873 bool match = false;
1874
1875 /* if the item is NULL, it's a mismatch */
1876 if (!item->isnull[idx] && DatumGetBool(item->values[idx]))
1877 match = true;
1878
1879 /* update the result bitmap */
1880 matches[i] = RESULT_MERGE(matches[i], is_or, match);
1881 }
1882 }
1883 else
1884 elog(ERROR, "unknown clause type: %d", clause->type);
1885 }
1886
1887 return matches;
1888 }
1889
1890
1891 /*
1892 * mcv_clauselist_selectivity
1893 * Return the selectivity estimate computed using an MCV list.
1894 *
1895 * First builds a bitmap of MCV items matching the clauses, and then sums
1896 * the frequencies of matching items.
1897 *
1898 * It also produces two additional interesting selectivities - total
1899 * selectivity of all the MCV items (not just the matching ones), and the
1900 * base frequency computed on the assumption of independence.
1901 */
1902 Selectivity
mcv_clauselist_selectivity(PlannerInfo * root,StatisticExtInfo * stat,List * clauses,int varRelid,JoinType jointype,SpecialJoinInfo * sjinfo,RelOptInfo * rel,Selectivity * basesel,Selectivity * totalsel)1903 mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
1904 List *clauses, int varRelid,
1905 JoinType jointype, SpecialJoinInfo *sjinfo,
1906 RelOptInfo *rel,
1907 Selectivity *basesel, Selectivity *totalsel)
1908 {
1909 int i;
1910 MCVList *mcv;
1911 Selectivity s = 0.0;
1912
1913 /* match/mismatch bitmap for each MCV item */
1914 bool *matches = NULL;
1915
1916 /* load the MCV list stored in the statistics object */
1917 mcv = statext_mcv_load(stat->statOid);
1918
1919 /* build a match bitmap for the clauses */
1920 matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
1921
1922 /* sum frequencies for all the matching MCV items */
1923 *basesel = 0.0;
1924 *totalsel = 0.0;
1925 for (i = 0; i < mcv->nitems; i++)
1926 {
1927 *totalsel += mcv->items[i].frequency;
1928
1929 if (matches[i] != false)
1930 {
1931 /* XXX Shouldn't the basesel be outside the if condition? */
1932 *basesel += mcv->items[i].base_frequency;
1933 s += mcv->items[i].frequency;
1934 }
1935 }
1936
1937 return s;
1938 }
1939