1 /*-------------------------------------------------------------------------
2  *
3  * mcv.c
4  *	  POSTGRES multivariate MCV lists
5  *
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *	  src/backend/statistics/mcv.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <math.h>
18 
19 #include "access/htup_details.h"
20 #include "catalog/pg_collation.h"
21 #include "catalog/pg_statistic_ext.h"
22 #include "catalog/pg_statistic_ext_data.h"
23 #include "fmgr.h"
24 #include "funcapi.h"
25 #include "nodes/nodeFuncs.h"
26 #include "optimizer/clauses.h"
27 #include "statistics/extended_stats_internal.h"
28 #include "statistics/statistics.h"
29 #include "utils/builtins.h"
30 #include "utils/bytea.h"
31 #include "utils/fmgroids.h"
32 #include "utils/fmgrprotos.h"
33 #include "utils/lsyscache.h"
34 #include "utils/syscache.h"
35 #include "utils/typcache.h"
36 
37 /*
38  * Computes size of a serialized MCV item, depending on the number of
39  * dimensions (columns) the statistic is defined on. The datum values are
40  * stored in a separate array (deduplicated, to minimize the size), and
41  * so the serialized items only store uint16 indexes into that array.
42  *
43  * Each serialized item stores (in this order):
44  *
45  * - indexes to values	  (ndim * sizeof(uint16))
46  * - null flags			  (ndim * sizeof(bool))
47  * - frequency			  (sizeof(double))
48  * - base_frequency		  (sizeof(double))
49  *
50  * There is no alignment padding within an MCV item.
51  * So in total each MCV item requires this many bytes:
52  *
53  *	 ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)
54  */
55 #define ITEM_SIZE(ndims)	\
56 	((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double))
57 
58 /*
59  * Used to compute size of serialized MCV list representation.
60  */
61 #define MinSizeOfMCVList		\
62 	(VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber))
63 
64 /*
65  * Size of the serialized MCV list, excluding the space needed for
66  * deduplicated per-dimension values. The macro is meant to be used
67  * when it's not yet safe to access the serialized info about amount
68  * of data for each column.
69  */
70 #define SizeOfMCVList(ndims,nitems)	\
71 	((MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \
72 	 ((ndims) * sizeof(DimensionInfo)) + \
73 	 ((nitems) * ITEM_SIZE(ndims)))
74 
75 static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
76 
77 static SortItem *build_distinct_groups(int numrows, SortItem *items,
78 									   MultiSortSupport mss, int *ndistinct);
79 
80 static SortItem **build_column_frequencies(SortItem *groups, int ngroups,
81 										   MultiSortSupport mss, int *ncounts);
82 
83 static int	count_distinct_groups(int numrows, SortItem *items,
84 								  MultiSortSupport mss);
85 
86 /*
87  * Compute new value for bitmap item, considering whether it's used for
88  * clauses connected by AND/OR.
89  */
90 #define RESULT_MERGE(value, is_or, match) \
91 	((is_or) ? ((value) || (match)) : ((value) && (match)))
92 
93 /*
94  * When processing a list of clauses, the bitmap item may get set to a value
95  * such that additional clauses can't change it. For example, when processing
96  * a list of clauses connected to AND, as soon as the item gets set to 'false'
97  * then it'll remain like that. Similarly clauses connected by OR and 'true'.
98  *
99  * Returns true when the value in the bitmap can't change no matter how the
100  * remaining clauses are evaluated.
101  */
102 #define RESULT_IS_FINAL(value, is_or)	((is_or) ? (value) : (!(value)))
103 
104 /*
105  * get_mincount_for_mcv_list
106  * 		Determine the minimum number of times a value needs to appear in
107  * 		the sample for it to be included in the MCV list.
108  *
109  * We want to keep only values that appear sufficiently often in the
110  * sample that it is reasonable to extrapolate their sample frequencies to
111  * the entire table.  We do this by placing an upper bound on the relative
112  * standard error of the sample frequency, so that any estimates the
113  * planner generates from the MCV statistics can be expected to be
114  * reasonably accurate.
115  *
116  * Since we are sampling without replacement, the sample frequency of a
117  * particular value is described by a hypergeometric distribution.  A
118  * common rule of thumb when estimating errors in this situation is to
119  * require at least 10 instances of the value in the sample, in which case
120  * the distribution can be approximated by a normal distribution, and
121  * standard error analysis techniques can be applied.  Given a sample size
122  * of n, a population size of N, and a sample frequency of p=cnt/n, the
123  * standard error of the proportion p is given by
124  *		SE = sqrt(p*(1-p)/n) * sqrt((N-n)/(N-1))
125  * where the second term is the finite population correction.  To get
126  * reasonably accurate planner estimates, we impose an upper bound on the
127  * relative standard error of 20% -- i.e., SE/p < 0.2.  This 20% relative
128  * error bound is fairly arbitrary, but has been found empirically to work
129  * well.  Rearranging this formula gives a lower bound on the number of
130  * instances of the value seen:
131  *		cnt > n*(N-n) / (N-n+0.04*n*(N-1))
132  * This bound is at most 25, and approaches 0 as n approaches 0 or N. The
133  * case where n approaches 0 cannot happen in practice, since the sample
134  * size is at least 300.  The case where n approaches N corresponds to
135  * sampling the whole the table, in which case it is reasonable to keep
136  * the whole MCV list (have no lower bound), so it makes sense to apply
137  * this formula for all inputs, even though the above derivation is
138  * technically only valid when the right hand side is at least around 10.
139  *
140  * An alternative way to look at this formula is as follows -- assume that
141  * the number of instances of the value seen scales up to the entire
142  * table, so that the population count is K=N*cnt/n. Then the distribution
143  * in the sample is a hypergeometric distribution parameterised by N, n
144  * and K, and the bound above is mathematically equivalent to demanding
145  * that the standard deviation of that distribution is less than 20% of
146  * its mean.  Thus the relative errors in any planner estimates produced
147  * from the MCV statistics are likely to be not too large.
148  */
149 static double
get_mincount_for_mcv_list(int samplerows,double totalrows)150 get_mincount_for_mcv_list(int samplerows, double totalrows)
151 {
152 	double		n = samplerows;
153 	double		N = totalrows;
154 	double		numer,
155 				denom;
156 
157 	numer = n * (N - n);
158 	denom = N - n + 0.04 * n * (N - 1);
159 
160 	/* Guard against division by zero (possible if n = N = 1) */
161 	if (denom == 0.0)
162 		return 0.0;
163 
164 	return numer / denom;
165 }
166 
167 /*
168  * Builds MCV list from the set of sampled rows.
169  *
170  * The algorithm is quite simple:
171  *
172  *	   (1) sort the data (default collation, '<' for the data type)
173  *
174  *	   (2) count distinct groups, decide how many to keep
175  *
176  *	   (3) build the MCV list using the threshold determined in (2)
177  *
178  *	   (4) remove rows represented by the MCV from the sample
179  *
180  */
181 MCVList *
statext_mcv_build(int numrows,HeapTuple * rows,Bitmapset * attrs,VacAttrStats ** stats,double totalrows)182 statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
183 				  VacAttrStats **stats, double totalrows)
184 {
185 	int			i,
186 				numattrs,
187 				ngroups,
188 				nitems;
189 	AttrNumber *attnums;
190 	double		mincount;
191 	SortItem   *items;
192 	SortItem   *groups;
193 	MCVList    *mcvlist = NULL;
194 	MultiSortSupport mss;
195 
196 	attnums = build_attnums_array(attrs, &numattrs);
197 
198 	/* comparator for all the columns */
199 	mss = build_mss(stats, numattrs);
200 
201 	/* sort the rows */
202 	items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
203 							   mss, numattrs, attnums);
204 
205 	if (!items)
206 		return NULL;
207 
208 	/* transform the sorted rows into groups (sorted by frequency) */
209 	groups = build_distinct_groups(nitems, items, mss, &ngroups);
210 
211 	/*
212 	 * Maximum number of MCV items to store, based on the attribute with the
213 	 * largest stats target (and the number of groups we have available).
214 	 */
215 	nitems = stats[0]->attr->attstattarget;
216 	for (i = 1; i < numattrs; i++)
217 	{
218 		if (stats[i]->attr->attstattarget > nitems)
219 			nitems = stats[i]->attr->attstattarget;
220 	}
221 	if (nitems > ngroups)
222 		nitems = ngroups;
223 
224 	/*
225 	 * Decide how many items to keep in the MCV list. We can't use the same
226 	 * algorithm as per-column MCV lists, because that only considers the
227 	 * actual group frequency - but we're primarily interested in how the
228 	 * actual frequency differs from the base frequency (product of simple
229 	 * per-column frequencies, as if the columns were independent).
230 	 *
231 	 * Using the same algorithm might exclude items that are close to the
232 	 * "average" frequency of the sample. But that does not say whether the
233 	 * observed frequency is close to the base frequency or not. We also need
234 	 * to consider unexpectedly uncommon items (again, compared to the base
235 	 * frequency), and the single-column algorithm does not have to.
236 	 *
237 	 * We simply decide how many items to keep by computing minimum count
238 	 * using get_mincount_for_mcv_list() and then keep all items that seem to
239 	 * be more common than that.
240 	 */
241 	mincount = get_mincount_for_mcv_list(numrows, totalrows);
242 
243 	/*
244 	 * Walk the groups until we find the first group with a count below the
245 	 * mincount threshold (the index of that group is the number of groups we
246 	 * want to keep).
247 	 */
248 	for (i = 0; i < nitems; i++)
249 	{
250 		if (groups[i].count < mincount)
251 		{
252 			nitems = i;
253 			break;
254 		}
255 	}
256 
257 	/*
258 	 * At this point we know the number of items for the MCV list. There might
259 	 * be none (for uniform distribution with many groups), and in that case
260 	 * there will be no MCV list. Otherwise construct the MCV list.
261 	 */
262 	if (nitems > 0)
263 	{
264 		int			j;
265 		SortItem	key;
266 		MultiSortSupport	tmp;
267 
268 		/* frequencies for values in each attribute */
269 		SortItem  **freqs;
270 		int		   *nfreqs;
271 
272 		/* used to search values */
273 		tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
274 										+ sizeof(SortSupportData));
275 
276 		/* compute frequencies for values in each column */
277 		nfreqs = (int *) palloc0(sizeof(int) * numattrs);
278 		freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
279 
280 		/*
281 		 * Allocate the MCV list structure, set the global parameters.
282 		 */
283 		mcvlist = (MCVList *) palloc0(offsetof(MCVList, items) +
284 									  sizeof(MCVItem) * nitems);
285 
286 		mcvlist->magic = STATS_MCV_MAGIC;
287 		mcvlist->type = STATS_MCV_TYPE_BASIC;
288 		mcvlist->ndimensions = numattrs;
289 		mcvlist->nitems = nitems;
290 
291 		/* store info about data type OIDs */
292 		for (i = 0; i < numattrs; i++)
293 			mcvlist->types[i] = stats[i]->attrtypid;
294 
295 		/* Copy the first chunk of groups into the result. */
296 		for (i = 0; i < nitems; i++)
297 		{
298 			/* just pointer to the proper place in the list */
299 			MCVItem    *item = &mcvlist->items[i];
300 
301 			item->values = (Datum *) palloc(sizeof(Datum) * numattrs);
302 			item->isnull = (bool *) palloc(sizeof(bool) * numattrs);
303 
304 			/* copy values for the group */
305 			memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs);
306 			memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs);
307 
308 			/* groups should be sorted by frequency in descending order */
309 			Assert((i == 0) || (groups[i - 1].count >= groups[i].count));
310 
311 			/* group frequency */
312 			item->frequency = (double) groups[i].count / numrows;
313 
314 			/* base frequency, if the attributes were independent */
315 			item->base_frequency = 1.0;
316 			for (j = 0; j < numattrs; j++)
317 			{
318 				SortItem   *freq;
319 
320 				/* single dimension */
321 				tmp->ndims = 1;
322 				tmp->ssup[0] = mss->ssup[j];
323 
324 				/* fill search key */
325 				key.values = &groups[i].values[j];
326 				key.isnull = &groups[i].isnull[j];
327 
328 				freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
329 												sizeof(SortItem),
330 												multi_sort_compare, tmp);
331 
332 				item->base_frequency *= ((double) freq->count) / numrows;
333 			}
334 		}
335 
336 		pfree(nfreqs);
337 		pfree(freqs);
338 	}
339 
340 	pfree(items);
341 	pfree(groups);
342 
343 	return mcvlist;
344 }
345 
346 /*
347  * build_mss
348  *	build MultiSortSupport for the attributes passed in attrs
349  */
350 static MultiSortSupport
build_mss(VacAttrStats ** stats,int numattrs)351 build_mss(VacAttrStats **stats, int numattrs)
352 {
353 	int			i;
354 
355 	/* Sort by multiple columns (using array of SortSupport) */
356 	MultiSortSupport mss = multi_sort_init(numattrs);
357 
358 	/* prepare the sort functions for all the attributes */
359 	for (i = 0; i < numattrs; i++)
360 	{
361 		VacAttrStats *colstat = stats[i];
362 		TypeCacheEntry *type;
363 
364 		type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
365 		if (type->lt_opr == InvalidOid) /* shouldn't happen */
366 			elog(ERROR, "cache lookup failed for ordering operator for type %u",
367 				 colstat->attrtypid);
368 
369 		multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
370 	}
371 
372 	return mss;
373 }
374 
375 /*
376  * count_distinct_groups
377  *	count distinct combinations of SortItems in the array
378  *
379  * The array is assumed to be sorted according to the MultiSortSupport.
380  */
381 static int
count_distinct_groups(int numrows,SortItem * items,MultiSortSupport mss)382 count_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss)
383 {
384 	int			i;
385 	int			ndistinct;
386 
387 	ndistinct = 1;
388 	for (i = 1; i < numrows; i++)
389 	{
390 		/* make sure the array really is sorted */
391 		Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
392 
393 		if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
394 			ndistinct += 1;
395 	}
396 
397 	return ndistinct;
398 }
399 
400 /*
401  * compare_sort_item_count
402  *	comparator for sorting items by count (frequencies) in descending order
403  */
404 static int
compare_sort_item_count(const void * a,const void * b)405 compare_sort_item_count(const void *a, const void *b)
406 {
407 	SortItem   *ia = (SortItem *) a;
408 	SortItem   *ib = (SortItem *) b;
409 
410 	if (ia->count == ib->count)
411 		return 0;
412 	else if (ia->count > ib->count)
413 		return -1;
414 
415 	return 1;
416 }
417 
418 /*
419  * build_distinct_groups
420  *	build an array of SortItems for distinct groups and counts matching items
421  *
422  * The input array is assumed to be sorted
423  */
424 static SortItem *
build_distinct_groups(int numrows,SortItem * items,MultiSortSupport mss,int * ndistinct)425 build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
426 					  int *ndistinct)
427 {
428 	int			i,
429 				j;
430 	int			ngroups = count_distinct_groups(numrows, items, mss);
431 
432 	SortItem   *groups = (SortItem *) palloc(ngroups * sizeof(SortItem));
433 
434 	j = 0;
435 	groups[0] = items[0];
436 	groups[0].count = 1;
437 
438 	for (i = 1; i < numrows; i++)
439 	{
440 		/* Assume sorted in ascending order. */
441 		Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
442 
443 		/* New distinct group detected. */
444 		if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
445 		{
446 			groups[++j] = items[i];
447 			groups[j].count = 0;
448 		}
449 
450 		groups[j].count++;
451 	}
452 
453 	/* ensure we filled the expected number of distinct groups */
454 	Assert(j + 1 == ngroups);
455 
456 	/* Sort the distinct groups by frequency (in descending order). */
457 	pg_qsort((void *) groups, ngroups, sizeof(SortItem),
458 			 compare_sort_item_count);
459 
460 	*ndistinct = ngroups;
461 	return groups;
462 }
463 
464 /* compare sort items (single dimension) */
465 static int
sort_item_compare(const void * a,const void * b,void * arg)466 sort_item_compare(const void *a, const void *b, void *arg)
467 {
468 	SortSupport	ssup = (SortSupport) arg;
469 	SortItem   *ia = (SortItem *) a;
470 	SortItem   *ib = (SortItem *) b;
471 
472 	return ApplySortComparator(ia->values[0], ia->isnull[0],
473 							   ib->values[0], ib->isnull[0],
474 							   ssup);
475 }
476 
477 /*
478  * build_column_frequencies
479  *	compute frequencies of values in each column
480  *
481  * This returns an array of SortItems for each attibute the MCV is built
482  * on, with a frequency (number of occurrences) for each value. This is
483  * then used to compute "base" frequency of MCV items.
484  *
485  * All the memory is allocated in a single chunk, so that a single pfree
486  * is enough to release it. We do not allocate space for values/isnull
487  * arrays in the SortItems, because we can simply point into the input
488  * groups directly.
489  */
490 static SortItem **
build_column_frequencies(SortItem * groups,int ngroups,MultiSortSupport mss,int * ncounts)491 build_column_frequencies(SortItem *groups, int ngroups,
492 						 MultiSortSupport mss, int *ncounts)
493 {
494 	int			i,
495 				dim;
496 	SortItem  **result;
497 	char	   *ptr;
498 
499 	Assert(groups);
500 	Assert(ncounts);
501 
502 	/* allocate arrays for all columns as a single chunk */
503 	ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) +
504 		  mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
505 
506 	/* initial array of pointers */
507 	result = (SortItem **) ptr;
508 	ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims);
509 
510 	for (dim = 0; dim < mss->ndims; dim++)
511 	{
512 		SortSupport	ssup = &mss->ssup[dim];
513 
514 		/* array of values for a single column */
515 		result[dim] = (SortItem *) ptr;
516 		ptr += MAXALIGN(sizeof(SortItem) * ngroups);
517 
518 		/* extract data for the dimension */
519 		for (i = 0; i < ngroups; i++)
520 		{
521 			/* point into the input groups */
522 			result[dim][i].values = &groups[i].values[dim];
523 			result[dim][i].isnull = &groups[i].isnull[dim];
524 			result[dim][i].count = groups[i].count;
525 		}
526 
527 		/* sort the values, deduplicate */
528 		qsort_arg((void *) result[dim], ngroups, sizeof(SortItem),
529 				  sort_item_compare, ssup);
530 
531 		/*
532 		 * Identify distinct values, compute frequency (there might be
533 		 * multiple MCV items containing this value, so we need to sum
534 		 * counts from all of them.
535 		 */
536 		ncounts[dim] = 1;
537 		for (i = 1; i < ngroups; i++)
538 		{
539 			if (sort_item_compare(&result[dim][i-1], &result[dim][i], ssup) == 0)
540 			{
541 				result[dim][ncounts[dim]-1].count += result[dim][i].count;
542 				continue;
543 			}
544 
545 			result[dim][ncounts[dim]] = result[dim][i];
546 
547 			ncounts[dim]++;
548 		}
549 	}
550 
551 	return result;
552 }
553 
554 /*
555  * statext_mcv_load
556  *		Load the MCV list for the indicated pg_statistic_ext tuple
557  */
558 MCVList *
statext_mcv_load(Oid mvoid)559 statext_mcv_load(Oid mvoid)
560 {
561 	MCVList    *result;
562 	bool		isnull;
563 	Datum		mcvlist;
564 	HeapTuple	htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
565 
566 	if (!HeapTupleIsValid(htup))
567 		elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
568 
569 	mcvlist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
570 							  Anum_pg_statistic_ext_data_stxdmcv, &isnull);
571 
572 	if (isnull)
573 		elog(ERROR,
574 			 "requested statistics kind \"%c\" is not yet built for statistics object %u",
575 			 STATS_EXT_DEPENDENCIES, mvoid);
576 
577 	result = statext_mcv_deserialize(DatumGetByteaP(mcvlist));
578 
579 	ReleaseSysCache(htup);
580 
581 	return result;
582 }
583 
584 
585 /*
586  * statext_mcv_serialize
587  *		Serialize MCV list into a pg_mcv_list value.
588  *
589  * The MCV items may include values of various data types, and it's reasonable
590  * to expect redundancy (values for a given attribute, repeated for multiple
591  * MCV list items). So we deduplicate the values into arrays, and then replace
592  * the values by indexes into those arrays.
593  *
594  * The overall structure of the serialized representation looks like this:
595  *
596  * +---------------+----------------+---------------------+-------+
597  * | header fields | dimension info | deduplicated values | items |
598  * +---------------+----------------+---------------------+-------+
599  *
600  * Where dimension info stores information about type of K-th attribute (e.g.
601  * typlen, typbyval and length of deduplicated values).  Deduplicated values
602  * store deduplicated values for each attribute.  And items store the actual
603  * MCV list items, with values replaced by indexes into the arrays.
604  *
605  * When serializing the items, we use uint16 indexes. The number of MCV items
606  * is limited by the statistics target (which is capped to 10k at the moment).
607  * We might increase this to 65k and still fit into uint16, so there's a bit of
608  * slack. Furthermore, this limit is on the number of distinct values per column,
609  * and we usually have few of those (and various combinations of them for the
610  * those MCV list). So uint16 seems fine for now.
611  *
612  * We don't really expect the serialization to save as much space as for
613  * histograms, as we are not doing any bucket splits (which is the source
614  * of high redundancy in histograms).
615  *
616  * TODO: Consider packing boolean flags (NULL) for each item into a single char
617  * (or a longer type) instead of using an array of bool items.
618  */
619 bytea *
statext_mcv_serialize(MCVList * mcvlist,VacAttrStats ** stats)620 statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
621 {
622 	int			i;
623 	int			dim;
624 	int			ndims = mcvlist->ndimensions;
625 
626 	SortSupport ssup;
627 	DimensionInfo *info;
628 
629 	Size		total_length;
630 
631 	/* serialized items (indexes into arrays, etc.) */
632 	bytea	   *raw;
633 	char	   *ptr;
634 	char	   *endptr PG_USED_FOR_ASSERTS_ONLY;
635 
636 	/* values per dimension (and number of non-NULL values) */
637 	Datum	  **values = (Datum **) palloc0(sizeof(Datum *) * ndims);
638 	int		   *counts = (int *) palloc0(sizeof(int) * ndims);
639 
640 	/*
641 	 * We'll include some rudimentary information about the attribute types
642 	 * (length, by-val flag), so that we don't have to look them up while
643 	 * deserializating the MCV list (we already have the type OID in the
644 	 * header).  This is safe, because when changing type of the attribute the
645 	 * statistics gets dropped automatically.  We need to store the info about
646 	 * the arrays of deduplicated values anyway.
647 	 */
648 	info = (DimensionInfo *) palloc0(sizeof(DimensionInfo) * ndims);
649 
650 	/* sort support data for all attributes included in the MCV list */
651 	ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims);
652 
653 	/* collect and deduplicate values for each dimension (attribute) */
654 	for (dim = 0; dim < ndims; dim++)
655 	{
656 		int			ndistinct;
657 		TypeCacheEntry *typentry;
658 
659 		/*
660 		 * Lookup the LT operator (can't get it from stats extra_data, as we
661 		 * don't know how to interpret that - scalar vs. array etc.).
662 		 */
663 		typentry = lookup_type_cache(stats[dim]->attrtypid, TYPECACHE_LT_OPR);
664 
665 		/* copy important info about the data type (length, by-value) */
666 		info[dim].typlen = stats[dim]->attrtype->typlen;
667 		info[dim].typbyval = stats[dim]->attrtype->typbyval;
668 
669 		/* allocate space for values in the attribute and collect them */
670 		values[dim] = (Datum *) palloc0(sizeof(Datum) * mcvlist->nitems);
671 
672 		for (i = 0; i < mcvlist->nitems; i++)
673 		{
674 			/* skip NULL values - we don't need to deduplicate those */
675 			if (mcvlist->items[i].isnull[dim])
676 				continue;
677 
678 			/* append the value at the end */
679 			values[dim][counts[dim]] = mcvlist->items[i].values[dim];
680 			counts[dim] += 1;
681 		}
682 
683 		/* if there are just NULL values in this dimension, we're done */
684 		if (counts[dim] == 0)
685 			continue;
686 
687 		/* sort and deduplicate the data */
688 		ssup[dim].ssup_cxt = CurrentMemoryContext;
689 		ssup[dim].ssup_collation = stats[dim]->attrcollid;
690 		ssup[dim].ssup_nulls_first = false;
691 
692 		PrepareSortSupportFromOrderingOp(typentry->lt_opr, &ssup[dim]);
693 
694 		qsort_arg(values[dim], counts[dim], sizeof(Datum),
695 				  compare_scalars_simple, &ssup[dim]);
696 
697 		/*
698 		 * Walk through the array and eliminate duplicate values, but keep the
699 		 * ordering (so that we can do bsearch later). We know there's at
700 		 * least one item as (counts[dim] != 0), so we can skip the first
701 		 * element.
702 		 */
703 		ndistinct = 1;			/* number of distinct values */
704 		for (i = 1; i < counts[dim]; i++)
705 		{
706 			/* expect sorted array */
707 			Assert(compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]) <= 0);
708 
709 			/* if the value is the same as the previous one, we can skip it */
710 			if (!compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]))
711 				continue;
712 
713 			values[dim][ndistinct] = values[dim][i];
714 			ndistinct += 1;
715 		}
716 
717 		/* we must not exceed PG_UINT16_MAX, as we use uint16 indexes */
718 		Assert(ndistinct <= PG_UINT16_MAX);
719 
720 		/*
721 		 * Store additional info about the attribute - number of deduplicated
722 		 * values, and also size of the serialized data. For fixed-length data
723 		 * types this is trivial to compute, for varwidth types we need to
724 		 * actually walk the array and sum the sizes.
725 		 */
726 		info[dim].nvalues = ndistinct;
727 
728 		if (info[dim].typbyval)	/* by-value data types */
729 		{
730 			info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
731 
732 			/*
733 			 * We copy the data into the MCV item during deserialization, so
734 			 * we don't need to allocate any extra space.
735 			*/
736 			info[dim].nbytes_aligned = 0;
737 		}
738 		else if (info[dim].typlen > 0)		/* fixed-length by-ref */
739 		{
740 			/*
741 			 * We don't care about alignment in the serialized data, so we
742 			 * pack the data as much as possible. But we also track how much
743 			 * data will be needed after deserialization, and in that case
744 			 * we need to account for alignment of each item.
745 			 *
746 			 * Note: As the items are fixed-length, we could easily compute
747 			 * this during deserialization, but we do it here anyway.
748 			 */
749 			info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
750 			info[dim].nbytes_aligned = info[dim].nvalues * MAXALIGN(info[dim].typlen);
751 		}
752 		else if (info[dim].typlen == -1)	/* varlena */
753 		{
754 			info[dim].nbytes = 0;
755 			info[dim].nbytes_aligned = 0;
756 			for (i = 0; i < info[dim].nvalues; i++)
757 			{
758 				Size		len;
759 
760 				/*
761 				 * For varlena values, we detoast the values and store the
762 				 * length and data separately. We don't bother with alignment
763 				 * here, which means that during deserialization we need to
764 				 * copy the fields and only access the copies.
765 				 */
766 				values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i]));
767 
768 				/* serialized length (uint32 length + data) */
769 				len = VARSIZE_ANY_EXHDR(values[dim][i]);
770 				info[dim].nbytes += sizeof(uint32);	/* length */
771 				info[dim].nbytes += len;			/* value (no header) */
772 
773 				/*
774 				 * During deserialization we'll build regular varlena values
775 				 * with full headers, and we need to align them properly.
776 				 */
777 				info[dim].nbytes_aligned += MAXALIGN(VARHDRSZ + len);
778 			}
779 		}
780 		else if (info[dim].typlen == -2)	/* cstring */
781 		{
782 			info[dim].nbytes = 0;
783 			info[dim].nbytes_aligned = 0;
784 			for (i = 0; i < info[dim].nvalues; i++)
785 			{
786 				Size		len;
787 
788 				/*
789 				 * For cstring, we do similar thing as for varlena - first we
790 				 * store the length as uint32 and then the data. We don't care
791 				 * about alignment, which means that during deserialization we
792 				 * need to copy the fields and only access the copies.
793 				 */
794 
795 				/* c-strings include terminator, so +1 byte */
796 				len = strlen(DatumGetCString(values[dim][i])) + 1;
797 				info[dim].nbytes += sizeof(uint32);	/* length */
798 				info[dim].nbytes += len;			/* value */
799 
800 				/* space needed for properly aligned deserialized copies */
801 				info[dim].nbytes_aligned += MAXALIGN(len);
802 			}
803 		}
804 
805 		/* we know (count>0) so there must be some data */
806 		Assert(info[dim].nbytes > 0);
807 	}
808 
809 	/*
810 	 * Now we can finally compute how much space we'll actually need for the
811 	 * whole serialized MCV list (varlena header, MCV header, dimension info
812 	 * for each attribute, deduplicated values and items).
813 	 */
814 	total_length = (3 * sizeof(uint32))			/* magic + type + nitems */
815 					+ sizeof(AttrNumber)		/* ndimensions */
816 					+ (ndims * sizeof(Oid));	/* attribute types */
817 
818 	/* dimension info */
819 	total_length += ndims * sizeof(DimensionInfo);
820 
821 	/* add space for the arrays of deduplicated values */
822 	for (i = 0; i < ndims; i++)
823 		total_length += info[i].nbytes;
824 
825 	/*
826 	 * And finally account for the items (those are fixed-length, thanks to
827 	 * replacing values with uint16 indexes into the deduplicated arrays).
828 	 */
829 	total_length += mcvlist->nitems * ITEM_SIZE(dim);
830 
831 	/*
832 	 * Allocate space for the whole serialized MCV list (we'll skip bytes, so
833 	 * we set them to zero to make the result more compressible).
834 	 */
835 	raw = (bytea *) palloc0(VARHDRSZ + total_length);
836 	SET_VARSIZE(raw, VARHDRSZ + total_length);
837 
838 	ptr = VARDATA(raw);
839 	endptr = ptr + total_length;
840 
841 	/* copy the MCV list header fields, one by one */
842 	memcpy(ptr, &mcvlist->magic, sizeof(uint32));
843 	ptr += sizeof(uint32);
844 
845 	memcpy(ptr, &mcvlist->type, sizeof(uint32));
846 	ptr += sizeof(uint32);
847 
848 	memcpy(ptr, &mcvlist->nitems, sizeof(uint32));
849 	ptr += sizeof(uint32);
850 
851 	memcpy(ptr, &mcvlist->ndimensions, sizeof(AttrNumber));
852 	ptr += sizeof(AttrNumber);
853 
854 	memcpy(ptr, mcvlist->types, sizeof(Oid) * ndims);
855 	ptr += (sizeof(Oid) * ndims);
856 
857 	/* store information about the attributes (data amounts, ...) */
858 	memcpy(ptr, info, sizeof(DimensionInfo) * ndims);
859 	ptr += sizeof(DimensionInfo) * ndims;
860 
861 	/* Copy the deduplicated values for all attributes to the output. */
862 	for (dim = 0; dim < ndims; dim++)
863 	{
864 		/* remember the starting point for Asserts later */
865 		char	   *start PG_USED_FOR_ASSERTS_ONLY = ptr;
866 
867 		for (i = 0; i < info[dim].nvalues; i++)
868 		{
869 			Datum		value = values[dim][i];
870 
871 			if (info[dim].typbyval) /* passed by value */
872 			{
873 				Datum		tmp;
874 
875 				/*
876 				 * For values passed by value, we need to copy just the
877 				 * significant bytes - we can't use memcpy directly, as that
878 				 * assumes little endian behavior.  store_att_byval does
879 				 * almost what we need, but it requires properly aligned
880 				 * buffer - the output buffer does not guarantee that. So we
881 				 * simply use a local Datum variable (which guarantees proper
882 				 * alignment), and then copy the value from it.
883 				 */
884 				store_att_byval(&tmp, value, info[dim].typlen);
885 
886 				memcpy(ptr, &tmp, info[dim].typlen);
887 				ptr += info[dim].typlen;
888 			}
889 			else if (info[dim].typlen > 0)	/* passed by reference */
890 			{
891 				/* no special alignment needed, treated as char array */
892 				memcpy(ptr, DatumGetPointer(value), info[dim].typlen);
893 				ptr += info[dim].typlen;
894 			}
895 			else if (info[dim].typlen == -1)	/* varlena */
896 			{
897 				uint32		len = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
898 
899 				/* copy the length */
900 				memcpy(ptr, &len, sizeof(uint32));
901 				ptr += sizeof(uint32);
902 
903 				/* data from the varlena value (without the header) */
904 				memcpy(ptr, VARDATA_ANY(DatumGetPointer(value)), len);
905 				ptr += len;
906 			}
907 			else if (info[dim].typlen == -2)	/* cstring */
908 			{
909 				uint32		len = (uint32) strlen(DatumGetCString(value)) + 1;
910 
911 				/* copy the length */
912 				memcpy(ptr, &len, sizeof(uint32));
913 				ptr += sizeof(uint32);
914 
915 				/* value */
916 				memcpy(ptr, DatumGetCString(value), len);
917 				ptr += len;
918 			}
919 
920 			/* no underflows or overflows */
921 			Assert((ptr > start) && ((ptr - start) <= info[dim].nbytes));
922 		}
923 
924 		/* we should get exactly nbytes of data for this dimension */
925 		Assert((ptr - start) == info[dim].nbytes);
926 	}
927 
928 	/* Serialize the items, with uint16 indexes instead of the values. */
929 	for (i = 0; i < mcvlist->nitems; i++)
930 	{
931 		MCVItem    *mcvitem = &mcvlist->items[i];
932 
933 		/* don't write beyond the allocated space */
934 		Assert(ptr <= (endptr - ITEM_SIZE(dim)));
935 
936 		/* copy NULL and frequency flags into the serialized MCV */
937 		memcpy(ptr, mcvitem->isnull, sizeof(bool) * ndims);
938 		ptr += sizeof(bool) * ndims;
939 
940 		memcpy(ptr, &mcvitem->frequency, sizeof(double));
941 		ptr += sizeof(double);
942 
943 		memcpy(ptr, &mcvitem->base_frequency, sizeof(double));
944 		ptr += sizeof(double);
945 
946 		/* store the indexes last */
947 		for (dim = 0; dim < ndims; dim++)
948 		{
949 			uint16		index = 0;
950 			Datum	   *value;
951 
952 			/* do the lookup only for non-NULL values */
953 			if (!mcvitem->isnull[dim])
954 			{
955 				value = (Datum *) bsearch_arg(&mcvitem->values[dim], values[dim],
956 											  info[dim].nvalues, sizeof(Datum),
957 											  compare_scalars_simple, &ssup[dim]);
958 
959 				Assert(value != NULL);	/* serialization or deduplication error */
960 
961 				/* compute index within the deduplicated array */
962 				index = (uint16) (value - values[dim]);
963 
964 				/* check the index is within expected bounds */
965 				Assert(index < info[dim].nvalues);
966 			}
967 
968 			/* copy the index into the serialized MCV */
969 			memcpy(ptr, &index, sizeof(uint16));
970 			ptr += sizeof(uint16);
971 		}
972 
973 		/* make sure we don't overflow the allocated value */
974 		Assert(ptr <= endptr);
975 	}
976 
977 	/* at this point we expect to match the total_length exactly */
978 	Assert(ptr == endptr);
979 
980 	pfree(values);
981 	pfree(counts);
982 
983 	return raw;
984 }
985 
986 /*
987  * statext_mcv_deserialize
988  *		Reads serialized MCV list into MCVList structure.
989  *
990  * All the memory needed by the MCV list is allocated as a single chunk, so
991  * it's possible to simply pfree() it at once.
992  */
993 MCVList *
statext_mcv_deserialize(bytea * data)994 statext_mcv_deserialize(bytea *data)
995 {
996 	int			dim,
997 				i;
998 	Size		expected_size;
999 	MCVList    *mcvlist;
1000 	char	   *raw;
1001 	char	   *ptr;
1002 	char	   *endptr PG_USED_FOR_ASSERTS_ONLY;
1003 
1004 	int			ndims,
1005 				nitems;
1006 	DimensionInfo *info = NULL;
1007 
1008 	/* local allocation buffer (used only for deserialization) */
1009 	Datum	  **map = NULL;
1010 
1011 	/* MCV list */
1012 	Size		mcvlen;
1013 
1014 	/* buffer used for the result */
1015 	Size		datalen;
1016 	char	   *dataptr;
1017 	char	   *valuesptr;
1018 	char	   *isnullptr;
1019 
1020 	if (data == NULL)
1021 		return NULL;
1022 
1023 	/*
1024 	 * We can't possibly deserialize a MCV list if there's not even a complete
1025 	 * header. We need an explicit formula here, because we serialize the
1026 	 * header fields one by one, so we need to ignore struct alignment.
1027 	 */
1028 	if (VARSIZE_ANY(data) < MinSizeOfMCVList)
1029 		elog(ERROR, "invalid MCV size %zd (expected at least %zu)",
1030 			 VARSIZE_ANY(data), MinSizeOfMCVList);
1031 
1032 	/* read the MCV list header */
1033 	mcvlist = (MCVList *) palloc0(offsetof(MCVList, items));
1034 
1035 	/* pointer to the data part (skip the varlena header) */
1036 	raw = (char *) data;
1037 	ptr = VARDATA_ANY(raw);
1038 	endptr = (char *) raw + VARSIZE_ANY(data);
1039 
1040 	/* get the header and perform further sanity checks */
1041 	memcpy(&mcvlist->magic, ptr, sizeof(uint32));
1042 	ptr += sizeof(uint32);
1043 
1044 	memcpy(&mcvlist->type, ptr, sizeof(uint32));
1045 	ptr += sizeof(uint32);
1046 
1047 	memcpy(&mcvlist->nitems, ptr, sizeof(uint32));
1048 	ptr += sizeof(uint32);
1049 
1050 	memcpy(&mcvlist->ndimensions, ptr, sizeof(AttrNumber));
1051 	ptr += sizeof(AttrNumber);
1052 
1053 	if (mcvlist->magic != STATS_MCV_MAGIC)
1054 		elog(ERROR, "invalid MCV magic %u (expected %u)",
1055 			 mcvlist->magic, STATS_MCV_MAGIC);
1056 
1057 	if (mcvlist->type != STATS_MCV_TYPE_BASIC)
1058 		elog(ERROR, "invalid MCV type %u (expected %u)",
1059 			 mcvlist->type, STATS_MCV_TYPE_BASIC);
1060 
1061 	if (mcvlist->ndimensions == 0)
1062 		elog(ERROR, "invalid zero-length dimension array in MCVList");
1063 	else if ((mcvlist->ndimensions > STATS_MAX_DIMENSIONS) ||
1064 			 (mcvlist->ndimensions < 0))
1065 		elog(ERROR, "invalid length (%d) dimension array in MCVList",
1066 			 mcvlist->ndimensions);
1067 
1068 	if (mcvlist->nitems == 0)
1069 		elog(ERROR, "invalid zero-length item array in MCVList");
1070 	else if (mcvlist->nitems > STATS_MCVLIST_MAX_ITEMS)
1071 		elog(ERROR, "invalid length (%u) item array in MCVList",
1072 			 mcvlist->nitems);
1073 
1074 	nitems = mcvlist->nitems;
1075 	ndims = mcvlist->ndimensions;
1076 
1077 	/*
1078 	 * Check amount of data including DimensionInfo for all dimensions and
1079 	 * also the serialized items (including uint16 indexes). Also, walk
1080 	 * through the dimension information and add it to the sum.
1081 	 */
1082 	expected_size = SizeOfMCVList(ndims, nitems);
1083 
1084 	/*
1085 	 * Check that we have at least the dimension and info records, along with
1086 	 * the items. We don't know the size of the serialized values yet. We need
1087 	 * to do this check first, before accessing the dimension info.
1088 	 */
1089 	if (VARSIZE_ANY(data) < expected_size)
1090 		elog(ERROR, "invalid MCV size %zd (expected %zu)",
1091 			 VARSIZE_ANY(data), expected_size);
1092 
1093 	/* Now copy the array of type Oids. */
1094 	memcpy(mcvlist->types, ptr, sizeof(Oid) * ndims);
1095 	ptr += (sizeof(Oid) * ndims);
1096 
1097 	/* Now it's safe to access the dimension info. */
1098 	info = palloc(ndims * sizeof(DimensionInfo));
1099 
1100 	memcpy(info, ptr, ndims * sizeof(DimensionInfo));
1101 	ptr += (ndims * sizeof(DimensionInfo));
1102 
1103 	/* account for the value arrays */
1104 	for (dim = 0; dim < ndims; dim++)
1105 	{
1106 		/*
1107 		 * XXX I wonder if we can/should rely on asserts here. Maybe those
1108 		 * checks should be done every time?
1109 		 */
1110 		Assert(info[dim].nvalues >= 0);
1111 		Assert(info[dim].nbytes >= 0);
1112 
1113 		expected_size += info[dim].nbytes;
1114 	}
1115 
1116 	/*
1117 	 * Now we know the total expected MCV size, including all the pieces
1118 	 * (header, dimension info. items and deduplicated data). So do the final
1119 	 * check on size.
1120 	 */
1121 	if (VARSIZE_ANY(data) != expected_size)
1122 		elog(ERROR, "invalid MCV size %zd (expected %zu)",
1123 			 VARSIZE_ANY(data), expected_size);
1124 
1125 	/*
1126 	 * We need an array of Datum values for each dimension, so that we can
1127 	 * easily translate the uint16 indexes later. We also need a top-level
1128 	 * array of pointers to those per-dimension arrays.
1129 	 *
1130 	 * While allocating the arrays for dimensions, compute how much space we
1131 	 * need for a copy of the by-ref data, as we can't simply point to the
1132 	 * original values (it might go away).
1133 	 */
1134 	datalen = 0;				/* space for by-ref data */
1135 	map = (Datum **) palloc(ndims * sizeof(Datum *));
1136 
1137 	for (dim = 0; dim < ndims; dim++)
1138 	{
1139 		map[dim] = (Datum *) palloc(sizeof(Datum) * info[dim].nvalues);
1140 
1141 		/* space needed for a copy of data for by-ref types */
1142 		datalen += info[dim].nbytes_aligned;
1143 	}
1144 
1145 	/*
1146 	 * Now resize the MCV list so that the allocation includes all the data.
1147 	 *
1148 	 * Allocate space for a copy of the data, as we can't simply reference the
1149 	 * serialized data - it's not aligned properly, and it may disappear while
1150 	 * we're still using the MCV list, e.g. due to catcache release.
1151 	 *
1152 	 * We do care about alignment here, because we will allocate all the pieces
1153 	 * at once, but then use pointers to different parts.
1154 	 */
1155 	mcvlen = MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
1156 
1157 	/* arrays of values and isnull flags for all MCV items */
1158 	mcvlen += nitems * MAXALIGN(sizeof(Datum) * ndims);
1159 	mcvlen += nitems * MAXALIGN(sizeof(bool) * ndims);
1160 
1161 	/* we don't quite need to align this, but it makes some asserts easier */
1162 	mcvlen += MAXALIGN(datalen);
1163 
1164 	/* now resize the deserialized MCV list, and compute pointers to parts */
1165 	mcvlist = repalloc(mcvlist, mcvlen);
1166 
1167 	/* pointer to the beginning of values/isnull arrays */
1168 	valuesptr = (char *) mcvlist
1169 		+ MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
1170 
1171 	isnullptr = valuesptr + (nitems * MAXALIGN(sizeof(Datum) * ndims));
1172 
1173 	dataptr = isnullptr + (nitems * MAXALIGN(sizeof(bool) * ndims));
1174 
1175 	/*
1176 	 * Build mapping (index => value) for translating the serialized data into
1177 	 * the in-memory representation.
1178 	 */
1179 	for (dim = 0; dim < ndims; dim++)
1180 	{
1181 		/* remember start position in the input array */
1182 		char	   *start PG_USED_FOR_ASSERTS_ONLY = ptr;
1183 
1184 		if (info[dim].typbyval)
1185 		{
1186 			/* for by-val types we simply copy data into the mapping */
1187 			for (i = 0; i < info[dim].nvalues; i++)
1188 			{
1189 				Datum		v = 0;
1190 
1191 				memcpy(&v, ptr, info[dim].typlen);
1192 				ptr += info[dim].typlen;
1193 
1194 				map[dim][i] = fetch_att(&v, true, info[dim].typlen);
1195 
1196 				/* no under/overflow of input array */
1197 				Assert(ptr <= (start + info[dim].nbytes));
1198 			}
1199 		}
1200 		else
1201 		{
1202 			/* for by-ref types we need to also make a copy of the data */
1203 
1204 			/* passed by reference, but fixed length (name, tid, ...) */
1205 			if (info[dim].typlen > 0)
1206 			{
1207 				for (i = 0; i < info[dim].nvalues; i++)
1208 				{
1209 					memcpy(dataptr, ptr, info[dim].typlen);
1210 					ptr += info[dim].typlen;
1211 
1212 					/* just point into the array */
1213 					map[dim][i] = PointerGetDatum(dataptr);
1214 					dataptr += MAXALIGN(info[dim].typlen);
1215 				}
1216 			}
1217 			else if (info[dim].typlen == -1)
1218 			{
1219 				/* varlena */
1220 				for (i = 0; i < info[dim].nvalues; i++)
1221 				{
1222 					uint32		len;
1223 
1224 					/* read the uint32 length */
1225 					memcpy(&len, ptr, sizeof(uint32));
1226 					ptr += sizeof(uint32);
1227 
1228 					/* the length is data-only */
1229 					SET_VARSIZE(dataptr, len + VARHDRSZ);
1230 					memcpy(VARDATA(dataptr), ptr, len);
1231 					ptr += len;
1232 
1233 					/* just point into the array */
1234 					map[dim][i] = PointerGetDatum(dataptr);
1235 
1236 					/* skip to place of the next deserialized value */
1237 					dataptr += MAXALIGN(len + VARHDRSZ);
1238 				}
1239 			}
1240 			else if (info[dim].typlen == -2)
1241 			{
1242 				/* cstring */
1243 				for (i = 0; i < info[dim].nvalues; i++)
1244 				{
1245 					uint32		len;
1246 
1247 					memcpy(&len, ptr, sizeof(uint32));
1248 					ptr += sizeof(uint32);
1249 
1250 					memcpy(dataptr, ptr, len);
1251 					ptr += len;
1252 
1253 					/* just point into the array */
1254 					map[dim][i] = PointerGetDatum(dataptr);
1255 					dataptr += MAXALIGN(len);
1256 				}
1257 			}
1258 
1259 			/* no under/overflow of input array */
1260 			Assert(ptr <= (start + info[dim].nbytes));
1261 
1262 			/* no overflow of the output mcv value */
1263 			Assert(dataptr <= ((char *) mcvlist + mcvlen));
1264 		}
1265 
1266 		/* check we consumed input data for this dimension exactly */
1267 		Assert(ptr == (start + info[dim].nbytes));
1268 	}
1269 
1270 	/* we should have also filled the MCV list exactly */
1271 	Assert(dataptr == ((char *) mcvlist + mcvlen));
1272 
1273 	/* deserialize the MCV items and translate the indexes to Datums */
1274 	for (i = 0; i < nitems; i++)
1275 	{
1276 		MCVItem    *item = &mcvlist->items[i];
1277 
1278 		item->values = (Datum *) valuesptr;
1279 		valuesptr += MAXALIGN(sizeof(Datum) * ndims);
1280 
1281 		item->isnull = (bool *) isnullptr;
1282 		isnullptr += MAXALIGN(sizeof(bool) * ndims);
1283 
1284 		memcpy(item->isnull, ptr, sizeof(bool) * ndims);
1285 		ptr += sizeof(bool) * ndims;
1286 
1287 		memcpy(&item->frequency, ptr, sizeof(double));
1288 		ptr += sizeof(double);
1289 
1290 		memcpy(&item->base_frequency, ptr, sizeof(double));
1291 		ptr += sizeof(double);
1292 
1293 		/* finally translate the indexes (for non-NULL only) */
1294 		for (dim = 0; dim < ndims; dim++)
1295 		{
1296 			uint16	index;
1297 
1298 			memcpy(&index, ptr, sizeof(uint16));
1299 			ptr += sizeof(uint16);
1300 
1301 			if (item->isnull[dim])
1302 				continue;
1303 
1304 			item->values[dim] = map[dim][index];
1305 		}
1306 
1307 		/* check we're not overflowing the input */
1308 		Assert(ptr <= endptr);
1309 	}
1310 
1311 	/* check that we processed all the data */
1312 	Assert(ptr == endptr);
1313 
1314 	/* release the buffers used for mapping */
1315 	for (dim = 0; dim < ndims; dim++)
1316 		pfree(map[dim]);
1317 
1318 	pfree(map);
1319 
1320 	return mcvlist;
1321 }
1322 
1323 /*
1324  * SRF with details about buckets of a histogram:
1325  *
1326  * - item ID (0...nitems)
1327  * - values (string array)
1328  * - nulls only (boolean array)
1329  * - frequency (double precision)
1330  * - base_frequency (double precision)
1331  *
1332  * The input is the OID of the statistics, and there are no rows returned if
1333  * the statistics contains no histogram.
1334  */
1335 Datum
pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)1336 pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)
1337 {
1338 	FuncCallContext *funcctx;
1339 
1340 	/* stuff done only on the first call of the function */
1341 	if (SRF_IS_FIRSTCALL())
1342 	{
1343 		MemoryContext oldcontext;
1344 		MCVList    *mcvlist;
1345 		TupleDesc	tupdesc;
1346 
1347 		/* create a function context for cross-call persistence */
1348 		funcctx = SRF_FIRSTCALL_INIT();
1349 
1350 		/* switch to memory context appropriate for multiple function calls */
1351 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1352 
1353 		mcvlist = statext_mcv_deserialize(PG_GETARG_BYTEA_P(0));
1354 
1355 		funcctx->user_fctx = mcvlist;
1356 
1357 		/* total number of tuples to be returned */
1358 		funcctx->max_calls = 0;
1359 		if (funcctx->user_fctx != NULL)
1360 			funcctx->max_calls = mcvlist->nitems;
1361 
1362 		/* Build a tuple descriptor for our result type */
1363 		if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1364 			ereport(ERROR,
1365 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1366 					 errmsg("function returning record called in context "
1367 							"that cannot accept type record")));
1368 		tupdesc = BlessTupleDesc(tupdesc);
1369 
1370 		/*
1371 		 * generate attribute metadata needed later to produce tuples from raw
1372 		 * C strings
1373 		 */
1374 		funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
1375 
1376 		MemoryContextSwitchTo(oldcontext);
1377 	}
1378 
1379 	/* stuff done on every call of the function */
1380 	funcctx = SRF_PERCALL_SETUP();
1381 
1382 	if (funcctx->call_cntr < funcctx->max_calls)	/* do when there is more left to send */
1383 	{
1384 		Datum		values[5];
1385 		bool		nulls[5];
1386 		HeapTuple	tuple;
1387 		Datum		result;
1388 		ArrayBuildState *astate_values = NULL;
1389 		ArrayBuildState *astate_nulls = NULL;
1390 
1391 		int			i;
1392 		MCVList    *mcvlist;
1393 		MCVItem    *item;
1394 
1395 		mcvlist = (MCVList *) funcctx->user_fctx;
1396 
1397 		Assert(funcctx->call_cntr < mcvlist->nitems);
1398 
1399 		item = &mcvlist->items[funcctx->call_cntr];
1400 
1401 		for (i = 0; i < mcvlist->ndimensions; i++)
1402 		{
1403 
1404 			astate_nulls = accumArrayResult(astate_nulls,
1405 								  BoolGetDatum(item->isnull[i]),
1406 								  false,
1407 								  BOOLOID,
1408 								  CurrentMemoryContext);
1409 
1410 			if (!item->isnull[i])
1411 			{
1412 				bool		isvarlena;
1413 				Oid			outfunc;
1414 				FmgrInfo	fmgrinfo;
1415 				Datum		val;
1416 				text	   *txt;
1417 
1418 				/* lookup output func for the type */
1419 				getTypeOutputInfo(mcvlist->types[i], &outfunc, &isvarlena);
1420 				fmgr_info(outfunc, &fmgrinfo);
1421 
1422 				val = FunctionCall1(&fmgrinfo, item->values[i]);
1423 				txt = cstring_to_text(DatumGetPointer(val));
1424 
1425 				astate_values = accumArrayResult(astate_values,
1426 								  PointerGetDatum(txt),
1427 								  false,
1428 								  TEXTOID,
1429 								  CurrentMemoryContext);
1430 			}
1431 			else
1432 				astate_values = accumArrayResult(astate_values,
1433 								  (Datum) 0,
1434 								  true,
1435 								  TEXTOID,
1436 								  CurrentMemoryContext);
1437 		}
1438 
1439 		values[0] = Int32GetDatum(funcctx->call_cntr);
1440 		values[1] = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext));
1441 		values[2] = PointerGetDatum(makeArrayResult(astate_nulls, CurrentMemoryContext));
1442 		values[3] = Float8GetDatum(item->frequency);
1443 		values[4] = Float8GetDatum(item->base_frequency);
1444 
1445 		/* no NULLs in the tuple */
1446 		memset(nulls, 0, sizeof(nulls));
1447 
1448 		/* build a tuple */
1449 		tuple = heap_form_tuple(funcctx->attinmeta->tupdesc, values, nulls);
1450 
1451 		/* make the tuple into a datum */
1452 		result = HeapTupleGetDatum(tuple);
1453 
1454 		SRF_RETURN_NEXT(funcctx, result);
1455 	}
1456 	else						/* do when there is no more left */
1457 	{
1458 		SRF_RETURN_DONE(funcctx);
1459 	}
1460 }
1461 
1462 /*
1463  * pg_mcv_list_in		- input routine for type pg_mcv_list.
1464  *
1465  * pg_mcv_list is real enough to be a table column, but it has no operations
1466  * of its own, and disallows input too
1467  */
1468 Datum
pg_mcv_list_in(PG_FUNCTION_ARGS)1469 pg_mcv_list_in(PG_FUNCTION_ARGS)
1470 {
1471 	/*
1472 	 * pg_mcv_list stores the data in binary form and parsing text input is
1473 	 * not needed, so disallow this.
1474 	 */
1475 	ereport(ERROR,
1476 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1477 			 errmsg("cannot accept a value of type %s", "pg_mcv_list")));
1478 
1479 	PG_RETURN_VOID();			/* keep compiler quiet */
1480 }
1481 
1482 
1483 /*
1484  * pg_mcv_list_out		- output routine for type pg_mcv_list.
1485  *
1486  * MCV lists are serialized into a bytea value, so we simply call byteaout()
1487  * to serialize the value into text. But it'd be nice to serialize that into
1488  * a meaningful representation (e.g. for inspection by people).
1489  *
1490  * XXX This should probably return something meaningful, similar to what
1491  * pg_dependencies_out does. Not sure how to deal with the deduplicated
1492  * values, though - do we want to expand that or not?
1493  */
1494 Datum
pg_mcv_list_out(PG_FUNCTION_ARGS)1495 pg_mcv_list_out(PG_FUNCTION_ARGS)
1496 {
1497 	return byteaout(fcinfo);
1498 }
1499 
1500 /*
1501  * pg_mcv_list_recv		- binary input routine for type pg_mcv_list.
1502  */
1503 Datum
pg_mcv_list_recv(PG_FUNCTION_ARGS)1504 pg_mcv_list_recv(PG_FUNCTION_ARGS)
1505 {
1506 	ereport(ERROR,
1507 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1508 			 errmsg("cannot accept a value of type %s", "pg_mcv_list")));
1509 
1510 	PG_RETURN_VOID();			/* keep compiler quiet */
1511 }
1512 
1513 /*
1514  * pg_mcv_list_send		- binary output routine for type pg_mcv_list.
1515  *
1516  * MCV lists are serialized in a bytea value (although the type is named
1517  * differently), so let's just send that.
1518  */
1519 Datum
pg_mcv_list_send(PG_FUNCTION_ARGS)1520 pg_mcv_list_send(PG_FUNCTION_ARGS)
1521 {
1522 	return byteasend(fcinfo);
1523 }
1524 
1525 /*
1526  * mcv_get_match_bitmap
1527  *	Evaluate clauses using the MCV list, and update the match bitmap.
1528  *
1529  * A match bitmap keeps match/mismatch status for each MCV item, and we
1530  * update it based on additional clauses. We also use it to skip items
1531  * that can't possibly match (e.g. item marked as "mismatch" can't change
1532  * to "match" when evaluating AND clause list).
1533  *
1534  * The function also returns a flag indicating whether there was an
1535  * equality condition for all attributes, the minimum frequency in the MCV
1536  * list, and a total MCV frequency (sum of frequencies for all items).
1537  *
1538  * XXX Currently the match bitmap uses a bool for each MCV item, which is
1539  * somewhat wasteful as we could do with just a single bit, thus reducing
1540  * the size to ~1/8. It would also allow us to combine bitmaps simply using
1541  * & and |, which should be faster than min/max. The bitmaps are fairly
1542  * small, though (thanks to the cap on the MCV list size).
1543  */
1544 static bool *
mcv_get_match_bitmap(PlannerInfo * root,List * clauses,Bitmapset * keys,MCVList * mcvlist,bool is_or)1545 mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
1546 					 Bitmapset *keys, MCVList *mcvlist, bool is_or)
1547 {
1548 	int			i;
1549 	ListCell   *l;
1550 	bool	   *matches;
1551 
1552 	/* The bitmap may be partially built. */
1553 	Assert(clauses != NIL);
1554 	Assert(list_length(clauses) >= 1);
1555 	Assert(mcvlist != NULL);
1556 	Assert(mcvlist->nitems > 0);
1557 	Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS);
1558 
1559 	matches = palloc(sizeof(bool) * mcvlist->nitems);
1560 	memset(matches, (is_or) ? false : true,
1561 		   sizeof(bool) * mcvlist->nitems);
1562 
1563 	/*
1564 	 * Loop through the list of clauses, and for each of them evaluate all the
1565 	 * MCV items not yet eliminated by the preceding clauses.
1566 	 */
1567 	foreach(l, clauses)
1568 	{
1569 		Node	   *clause = (Node *) lfirst(l);
1570 
1571 		/* if it's a RestrictInfo, then extract the clause */
1572 		if (IsA(clause, RestrictInfo))
1573 			clause = (Node *) ((RestrictInfo *) clause)->clause;
1574 
1575 		/*
1576 		 * Handle the various types of clauses - OpClause, NullTest and
1577 		 * AND/OR/NOT
1578 		 */
1579 		if (is_opclause(clause))
1580 		{
1581 			OpExpr	   *expr = (OpExpr *) clause;
1582 			FmgrInfo	opproc;
1583 
1584 			/* valid only after examine_opclause_expression returns true */
1585 			Var		   *var;
1586 			Const	   *cst;
1587 			bool		varonleft;
1588 
1589 			fmgr_info(get_opcode(expr->opno), &opproc);
1590 
1591 			/* extract the var and const from the expression */
1592 			if (examine_opclause_expression(expr, &var, &cst, &varonleft))
1593 			{
1594 				int			idx;
1595 
1596 				/* match the attribute to a dimension of the statistic */
1597 				idx = bms_member_index(keys, var->varattno);
1598 
1599 				/*
1600 				 * Walk through the MCV items and evaluate the current clause.
1601 				 * We can skip items that were already ruled out, and
1602 				 * terminate if there are no remaining MCV items that might
1603 				 * possibly match.
1604 				 */
1605 				for (i = 0; i < mcvlist->nitems; i++)
1606 				{
1607 					bool		match = true;
1608 					MCVItem    *item = &mcvlist->items[i];
1609 
1610 					/*
1611 					 * When the MCV item or the Const value is NULL we can treat
1612 					 * this as a mismatch. We must not call the operator because
1613 					 * of strictness.
1614 					 */
1615 					if (item->isnull[idx] || cst->constisnull)
1616 					{
1617 						matches[i] = RESULT_MERGE(matches[i], is_or, false);
1618 						continue;
1619 					}
1620 
1621 					/*
1622 					 * Skip MCV items that can't change result in the bitmap.
1623 					 * Once the value gets false for AND-lists, or true for
1624 					 * OR-lists, we don't need to look at more clauses.
1625 					 */
1626 					if (RESULT_IS_FINAL(matches[i], is_or))
1627 						continue;
1628 
1629 					/*
1630 					 * First check whether the constant is below the lower
1631 					 * boundary (in that case we can skip the bucket, because
1632 					 * there's no overlap).
1633 					 *
1634 					 * We don't store collations used to build the statistics,
1635 					 * but we can use the collation for the attribute itself,
1636 					 * as stored in varcollid. We do reset the statistics after
1637 					 * a type change (including collation change), so this is
1638 					 * OK. We may need to relax this after allowing extended
1639 					 * statistics on expressions.
1640 					 */
1641 					if (varonleft)
1642 						match = DatumGetBool(FunctionCall2Coll(&opproc,
1643 															   var->varcollid,
1644 															   item->values[idx],
1645 															   cst->constvalue));
1646 					else
1647 						match = DatumGetBool(FunctionCall2Coll(&opproc,
1648 															   var->varcollid,
1649 															   cst->constvalue,
1650 															   item->values[idx]));
1651 
1652 					/* update the match bitmap with the result */
1653 					matches[i] = RESULT_MERGE(matches[i], is_or, match);
1654 				}
1655 			}
1656 		}
1657 		else if (IsA(clause, NullTest))
1658 		{
1659 			NullTest   *expr = (NullTest *) clause;
1660 			Var		   *var = (Var *) (expr->arg);
1661 
1662 			/* match the attribute to a dimension of the statistic */
1663 			int			idx = bms_member_index(keys, var->varattno);
1664 
1665 			/*
1666 			 * Walk through the MCV items and evaluate the current clause. We
1667 			 * can skip items that were already ruled out, and terminate if
1668 			 * there are no remaining MCV items that might possibly match.
1669 			 */
1670 			for (i = 0; i < mcvlist->nitems; i++)
1671 			{
1672 				bool		match = false;	/* assume mismatch */
1673 				MCVItem    *item = &mcvlist->items[i];
1674 
1675 				/* if the clause mismatches the MCV item, update the bitmap */
1676 				switch (expr->nulltesttype)
1677 				{
1678 					case IS_NULL:
1679 						match = (item->isnull[idx]) ? true : match;
1680 						break;
1681 
1682 					case IS_NOT_NULL:
1683 						match = (!item->isnull[idx]) ? true : match;
1684 						break;
1685 				}
1686 
1687 				/* now, update the match bitmap, depending on OR/AND type */
1688 				matches[i] = RESULT_MERGE(matches[i], is_or, match);
1689 			}
1690 		}
1691 		else if (is_orclause(clause) || is_andclause(clause))
1692 		{
1693 			/* AND/OR clause, with all subclauses being compatible */
1694 
1695 			int			i;
1696 			BoolExpr   *bool_clause = ((BoolExpr *) clause);
1697 			List	   *bool_clauses = bool_clause->args;
1698 
1699 			/* match/mismatch bitmap for each MCV item */
1700 			bool	   *bool_matches = NULL;
1701 
1702 			Assert(bool_clauses != NIL);
1703 			Assert(list_length(bool_clauses) >= 2);
1704 
1705 			/* build the match bitmap for the OR-clauses */
1706 			bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
1707 												mcvlist, is_orclause(clause));
1708 
1709 			/*
1710 			 * Merge the bitmap produced by mcv_get_match_bitmap into the
1711 			 * current one. We need to consider if we're evaluating AND or OR
1712 			 * condition when merging the results.
1713 			 */
1714 			for (i = 0; i < mcvlist->nitems; i++)
1715 				matches[i] = RESULT_MERGE(matches[i], is_or, bool_matches[i]);
1716 
1717 			pfree(bool_matches);
1718 		}
1719 		else if (is_notclause(clause))
1720 		{
1721 			/* NOT clause, with all subclauses compatible */
1722 
1723 			int			i;
1724 			BoolExpr   *not_clause = ((BoolExpr *) clause);
1725 			List	   *not_args = not_clause->args;
1726 
1727 			/* match/mismatch bitmap for each MCV item */
1728 			bool	   *not_matches = NULL;
1729 
1730 			Assert(not_args != NIL);
1731 			Assert(list_length(not_args) == 1);
1732 
1733 			/* build the match bitmap for the NOT-clause */
1734 			not_matches = mcv_get_match_bitmap(root, not_args, keys,
1735 											   mcvlist, false);
1736 
1737 			/*
1738 			 * Merge the bitmap produced by mcv_get_match_bitmap into the
1739 			 * current one. We're handling a NOT clause, so invert the result
1740 			 * before merging it into the global bitmap.
1741 			 */
1742 			for (i = 0; i < mcvlist->nitems; i++)
1743 				matches[i] = RESULT_MERGE(matches[i], is_or, !not_matches[i]);
1744 
1745 			pfree(not_matches);
1746 		}
1747 		else if (IsA(clause, Var))
1748 		{
1749 			/* Var (has to be a boolean Var, possibly from below NOT) */
1750 
1751 			Var		   *var = (Var *) (clause);
1752 
1753 			/* match the attribute to a dimension of the statistic */
1754 			int			idx = bms_member_index(keys, var->varattno);
1755 
1756 			Assert(var->vartype == BOOLOID);
1757 
1758 			/*
1759 			 * Walk through the MCV items and evaluate the current clause. We
1760 			 * can skip items that were already ruled out, and terminate if
1761 			 * there are no remaining MCV items that might possibly match.
1762 			 */
1763 			for (i = 0; i < mcvlist->nitems; i++)
1764 			{
1765 				MCVItem    *item = &mcvlist->items[i];
1766 				bool		match = false;
1767 
1768 				/* if the item is NULL, it's a mismatch */
1769 				if (!item->isnull[idx] && DatumGetBool(item->values[idx]))
1770 					match = true;
1771 
1772 				/* update the result bitmap */
1773 				matches[i] = RESULT_MERGE(matches[i], is_or, match);
1774 			}
1775 		}
1776 		else
1777 			elog(ERROR, "unknown clause type: %d", clause->type);
1778 	}
1779 
1780 	return matches;
1781 }
1782 
1783 
1784 /*
1785  * mcv_clauselist_selectivity
1786  *		Return the selectivity estimate computed using an MCV list.
1787  *
1788  * First builds a bitmap of MCV items matching the clauses, and then sums
1789  * the frequencies of matching items.
1790  *
1791  * It also produces two additional interesting selectivities - total
1792  * selectivity of all the MCV items (not just the matching ones), and the
1793  * base frequency computed on the assumption of independence.
1794  */
1795 Selectivity
mcv_clauselist_selectivity(PlannerInfo * root,StatisticExtInfo * stat,List * clauses,int varRelid,JoinType jointype,SpecialJoinInfo * sjinfo,RelOptInfo * rel,Selectivity * basesel,Selectivity * totalsel)1796 mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
1797 						   List *clauses, int varRelid,
1798 						   JoinType jointype, SpecialJoinInfo *sjinfo,
1799 						   RelOptInfo *rel,
1800 						   Selectivity *basesel, Selectivity *totalsel)
1801 {
1802 	int			i;
1803 	MCVList    *mcv;
1804 	Selectivity s = 0.0;
1805 
1806 	/* match/mismatch bitmap for each MCV item */
1807 	bool	   *matches = NULL;
1808 
1809 	/* load the MCV list stored in the statistics object */
1810 	mcv = statext_mcv_load(stat->statOid);
1811 
1812 	/* build a match bitmap for the clauses */
1813 	matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
1814 
1815 	/* sum frequencies for all the matching MCV items */
1816 	*basesel = 0.0;
1817 	*totalsel = 0.0;
1818 	for (i = 0; i < mcv->nitems; i++)
1819 	{
1820 		*totalsel += mcv->items[i].frequency;
1821 
1822 		if (matches[i] != false)
1823 		{
1824 			/* XXX Shouldn't the basesel be outside the if condition? */
1825 			*basesel += mcv->items[i].base_frequency;
1826 			s += mcv->items[i].frequency;
1827 		}
1828 	}
1829 
1830 	return s;
1831 }
1832