1 /*-------------------------------------------------------------------------
2 *
3 * analyze.c
4 * the Postgres statistics generator
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/commands/analyze.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include <math.h>
18
19 #include "access/detoast.h"
20 #include "access/genam.h"
21 #include "access/multixact.h"
22 #include "access/relation.h"
23 #include "access/sysattr.h"
24 #include "access/table.h"
25 #include "access/tableam.h"
26 #include "access/transam.h"
27 #include "access/tupconvert.h"
28 #include "access/visibilitymap.h"
29 #include "access/xact.h"
30 #include "catalog/catalog.h"
31 #include "catalog/index.h"
32 #include "catalog/indexing.h"
33 #include "catalog/pg_collation.h"
34 #include "catalog/pg_inherits.h"
35 #include "catalog/pg_namespace.h"
36 #include "catalog/pg_statistic_ext.h"
37 #include "commands/dbcommands.h"
38 #include "commands/progress.h"
39 #include "commands/tablecmds.h"
40 #include "commands/vacuum.h"
41 #include "executor/executor.h"
42 #include "foreign/fdwapi.h"
43 #include "miscadmin.h"
44 #include "nodes/nodeFuncs.h"
45 #include "parser/parse_oper.h"
46 #include "parser/parse_relation.h"
47 #include "pgstat.h"
48 #include "postmaster/autovacuum.h"
49 #include "statistics/extended_stats_internal.h"
50 #include "statistics/statistics.h"
51 #include "storage/bufmgr.h"
52 #include "storage/lmgr.h"
53 #include "storage/proc.h"
54 #include "storage/procarray.h"
55 #include "utils/acl.h"
56 #include "utils/attoptcache.h"
57 #include "utils/builtins.h"
58 #include "utils/datum.h"
59 #include "utils/fmgroids.h"
60 #include "utils/guc.h"
61 #include "utils/lsyscache.h"
62 #include "utils/memutils.h"
63 #include "utils/pg_rusage.h"
64 #include "utils/sampling.h"
65 #include "utils/sortsupport.h"
66 #include "utils/spccache.h"
67 #include "utils/syscache.h"
68 #include "utils/timestamp.h"
69
70
71 /* Per-index data for ANALYZE */
72 typedef struct AnlIndexData
73 {
74 IndexInfo *indexInfo; /* BuildIndexInfo result */
75 double tupleFract; /* fraction of rows for partial index */
76 VacAttrStats **vacattrstats; /* index attrs to analyze */
77 int attr_cnt;
78 } AnlIndexData;
79
80
81 /* Default statistics target (GUC parameter) */
82 int default_statistics_target = 100;
83
84 /* A few variables that don't seem worth passing around as parameters */
85 static MemoryContext anl_context = NULL;
86 static BufferAccessStrategy vac_strategy;
87
88
89 static void do_analyze_rel(Relation onerel,
90 VacuumParams *params, List *va_cols,
91 AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
92 bool inh, bool in_outer_xact, int elevel);
93 static void compute_index_stats(Relation onerel, double totalrows,
94 AnlIndexData *indexdata, int nindexes,
95 HeapTuple *rows, int numrows,
96 MemoryContext col_context);
97 static VacAttrStats *examine_attribute(Relation onerel, int attnum,
98 Node *index_expr);
99 static int acquire_sample_rows(Relation onerel, int elevel,
100 HeapTuple *rows, int targrows,
101 double *totalrows, double *totaldeadrows);
102 static int compare_rows(const void *a, const void *b);
103 static int acquire_inherited_sample_rows(Relation onerel, int elevel,
104 HeapTuple *rows, int targrows,
105 double *totalrows, double *totaldeadrows);
106 static void update_attstats(Oid relid, bool inh,
107 int natts, VacAttrStats **vacattrstats);
108 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
109 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
110
111
112 /*
113 * analyze_rel() -- analyze one relation
114 *
115 * relid identifies the relation to analyze. If relation is supplied, use
116 * the name therein for reporting any failure to open/lock the rel; do not
117 * use it once we've successfully opened the rel, since it might be stale.
118 */
119 void
analyze_rel(Oid relid,RangeVar * relation,VacuumParams * params,List * va_cols,bool in_outer_xact,BufferAccessStrategy bstrategy)120 analyze_rel(Oid relid, RangeVar *relation,
121 VacuumParams *params, List *va_cols, bool in_outer_xact,
122 BufferAccessStrategy bstrategy)
123 {
124 Relation onerel;
125 int elevel;
126 AcquireSampleRowsFunc acquirefunc = NULL;
127 BlockNumber relpages = 0;
128
129 /* Select logging level */
130 if (params->options & VACOPT_VERBOSE)
131 elevel = INFO;
132 else
133 elevel = DEBUG2;
134
135 /* Set up static variables */
136 vac_strategy = bstrategy;
137
138 /*
139 * Check for user-requested abort.
140 */
141 CHECK_FOR_INTERRUPTS();
142
143 /*
144 * Open the relation, getting ShareUpdateExclusiveLock to ensure that two
145 * ANALYZEs don't run on it concurrently. (This also locks out a
146 * concurrent VACUUM, which doesn't matter much at the moment but might
147 * matter if we ever try to accumulate stats on dead tuples.) If the rel
148 * has been dropped since we last saw it, we don't need to process it.
149 *
150 * Make sure to generate only logs for ANALYZE in this case.
151 */
152 onerel = vacuum_open_relation(relid, relation, params->options & ~(VACOPT_VACUUM),
153 params->log_min_duration >= 0,
154 ShareUpdateExclusiveLock);
155
156 /* leave if relation could not be opened or locked */
157 if (!onerel)
158 return;
159
160 /*
161 * Check if relation needs to be skipped based on ownership. This check
162 * happens also when building the relation list to analyze for a manual
163 * operation, and needs to be done additionally here as ANALYZE could
164 * happen across multiple transactions where relation ownership could have
165 * changed in-between. Make sure to generate only logs for ANALYZE in
166 * this case.
167 */
168 if (!vacuum_is_relation_owner(RelationGetRelid(onerel),
169 onerel->rd_rel,
170 params->options & VACOPT_ANALYZE))
171 {
172 relation_close(onerel, ShareUpdateExclusiveLock);
173 return;
174 }
175
176 /*
177 * Silently ignore tables that are temp tables of other backends ---
178 * trying to analyze these is rather pointless, since their contents are
179 * probably not up-to-date on disk. (We don't throw a warning here; it
180 * would just lead to chatter during a database-wide ANALYZE.)
181 */
182 if (RELATION_IS_OTHER_TEMP(onerel))
183 {
184 relation_close(onerel, ShareUpdateExclusiveLock);
185 return;
186 }
187
188 /*
189 * We can ANALYZE any table except pg_statistic. See update_attstats
190 */
191 if (RelationGetRelid(onerel) == StatisticRelationId)
192 {
193 relation_close(onerel, ShareUpdateExclusiveLock);
194 return;
195 }
196
197 /*
198 * Check that it's of an analyzable relkind, and set up appropriately.
199 */
200 if (onerel->rd_rel->relkind == RELKIND_RELATION ||
201 onerel->rd_rel->relkind == RELKIND_MATVIEW)
202 {
203 /* Regular table, so we'll use the regular row acquisition function */
204 acquirefunc = acquire_sample_rows;
205 /* Also get regular table's size */
206 relpages = RelationGetNumberOfBlocks(onerel);
207 }
208 else if (onerel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
209 {
210 /*
211 * For a foreign table, call the FDW's hook function to see whether it
212 * supports analysis.
213 */
214 FdwRoutine *fdwroutine;
215 bool ok = false;
216
217 fdwroutine = GetFdwRoutineForRelation(onerel, false);
218
219 if (fdwroutine->AnalyzeForeignTable != NULL)
220 ok = fdwroutine->AnalyzeForeignTable(onerel,
221 &acquirefunc,
222 &relpages);
223
224 if (!ok)
225 {
226 ereport(WARNING,
227 (errmsg("skipping \"%s\" --- cannot analyze this foreign table",
228 RelationGetRelationName(onerel))));
229 relation_close(onerel, ShareUpdateExclusiveLock);
230 return;
231 }
232 }
233 else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
234 {
235 /*
236 * For partitioned tables, we want to do the recursive ANALYZE below.
237 */
238 }
239 else
240 {
241 /* No need for a WARNING if we already complained during VACUUM */
242 if (!(params->options & VACOPT_VACUUM))
243 ereport(WARNING,
244 (errmsg("skipping \"%s\" --- cannot analyze non-tables or special system tables",
245 RelationGetRelationName(onerel))));
246 relation_close(onerel, ShareUpdateExclusiveLock);
247 return;
248 }
249
250 /*
251 * OK, let's do it. First, initialize progress reporting.
252 */
253 pgstat_progress_start_command(PROGRESS_COMMAND_ANALYZE,
254 RelationGetRelid(onerel));
255
256 /*
257 * Do the normal non-recursive ANALYZE. We can skip this for partitioned
258 * tables, which don't contain any rows.
259 */
260 if (onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
261 do_analyze_rel(onerel, params, va_cols, acquirefunc,
262 relpages, false, in_outer_xact, elevel);
263
264 /*
265 * If there are child tables, do recursive ANALYZE.
266 */
267 if (onerel->rd_rel->relhassubclass)
268 do_analyze_rel(onerel, params, va_cols, acquirefunc, relpages,
269 true, in_outer_xact, elevel);
270
271 /*
272 * Close source relation now, but keep lock so that no one deletes it
273 * before we commit. (If someone did, they'd fail to clean up the entries
274 * we made in pg_statistic. Also, releasing the lock before commit would
275 * expose us to concurrent-update failures in update_attstats.)
276 */
277 relation_close(onerel, NoLock);
278
279 pgstat_progress_end_command();
280 }
281
282 /*
283 * do_analyze_rel() -- analyze one relation, recursively or not
284 *
285 * Note that "acquirefunc" is only relevant for the non-inherited case.
286 * For the inherited case, acquire_inherited_sample_rows() determines the
287 * appropriate acquirefunc for each child table.
288 */
289 static void
do_analyze_rel(Relation onerel,VacuumParams * params,List * va_cols,AcquireSampleRowsFunc acquirefunc,BlockNumber relpages,bool inh,bool in_outer_xact,int elevel)290 do_analyze_rel(Relation onerel, VacuumParams *params,
291 List *va_cols, AcquireSampleRowsFunc acquirefunc,
292 BlockNumber relpages, bool inh, bool in_outer_xact,
293 int elevel)
294 {
295 int attr_cnt,
296 tcnt,
297 i,
298 ind;
299 Relation *Irel;
300 int nindexes;
301 bool hasindex;
302 VacAttrStats **vacattrstats;
303 AnlIndexData *indexdata;
304 int targrows,
305 numrows,
306 minrows;
307 double totalrows,
308 totaldeadrows;
309 HeapTuple *rows;
310 PGRUsage ru0;
311 TimestampTz starttime = 0;
312 MemoryContext caller_context;
313 Oid save_userid;
314 int save_sec_context;
315 int save_nestlevel;
316 int64 AnalyzePageHit = VacuumPageHit;
317 int64 AnalyzePageMiss = VacuumPageMiss;
318 int64 AnalyzePageDirty = VacuumPageDirty;
319 PgStat_Counter startreadtime = 0;
320 PgStat_Counter startwritetime = 0;
321
322 if (inh)
323 ereport(elevel,
324 (errmsg("analyzing \"%s.%s\" inheritance tree",
325 get_namespace_name(RelationGetNamespace(onerel)),
326 RelationGetRelationName(onerel))));
327 else
328 ereport(elevel,
329 (errmsg("analyzing \"%s.%s\"",
330 get_namespace_name(RelationGetNamespace(onerel)),
331 RelationGetRelationName(onerel))));
332
333 /*
334 * Set up a working context so that we can easily free whatever junk gets
335 * created.
336 */
337 anl_context = AllocSetContextCreate(CurrentMemoryContext,
338 "Analyze",
339 ALLOCSET_DEFAULT_SIZES);
340 caller_context = MemoryContextSwitchTo(anl_context);
341
342 /*
343 * Switch to the table owner's userid, so that any index functions are run
344 * as that user. Also lock down security-restricted operations and
345 * arrange to make GUC variable changes local to this command.
346 */
347 GetUserIdAndSecContext(&save_userid, &save_sec_context);
348 SetUserIdAndSecContext(onerel->rd_rel->relowner,
349 save_sec_context | SECURITY_RESTRICTED_OPERATION);
350 save_nestlevel = NewGUCNestLevel();
351
352 /* measure elapsed time iff autovacuum logging requires it */
353 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
354 {
355 if (track_io_timing)
356 {
357 startreadtime = pgStatBlockReadTime;
358 startwritetime = pgStatBlockWriteTime;
359 }
360
361 pg_rusage_init(&ru0);
362 if (params->log_min_duration >= 0)
363 starttime = GetCurrentTimestamp();
364 }
365
366 /*
367 * Determine which columns to analyze
368 *
369 * Note that system attributes are never analyzed, so we just reject them
370 * at the lookup stage. We also reject duplicate column mentions. (We
371 * could alternatively ignore duplicates, but analyzing a column twice
372 * won't work; we'd end up making a conflicting update in pg_statistic.)
373 */
374 if (va_cols != NIL)
375 {
376 Bitmapset *unique_cols = NULL;
377 ListCell *le;
378
379 vacattrstats = (VacAttrStats **) palloc(list_length(va_cols) *
380 sizeof(VacAttrStats *));
381 tcnt = 0;
382 foreach(le, va_cols)
383 {
384 char *col = strVal(lfirst(le));
385
386 i = attnameAttNum(onerel, col, false);
387 if (i == InvalidAttrNumber)
388 ereport(ERROR,
389 (errcode(ERRCODE_UNDEFINED_COLUMN),
390 errmsg("column \"%s\" of relation \"%s\" does not exist",
391 col, RelationGetRelationName(onerel))));
392 if (bms_is_member(i, unique_cols))
393 ereport(ERROR,
394 (errcode(ERRCODE_DUPLICATE_COLUMN),
395 errmsg("column \"%s\" of relation \"%s\" appears more than once",
396 col, RelationGetRelationName(onerel))));
397 unique_cols = bms_add_member(unique_cols, i);
398
399 vacattrstats[tcnt] = examine_attribute(onerel, i, NULL);
400 if (vacattrstats[tcnt] != NULL)
401 tcnt++;
402 }
403 attr_cnt = tcnt;
404 }
405 else
406 {
407 attr_cnt = onerel->rd_att->natts;
408 vacattrstats = (VacAttrStats **)
409 palloc(attr_cnt * sizeof(VacAttrStats *));
410 tcnt = 0;
411 for (i = 1; i <= attr_cnt; i++)
412 {
413 vacattrstats[tcnt] = examine_attribute(onerel, i, NULL);
414 if (vacattrstats[tcnt] != NULL)
415 tcnt++;
416 }
417 attr_cnt = tcnt;
418 }
419
420 /*
421 * Open all indexes of the relation, and see if there are any analyzable
422 * columns in the indexes. We do not analyze index columns if there was
423 * an explicit column list in the ANALYZE command, however.
424 *
425 * If we are doing a recursive scan, we don't want to touch the parent's
426 * indexes at all. If we're processing a partitioned table, we need to
427 * know if there are any indexes, but we don't want to process them.
428 */
429 if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
430 {
431 List *idxs = RelationGetIndexList(onerel);
432
433 Irel = NULL;
434 nindexes = 0;
435 hasindex = idxs != NIL;
436 list_free(idxs);
437 }
438 else if (!inh)
439 {
440 vac_open_indexes(onerel, AccessShareLock, &nindexes, &Irel);
441 hasindex = nindexes > 0;
442 }
443 else
444 {
445 Irel = NULL;
446 nindexes = 0;
447 hasindex = false;
448 }
449 indexdata = NULL;
450 if (nindexes > 0)
451 {
452 indexdata = (AnlIndexData *) palloc0(nindexes * sizeof(AnlIndexData));
453 for (ind = 0; ind < nindexes; ind++)
454 {
455 AnlIndexData *thisdata = &indexdata[ind];
456 IndexInfo *indexInfo;
457
458 thisdata->indexInfo = indexInfo = BuildIndexInfo(Irel[ind]);
459 thisdata->tupleFract = 1.0; /* fix later if partial */
460 if (indexInfo->ii_Expressions != NIL && va_cols == NIL)
461 {
462 ListCell *indexpr_item = list_head(indexInfo->ii_Expressions);
463
464 thisdata->vacattrstats = (VacAttrStats **)
465 palloc(indexInfo->ii_NumIndexAttrs * sizeof(VacAttrStats *));
466 tcnt = 0;
467 for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
468 {
469 int keycol = indexInfo->ii_IndexAttrNumbers[i];
470
471 if (keycol == 0)
472 {
473 /* Found an index expression */
474 Node *indexkey;
475
476 if (indexpr_item == NULL) /* shouldn't happen */
477 elog(ERROR, "too few entries in indexprs list");
478 indexkey = (Node *) lfirst(indexpr_item);
479 indexpr_item = lnext(indexInfo->ii_Expressions,
480 indexpr_item);
481 thisdata->vacattrstats[tcnt] =
482 examine_attribute(Irel[ind], i + 1, indexkey);
483 if (thisdata->vacattrstats[tcnt] != NULL)
484 tcnt++;
485 }
486 }
487 thisdata->attr_cnt = tcnt;
488 }
489 }
490 }
491
492 /*
493 * Determine how many rows we need to sample, using the worst case from
494 * all analyzable columns. We use a lower bound of 100 rows to avoid
495 * possible overflow in Vitter's algorithm. (Note: that will also be the
496 * target in the corner case where there are no analyzable columns.)
497 */
498 targrows = 100;
499 for (i = 0; i < attr_cnt; i++)
500 {
501 if (targrows < vacattrstats[i]->minrows)
502 targrows = vacattrstats[i]->minrows;
503 }
504 for (ind = 0; ind < nindexes; ind++)
505 {
506 AnlIndexData *thisdata = &indexdata[ind];
507
508 for (i = 0; i < thisdata->attr_cnt; i++)
509 {
510 if (targrows < thisdata->vacattrstats[i]->minrows)
511 targrows = thisdata->vacattrstats[i]->minrows;
512 }
513 }
514
515 /*
516 * Look at extended statistics objects too, as those may define custom
517 * statistics target. So we may need to sample more rows and then build
518 * the statistics with enough detail.
519 */
520 minrows = ComputeExtStatisticsRows(onerel, attr_cnt, vacattrstats);
521
522 if (targrows < minrows)
523 targrows = minrows;
524
525 /*
526 * Acquire the sample rows
527 */
528 rows = (HeapTuple *) palloc(targrows * sizeof(HeapTuple));
529 pgstat_progress_update_param(PROGRESS_ANALYZE_PHASE,
530 inh ? PROGRESS_ANALYZE_PHASE_ACQUIRE_SAMPLE_ROWS_INH :
531 PROGRESS_ANALYZE_PHASE_ACQUIRE_SAMPLE_ROWS);
532 if (inh)
533 numrows = acquire_inherited_sample_rows(onerel, elevel,
534 rows, targrows,
535 &totalrows, &totaldeadrows);
536 else
537 numrows = (*acquirefunc) (onerel, elevel,
538 rows, targrows,
539 &totalrows, &totaldeadrows);
540
541 /*
542 * Compute the statistics. Temporary results during the calculations for
543 * each column are stored in a child context. The calc routines are
544 * responsible to make sure that whatever they store into the VacAttrStats
545 * structure is allocated in anl_context.
546 */
547 if (numrows > 0)
548 {
549 MemoryContext col_context,
550 old_context;
551
552 pgstat_progress_update_param(PROGRESS_ANALYZE_PHASE,
553 PROGRESS_ANALYZE_PHASE_COMPUTE_STATS);
554
555 col_context = AllocSetContextCreate(anl_context,
556 "Analyze Column",
557 ALLOCSET_DEFAULT_SIZES);
558 old_context = MemoryContextSwitchTo(col_context);
559
560 for (i = 0; i < attr_cnt; i++)
561 {
562 VacAttrStats *stats = vacattrstats[i];
563 AttributeOpts *aopt;
564
565 stats->rows = rows;
566 stats->tupDesc = onerel->rd_att;
567 stats->compute_stats(stats,
568 std_fetch_func,
569 numrows,
570 totalrows);
571
572 /*
573 * If the appropriate flavor of the n_distinct option is
574 * specified, override with the corresponding value.
575 */
576 aopt = get_attribute_options(onerel->rd_id, stats->attr->attnum);
577 if (aopt != NULL)
578 {
579 float8 n_distinct;
580
581 n_distinct = inh ? aopt->n_distinct_inherited : aopt->n_distinct;
582 if (n_distinct != 0.0)
583 stats->stadistinct = n_distinct;
584 }
585
586 MemoryContextResetAndDeleteChildren(col_context);
587 }
588
589 if (nindexes > 0)
590 compute_index_stats(onerel, totalrows,
591 indexdata, nindexes,
592 rows, numrows,
593 col_context);
594
595 MemoryContextSwitchTo(old_context);
596 MemoryContextDelete(col_context);
597
598 /*
599 * Emit the completed stats rows into pg_statistic, replacing any
600 * previous statistics for the target columns. (If there are stats in
601 * pg_statistic for columns we didn't process, we leave them alone.)
602 */
603 update_attstats(RelationGetRelid(onerel), inh,
604 attr_cnt, vacattrstats);
605
606 for (ind = 0; ind < nindexes; ind++)
607 {
608 AnlIndexData *thisdata = &indexdata[ind];
609
610 update_attstats(RelationGetRelid(Irel[ind]), false,
611 thisdata->attr_cnt, thisdata->vacattrstats);
612 }
613
614 /*
615 * Build extended statistics (if there are any).
616 *
617 * For now we only build extended statistics on individual relations,
618 * not for relations representing inheritance trees.
619 */
620 if (!inh)
621 BuildRelationExtStatistics(onerel, totalrows, numrows, rows,
622 attr_cnt, vacattrstats);
623 }
624
625 pgstat_progress_update_param(PROGRESS_ANALYZE_PHASE,
626 PROGRESS_ANALYZE_PHASE_FINALIZE_ANALYZE);
627
628 /*
629 * Update pages/tuples stats in pg_class ... but not if we're doing
630 * inherited stats.
631 *
632 * We assume that VACUUM hasn't set pg_class.reltuples already, even
633 * during a VACUUM ANALYZE. Although VACUUM often updates pg_class,
634 * exceptions exist. A "VACUUM (ANALYZE, INDEX_CLEANUP OFF)" command will
635 * never update pg_class entries for index relations. It's also possible
636 * that an individual index's pg_class entry won't be updated during
637 * VACUUM if the index AM returns NULL from its amvacuumcleanup() routine.
638 */
639 if (!inh)
640 {
641 BlockNumber relallvisible;
642
643 visibilitymap_count(onerel, &relallvisible, NULL);
644
645 /* Update pg_class for table relation */
646 vac_update_relstats(onerel,
647 relpages,
648 totalrows,
649 relallvisible,
650 hasindex,
651 InvalidTransactionId,
652 InvalidMultiXactId,
653 in_outer_xact);
654
655 /* Same for indexes */
656 for (ind = 0; ind < nindexes; ind++)
657 {
658 AnlIndexData *thisdata = &indexdata[ind];
659 double totalindexrows;
660
661 totalindexrows = ceil(thisdata->tupleFract * totalrows);
662 vac_update_relstats(Irel[ind],
663 RelationGetNumberOfBlocks(Irel[ind]),
664 totalindexrows,
665 0,
666 false,
667 InvalidTransactionId,
668 InvalidMultiXactId,
669 in_outer_xact);
670 }
671 }
672 else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
673 {
674 /*
675 * Partitioned tables don't have storage, so we don't set any fields
676 * in their pg_class entries except for reltuples and relhasindex.
677 */
678 vac_update_relstats(onerel, -1, totalrows,
679 0, hasindex, InvalidTransactionId,
680 InvalidMultiXactId,
681 in_outer_xact);
682 }
683
684 /*
685 * Now report ANALYZE to the stats collector. For regular tables, we do
686 * it only if not doing inherited stats. For partitioned tables, we only
687 * do it for inherited stats. (We're never called for not-inherited stats
688 * on partitioned tables anyway.)
689 *
690 * Reset the changes_since_analyze counter only if we analyzed all
691 * columns; otherwise, there is still work for auto-analyze to do.
692 */
693 if (!inh)
694 pgstat_report_analyze(onerel, totalrows, totaldeadrows,
695 (va_cols == NIL));
696 else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
697 pgstat_report_analyze(onerel, 0, 0, (va_cols == NIL));
698
699 /*
700 * If this isn't part of VACUUM ANALYZE, let index AMs do cleanup.
701 *
702 * Note that most index AMs perform a no-op as a matter of policy for
703 * amvacuumcleanup() when called in ANALYZE-only mode. The only exception
704 * among core index AMs is GIN/ginvacuumcleanup().
705 */
706 if (!(params->options & VACOPT_VACUUM))
707 {
708 for (ind = 0; ind < nindexes; ind++)
709 {
710 IndexBulkDeleteResult *stats;
711 IndexVacuumInfo ivinfo;
712
713 ivinfo.index = Irel[ind];
714 ivinfo.analyze_only = true;
715 ivinfo.estimated_count = true;
716 ivinfo.message_level = elevel;
717 ivinfo.num_heap_tuples = onerel->rd_rel->reltuples;
718 ivinfo.strategy = vac_strategy;
719
720 stats = index_vacuum_cleanup(&ivinfo, NULL);
721
722 if (stats)
723 pfree(stats);
724 }
725 }
726
727 /* Done with indexes */
728 vac_close_indexes(nindexes, Irel, NoLock);
729
730 /* Log the action if appropriate */
731 if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
732 {
733 TimestampTz endtime = GetCurrentTimestamp();
734
735 if (params->log_min_duration == 0 ||
736 TimestampDifferenceExceeds(starttime, endtime,
737 params->log_min_duration))
738 {
739 long delay_in_ms;
740 double read_rate = 0;
741 double write_rate = 0;
742 StringInfoData buf;
743
744 /*
745 * Calculate the difference in the Page Hit/Miss/Dirty that
746 * happened as part of the analyze by subtracting out the
747 * pre-analyze values which we saved above.
748 */
749 AnalyzePageHit = VacuumPageHit - AnalyzePageHit;
750 AnalyzePageMiss = VacuumPageMiss - AnalyzePageMiss;
751 AnalyzePageDirty = VacuumPageDirty - AnalyzePageDirty;
752
753 /*
754 * We do not expect an analyze to take > 25 days and it simplifies
755 * things a bit to use TimestampDifferenceMilliseconds.
756 */
757 delay_in_ms = TimestampDifferenceMilliseconds(starttime, endtime);
758
759 /*
760 * Note that we are reporting these read/write rates in the same
761 * manner as VACUUM does, which means that while the 'average read
762 * rate' here actually corresponds to page misses and resulting
763 * reads which are also picked up by track_io_timing, if enabled,
764 * the 'average write rate' is actually talking about the rate of
765 * pages being dirtied, not being written out, so it's typical to
766 * have a non-zero 'avg write rate' while I/O timings only reports
767 * reads.
768 *
769 * It's not clear that an ANALYZE will ever result in
770 * FlushBuffer() being called, but we track and support reporting
771 * on I/O write time in case that changes as it's practically free
772 * to do so anyway.
773 */
774
775 if (delay_in_ms > 0)
776 {
777 read_rate = (double) BLCKSZ * AnalyzePageMiss / (1024 * 1024) /
778 (delay_in_ms / 1000.0);
779 write_rate = (double) BLCKSZ * AnalyzePageDirty / (1024 * 1024) /
780 (delay_in_ms / 1000.0);
781 }
782
783 /*
784 * We split this up so we don't emit empty I/O timing values when
785 * track_io_timing isn't enabled.
786 */
787
788 initStringInfo(&buf);
789 appendStringInfo(&buf, _("automatic analyze of table \"%s.%s.%s\"\n"),
790 get_database_name(MyDatabaseId),
791 get_namespace_name(RelationGetNamespace(onerel)),
792 RelationGetRelationName(onerel));
793 if (track_io_timing)
794 {
795 double read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000;
796 double write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000;
797
798 appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"),
799 read_ms, write_ms);
800 }
801 appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
802 read_rate, write_rate);
803 appendStringInfo(&buf, _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
804 (long long) AnalyzePageHit,
805 (long long) AnalyzePageMiss,
806 (long long) AnalyzePageDirty);
807 appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
808
809 ereport(LOG,
810 (errmsg_internal("%s", buf.data)));
811
812 pfree(buf.data);
813 }
814 }
815
816 /* Roll back any GUC changes executed by index functions */
817 AtEOXact_GUC(false, save_nestlevel);
818
819 /* Restore userid and security context */
820 SetUserIdAndSecContext(save_userid, save_sec_context);
821
822 /* Restore current context and release memory */
823 MemoryContextSwitchTo(caller_context);
824 MemoryContextDelete(anl_context);
825 anl_context = NULL;
826 }
827
828 /*
829 * Compute statistics about indexes of a relation
830 */
831 static void
compute_index_stats(Relation onerel,double totalrows,AnlIndexData * indexdata,int nindexes,HeapTuple * rows,int numrows,MemoryContext col_context)832 compute_index_stats(Relation onerel, double totalrows,
833 AnlIndexData *indexdata, int nindexes,
834 HeapTuple *rows, int numrows,
835 MemoryContext col_context)
836 {
837 MemoryContext ind_context,
838 old_context;
839 Datum values[INDEX_MAX_KEYS];
840 bool isnull[INDEX_MAX_KEYS];
841 int ind,
842 i;
843
844 ind_context = AllocSetContextCreate(anl_context,
845 "Analyze Index",
846 ALLOCSET_DEFAULT_SIZES);
847 old_context = MemoryContextSwitchTo(ind_context);
848
849 for (ind = 0; ind < nindexes; ind++)
850 {
851 AnlIndexData *thisdata = &indexdata[ind];
852 IndexInfo *indexInfo = thisdata->indexInfo;
853 int attr_cnt = thisdata->attr_cnt;
854 TupleTableSlot *slot;
855 EState *estate;
856 ExprContext *econtext;
857 ExprState *predicate;
858 Datum *exprvals;
859 bool *exprnulls;
860 int numindexrows,
861 tcnt,
862 rowno;
863 double totalindexrows;
864
865 /* Ignore index if no columns to analyze and not partial */
866 if (attr_cnt == 0 && indexInfo->ii_Predicate == NIL)
867 continue;
868
869 /*
870 * Need an EState for evaluation of index expressions and
871 * partial-index predicates. Create it in the per-index context to be
872 * sure it gets cleaned up at the bottom of the loop.
873 */
874 estate = CreateExecutorState();
875 econtext = GetPerTupleExprContext(estate);
876 /* Need a slot to hold the current heap tuple, too */
877 slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel),
878 &TTSOpsHeapTuple);
879
880 /* Arrange for econtext's scan tuple to be the tuple under test */
881 econtext->ecxt_scantuple = slot;
882
883 /* Set up execution state for predicate. */
884 predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
885
886 /* Compute and save index expression values */
887 exprvals = (Datum *) palloc(numrows * attr_cnt * sizeof(Datum));
888 exprnulls = (bool *) palloc(numrows * attr_cnt * sizeof(bool));
889 numindexrows = 0;
890 tcnt = 0;
891 for (rowno = 0; rowno < numrows; rowno++)
892 {
893 HeapTuple heapTuple = rows[rowno];
894
895 vacuum_delay_point();
896
897 /*
898 * Reset the per-tuple context each time, to reclaim any cruft
899 * left behind by evaluating the predicate or index expressions.
900 */
901 ResetExprContext(econtext);
902
903 /* Set up for predicate or expression evaluation */
904 ExecStoreHeapTuple(heapTuple, slot, false);
905
906 /* If index is partial, check predicate */
907 if (predicate != NULL)
908 {
909 if (!ExecQual(predicate, econtext))
910 continue;
911 }
912 numindexrows++;
913
914 if (attr_cnt > 0)
915 {
916 /*
917 * Evaluate the index row to compute expression values. We
918 * could do this by hand, but FormIndexDatum is convenient.
919 */
920 FormIndexDatum(indexInfo,
921 slot,
922 estate,
923 values,
924 isnull);
925
926 /*
927 * Save just the columns we care about. We copy the values
928 * into ind_context from the estate's per-tuple context.
929 */
930 for (i = 0; i < attr_cnt; i++)
931 {
932 VacAttrStats *stats = thisdata->vacattrstats[i];
933 int attnum = stats->attr->attnum;
934
935 if (isnull[attnum - 1])
936 {
937 exprvals[tcnt] = (Datum) 0;
938 exprnulls[tcnt] = true;
939 }
940 else
941 {
942 exprvals[tcnt] = datumCopy(values[attnum - 1],
943 stats->attrtype->typbyval,
944 stats->attrtype->typlen);
945 exprnulls[tcnt] = false;
946 }
947 tcnt++;
948 }
949 }
950 }
951
952 /*
953 * Having counted the number of rows that pass the predicate in the
954 * sample, we can estimate the total number of rows in the index.
955 */
956 thisdata->tupleFract = (double) numindexrows / (double) numrows;
957 totalindexrows = ceil(thisdata->tupleFract * totalrows);
958
959 /*
960 * Now we can compute the statistics for the expression columns.
961 */
962 if (numindexrows > 0)
963 {
964 MemoryContextSwitchTo(col_context);
965 for (i = 0; i < attr_cnt; i++)
966 {
967 VacAttrStats *stats = thisdata->vacattrstats[i];
968
969 stats->exprvals = exprvals + i;
970 stats->exprnulls = exprnulls + i;
971 stats->rowstride = attr_cnt;
972 stats->compute_stats(stats,
973 ind_fetch_func,
974 numindexrows,
975 totalindexrows);
976
977 MemoryContextResetAndDeleteChildren(col_context);
978 }
979 }
980
981 /* And clean up */
982 MemoryContextSwitchTo(ind_context);
983
984 ExecDropSingleTupleTableSlot(slot);
985 FreeExecutorState(estate);
986 MemoryContextResetAndDeleteChildren(ind_context);
987 }
988
989 MemoryContextSwitchTo(old_context);
990 MemoryContextDelete(ind_context);
991 }
992
993 /*
994 * examine_attribute -- pre-analysis of a single column
995 *
996 * Determine whether the column is analyzable; if so, create and initialize
997 * a VacAttrStats struct for it. If not, return NULL.
998 *
999 * If index_expr isn't NULL, then we're trying to analyze an expression index,
1000 * and index_expr is the expression tree representing the column's data.
1001 */
1002 static VacAttrStats *
examine_attribute(Relation onerel,int attnum,Node * index_expr)1003 examine_attribute(Relation onerel, int attnum, Node *index_expr)
1004 {
1005 Form_pg_attribute attr = TupleDescAttr(onerel->rd_att, attnum - 1);
1006 HeapTuple typtuple;
1007 VacAttrStats *stats;
1008 int i;
1009 bool ok;
1010
1011 /* Never analyze dropped columns */
1012 if (attr->attisdropped)
1013 return NULL;
1014
1015 /* Don't analyze column if user has specified not to */
1016 if (attr->attstattarget == 0)
1017 return NULL;
1018
1019 /*
1020 * Create the VacAttrStats struct. Note that we only have a copy of the
1021 * fixed fields of the pg_attribute tuple.
1022 */
1023 stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
1024 stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_FIXED_PART_SIZE);
1025 memcpy(stats->attr, attr, ATTRIBUTE_FIXED_PART_SIZE);
1026
1027 /*
1028 * When analyzing an expression index, believe the expression tree's type
1029 * not the column datatype --- the latter might be the opckeytype storage
1030 * type of the opclass, which is not interesting for our purposes. (Note:
1031 * if we did anything with non-expression index columns, we'd need to
1032 * figure out where to get the correct type info from, but for now that's
1033 * not a problem.) It's not clear whether anyone will care about the
1034 * typmod, but we store that too just in case.
1035 */
1036 if (index_expr)
1037 {
1038 stats->attrtypid = exprType(index_expr);
1039 stats->attrtypmod = exprTypmod(index_expr);
1040
1041 /*
1042 * If a collation has been specified for the index column, use that in
1043 * preference to anything else; but if not, fall back to whatever we
1044 * can get from the expression.
1045 */
1046 if (OidIsValid(onerel->rd_indcollation[attnum - 1]))
1047 stats->attrcollid = onerel->rd_indcollation[attnum - 1];
1048 else
1049 stats->attrcollid = exprCollation(index_expr);
1050 }
1051 else
1052 {
1053 stats->attrtypid = attr->atttypid;
1054 stats->attrtypmod = attr->atttypmod;
1055 stats->attrcollid = attr->attcollation;
1056 }
1057
1058 typtuple = SearchSysCacheCopy1(TYPEOID,
1059 ObjectIdGetDatum(stats->attrtypid));
1060 if (!HeapTupleIsValid(typtuple))
1061 elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
1062 stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
1063 stats->anl_context = anl_context;
1064 stats->tupattnum = attnum;
1065
1066 /*
1067 * The fields describing the stats->stavalues[n] element types default to
1068 * the type of the data being analyzed, but the type-specific typanalyze
1069 * function can change them if it wants to store something else.
1070 */
1071 for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
1072 {
1073 stats->statypid[i] = stats->attrtypid;
1074 stats->statyplen[i] = stats->attrtype->typlen;
1075 stats->statypbyval[i] = stats->attrtype->typbyval;
1076 stats->statypalign[i] = stats->attrtype->typalign;
1077 }
1078
1079 /*
1080 * Call the type-specific typanalyze function. If none is specified, use
1081 * std_typanalyze().
1082 */
1083 if (OidIsValid(stats->attrtype->typanalyze))
1084 ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
1085 PointerGetDatum(stats)));
1086 else
1087 ok = std_typanalyze(stats);
1088
1089 if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
1090 {
1091 heap_freetuple(typtuple);
1092 pfree(stats->attr);
1093 pfree(stats);
1094 return NULL;
1095 }
1096
1097 return stats;
1098 }
1099
1100 /*
1101 * acquire_sample_rows -- acquire a random sample of rows from the table
1102 *
1103 * Selected rows are returned in the caller-allocated array rows[], which
1104 * must have at least targrows entries.
1105 * The actual number of rows selected is returned as the function result.
1106 * We also estimate the total numbers of live and dead rows in the table,
1107 * and return them into *totalrows and *totaldeadrows, respectively.
1108 *
1109 * The returned list of tuples is in order by physical position in the table.
1110 * (We will rely on this later to derive correlation estimates.)
1111 *
1112 * As of May 2004 we use a new two-stage method: Stage one selects up
1113 * to targrows random blocks (or all blocks, if there aren't so many).
1114 * Stage two scans these blocks and uses the Vitter algorithm to create
1115 * a random sample of targrows rows (or less, if there are less in the
1116 * sample of blocks). The two stages are executed simultaneously: each
1117 * block is processed as soon as stage one returns its number and while
1118 * the rows are read stage two controls which ones are to be inserted
1119 * into the sample.
1120 *
1121 * Although every row has an equal chance of ending up in the final
1122 * sample, this sampling method is not perfect: not every possible
1123 * sample has an equal chance of being selected. For large relations
1124 * the number of different blocks represented by the sample tends to be
1125 * too small. We can live with that for now. Improvements are welcome.
1126 *
1127 * An important property of this sampling method is that because we do
1128 * look at a statistically unbiased set of blocks, we should get
1129 * unbiased estimates of the average numbers of live and dead rows per
1130 * block. The previous sampling method put too much credence in the row
1131 * density near the start of the table.
1132 */
1133 static int
acquire_sample_rows(Relation onerel,int elevel,HeapTuple * rows,int targrows,double * totalrows,double * totaldeadrows)1134 acquire_sample_rows(Relation onerel, int elevel,
1135 HeapTuple *rows, int targrows,
1136 double *totalrows, double *totaldeadrows)
1137 {
1138 int numrows = 0; /* # rows now in reservoir */
1139 double samplerows = 0; /* total # rows collected */
1140 double liverows = 0; /* # live rows seen */
1141 double deadrows = 0; /* # dead rows seen */
1142 double rowstoskip = -1; /* -1 means not set yet */
1143 long randseed; /* Seed for block sampler(s) */
1144 BlockNumber totalblocks;
1145 TransactionId OldestXmin;
1146 BlockSamplerData bs;
1147 ReservoirStateData rstate;
1148 TupleTableSlot *slot;
1149 TableScanDesc scan;
1150 BlockNumber nblocks;
1151 BlockNumber blksdone = 0;
1152 #ifdef USE_PREFETCH
1153 int prefetch_maximum = 0; /* blocks to prefetch if enabled */
1154 BlockSamplerData prefetch_bs;
1155 #endif
1156
1157 Assert(targrows > 0);
1158
1159 totalblocks = RelationGetNumberOfBlocks(onerel);
1160
1161 /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
1162 OldestXmin = GetOldestNonRemovableTransactionId(onerel);
1163
1164 /* Prepare for sampling block numbers */
1165 randseed = random();
1166 nblocks = BlockSampler_Init(&bs, totalblocks, targrows, randseed);
1167
1168 #ifdef USE_PREFETCH
1169 prefetch_maximum = get_tablespace_maintenance_io_concurrency(onerel->rd_rel->reltablespace);
1170 /* Create another BlockSampler, using the same seed, for prefetching */
1171 if (prefetch_maximum)
1172 (void) BlockSampler_Init(&prefetch_bs, totalblocks, targrows, randseed);
1173 #endif
1174
1175 /* Report sampling block numbers */
1176 pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_TOTAL,
1177 nblocks);
1178
1179 /* Prepare for sampling rows */
1180 reservoir_init_selection_state(&rstate, targrows);
1181
1182 scan = table_beginscan_analyze(onerel);
1183 slot = table_slot_create(onerel, NULL);
1184
1185 #ifdef USE_PREFETCH
1186
1187 /*
1188 * If we are doing prefetching, then go ahead and tell the kernel about
1189 * the first set of pages we are going to want. This also moves our
1190 * iterator out ahead of the main one being used, where we will keep it so
1191 * that we're always pre-fetching out prefetch_maximum number of blocks
1192 * ahead.
1193 */
1194 if (prefetch_maximum)
1195 {
1196 for (int i = 0; i < prefetch_maximum; i++)
1197 {
1198 BlockNumber prefetch_block;
1199
1200 if (!BlockSampler_HasMore(&prefetch_bs))
1201 break;
1202
1203 prefetch_block = BlockSampler_Next(&prefetch_bs);
1204 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, prefetch_block);
1205 }
1206 }
1207 #endif
1208
1209 /* Outer loop over blocks to sample */
1210 while (BlockSampler_HasMore(&bs))
1211 {
1212 bool block_accepted;
1213 BlockNumber targblock = BlockSampler_Next(&bs);
1214 #ifdef USE_PREFETCH
1215 BlockNumber prefetch_targblock = InvalidBlockNumber;
1216
1217 /*
1218 * Make sure that every time the main BlockSampler is moved forward
1219 * that our prefetch BlockSampler also gets moved forward, so that we
1220 * always stay out ahead.
1221 */
1222 if (prefetch_maximum && BlockSampler_HasMore(&prefetch_bs))
1223 prefetch_targblock = BlockSampler_Next(&prefetch_bs);
1224 #endif
1225
1226 vacuum_delay_point();
1227
1228 block_accepted = table_scan_analyze_next_block(scan, targblock, vac_strategy);
1229
1230 #ifdef USE_PREFETCH
1231
1232 /*
1233 * When pre-fetching, after we get a block, tell the kernel about the
1234 * next one we will want, if there's any left.
1235 *
1236 * We want to do this even if the table_scan_analyze_next_block() call
1237 * above decides against analyzing the block it picked.
1238 */
1239 if (prefetch_maximum && prefetch_targblock != InvalidBlockNumber)
1240 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, prefetch_targblock);
1241 #endif
1242
1243 /*
1244 * Don't analyze if table_scan_analyze_next_block() indicated this
1245 * block is unsuitable for analyzing.
1246 */
1247 if (!block_accepted)
1248 continue;
1249
1250 while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot))
1251 {
1252 /*
1253 * The first targrows sample rows are simply copied into the
1254 * reservoir. Then we start replacing tuples in the sample until
1255 * we reach the end of the relation. This algorithm is from Jeff
1256 * Vitter's paper (see full citation in utils/misc/sampling.c). It
1257 * works by repeatedly computing the number of tuples to skip
1258 * before selecting a tuple, which replaces a randomly chosen
1259 * element of the reservoir (current set of tuples). At all times
1260 * the reservoir is a true random sample of the tuples we've
1261 * passed over so far, so when we fall off the end of the relation
1262 * we're done.
1263 */
1264 if (numrows < targrows)
1265 rows[numrows++] = ExecCopySlotHeapTuple(slot);
1266 else
1267 {
1268 /*
1269 * t in Vitter's paper is the number of records already
1270 * processed. If we need to compute a new S value, we must
1271 * use the not-yet-incremented value of samplerows as t.
1272 */
1273 if (rowstoskip < 0)
1274 rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows);
1275
1276 if (rowstoskip <= 0)
1277 {
1278 /*
1279 * Found a suitable tuple, so save it, replacing one old
1280 * tuple at random
1281 */
1282 int k = (int) (targrows * sampler_random_fract(rstate.randstate));
1283
1284 Assert(k >= 0 && k < targrows);
1285 heap_freetuple(rows[k]);
1286 rows[k] = ExecCopySlotHeapTuple(slot);
1287 }
1288
1289 rowstoskip -= 1;
1290 }
1291
1292 samplerows += 1;
1293 }
1294
1295 pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_DONE,
1296 ++blksdone);
1297 }
1298
1299 ExecDropSingleTupleTableSlot(slot);
1300 table_endscan(scan);
1301
1302 /*
1303 * If we didn't find as many tuples as we wanted then we're done. No sort
1304 * is needed, since they're already in order.
1305 *
1306 * Otherwise we need to sort the collected tuples by position
1307 * (itempointer). It's not worth worrying about corner cases where the
1308 * tuples are already sorted.
1309 */
1310 if (numrows == targrows)
1311 qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
1312
1313 /*
1314 * Estimate total numbers of live and dead rows in relation, extrapolating
1315 * on the assumption that the average tuple density in pages we didn't
1316 * scan is the same as in the pages we did scan. Since what we scanned is
1317 * a random sample of the pages in the relation, this should be a good
1318 * assumption.
1319 */
1320 if (bs.m > 0)
1321 {
1322 *totalrows = floor((liverows / bs.m) * totalblocks + 0.5);
1323 *totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5);
1324 }
1325 else
1326 {
1327 *totalrows = 0.0;
1328 *totaldeadrows = 0.0;
1329 }
1330
1331 /*
1332 * Emit some interesting relation info
1333 */
1334 ereport(elevel,
1335 (errmsg("\"%s\": scanned %d of %u pages, "
1336 "containing %.0f live rows and %.0f dead rows; "
1337 "%d rows in sample, %.0f estimated total rows",
1338 RelationGetRelationName(onerel),
1339 bs.m, totalblocks,
1340 liverows, deadrows,
1341 numrows, *totalrows)));
1342
1343 return numrows;
1344 }
1345
1346 /*
1347 * qsort comparator for sorting rows[] array
1348 */
1349 static int
compare_rows(const void * a,const void * b)1350 compare_rows(const void *a, const void *b)
1351 {
1352 HeapTuple ha = *(const HeapTuple *) a;
1353 HeapTuple hb = *(const HeapTuple *) b;
1354 BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self);
1355 OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self);
1356 BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self);
1357 OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self);
1358
1359 if (ba < bb)
1360 return -1;
1361 if (ba > bb)
1362 return 1;
1363 if (oa < ob)
1364 return -1;
1365 if (oa > ob)
1366 return 1;
1367 return 0;
1368 }
1369
1370
1371 /*
1372 * acquire_inherited_sample_rows -- acquire sample rows from inheritance tree
1373 *
1374 * This has the same API as acquire_sample_rows, except that rows are
1375 * collected from all inheritance children as well as the specified table.
1376 * We fail and return zero if there are no inheritance children, or if all
1377 * children are foreign tables that don't support ANALYZE.
1378 */
1379 static int
acquire_inherited_sample_rows(Relation onerel,int elevel,HeapTuple * rows,int targrows,double * totalrows,double * totaldeadrows)1380 acquire_inherited_sample_rows(Relation onerel, int elevel,
1381 HeapTuple *rows, int targrows,
1382 double *totalrows, double *totaldeadrows)
1383 {
1384 List *tableOIDs;
1385 Relation *rels;
1386 AcquireSampleRowsFunc *acquirefuncs;
1387 double *relblocks;
1388 double totalblocks;
1389 int numrows,
1390 nrels,
1391 i;
1392 ListCell *lc;
1393 bool has_child;
1394
1395 /*
1396 * Find all members of inheritance set. We only need AccessShareLock on
1397 * the children.
1398 */
1399 tableOIDs =
1400 find_all_inheritors(RelationGetRelid(onerel), AccessShareLock, NULL);
1401
1402 /*
1403 * Check that there's at least one descendant, else fail. This could
1404 * happen despite analyze_rel's relhassubclass check, if table once had a
1405 * child but no longer does. In that case, we can clear the
1406 * relhassubclass field so as not to make the same mistake again later.
1407 * (This is safe because we hold ShareUpdateExclusiveLock.)
1408 */
1409 if (list_length(tableOIDs) < 2)
1410 {
1411 /* CCI because we already updated the pg_class row in this command */
1412 CommandCounterIncrement();
1413 SetRelationHasSubclass(RelationGetRelid(onerel), false);
1414 ereport(elevel,
1415 (errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no child tables",
1416 get_namespace_name(RelationGetNamespace(onerel)),
1417 RelationGetRelationName(onerel))));
1418 return 0;
1419 }
1420
1421 /*
1422 * Identify acquirefuncs to use, and count blocks in all the relations.
1423 * The result could overflow BlockNumber, so we use double arithmetic.
1424 */
1425 rels = (Relation *) palloc(list_length(tableOIDs) * sizeof(Relation));
1426 acquirefuncs = (AcquireSampleRowsFunc *)
1427 palloc(list_length(tableOIDs) * sizeof(AcquireSampleRowsFunc));
1428 relblocks = (double *) palloc(list_length(tableOIDs) * sizeof(double));
1429 totalblocks = 0;
1430 nrels = 0;
1431 has_child = false;
1432 foreach(lc, tableOIDs)
1433 {
1434 Oid childOID = lfirst_oid(lc);
1435 Relation childrel;
1436 AcquireSampleRowsFunc acquirefunc = NULL;
1437 BlockNumber relpages = 0;
1438
1439 /* We already got the needed lock */
1440 childrel = table_open(childOID, NoLock);
1441
1442 /* Ignore if temp table of another backend */
1443 if (RELATION_IS_OTHER_TEMP(childrel))
1444 {
1445 /* ... but release the lock on it */
1446 Assert(childrel != onerel);
1447 table_close(childrel, AccessShareLock);
1448 continue;
1449 }
1450
1451 /* Check table type (MATVIEW can't happen, but might as well allow) */
1452 if (childrel->rd_rel->relkind == RELKIND_RELATION ||
1453 childrel->rd_rel->relkind == RELKIND_MATVIEW)
1454 {
1455 /* Regular table, so use the regular row acquisition function */
1456 acquirefunc = acquire_sample_rows;
1457 relpages = RelationGetNumberOfBlocks(childrel);
1458 }
1459 else if (childrel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
1460 {
1461 /*
1462 * For a foreign table, call the FDW's hook function to see
1463 * whether it supports analysis.
1464 */
1465 FdwRoutine *fdwroutine;
1466 bool ok = false;
1467
1468 fdwroutine = GetFdwRoutineForRelation(childrel, false);
1469
1470 if (fdwroutine->AnalyzeForeignTable != NULL)
1471 ok = fdwroutine->AnalyzeForeignTable(childrel,
1472 &acquirefunc,
1473 &relpages);
1474
1475 if (!ok)
1476 {
1477 /* ignore, but release the lock on it */
1478 Assert(childrel != onerel);
1479 table_close(childrel, AccessShareLock);
1480 continue;
1481 }
1482 }
1483 else
1484 {
1485 /*
1486 * ignore, but release the lock on it. don't try to unlock the
1487 * passed-in relation
1488 */
1489 Assert(childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
1490 if (childrel != onerel)
1491 table_close(childrel, AccessShareLock);
1492 else
1493 table_close(childrel, NoLock);
1494 continue;
1495 }
1496
1497 /* OK, we'll process this child */
1498 has_child = true;
1499 rels[nrels] = childrel;
1500 acquirefuncs[nrels] = acquirefunc;
1501 relblocks[nrels] = (double) relpages;
1502 totalblocks += (double) relpages;
1503 nrels++;
1504 }
1505
1506 /*
1507 * If we don't have at least one child table to consider, fail. If the
1508 * relation is a partitioned table, it's not counted as a child table.
1509 */
1510 if (!has_child)
1511 {
1512 ereport(elevel,
1513 (errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no analyzable child tables",
1514 get_namespace_name(RelationGetNamespace(onerel)),
1515 RelationGetRelationName(onerel))));
1516 return 0;
1517 }
1518
1519 /*
1520 * Now sample rows from each relation, proportionally to its fraction of
1521 * the total block count. (This might be less than desirable if the child
1522 * rels have radically different free-space percentages, but it's not
1523 * clear that it's worth working harder.)
1524 */
1525 pgstat_progress_update_param(PROGRESS_ANALYZE_CHILD_TABLES_TOTAL,
1526 nrels);
1527 numrows = 0;
1528 *totalrows = 0;
1529 *totaldeadrows = 0;
1530 for (i = 0; i < nrels; i++)
1531 {
1532 Relation childrel = rels[i];
1533 AcquireSampleRowsFunc acquirefunc = acquirefuncs[i];
1534 double childblocks = relblocks[i];
1535
1536 pgstat_progress_update_param(PROGRESS_ANALYZE_CURRENT_CHILD_TABLE_RELID,
1537 RelationGetRelid(childrel));
1538
1539 if (childblocks > 0)
1540 {
1541 int childtargrows;
1542
1543 childtargrows = (int) rint(targrows * childblocks / totalblocks);
1544 /* Make sure we don't overrun due to roundoff error */
1545 childtargrows = Min(childtargrows, targrows - numrows);
1546 if (childtargrows > 0)
1547 {
1548 int childrows;
1549 double trows,
1550 tdrows;
1551
1552 /* Fetch a random sample of the child's rows */
1553 childrows = (*acquirefunc) (childrel, elevel,
1554 rows + numrows, childtargrows,
1555 &trows, &tdrows);
1556
1557 /* We may need to convert from child's rowtype to parent's */
1558 if (childrows > 0 &&
1559 !equalTupleDescs(RelationGetDescr(childrel),
1560 RelationGetDescr(onerel)))
1561 {
1562 TupleConversionMap *map;
1563
1564 map = convert_tuples_by_name(RelationGetDescr(childrel),
1565 RelationGetDescr(onerel));
1566 if (map != NULL)
1567 {
1568 int j;
1569
1570 for (j = 0; j < childrows; j++)
1571 {
1572 HeapTuple newtup;
1573
1574 newtup = execute_attr_map_tuple(rows[numrows + j], map);
1575 heap_freetuple(rows[numrows + j]);
1576 rows[numrows + j] = newtup;
1577 }
1578 free_conversion_map(map);
1579 }
1580 }
1581
1582 /* And add to counts */
1583 numrows += childrows;
1584 *totalrows += trows;
1585 *totaldeadrows += tdrows;
1586 }
1587 }
1588
1589 /*
1590 * Note: we cannot release the child-table locks, since we may have
1591 * pointers to their TOAST tables in the sampled rows.
1592 */
1593 table_close(childrel, NoLock);
1594 pgstat_progress_update_param(PROGRESS_ANALYZE_CHILD_TABLES_DONE,
1595 i + 1);
1596 }
1597
1598 return numrows;
1599 }
1600
1601
1602 /*
1603 * update_attstats() -- update attribute statistics for one relation
1604 *
1605 * Statistics are stored in several places: the pg_class row for the
1606 * relation has stats about the whole relation, and there is a
1607 * pg_statistic row for each (non-system) attribute that has ever
1608 * been analyzed. The pg_class values are updated by VACUUM, not here.
1609 *
1610 * pg_statistic rows are just added or updated normally. This means
1611 * that pg_statistic will probably contain some deleted rows at the
1612 * completion of a vacuum cycle, unless it happens to get vacuumed last.
1613 *
1614 * To keep things simple, we punt for pg_statistic, and don't try
1615 * to compute or store rows for pg_statistic itself in pg_statistic.
1616 * This could possibly be made to work, but it's not worth the trouble.
1617 * Note analyze_rel() has seen to it that we won't come here when
1618 * vacuuming pg_statistic itself.
1619 *
1620 * Note: there would be a race condition here if two backends could
1621 * ANALYZE the same table concurrently. Presently, we lock that out
1622 * by taking a self-exclusive lock on the relation in analyze_rel().
1623 */
1624 static void
update_attstats(Oid relid,bool inh,int natts,VacAttrStats ** vacattrstats)1625 update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats)
1626 {
1627 Relation sd;
1628 int attno;
1629
1630 if (natts <= 0)
1631 return; /* nothing to do */
1632
1633 sd = table_open(StatisticRelationId, RowExclusiveLock);
1634
1635 for (attno = 0; attno < natts; attno++)
1636 {
1637 VacAttrStats *stats = vacattrstats[attno];
1638 HeapTuple stup,
1639 oldtup;
1640 int i,
1641 k,
1642 n;
1643 Datum values[Natts_pg_statistic];
1644 bool nulls[Natts_pg_statistic];
1645 bool replaces[Natts_pg_statistic];
1646
1647 /* Ignore attr if we weren't able to collect stats */
1648 if (!stats->stats_valid)
1649 continue;
1650
1651 /*
1652 * Construct a new pg_statistic tuple
1653 */
1654 for (i = 0; i < Natts_pg_statistic; ++i)
1655 {
1656 nulls[i] = false;
1657 replaces[i] = true;
1658 }
1659
1660 values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(relid);
1661 values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(stats->attr->attnum);
1662 values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(inh);
1663 values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac);
1664 values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth);
1665 values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct);
1666 i = Anum_pg_statistic_stakind1 - 1;
1667 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1668 {
1669 values[i++] = Int16GetDatum(stats->stakind[k]); /* stakindN */
1670 }
1671 i = Anum_pg_statistic_staop1 - 1;
1672 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1673 {
1674 values[i++] = ObjectIdGetDatum(stats->staop[k]); /* staopN */
1675 }
1676 i = Anum_pg_statistic_stacoll1 - 1;
1677 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1678 {
1679 values[i++] = ObjectIdGetDatum(stats->stacoll[k]); /* stacollN */
1680 }
1681 i = Anum_pg_statistic_stanumbers1 - 1;
1682 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1683 {
1684 int nnum = stats->numnumbers[k];
1685
1686 if (nnum > 0)
1687 {
1688 Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum));
1689 ArrayType *arry;
1690
1691 for (n = 0; n < nnum; n++)
1692 numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]);
1693 /* XXX knows more than it should about type float4: */
1694 arry = construct_array(numdatums, nnum,
1695 FLOAT4OID,
1696 sizeof(float4), true, TYPALIGN_INT);
1697 values[i++] = PointerGetDatum(arry); /* stanumbersN */
1698 }
1699 else
1700 {
1701 nulls[i] = true;
1702 values[i++] = (Datum) 0;
1703 }
1704 }
1705 i = Anum_pg_statistic_stavalues1 - 1;
1706 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1707 {
1708 if (stats->numvalues[k] > 0)
1709 {
1710 ArrayType *arry;
1711
1712 arry = construct_array(stats->stavalues[k],
1713 stats->numvalues[k],
1714 stats->statypid[k],
1715 stats->statyplen[k],
1716 stats->statypbyval[k],
1717 stats->statypalign[k]);
1718 values[i++] = PointerGetDatum(arry); /* stavaluesN */
1719 }
1720 else
1721 {
1722 nulls[i] = true;
1723 values[i++] = (Datum) 0;
1724 }
1725 }
1726
1727 /* Is there already a pg_statistic tuple for this attribute? */
1728 oldtup = SearchSysCache3(STATRELATTINH,
1729 ObjectIdGetDatum(relid),
1730 Int16GetDatum(stats->attr->attnum),
1731 BoolGetDatum(inh));
1732
1733 if (HeapTupleIsValid(oldtup))
1734 {
1735 /* Yes, replace it */
1736 stup = heap_modify_tuple(oldtup,
1737 RelationGetDescr(sd),
1738 values,
1739 nulls,
1740 replaces);
1741 ReleaseSysCache(oldtup);
1742 CatalogTupleUpdate(sd, &stup->t_self, stup);
1743 }
1744 else
1745 {
1746 /* No, insert new tuple */
1747 stup = heap_form_tuple(RelationGetDescr(sd), values, nulls);
1748 CatalogTupleInsert(sd, stup);
1749 }
1750
1751 heap_freetuple(stup);
1752 }
1753
1754 table_close(sd, RowExclusiveLock);
1755 }
1756
1757 /*
1758 * Standard fetch function for use by compute_stats subroutines.
1759 *
1760 * This exists to provide some insulation between compute_stats routines
1761 * and the actual storage of the sample data.
1762 */
1763 static Datum
std_fetch_func(VacAttrStatsP stats,int rownum,bool * isNull)1764 std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull)
1765 {
1766 int attnum = stats->tupattnum;
1767 HeapTuple tuple = stats->rows[rownum];
1768 TupleDesc tupDesc = stats->tupDesc;
1769
1770 return heap_getattr(tuple, attnum, tupDesc, isNull);
1771 }
1772
1773 /*
1774 * Fetch function for analyzing index expressions.
1775 *
1776 * We have not bothered to construct index tuples, instead the data is
1777 * just in Datum arrays.
1778 */
1779 static Datum
ind_fetch_func(VacAttrStatsP stats,int rownum,bool * isNull)1780 ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull)
1781 {
1782 int i;
1783
1784 /* exprvals and exprnulls are already offset for proper column */
1785 i = rownum * stats->rowstride;
1786 *isNull = stats->exprnulls[i];
1787 return stats->exprvals[i];
1788 }
1789
1790
1791 /*==========================================================================
1792 *
1793 * Code below this point represents the "standard" type-specific statistics
1794 * analysis algorithms. This code can be replaced on a per-data-type basis
1795 * by setting a nonzero value in pg_type.typanalyze.
1796 *
1797 *==========================================================================
1798 */
1799
1800
1801 /*
1802 * To avoid consuming too much memory during analysis and/or too much space
1803 * in the resulting pg_statistic rows, we ignore varlena datums that are wider
1804 * than WIDTH_THRESHOLD (after detoasting!). This is legitimate for MCV
1805 * and distinct-value calculations since a wide value is unlikely to be
1806 * duplicated at all, much less be a most-common value. For the same reason,
1807 * ignoring wide values will not affect our estimates of histogram bin
1808 * boundaries very much.
1809 */
1810 #define WIDTH_THRESHOLD 1024
1811
1812 #define swapInt(a,b) do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0)
1813 #define swapDatum(a,b) do {Datum _tmp; _tmp=a; a=b; b=_tmp;} while(0)
1814
1815 /*
1816 * Extra information used by the default analysis routines
1817 */
1818 typedef struct
1819 {
1820 int count; /* # of duplicates */
1821 int first; /* values[] index of first occurrence */
1822 } ScalarMCVItem;
1823
1824 typedef struct
1825 {
1826 SortSupport ssup;
1827 int *tupnoLink;
1828 } CompareScalarsContext;
1829
1830
1831 static void compute_trivial_stats(VacAttrStatsP stats,
1832 AnalyzeAttrFetchFunc fetchfunc,
1833 int samplerows,
1834 double totalrows);
1835 static void compute_distinct_stats(VacAttrStatsP stats,
1836 AnalyzeAttrFetchFunc fetchfunc,
1837 int samplerows,
1838 double totalrows);
1839 static void compute_scalar_stats(VacAttrStatsP stats,
1840 AnalyzeAttrFetchFunc fetchfunc,
1841 int samplerows,
1842 double totalrows);
1843 static int compare_scalars(const void *a, const void *b, void *arg);
1844 static int compare_mcvs(const void *a, const void *b);
1845 static int analyze_mcv_list(int *mcv_counts,
1846 int num_mcv,
1847 double stadistinct,
1848 double stanullfrac,
1849 int samplerows,
1850 double totalrows);
1851
1852
1853 /*
1854 * std_typanalyze -- the default type-specific typanalyze function
1855 */
1856 bool
std_typanalyze(VacAttrStats * stats)1857 std_typanalyze(VacAttrStats *stats)
1858 {
1859 Form_pg_attribute attr = stats->attr;
1860 Oid ltopr;
1861 Oid eqopr;
1862 StdAnalyzeData *mystats;
1863
1864 /* If the attstattarget column is negative, use the default value */
1865 /* NB: it is okay to scribble on stats->attr since it's a copy */
1866 if (attr->attstattarget < 0)
1867 attr->attstattarget = default_statistics_target;
1868
1869 /* Look for default "<" and "=" operators for column's type */
1870 get_sort_group_operators(stats->attrtypid,
1871 false, false, false,
1872 <opr, &eqopr, NULL,
1873 NULL);
1874
1875 /* Save the operator info for compute_stats routines */
1876 mystats = (StdAnalyzeData *) palloc(sizeof(StdAnalyzeData));
1877 mystats->eqopr = eqopr;
1878 mystats->eqfunc = OidIsValid(eqopr) ? get_opcode(eqopr) : InvalidOid;
1879 mystats->ltopr = ltopr;
1880 stats->extra_data = mystats;
1881
1882 /*
1883 * Determine which standard statistics algorithm to use
1884 */
1885 if (OidIsValid(eqopr) && OidIsValid(ltopr))
1886 {
1887 /* Seems to be a scalar datatype */
1888 stats->compute_stats = compute_scalar_stats;
1889 /*--------------------
1890 * The following choice of minrows is based on the paper
1891 * "Random sampling for histogram construction: how much is enough?"
1892 * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
1893 * Proceedings of ACM SIGMOD International Conference on Management
1894 * of Data, 1998, Pages 436-447. Their Corollary 1 to Theorem 5
1895 * says that for table size n, histogram size k, maximum relative
1896 * error in bin size f, and error probability gamma, the minimum
1897 * random sample size is
1898 * r = 4 * k * ln(2*n/gamma) / f^2
1899 * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
1900 * r = 305.82 * k
1901 * Note that because of the log function, the dependence on n is
1902 * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
1903 * bin size error with probability 0.99. So there's no real need to
1904 * scale for n, which is a good thing because we don't necessarily
1905 * know it at this point.
1906 *--------------------
1907 */
1908 stats->minrows = 300 * attr->attstattarget;
1909 }
1910 else if (OidIsValid(eqopr))
1911 {
1912 /* We can still recognize distinct values */
1913 stats->compute_stats = compute_distinct_stats;
1914 /* Might as well use the same minrows as above */
1915 stats->minrows = 300 * attr->attstattarget;
1916 }
1917 else
1918 {
1919 /* Can't do much but the trivial stuff */
1920 stats->compute_stats = compute_trivial_stats;
1921 /* Might as well use the same minrows as above */
1922 stats->minrows = 300 * attr->attstattarget;
1923 }
1924
1925 return true;
1926 }
1927
1928
1929 /*
1930 * compute_trivial_stats() -- compute very basic column statistics
1931 *
1932 * We use this when we cannot find a hash "=" operator for the datatype.
1933 *
1934 * We determine the fraction of non-null rows and the average datum width.
1935 */
1936 static void
compute_trivial_stats(VacAttrStatsP stats,AnalyzeAttrFetchFunc fetchfunc,int samplerows,double totalrows)1937 compute_trivial_stats(VacAttrStatsP stats,
1938 AnalyzeAttrFetchFunc fetchfunc,
1939 int samplerows,
1940 double totalrows)
1941 {
1942 int i;
1943 int null_cnt = 0;
1944 int nonnull_cnt = 0;
1945 double total_width = 0;
1946 bool is_varlena = (!stats->attrtype->typbyval &&
1947 stats->attrtype->typlen == -1);
1948 bool is_varwidth = (!stats->attrtype->typbyval &&
1949 stats->attrtype->typlen < 0);
1950
1951 for (i = 0; i < samplerows; i++)
1952 {
1953 Datum value;
1954 bool isnull;
1955
1956 vacuum_delay_point();
1957
1958 value = fetchfunc(stats, i, &isnull);
1959
1960 /* Check for null/nonnull */
1961 if (isnull)
1962 {
1963 null_cnt++;
1964 continue;
1965 }
1966 nonnull_cnt++;
1967
1968 /*
1969 * If it's a variable-width field, add up widths for average width
1970 * calculation. Note that if the value is toasted, we use the toasted
1971 * width. We don't bother with this calculation if it's a fixed-width
1972 * type.
1973 */
1974 if (is_varlena)
1975 {
1976 total_width += VARSIZE_ANY(DatumGetPointer(value));
1977 }
1978 else if (is_varwidth)
1979 {
1980 /* must be cstring */
1981 total_width += strlen(DatumGetCString(value)) + 1;
1982 }
1983 }
1984
1985 /* We can only compute average width if we found some non-null values. */
1986 if (nonnull_cnt > 0)
1987 {
1988 stats->stats_valid = true;
1989 /* Do the simple null-frac and width stats */
1990 stats->stanullfrac = (double) null_cnt / (double) samplerows;
1991 if (is_varwidth)
1992 stats->stawidth = total_width / (double) nonnull_cnt;
1993 else
1994 stats->stawidth = stats->attrtype->typlen;
1995 stats->stadistinct = 0.0; /* "unknown" */
1996 }
1997 else if (null_cnt > 0)
1998 {
1999 /* We found only nulls; assume the column is entirely null */
2000 stats->stats_valid = true;
2001 stats->stanullfrac = 1.0;
2002 if (is_varwidth)
2003 stats->stawidth = 0; /* "unknown" */
2004 else
2005 stats->stawidth = stats->attrtype->typlen;
2006 stats->stadistinct = 0.0; /* "unknown" */
2007 }
2008 }
2009
2010
2011 /*
2012 * compute_distinct_stats() -- compute column statistics including ndistinct
2013 *
2014 * We use this when we can find only an "=" operator for the datatype.
2015 *
2016 * We determine the fraction of non-null rows, the average width, the
2017 * most common values, and the (estimated) number of distinct values.
2018 *
2019 * The most common values are determined by brute force: we keep a list
2020 * of previously seen values, ordered by number of times seen, as we scan
2021 * the samples. A newly seen value is inserted just after the last
2022 * multiply-seen value, causing the bottommost (oldest) singly-seen value
2023 * to drop off the list. The accuracy of this method, and also its cost,
2024 * depend mainly on the length of the list we are willing to keep.
2025 */
2026 static void
compute_distinct_stats(VacAttrStatsP stats,AnalyzeAttrFetchFunc fetchfunc,int samplerows,double totalrows)2027 compute_distinct_stats(VacAttrStatsP stats,
2028 AnalyzeAttrFetchFunc fetchfunc,
2029 int samplerows,
2030 double totalrows)
2031 {
2032 int i;
2033 int null_cnt = 0;
2034 int nonnull_cnt = 0;
2035 int toowide_cnt = 0;
2036 double total_width = 0;
2037 bool is_varlena = (!stats->attrtype->typbyval &&
2038 stats->attrtype->typlen == -1);
2039 bool is_varwidth = (!stats->attrtype->typbyval &&
2040 stats->attrtype->typlen < 0);
2041 FmgrInfo f_cmpeq;
2042 typedef struct
2043 {
2044 Datum value;
2045 int count;
2046 } TrackItem;
2047 TrackItem *track;
2048 int track_cnt,
2049 track_max;
2050 int num_mcv = stats->attr->attstattarget;
2051 StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
2052
2053 /*
2054 * We track up to 2*n values for an n-element MCV list; but at least 10
2055 */
2056 track_max = 2 * num_mcv;
2057 if (track_max < 10)
2058 track_max = 10;
2059 track = (TrackItem *) palloc(track_max * sizeof(TrackItem));
2060 track_cnt = 0;
2061
2062 fmgr_info(mystats->eqfunc, &f_cmpeq);
2063
2064 for (i = 0; i < samplerows; i++)
2065 {
2066 Datum value;
2067 bool isnull;
2068 bool match;
2069 int firstcount1,
2070 j;
2071
2072 vacuum_delay_point();
2073
2074 value = fetchfunc(stats, i, &isnull);
2075
2076 /* Check for null/nonnull */
2077 if (isnull)
2078 {
2079 null_cnt++;
2080 continue;
2081 }
2082 nonnull_cnt++;
2083
2084 /*
2085 * If it's a variable-width field, add up widths for average width
2086 * calculation. Note that if the value is toasted, we use the toasted
2087 * width. We don't bother with this calculation if it's a fixed-width
2088 * type.
2089 */
2090 if (is_varlena)
2091 {
2092 total_width += VARSIZE_ANY(DatumGetPointer(value));
2093
2094 /*
2095 * If the value is toasted, we want to detoast it just once to
2096 * avoid repeated detoastings and resultant excess memory usage
2097 * during the comparisons. Also, check to see if the value is
2098 * excessively wide, and if so don't detoast at all --- just
2099 * ignore the value.
2100 */
2101 if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
2102 {
2103 toowide_cnt++;
2104 continue;
2105 }
2106 value = PointerGetDatum(PG_DETOAST_DATUM(value));
2107 }
2108 else if (is_varwidth)
2109 {
2110 /* must be cstring */
2111 total_width += strlen(DatumGetCString(value)) + 1;
2112 }
2113
2114 /*
2115 * See if the value matches anything we're already tracking.
2116 */
2117 match = false;
2118 firstcount1 = track_cnt;
2119 for (j = 0; j < track_cnt; j++)
2120 {
2121 if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
2122 stats->attrcollid,
2123 value, track[j].value)))
2124 {
2125 match = true;
2126 break;
2127 }
2128 if (j < firstcount1 && track[j].count == 1)
2129 firstcount1 = j;
2130 }
2131
2132 if (match)
2133 {
2134 /* Found a match */
2135 track[j].count++;
2136 /* This value may now need to "bubble up" in the track list */
2137 while (j > 0 && track[j].count > track[j - 1].count)
2138 {
2139 swapDatum(track[j].value, track[j - 1].value);
2140 swapInt(track[j].count, track[j - 1].count);
2141 j--;
2142 }
2143 }
2144 else
2145 {
2146 /* No match. Insert at head of count-1 list */
2147 if (track_cnt < track_max)
2148 track_cnt++;
2149 for (j = track_cnt - 1; j > firstcount1; j--)
2150 {
2151 track[j].value = track[j - 1].value;
2152 track[j].count = track[j - 1].count;
2153 }
2154 if (firstcount1 < track_cnt)
2155 {
2156 track[firstcount1].value = value;
2157 track[firstcount1].count = 1;
2158 }
2159 }
2160 }
2161
2162 /* We can only compute real stats if we found some non-null values. */
2163 if (nonnull_cnt > 0)
2164 {
2165 int nmultiple,
2166 summultiple;
2167
2168 stats->stats_valid = true;
2169 /* Do the simple null-frac and width stats */
2170 stats->stanullfrac = (double) null_cnt / (double) samplerows;
2171 if (is_varwidth)
2172 stats->stawidth = total_width / (double) nonnull_cnt;
2173 else
2174 stats->stawidth = stats->attrtype->typlen;
2175
2176 /* Count the number of values we found multiple times */
2177 summultiple = 0;
2178 for (nmultiple = 0; nmultiple < track_cnt; nmultiple++)
2179 {
2180 if (track[nmultiple].count == 1)
2181 break;
2182 summultiple += track[nmultiple].count;
2183 }
2184
2185 if (nmultiple == 0)
2186 {
2187 /*
2188 * If we found no repeated non-null values, assume it's a unique
2189 * column; but be sure to discount for any nulls we found.
2190 */
2191 stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
2192 }
2193 else if (track_cnt < track_max && toowide_cnt == 0 &&
2194 nmultiple == track_cnt)
2195 {
2196 /*
2197 * Our track list includes every value in the sample, and every
2198 * value appeared more than once. Assume the column has just
2199 * these values. (This case is meant to address columns with
2200 * small, fixed sets of possible values, such as boolean or enum
2201 * columns. If there are any values that appear just once in the
2202 * sample, including too-wide values, we should assume that that's
2203 * not what we're dealing with.)
2204 */
2205 stats->stadistinct = track_cnt;
2206 }
2207 else
2208 {
2209 /*----------
2210 * Estimate the number of distinct values using the estimator
2211 * proposed by Haas and Stokes in IBM Research Report RJ 10025:
2212 * n*d / (n - f1 + f1*n/N)
2213 * where f1 is the number of distinct values that occurred
2214 * exactly once in our sample of n rows (from a total of N),
2215 * and d is the total number of distinct values in the sample.
2216 * This is their Duj1 estimator; the other estimators they
2217 * recommend are considerably more complex, and are numerically
2218 * very unstable when n is much smaller than N.
2219 *
2220 * In this calculation, we consider only non-nulls. We used to
2221 * include rows with null values in the n and N counts, but that
2222 * leads to inaccurate answers in columns with many nulls, and
2223 * it's intuitively bogus anyway considering the desired result is
2224 * the number of distinct non-null values.
2225 *
2226 * We assume (not very reliably!) that all the multiply-occurring
2227 * values are reflected in the final track[] list, and the other
2228 * nonnull values all appeared but once. (XXX this usually
2229 * results in a drastic overestimate of ndistinct. Can we do
2230 * any better?)
2231 *----------
2232 */
2233 int f1 = nonnull_cnt - summultiple;
2234 int d = f1 + nmultiple;
2235 double n = samplerows - null_cnt;
2236 double N = totalrows * (1.0 - stats->stanullfrac);
2237 double stadistinct;
2238
2239 /* N == 0 shouldn't happen, but just in case ... */
2240 if (N > 0)
2241 stadistinct = (n * d) / ((n - f1) + f1 * n / N);
2242 else
2243 stadistinct = 0;
2244
2245 /* Clamp to sane range in case of roundoff error */
2246 if (stadistinct < d)
2247 stadistinct = d;
2248 if (stadistinct > N)
2249 stadistinct = N;
2250 /* And round to integer */
2251 stats->stadistinct = floor(stadistinct + 0.5);
2252 }
2253
2254 /*
2255 * If we estimated the number of distinct values at more than 10% of
2256 * the total row count (a very arbitrary limit), then assume that
2257 * stadistinct should scale with the row count rather than be a fixed
2258 * value.
2259 */
2260 if (stats->stadistinct > 0.1 * totalrows)
2261 stats->stadistinct = -(stats->stadistinct / totalrows);
2262
2263 /*
2264 * Decide how many values are worth storing as most-common values. If
2265 * we are able to generate a complete MCV list (all the values in the
2266 * sample will fit, and we think these are all the ones in the table),
2267 * then do so. Otherwise, store only those values that are
2268 * significantly more common than the values not in the list.
2269 *
2270 * Note: the first of these cases is meant to address columns with
2271 * small, fixed sets of possible values, such as boolean or enum
2272 * columns. If we can *completely* represent the column population by
2273 * an MCV list that will fit into the stats target, then we should do
2274 * so and thus provide the planner with complete information. But if
2275 * the MCV list is not complete, it's generally worth being more
2276 * selective, and not just filling it all the way up to the stats
2277 * target.
2278 */
2279 if (track_cnt < track_max && toowide_cnt == 0 &&
2280 stats->stadistinct > 0 &&
2281 track_cnt <= num_mcv)
2282 {
2283 /* Track list includes all values seen, and all will fit */
2284 num_mcv = track_cnt;
2285 }
2286 else
2287 {
2288 int *mcv_counts;
2289
2290 /* Incomplete list; decide how many values are worth keeping */
2291 if (num_mcv > track_cnt)
2292 num_mcv = track_cnt;
2293
2294 if (num_mcv > 0)
2295 {
2296 mcv_counts = (int *) palloc(num_mcv * sizeof(int));
2297 for (i = 0; i < num_mcv; i++)
2298 mcv_counts[i] = track[i].count;
2299
2300 num_mcv = analyze_mcv_list(mcv_counts, num_mcv,
2301 stats->stadistinct,
2302 stats->stanullfrac,
2303 samplerows, totalrows);
2304 }
2305 }
2306
2307 /* Generate MCV slot entry */
2308 if (num_mcv > 0)
2309 {
2310 MemoryContext old_context;
2311 Datum *mcv_values;
2312 float4 *mcv_freqs;
2313
2314 /* Must copy the target values into anl_context */
2315 old_context = MemoryContextSwitchTo(stats->anl_context);
2316 mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));
2317 mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));
2318 for (i = 0; i < num_mcv; i++)
2319 {
2320 mcv_values[i] = datumCopy(track[i].value,
2321 stats->attrtype->typbyval,
2322 stats->attrtype->typlen);
2323 mcv_freqs[i] = (double) track[i].count / (double) samplerows;
2324 }
2325 MemoryContextSwitchTo(old_context);
2326
2327 stats->stakind[0] = STATISTIC_KIND_MCV;
2328 stats->staop[0] = mystats->eqopr;
2329 stats->stacoll[0] = stats->attrcollid;
2330 stats->stanumbers[0] = mcv_freqs;
2331 stats->numnumbers[0] = num_mcv;
2332 stats->stavalues[0] = mcv_values;
2333 stats->numvalues[0] = num_mcv;
2334
2335 /*
2336 * Accept the defaults for stats->statypid and others. They have
2337 * been set before we were called (see vacuum.h)
2338 */
2339 }
2340 }
2341 else if (null_cnt > 0)
2342 {
2343 /* We found only nulls; assume the column is entirely null */
2344 stats->stats_valid = true;
2345 stats->stanullfrac = 1.0;
2346 if (is_varwidth)
2347 stats->stawidth = 0; /* "unknown" */
2348 else
2349 stats->stawidth = stats->attrtype->typlen;
2350 stats->stadistinct = 0.0; /* "unknown" */
2351 }
2352
2353 /* We don't need to bother cleaning up any of our temporary palloc's */
2354 }
2355
2356
2357 /*
2358 * compute_scalar_stats() -- compute column statistics
2359 *
2360 * We use this when we can find "=" and "<" operators for the datatype.
2361 *
2362 * We determine the fraction of non-null rows, the average width, the
2363 * most common values, the (estimated) number of distinct values, the
2364 * distribution histogram, and the correlation of physical to logical order.
2365 *
2366 * The desired stats can be determined fairly easily after sorting the
2367 * data values into order.
2368 */
2369 static void
compute_scalar_stats(VacAttrStatsP stats,AnalyzeAttrFetchFunc fetchfunc,int samplerows,double totalrows)2370 compute_scalar_stats(VacAttrStatsP stats,
2371 AnalyzeAttrFetchFunc fetchfunc,
2372 int samplerows,
2373 double totalrows)
2374 {
2375 int i;
2376 int null_cnt = 0;
2377 int nonnull_cnt = 0;
2378 int toowide_cnt = 0;
2379 double total_width = 0;
2380 bool is_varlena = (!stats->attrtype->typbyval &&
2381 stats->attrtype->typlen == -1);
2382 bool is_varwidth = (!stats->attrtype->typbyval &&
2383 stats->attrtype->typlen < 0);
2384 double corr_xysum;
2385 SortSupportData ssup;
2386 ScalarItem *values;
2387 int values_cnt = 0;
2388 int *tupnoLink;
2389 ScalarMCVItem *track;
2390 int track_cnt = 0;
2391 int num_mcv = stats->attr->attstattarget;
2392 int num_bins = stats->attr->attstattarget;
2393 StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
2394
2395 values = (ScalarItem *) palloc(samplerows * sizeof(ScalarItem));
2396 tupnoLink = (int *) palloc(samplerows * sizeof(int));
2397 track = (ScalarMCVItem *) palloc(num_mcv * sizeof(ScalarMCVItem));
2398
2399 memset(&ssup, 0, sizeof(ssup));
2400 ssup.ssup_cxt = CurrentMemoryContext;
2401 ssup.ssup_collation = stats->attrcollid;
2402 ssup.ssup_nulls_first = false;
2403
2404 /*
2405 * For now, don't perform abbreviated key conversion, because full values
2406 * are required for MCV slot generation. Supporting that optimization
2407 * would necessitate teaching compare_scalars() to call a tie-breaker.
2408 */
2409 ssup.abbreviate = false;
2410
2411 PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
2412
2413 /* Initial scan to find sortable values */
2414 for (i = 0; i < samplerows; i++)
2415 {
2416 Datum value;
2417 bool isnull;
2418
2419 vacuum_delay_point();
2420
2421 value = fetchfunc(stats, i, &isnull);
2422
2423 /* Check for null/nonnull */
2424 if (isnull)
2425 {
2426 null_cnt++;
2427 continue;
2428 }
2429 nonnull_cnt++;
2430
2431 /*
2432 * If it's a variable-width field, add up widths for average width
2433 * calculation. Note that if the value is toasted, we use the toasted
2434 * width. We don't bother with this calculation if it's a fixed-width
2435 * type.
2436 */
2437 if (is_varlena)
2438 {
2439 total_width += VARSIZE_ANY(DatumGetPointer(value));
2440
2441 /*
2442 * If the value is toasted, we want to detoast it just once to
2443 * avoid repeated detoastings and resultant excess memory usage
2444 * during the comparisons. Also, check to see if the value is
2445 * excessively wide, and if so don't detoast at all --- just
2446 * ignore the value.
2447 */
2448 if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
2449 {
2450 toowide_cnt++;
2451 continue;
2452 }
2453 value = PointerGetDatum(PG_DETOAST_DATUM(value));
2454 }
2455 else if (is_varwidth)
2456 {
2457 /* must be cstring */
2458 total_width += strlen(DatumGetCString(value)) + 1;
2459 }
2460
2461 /* Add it to the list to be sorted */
2462 values[values_cnt].value = value;
2463 values[values_cnt].tupno = values_cnt;
2464 tupnoLink[values_cnt] = values_cnt;
2465 values_cnt++;
2466 }
2467
2468 /* We can only compute real stats if we found some sortable values. */
2469 if (values_cnt > 0)
2470 {
2471 int ndistinct, /* # distinct values in sample */
2472 nmultiple, /* # that appear multiple times */
2473 num_hist,
2474 dups_cnt;
2475 int slot_idx = 0;
2476 CompareScalarsContext cxt;
2477
2478 /* Sort the collected values */
2479 cxt.ssup = &ssup;
2480 cxt.tupnoLink = tupnoLink;
2481 qsort_arg((void *) values, values_cnt, sizeof(ScalarItem),
2482 compare_scalars, (void *) &cxt);
2483
2484 /*
2485 * Now scan the values in order, find the most common ones, and also
2486 * accumulate ordering-correlation statistics.
2487 *
2488 * To determine which are most common, we first have to count the
2489 * number of duplicates of each value. The duplicates are adjacent in
2490 * the sorted list, so a brute-force approach is to compare successive
2491 * datum values until we find two that are not equal. However, that
2492 * requires N-1 invocations of the datum comparison routine, which are
2493 * completely redundant with work that was done during the sort. (The
2494 * sort algorithm must at some point have compared each pair of items
2495 * that are adjacent in the sorted order; otherwise it could not know
2496 * that it's ordered the pair correctly.) We exploit this by having
2497 * compare_scalars remember the highest tupno index that each
2498 * ScalarItem has been found equal to. At the end of the sort, a
2499 * ScalarItem's tupnoLink will still point to itself if and only if it
2500 * is the last item of its group of duplicates (since the group will
2501 * be ordered by tupno).
2502 */
2503 corr_xysum = 0;
2504 ndistinct = 0;
2505 nmultiple = 0;
2506 dups_cnt = 0;
2507 for (i = 0; i < values_cnt; i++)
2508 {
2509 int tupno = values[i].tupno;
2510
2511 corr_xysum += ((double) i) * ((double) tupno);
2512 dups_cnt++;
2513 if (tupnoLink[tupno] == tupno)
2514 {
2515 /* Reached end of duplicates of this value */
2516 ndistinct++;
2517 if (dups_cnt > 1)
2518 {
2519 nmultiple++;
2520 if (track_cnt < num_mcv ||
2521 dups_cnt > track[track_cnt - 1].count)
2522 {
2523 /*
2524 * Found a new item for the mcv list; find its
2525 * position, bubbling down old items if needed. Loop
2526 * invariant is that j points at an empty/ replaceable
2527 * slot.
2528 */
2529 int j;
2530
2531 if (track_cnt < num_mcv)
2532 track_cnt++;
2533 for (j = track_cnt - 1; j > 0; j--)
2534 {
2535 if (dups_cnt <= track[j - 1].count)
2536 break;
2537 track[j].count = track[j - 1].count;
2538 track[j].first = track[j - 1].first;
2539 }
2540 track[j].count = dups_cnt;
2541 track[j].first = i + 1 - dups_cnt;
2542 }
2543 }
2544 dups_cnt = 0;
2545 }
2546 }
2547
2548 stats->stats_valid = true;
2549 /* Do the simple null-frac and width stats */
2550 stats->stanullfrac = (double) null_cnt / (double) samplerows;
2551 if (is_varwidth)
2552 stats->stawidth = total_width / (double) nonnull_cnt;
2553 else
2554 stats->stawidth = stats->attrtype->typlen;
2555
2556 if (nmultiple == 0)
2557 {
2558 /*
2559 * If we found no repeated non-null values, assume it's a unique
2560 * column; but be sure to discount for any nulls we found.
2561 */
2562 stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
2563 }
2564 else if (toowide_cnt == 0 && nmultiple == ndistinct)
2565 {
2566 /*
2567 * Every value in the sample appeared more than once. Assume the
2568 * column has just these values. (This case is meant to address
2569 * columns with small, fixed sets of possible values, such as
2570 * boolean or enum columns. If there are any values that appear
2571 * just once in the sample, including too-wide values, we should
2572 * assume that that's not what we're dealing with.)
2573 */
2574 stats->stadistinct = ndistinct;
2575 }
2576 else
2577 {
2578 /*----------
2579 * Estimate the number of distinct values using the estimator
2580 * proposed by Haas and Stokes in IBM Research Report RJ 10025:
2581 * n*d / (n - f1 + f1*n/N)
2582 * where f1 is the number of distinct values that occurred
2583 * exactly once in our sample of n rows (from a total of N),
2584 * and d is the total number of distinct values in the sample.
2585 * This is their Duj1 estimator; the other estimators they
2586 * recommend are considerably more complex, and are numerically
2587 * very unstable when n is much smaller than N.
2588 *
2589 * In this calculation, we consider only non-nulls. We used to
2590 * include rows with null values in the n and N counts, but that
2591 * leads to inaccurate answers in columns with many nulls, and
2592 * it's intuitively bogus anyway considering the desired result is
2593 * the number of distinct non-null values.
2594 *
2595 * Overwidth values are assumed to have been distinct.
2596 *----------
2597 */
2598 int f1 = ndistinct - nmultiple + toowide_cnt;
2599 int d = f1 + nmultiple;
2600 double n = samplerows - null_cnt;
2601 double N = totalrows * (1.0 - stats->stanullfrac);
2602 double stadistinct;
2603
2604 /* N == 0 shouldn't happen, but just in case ... */
2605 if (N > 0)
2606 stadistinct = (n * d) / ((n - f1) + f1 * n / N);
2607 else
2608 stadistinct = 0;
2609
2610 /* Clamp to sane range in case of roundoff error */
2611 if (stadistinct < d)
2612 stadistinct = d;
2613 if (stadistinct > N)
2614 stadistinct = N;
2615 /* And round to integer */
2616 stats->stadistinct = floor(stadistinct + 0.5);
2617 }
2618
2619 /*
2620 * If we estimated the number of distinct values at more than 10% of
2621 * the total row count (a very arbitrary limit), then assume that
2622 * stadistinct should scale with the row count rather than be a fixed
2623 * value.
2624 */
2625 if (stats->stadistinct > 0.1 * totalrows)
2626 stats->stadistinct = -(stats->stadistinct / totalrows);
2627
2628 /*
2629 * Decide how many values are worth storing as most-common values. If
2630 * we are able to generate a complete MCV list (all the values in the
2631 * sample will fit, and we think these are all the ones in the table),
2632 * then do so. Otherwise, store only those values that are
2633 * significantly more common than the values not in the list.
2634 *
2635 * Note: the first of these cases is meant to address columns with
2636 * small, fixed sets of possible values, such as boolean or enum
2637 * columns. If we can *completely* represent the column population by
2638 * an MCV list that will fit into the stats target, then we should do
2639 * so and thus provide the planner with complete information. But if
2640 * the MCV list is not complete, it's generally worth being more
2641 * selective, and not just filling it all the way up to the stats
2642 * target.
2643 */
2644 if (track_cnt == ndistinct && toowide_cnt == 0 &&
2645 stats->stadistinct > 0 &&
2646 track_cnt <= num_mcv)
2647 {
2648 /* Track list includes all values seen, and all will fit */
2649 num_mcv = track_cnt;
2650 }
2651 else
2652 {
2653 int *mcv_counts;
2654
2655 /* Incomplete list; decide how many values are worth keeping */
2656 if (num_mcv > track_cnt)
2657 num_mcv = track_cnt;
2658
2659 if (num_mcv > 0)
2660 {
2661 mcv_counts = (int *) palloc(num_mcv * sizeof(int));
2662 for (i = 0; i < num_mcv; i++)
2663 mcv_counts[i] = track[i].count;
2664
2665 num_mcv = analyze_mcv_list(mcv_counts, num_mcv,
2666 stats->stadistinct,
2667 stats->stanullfrac,
2668 samplerows, totalrows);
2669 }
2670 }
2671
2672 /* Generate MCV slot entry */
2673 if (num_mcv > 0)
2674 {
2675 MemoryContext old_context;
2676 Datum *mcv_values;
2677 float4 *mcv_freqs;
2678
2679 /* Must copy the target values into anl_context */
2680 old_context = MemoryContextSwitchTo(stats->anl_context);
2681 mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));
2682 mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));
2683 for (i = 0; i < num_mcv; i++)
2684 {
2685 mcv_values[i] = datumCopy(values[track[i].first].value,
2686 stats->attrtype->typbyval,
2687 stats->attrtype->typlen);
2688 mcv_freqs[i] = (double) track[i].count / (double) samplerows;
2689 }
2690 MemoryContextSwitchTo(old_context);
2691
2692 stats->stakind[slot_idx] = STATISTIC_KIND_MCV;
2693 stats->staop[slot_idx] = mystats->eqopr;
2694 stats->stacoll[slot_idx] = stats->attrcollid;
2695 stats->stanumbers[slot_idx] = mcv_freqs;
2696 stats->numnumbers[slot_idx] = num_mcv;
2697 stats->stavalues[slot_idx] = mcv_values;
2698 stats->numvalues[slot_idx] = num_mcv;
2699
2700 /*
2701 * Accept the defaults for stats->statypid and others. They have
2702 * been set before we were called (see vacuum.h)
2703 */
2704 slot_idx++;
2705 }
2706
2707 /*
2708 * Generate a histogram slot entry if there are at least two distinct
2709 * values not accounted for in the MCV list. (This ensures the
2710 * histogram won't collapse to empty or a singleton.)
2711 */
2712 num_hist = ndistinct - num_mcv;
2713 if (num_hist > num_bins)
2714 num_hist = num_bins + 1;
2715 if (num_hist >= 2)
2716 {
2717 MemoryContext old_context;
2718 Datum *hist_values;
2719 int nvals;
2720 int pos,
2721 posfrac,
2722 delta,
2723 deltafrac;
2724
2725 /* Sort the MCV items into position order to speed next loop */
2726 qsort((void *) track, num_mcv,
2727 sizeof(ScalarMCVItem), compare_mcvs);
2728
2729 /*
2730 * Collapse out the MCV items from the values[] array.
2731 *
2732 * Note we destroy the values[] array here... but we don't need it
2733 * for anything more. We do, however, still need values_cnt.
2734 * nvals will be the number of remaining entries in values[].
2735 */
2736 if (num_mcv > 0)
2737 {
2738 int src,
2739 dest;
2740 int j;
2741
2742 src = dest = 0;
2743 j = 0; /* index of next interesting MCV item */
2744 while (src < values_cnt)
2745 {
2746 int ncopy;
2747
2748 if (j < num_mcv)
2749 {
2750 int first = track[j].first;
2751
2752 if (src >= first)
2753 {
2754 /* advance past this MCV item */
2755 src = first + track[j].count;
2756 j++;
2757 continue;
2758 }
2759 ncopy = first - src;
2760 }
2761 else
2762 ncopy = values_cnt - src;
2763 memmove(&values[dest], &values[src],
2764 ncopy * sizeof(ScalarItem));
2765 src += ncopy;
2766 dest += ncopy;
2767 }
2768 nvals = dest;
2769 }
2770 else
2771 nvals = values_cnt;
2772 Assert(nvals >= num_hist);
2773
2774 /* Must copy the target values into anl_context */
2775 old_context = MemoryContextSwitchTo(stats->anl_context);
2776 hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
2777
2778 /*
2779 * The object of this loop is to copy the first and last values[]
2780 * entries along with evenly-spaced values in between. So the
2781 * i'th value is values[(i * (nvals - 1)) / (num_hist - 1)]. But
2782 * computing that subscript directly risks integer overflow when
2783 * the stats target is more than a couple thousand. Instead we
2784 * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
2785 * the integral and fractional parts of the sum separately.
2786 */
2787 delta = (nvals - 1) / (num_hist - 1);
2788 deltafrac = (nvals - 1) % (num_hist - 1);
2789 pos = posfrac = 0;
2790
2791 for (i = 0; i < num_hist; i++)
2792 {
2793 hist_values[i] = datumCopy(values[pos].value,
2794 stats->attrtype->typbyval,
2795 stats->attrtype->typlen);
2796 pos += delta;
2797 posfrac += deltafrac;
2798 if (posfrac >= (num_hist - 1))
2799 {
2800 /* fractional part exceeds 1, carry to integer part */
2801 pos++;
2802 posfrac -= (num_hist - 1);
2803 }
2804 }
2805
2806 MemoryContextSwitchTo(old_context);
2807
2808 stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM;
2809 stats->staop[slot_idx] = mystats->ltopr;
2810 stats->stacoll[slot_idx] = stats->attrcollid;
2811 stats->stavalues[slot_idx] = hist_values;
2812 stats->numvalues[slot_idx] = num_hist;
2813
2814 /*
2815 * Accept the defaults for stats->statypid and others. They have
2816 * been set before we were called (see vacuum.h)
2817 */
2818 slot_idx++;
2819 }
2820
2821 /* Generate a correlation entry if there are multiple values */
2822 if (values_cnt > 1)
2823 {
2824 MemoryContext old_context;
2825 float4 *corrs;
2826 double corr_xsum,
2827 corr_x2sum;
2828
2829 /* Must copy the target values into anl_context */
2830 old_context = MemoryContextSwitchTo(stats->anl_context);
2831 corrs = (float4 *) palloc(sizeof(float4));
2832 MemoryContextSwitchTo(old_context);
2833
2834 /*----------
2835 * Since we know the x and y value sets are both
2836 * 0, 1, ..., values_cnt-1
2837 * we have sum(x) = sum(y) =
2838 * (values_cnt-1)*values_cnt / 2
2839 * and sum(x^2) = sum(y^2) =
2840 * (values_cnt-1)*values_cnt*(2*values_cnt-1) / 6.
2841 *----------
2842 */
2843 corr_xsum = ((double) (values_cnt - 1)) *
2844 ((double) values_cnt) / 2.0;
2845 corr_x2sum = ((double) (values_cnt - 1)) *
2846 ((double) values_cnt) * (double) (2 * values_cnt - 1) / 6.0;
2847
2848 /* And the correlation coefficient reduces to */
2849 corrs[0] = (values_cnt * corr_xysum - corr_xsum * corr_xsum) /
2850 (values_cnt * corr_x2sum - corr_xsum * corr_xsum);
2851
2852 stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION;
2853 stats->staop[slot_idx] = mystats->ltopr;
2854 stats->stacoll[slot_idx] = stats->attrcollid;
2855 stats->stanumbers[slot_idx] = corrs;
2856 stats->numnumbers[slot_idx] = 1;
2857 slot_idx++;
2858 }
2859 }
2860 else if (nonnull_cnt > 0)
2861 {
2862 /* We found some non-null values, but they were all too wide */
2863 Assert(nonnull_cnt == toowide_cnt);
2864 stats->stats_valid = true;
2865 /* Do the simple null-frac and width stats */
2866 stats->stanullfrac = (double) null_cnt / (double) samplerows;
2867 if (is_varwidth)
2868 stats->stawidth = total_width / (double) nonnull_cnt;
2869 else
2870 stats->stawidth = stats->attrtype->typlen;
2871 /* Assume all too-wide values are distinct, so it's a unique column */
2872 stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
2873 }
2874 else if (null_cnt > 0)
2875 {
2876 /* We found only nulls; assume the column is entirely null */
2877 stats->stats_valid = true;
2878 stats->stanullfrac = 1.0;
2879 if (is_varwidth)
2880 stats->stawidth = 0; /* "unknown" */
2881 else
2882 stats->stawidth = stats->attrtype->typlen;
2883 stats->stadistinct = 0.0; /* "unknown" */
2884 }
2885
2886 /* We don't need to bother cleaning up any of our temporary palloc's */
2887 }
2888
2889 /*
2890 * qsort_arg comparator for sorting ScalarItems
2891 *
2892 * Aside from sorting the items, we update the tupnoLink[] array
2893 * whenever two ScalarItems are found to contain equal datums. The array
2894 * is indexed by tupno; for each ScalarItem, it contains the highest
2895 * tupno that that item's datum has been found to be equal to. This allows
2896 * us to avoid additional comparisons in compute_scalar_stats().
2897 */
2898 static int
compare_scalars(const void * a,const void * b,void * arg)2899 compare_scalars(const void *a, const void *b, void *arg)
2900 {
2901 Datum da = ((const ScalarItem *) a)->value;
2902 int ta = ((const ScalarItem *) a)->tupno;
2903 Datum db = ((const ScalarItem *) b)->value;
2904 int tb = ((const ScalarItem *) b)->tupno;
2905 CompareScalarsContext *cxt = (CompareScalarsContext *) arg;
2906 int compare;
2907
2908 compare = ApplySortComparator(da, false, db, false, cxt->ssup);
2909 if (compare != 0)
2910 return compare;
2911
2912 /*
2913 * The two datums are equal, so update cxt->tupnoLink[].
2914 */
2915 if (cxt->tupnoLink[ta] < tb)
2916 cxt->tupnoLink[ta] = tb;
2917 if (cxt->tupnoLink[tb] < ta)
2918 cxt->tupnoLink[tb] = ta;
2919
2920 /*
2921 * For equal datums, sort by tupno
2922 */
2923 return ta - tb;
2924 }
2925
2926 /*
2927 * qsort comparator for sorting ScalarMCVItems by position
2928 */
2929 static int
compare_mcvs(const void * a,const void * b)2930 compare_mcvs(const void *a, const void *b)
2931 {
2932 int da = ((const ScalarMCVItem *) a)->first;
2933 int db = ((const ScalarMCVItem *) b)->first;
2934
2935 return da - db;
2936 }
2937
2938 /*
2939 * Analyze the list of common values in the sample and decide how many are
2940 * worth storing in the table's MCV list.
2941 *
2942 * mcv_counts is assumed to be a list of the counts of the most common values
2943 * seen in the sample, starting with the most common. The return value is the
2944 * number that are significantly more common than the values not in the list,
2945 * and which are therefore deemed worth storing in the table's MCV list.
2946 */
2947 static int
analyze_mcv_list(int * mcv_counts,int num_mcv,double stadistinct,double stanullfrac,int samplerows,double totalrows)2948 analyze_mcv_list(int *mcv_counts,
2949 int num_mcv,
2950 double stadistinct,
2951 double stanullfrac,
2952 int samplerows,
2953 double totalrows)
2954 {
2955 double ndistinct_table;
2956 double sumcount;
2957 int i;
2958
2959 /*
2960 * If the entire table was sampled, keep the whole list. This also
2961 * protects us against division by zero in the code below.
2962 */
2963 if (samplerows == totalrows || totalrows <= 1.0)
2964 return num_mcv;
2965
2966 /* Re-extract the estimated number of distinct nonnull values in table */
2967 ndistinct_table = stadistinct;
2968 if (ndistinct_table < 0)
2969 ndistinct_table = -ndistinct_table * totalrows;
2970
2971 /*
2972 * Exclude the least common values from the MCV list, if they are not
2973 * significantly more common than the estimated selectivity they would
2974 * have if they weren't in the list. All non-MCV values are assumed to be
2975 * equally common, after taking into account the frequencies of all the
2976 * values in the MCV list and the number of nulls (c.f. eqsel()).
2977 *
2978 * Here sumcount tracks the total count of all but the last (least common)
2979 * value in the MCV list, allowing us to determine the effect of excluding
2980 * that value from the list.
2981 *
2982 * Note that we deliberately do this by removing values from the full
2983 * list, rather than starting with an empty list and adding values,
2984 * because the latter approach can fail to add any values if all the most
2985 * common values have around the same frequency and make up the majority
2986 * of the table, so that the overall average frequency of all values is
2987 * roughly the same as that of the common values. This would lead to any
2988 * uncommon values being significantly overestimated.
2989 */
2990 sumcount = 0.0;
2991 for (i = 0; i < num_mcv - 1; i++)
2992 sumcount += mcv_counts[i];
2993
2994 while (num_mcv > 0)
2995 {
2996 double selec,
2997 otherdistinct,
2998 N,
2999 n,
3000 K,
3001 variance,
3002 stddev;
3003
3004 /*
3005 * Estimated selectivity the least common value would have if it
3006 * wasn't in the MCV list (c.f. eqsel()).
3007 */
3008 selec = 1.0 - sumcount / samplerows - stanullfrac;
3009 if (selec < 0.0)
3010 selec = 0.0;
3011 if (selec > 1.0)
3012 selec = 1.0;
3013 otherdistinct = ndistinct_table - (num_mcv - 1);
3014 if (otherdistinct > 1)
3015 selec /= otherdistinct;
3016
3017 /*
3018 * If the value is kept in the MCV list, its population frequency is
3019 * assumed to equal its sample frequency. We use the lower end of a
3020 * textbook continuity-corrected Wald-type confidence interval to
3021 * determine if that is significantly more common than the non-MCV
3022 * frequency --- specifically we assume the population frequency is
3023 * highly likely to be within around 2 standard errors of the sample
3024 * frequency, which equates to an interval of 2 standard deviations
3025 * either side of the sample count, plus an additional 0.5 for the
3026 * continuity correction. Since we are sampling without replacement,
3027 * this is a hypergeometric distribution.
3028 *
3029 * XXX: Empirically, this approach seems to work quite well, but it
3030 * may be worth considering more advanced techniques for estimating
3031 * the confidence interval of the hypergeometric distribution.
3032 */
3033 N = totalrows;
3034 n = samplerows;
3035 K = N * mcv_counts[num_mcv - 1] / n;
3036 variance = n * K * (N - K) * (N - n) / (N * N * (N - 1));
3037 stddev = sqrt(variance);
3038
3039 if (mcv_counts[num_mcv - 1] > selec * samplerows + 2 * stddev + 0.5)
3040 {
3041 /*
3042 * The value is significantly more common than the non-MCV
3043 * selectivity would suggest. Keep it, and all the other more
3044 * common values in the list.
3045 */
3046 break;
3047 }
3048 else
3049 {
3050 /* Discard this value and consider the next least common value */
3051 num_mcv--;
3052 if (num_mcv == 0)
3053 break;
3054 sumcount -= mcv_counts[num_mcv - 1];
3055 }
3056 }
3057 return num_mcv;
3058 }
3059