1 /*-------------------------------------------------------------------------
2 *
3 * costsize.c
4 * Routines to compute (and set) relation sizes and path costs
5 *
6 * Path costs are measured in arbitrary units established by these basic
7 * parameters:
8 *
9 * seq_page_cost Cost of a sequential page fetch
10 * random_page_cost Cost of a non-sequential page fetch
11 * cpu_tuple_cost Cost of typical CPU time to process a tuple
12 * cpu_index_tuple_cost Cost of typical CPU time to process an index tuple
13 * cpu_operator_cost Cost of CPU time to execute an operator or function
14 * parallel_tuple_cost Cost of CPU time to pass a tuple from worker to master backend
15 * parallel_setup_cost Cost of setting up shared memory for parallelism
16 *
17 * We expect that the kernel will typically do some amount of read-ahead
18 * optimization; this in conjunction with seek costs means that seq_page_cost
19 * is normally considerably less than random_page_cost. (However, if the
20 * database is fully cached in RAM, it is reasonable to set them equal.)
21 *
22 * We also use a rough estimate "effective_cache_size" of the number of
23 * disk pages in Postgres + OS-level disk cache. (We can't simply use
24 * NBuffers for this purpose because that would ignore the effects of
25 * the kernel's disk cache.)
26 *
27 * Obviously, taking constants for these values is an oversimplification,
28 * but it's tough enough to get any useful estimates even at this level of
29 * detail. Note that all of these parameters are user-settable, in case
30 * the default values are drastically off for a particular platform.
31 *
32 * seq_page_cost and random_page_cost can also be overridden for an individual
33 * tablespace, in case some data is on a fast disk and other data is on a slow
34 * disk. Per-tablespace overrides never apply to temporary work files such as
35 * an external sort or a materialize node that overflows work_mem.
36 *
37 * We compute two separate costs for each path:
38 * total_cost: total estimated cost to fetch all tuples
39 * startup_cost: cost that is expended before first tuple is fetched
40 * In some scenarios, such as when there is a LIMIT or we are implementing
41 * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
42 * path's result. A caller can estimate the cost of fetching a partial
43 * result by interpolating between startup_cost and total_cost. In detail:
44 * actual_cost = startup_cost +
45 * (total_cost - startup_cost) * tuples_to_fetch / path->rows;
46 * Note that a base relation's rows count (and, by extension, plan_rows for
47 * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
48 * that this equation works properly. (Note: while path->rows is never zero
49 * for ordinary relations, it is zero for paths for provably-empty relations,
50 * so beware of division-by-zero.) The LIMIT is applied as a top-level
51 * plan node.
52 *
53 * For largely historical reasons, most of the routines in this module use
54 * the passed result Path only to store their results (rows, startup_cost and
55 * total_cost) into. All the input data they need is passed as separate
56 * parameters, even though much of it could be extracted from the Path.
57 * An exception is made for the cost_XXXjoin() routines, which expect all
58 * the other fields of the passed XXXPath to be filled in, and similarly
59 * cost_index() assumes the passed IndexPath is valid except for its output
60 * values.
61 *
62 *
63 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
64 * Portions Copyright (c) 1994, Regents of the University of California
65 *
66 * IDENTIFICATION
67 * src/backend/optimizer/path/costsize.c
68 *
69 *-------------------------------------------------------------------------
70 */
71
72 #include "postgres.h"
73
74 #include <math.h>
75
76 #include "access/amapi.h"
77 #include "access/htup_details.h"
78 #include "access/tsmapi.h"
79 #include "executor/executor.h"
80 #include "executor/nodeAgg.h"
81 #include "executor/nodeHash.h"
82 #include "miscadmin.h"
83 #include "nodes/makefuncs.h"
84 #include "nodes/nodeFuncs.h"
85 #include "optimizer/clauses.h"
86 #include "optimizer/cost.h"
87 #include "optimizer/optimizer.h"
88 #include "optimizer/pathnode.h"
89 #include "optimizer/paths.h"
90 #include "optimizer/placeholder.h"
91 #include "optimizer/plancat.h"
92 #include "optimizer/planmain.h"
93 #include "optimizer/restrictinfo.h"
94 #include "parser/parsetree.h"
95 #include "utils/lsyscache.h"
96 #include "utils/selfuncs.h"
97 #include "utils/spccache.h"
98 #include "utils/tuplesort.h"
99
100
101 /* source-code-compatibility hacks for pull_varnos() API change */
102 #define pull_varnos(a,b) pull_varnos_new(a,b)
103
104 #define LOG2(x) (log(x) / 0.693147180559945)
105
106 /*
107 * Append and MergeAppend nodes are less expensive than some other operations
108 * which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
109 * per-tuple cost as cpu_tuple_cost multiplied by this value.
110 */
111 #define APPEND_CPU_COST_MULTIPLIER 0.5
112
113
114 double seq_page_cost = DEFAULT_SEQ_PAGE_COST;
115 double random_page_cost = DEFAULT_RANDOM_PAGE_COST;
116 double cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
117 double cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
118 double cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
119 double parallel_tuple_cost = DEFAULT_PARALLEL_TUPLE_COST;
120 double parallel_setup_cost = DEFAULT_PARALLEL_SETUP_COST;
121
122 int effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
123
124 Cost disable_cost = 1.0e10;
125
126 int max_parallel_workers_per_gather = 2;
127
128 bool enable_seqscan = true;
129 bool enable_indexscan = true;
130 bool enable_indexonlyscan = true;
131 bool enable_bitmapscan = true;
132 bool enable_tidscan = true;
133 bool enable_sort = true;
134 bool enable_incremental_sort = true;
135 bool enable_hashagg = true;
136 bool enable_nestloop = true;
137 bool enable_material = true;
138 bool enable_mergejoin = true;
139 bool enable_hashjoin = true;
140 bool enable_gathermerge = true;
141 bool enable_partitionwise_join = false;
142 bool enable_partitionwise_aggregate = false;
143 bool enable_parallel_append = true;
144 bool enable_parallel_hash = true;
145 bool enable_partition_pruning = true;
146
147 typedef struct
148 {
149 PlannerInfo *root;
150 QualCost total;
151 } cost_qual_eval_context;
152
153 static List *extract_nonindex_conditions(List *qual_clauses, List *indexclauses);
154 static MergeScanSelCache *cached_scansel(PlannerInfo *root,
155 RestrictInfo *rinfo,
156 PathKey *pathkey);
157 static void cost_rescan(PlannerInfo *root, Path *path,
158 Cost *rescan_startup_cost, Cost *rescan_total_cost);
159 static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
160 static void get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
161 ParamPathInfo *param_info,
162 QualCost *qpqual_cost);
163 static bool has_indexed_join_quals(NestPath *joinpath);
164 static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
165 List *quals);
166 static double calc_joinrel_size_estimate(PlannerInfo *root,
167 RelOptInfo *joinrel,
168 RelOptInfo *outer_rel,
169 RelOptInfo *inner_rel,
170 double outer_rows,
171 double inner_rows,
172 SpecialJoinInfo *sjinfo,
173 List *restrictlist);
174 static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
175 Relids outer_relids,
176 Relids inner_relids,
177 SpecialJoinInfo *sjinfo,
178 List **restrictlist);
179 static Cost append_nonpartial_cost(List *subpaths, int numpaths,
180 int parallel_workers);
181 static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
182 static double relation_byte_size(double tuples, int width);
183 static double page_size(double tuples, int width);
184 static double get_parallel_divisor(Path *path);
185
186
187 /*
188 * clamp_row_est
189 * Force a row-count estimate to a sane value.
190 */
191 double
clamp_row_est(double nrows)192 clamp_row_est(double nrows)
193 {
194 /*
195 * Force estimate to be at least one row, to make explain output look
196 * better and to avoid possible divide-by-zero when interpolating costs.
197 * Make it an integer, too.
198 */
199 if (nrows <= 1.0)
200 nrows = 1.0;
201 else
202 nrows = rint(nrows);
203
204 return nrows;
205 }
206
207
208 /*
209 * cost_seqscan
210 * Determines and returns the cost of scanning a relation sequentially.
211 *
212 * 'baserel' is the relation to be scanned
213 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
214 */
215 void
cost_seqscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)216 cost_seqscan(Path *path, PlannerInfo *root,
217 RelOptInfo *baserel, ParamPathInfo *param_info)
218 {
219 Cost startup_cost = 0;
220 Cost cpu_run_cost;
221 Cost disk_run_cost;
222 double spc_seq_page_cost;
223 QualCost qpqual_cost;
224 Cost cpu_per_tuple;
225
226 /* Should only be applied to base relations */
227 Assert(baserel->relid > 0);
228 Assert(baserel->rtekind == RTE_RELATION);
229
230 /* Mark the path with the correct row estimate */
231 if (param_info)
232 path->rows = param_info->ppi_rows;
233 else
234 path->rows = baserel->rows;
235
236 if (!enable_seqscan)
237 startup_cost += disable_cost;
238
239 /* fetch estimated page cost for tablespace containing table */
240 get_tablespace_page_costs(baserel->reltablespace,
241 NULL,
242 &spc_seq_page_cost);
243
244 /*
245 * disk costs
246 */
247 disk_run_cost = spc_seq_page_cost * baserel->pages;
248
249 /* CPU costs */
250 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
251
252 startup_cost += qpqual_cost.startup;
253 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
254 cpu_run_cost = cpu_per_tuple * baserel->tuples;
255 /* tlist eval costs are paid per output row, not per tuple scanned */
256 startup_cost += path->pathtarget->cost.startup;
257 cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
258
259 /* Adjust costing for parallelism, if used. */
260 if (path->parallel_workers > 0)
261 {
262 double parallel_divisor = get_parallel_divisor(path);
263
264 /* The CPU cost is divided among all the workers. */
265 cpu_run_cost /= parallel_divisor;
266
267 /*
268 * It may be possible to amortize some of the I/O cost, but probably
269 * not very much, because most operating systems already do aggressive
270 * prefetching. For now, we assume that the disk run cost can't be
271 * amortized at all.
272 */
273
274 /*
275 * In the case of a parallel plan, the row count needs to represent
276 * the number of tuples processed per worker.
277 */
278 path->rows = clamp_row_est(path->rows / parallel_divisor);
279 }
280
281 path->startup_cost = startup_cost;
282 path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
283 }
284
285 /*
286 * cost_samplescan
287 * Determines and returns the cost of scanning a relation using sampling.
288 *
289 * 'baserel' is the relation to be scanned
290 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
291 */
292 void
cost_samplescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)293 cost_samplescan(Path *path, PlannerInfo *root,
294 RelOptInfo *baserel, ParamPathInfo *param_info)
295 {
296 Cost startup_cost = 0;
297 Cost run_cost = 0;
298 RangeTblEntry *rte;
299 TableSampleClause *tsc;
300 TsmRoutine *tsm;
301 double spc_seq_page_cost,
302 spc_random_page_cost,
303 spc_page_cost;
304 QualCost qpqual_cost;
305 Cost cpu_per_tuple;
306
307 /* Should only be applied to base relations with tablesample clauses */
308 Assert(baserel->relid > 0);
309 rte = planner_rt_fetch(baserel->relid, root);
310 Assert(rte->rtekind == RTE_RELATION);
311 tsc = rte->tablesample;
312 Assert(tsc != NULL);
313 tsm = GetTsmRoutine(tsc->tsmhandler);
314
315 /* Mark the path with the correct row estimate */
316 if (param_info)
317 path->rows = param_info->ppi_rows;
318 else
319 path->rows = baserel->rows;
320
321 /* fetch estimated page cost for tablespace containing table */
322 get_tablespace_page_costs(baserel->reltablespace,
323 &spc_random_page_cost,
324 &spc_seq_page_cost);
325
326 /* if NextSampleBlock is used, assume random access, else sequential */
327 spc_page_cost = (tsm->NextSampleBlock != NULL) ?
328 spc_random_page_cost : spc_seq_page_cost;
329
330 /*
331 * disk costs (recall that baserel->pages has already been set to the
332 * number of pages the sampling method will visit)
333 */
334 run_cost += spc_page_cost * baserel->pages;
335
336 /*
337 * CPU costs (recall that baserel->tuples has already been set to the
338 * number of tuples the sampling method will select). Note that we ignore
339 * execution cost of the TABLESAMPLE parameter expressions; they will be
340 * evaluated only once per scan, and in most usages they'll likely be
341 * simple constants anyway. We also don't charge anything for the
342 * calculations the sampling method might do internally.
343 */
344 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
345
346 startup_cost += qpqual_cost.startup;
347 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
348 run_cost += cpu_per_tuple * baserel->tuples;
349 /* tlist eval costs are paid per output row, not per tuple scanned */
350 startup_cost += path->pathtarget->cost.startup;
351 run_cost += path->pathtarget->cost.per_tuple * path->rows;
352
353 path->startup_cost = startup_cost;
354 path->total_cost = startup_cost + run_cost;
355 }
356
357 /*
358 * cost_gather
359 * Determines and returns the cost of gather path.
360 *
361 * 'rel' is the relation to be operated upon
362 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
363 * 'rows' may be used to point to a row estimate; if non-NULL, it overrides
364 * both 'rel' and 'param_info'. This is useful when the path doesn't exactly
365 * correspond to any particular RelOptInfo.
366 */
367 void
cost_gather(GatherPath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,double * rows)368 cost_gather(GatherPath *path, PlannerInfo *root,
369 RelOptInfo *rel, ParamPathInfo *param_info,
370 double *rows)
371 {
372 Cost startup_cost = 0;
373 Cost run_cost = 0;
374
375 /* Mark the path with the correct row estimate */
376 if (rows)
377 path->path.rows = *rows;
378 else if (param_info)
379 path->path.rows = param_info->ppi_rows;
380 else
381 path->path.rows = rel->rows;
382
383 startup_cost = path->subpath->startup_cost;
384
385 run_cost = path->subpath->total_cost - path->subpath->startup_cost;
386
387 /* Parallel setup and communication cost. */
388 startup_cost += parallel_setup_cost;
389 run_cost += parallel_tuple_cost * path->path.rows;
390
391 path->path.startup_cost = startup_cost;
392 path->path.total_cost = (startup_cost + run_cost);
393 }
394
395 /*
396 * cost_gather_merge
397 * Determines and returns the cost of gather merge path.
398 *
399 * GatherMerge merges several pre-sorted input streams, using a heap that at
400 * any given instant holds the next tuple from each stream. If there are N
401 * streams, we need about N*log2(N) tuple comparisons to construct the heap at
402 * startup, and then for each output tuple, about log2(N) comparisons to
403 * replace the top heap entry with the next tuple from the same stream.
404 */
405 void
cost_gather_merge(GatherMergePath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,Cost input_startup_cost,Cost input_total_cost,double * rows)406 cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
407 RelOptInfo *rel, ParamPathInfo *param_info,
408 Cost input_startup_cost, Cost input_total_cost,
409 double *rows)
410 {
411 Cost startup_cost = 0;
412 Cost run_cost = 0;
413 Cost comparison_cost;
414 double N;
415 double logN;
416
417 /* Mark the path with the correct row estimate */
418 if (rows)
419 path->path.rows = *rows;
420 else if (param_info)
421 path->path.rows = param_info->ppi_rows;
422 else
423 path->path.rows = rel->rows;
424
425 if (!enable_gathermerge)
426 startup_cost += disable_cost;
427
428 /*
429 * Add one to the number of workers to account for the leader. This might
430 * be overgenerous since the leader will do less work than other workers
431 * in typical cases, but we'll go with it for now.
432 */
433 Assert(path->num_workers > 0);
434 N = (double) path->num_workers + 1;
435 logN = LOG2(N);
436
437 /* Assumed cost per tuple comparison */
438 comparison_cost = 2.0 * cpu_operator_cost;
439
440 /* Heap creation cost */
441 startup_cost += comparison_cost * N * logN;
442
443 /* Per-tuple heap maintenance cost */
444 run_cost += path->path.rows * comparison_cost * logN;
445
446 /* small cost for heap management, like cost_merge_append */
447 run_cost += cpu_operator_cost * path->path.rows;
448
449 /*
450 * Parallel setup and communication cost. Since Gather Merge, unlike
451 * Gather, requires us to block until a tuple is available from every
452 * worker, we bump the IPC cost up a little bit as compared with Gather.
453 * For lack of a better idea, charge an extra 5%.
454 */
455 startup_cost += parallel_setup_cost;
456 run_cost += parallel_tuple_cost * path->path.rows * 1.05;
457
458 path->path.startup_cost = startup_cost + input_startup_cost;
459 path->path.total_cost = (startup_cost + run_cost + input_total_cost);
460 }
461
462 /*
463 * cost_index
464 * Determines and returns the cost of scanning a relation using an index.
465 *
466 * 'path' describes the indexscan under consideration, and is complete
467 * except for the fields to be set by this routine
468 * 'loop_count' is the number of repetitions of the indexscan to factor into
469 * estimates of caching behavior
470 *
471 * In addition to rows, startup_cost and total_cost, cost_index() sets the
472 * path's indextotalcost and indexselectivity fields. These values will be
473 * needed if the IndexPath is used in a BitmapIndexScan.
474 *
475 * NOTE: path->indexquals must contain only clauses usable as index
476 * restrictions. Any additional quals evaluated as qpquals may reduce the
477 * number of returned tuples, but they won't reduce the number of tuples
478 * we have to fetch from the table, so they don't reduce the scan cost.
479 */
480 void
cost_index(IndexPath * path,PlannerInfo * root,double loop_count,bool partial_path)481 cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
482 bool partial_path)
483 {
484 IndexOptInfo *index = path->indexinfo;
485 RelOptInfo *baserel = index->rel;
486 bool indexonly = (path->path.pathtype == T_IndexOnlyScan);
487 amcostestimate_function amcostestimate;
488 List *qpquals;
489 Cost startup_cost = 0;
490 Cost run_cost = 0;
491 Cost cpu_run_cost = 0;
492 Cost indexStartupCost;
493 Cost indexTotalCost;
494 Selectivity indexSelectivity;
495 double indexCorrelation,
496 csquared;
497 double spc_seq_page_cost,
498 spc_random_page_cost;
499 Cost min_IO_cost,
500 max_IO_cost;
501 QualCost qpqual_cost;
502 Cost cpu_per_tuple;
503 double tuples_fetched;
504 double pages_fetched;
505 double rand_heap_pages;
506 double index_pages;
507
508 /* Should only be applied to base relations */
509 Assert(IsA(baserel, RelOptInfo) &&
510 IsA(index, IndexOptInfo));
511 Assert(baserel->relid > 0);
512 Assert(baserel->rtekind == RTE_RELATION);
513
514 /*
515 * Mark the path with the correct row estimate, and identify which quals
516 * will need to be enforced as qpquals. We need not check any quals that
517 * are implied by the index's predicate, so we can use indrestrictinfo not
518 * baserestrictinfo as the list of relevant restriction clauses for the
519 * rel.
520 */
521 if (path->path.param_info)
522 {
523 path->path.rows = path->path.param_info->ppi_rows;
524 /* qpquals come from the rel's restriction clauses and ppi_clauses */
525 qpquals = list_concat(extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
526 path->indexclauses),
527 extract_nonindex_conditions(path->path.param_info->ppi_clauses,
528 path->indexclauses));
529 }
530 else
531 {
532 path->path.rows = baserel->rows;
533 /* qpquals come from just the rel's restriction clauses */
534 qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
535 path->indexclauses);
536 }
537
538 if (!enable_indexscan)
539 startup_cost += disable_cost;
540 /* we don't need to check enable_indexonlyscan; indxpath.c does that */
541
542 /*
543 * Call index-access-method-specific code to estimate the processing cost
544 * for scanning the index, as well as the selectivity of the index (ie,
545 * the fraction of main-table tuples we will have to retrieve) and its
546 * correlation to the main-table tuple order. We need a cast here because
547 * pathnodes.h uses a weak function type to avoid including amapi.h.
548 */
549 amcostestimate = (amcostestimate_function) index->amcostestimate;
550 amcostestimate(root, path, loop_count,
551 &indexStartupCost, &indexTotalCost,
552 &indexSelectivity, &indexCorrelation,
553 &index_pages);
554
555 /*
556 * Save amcostestimate's results for possible use in bitmap scan planning.
557 * We don't bother to save indexStartupCost or indexCorrelation, because a
558 * bitmap scan doesn't care about either.
559 */
560 path->indextotalcost = indexTotalCost;
561 path->indexselectivity = indexSelectivity;
562
563 /* all costs for touching index itself included here */
564 startup_cost += indexStartupCost;
565 run_cost += indexTotalCost - indexStartupCost;
566
567 /* estimate number of main-table tuples fetched */
568 tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
569
570 /* fetch estimated page costs for tablespace containing table */
571 get_tablespace_page_costs(baserel->reltablespace,
572 &spc_random_page_cost,
573 &spc_seq_page_cost);
574
575 /*----------
576 * Estimate number of main-table pages fetched, and compute I/O cost.
577 *
578 * When the index ordering is uncorrelated with the table ordering,
579 * we use an approximation proposed by Mackert and Lohman (see
580 * index_pages_fetched() for details) to compute the number of pages
581 * fetched, and then charge spc_random_page_cost per page fetched.
582 *
583 * When the index ordering is exactly correlated with the table ordering
584 * (just after a CLUSTER, for example), the number of pages fetched should
585 * be exactly selectivity * table_size. What's more, all but the first
586 * will be sequential fetches, not the random fetches that occur in the
587 * uncorrelated case. So if the number of pages is more than 1, we
588 * ought to charge
589 * spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
590 * For partially-correlated indexes, we ought to charge somewhere between
591 * these two estimates. We currently interpolate linearly between the
592 * estimates based on the correlation squared (XXX is that appropriate?).
593 *
594 * If it's an index-only scan, then we will not need to fetch any heap
595 * pages for which the visibility map shows all tuples are visible.
596 * Hence, reduce the estimated number of heap fetches accordingly.
597 * We use the measured fraction of the entire heap that is all-visible,
598 * which might not be particularly relevant to the subset of the heap
599 * that this query will fetch; but it's not clear how to do better.
600 *----------
601 */
602 if (loop_count > 1)
603 {
604 /*
605 * For repeated indexscans, the appropriate estimate for the
606 * uncorrelated case is to scale up the number of tuples fetched in
607 * the Mackert and Lohman formula by the number of scans, so that we
608 * estimate the number of pages fetched by all the scans; then
609 * pro-rate the costs for one scan. In this case we assume all the
610 * fetches are random accesses.
611 */
612 pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
613 baserel->pages,
614 (double) index->pages,
615 root);
616
617 if (indexonly)
618 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
619
620 rand_heap_pages = pages_fetched;
621
622 max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
623
624 /*
625 * In the perfectly correlated case, the number of pages touched by
626 * each scan is selectivity * table_size, and we can use the Mackert
627 * and Lohman formula at the page level to estimate how much work is
628 * saved by caching across scans. We still assume all the fetches are
629 * random, though, which is an overestimate that's hard to correct for
630 * without double-counting the cache effects. (But in most cases
631 * where such a plan is actually interesting, only one page would get
632 * fetched per scan anyway, so it shouldn't matter much.)
633 */
634 pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
635
636 pages_fetched = index_pages_fetched(pages_fetched * loop_count,
637 baserel->pages,
638 (double) index->pages,
639 root);
640
641 if (indexonly)
642 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
643
644 min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
645 }
646 else
647 {
648 /*
649 * Normal case: apply the Mackert and Lohman formula, and then
650 * interpolate between that and the correlation-derived result.
651 */
652 pages_fetched = index_pages_fetched(tuples_fetched,
653 baserel->pages,
654 (double) index->pages,
655 root);
656
657 if (indexonly)
658 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
659
660 rand_heap_pages = pages_fetched;
661
662 /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
663 max_IO_cost = pages_fetched * spc_random_page_cost;
664
665 /* min_IO_cost is for the perfectly correlated case (csquared=1) */
666 pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
667
668 if (indexonly)
669 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
670
671 if (pages_fetched > 0)
672 {
673 min_IO_cost = spc_random_page_cost;
674 if (pages_fetched > 1)
675 min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost;
676 }
677 else
678 min_IO_cost = 0;
679 }
680
681 if (partial_path)
682 {
683 /*
684 * For index only scans compute workers based on number of index pages
685 * fetched; the number of heap pages we fetch might be so small as to
686 * effectively rule out parallelism, which we don't want to do.
687 */
688 if (indexonly)
689 rand_heap_pages = -1;
690
691 /*
692 * Estimate the number of parallel workers required to scan index. Use
693 * the number of heap pages computed considering heap fetches won't be
694 * sequential as for parallel scans the pages are accessed in random
695 * order.
696 */
697 path->path.parallel_workers = compute_parallel_worker(baserel,
698 rand_heap_pages,
699 index_pages,
700 max_parallel_workers_per_gather);
701
702 /*
703 * Fall out if workers can't be assigned for parallel scan, because in
704 * such a case this path will be rejected. So there is no benefit in
705 * doing extra computation.
706 */
707 if (path->path.parallel_workers <= 0)
708 return;
709
710 path->path.parallel_aware = true;
711 }
712
713 /*
714 * Now interpolate based on estimated index order correlation to get total
715 * disk I/O cost for main table accesses.
716 */
717 csquared = indexCorrelation * indexCorrelation;
718
719 run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
720
721 /*
722 * Estimate CPU costs per tuple.
723 *
724 * What we want here is cpu_tuple_cost plus the evaluation costs of any
725 * qual clauses that we have to evaluate as qpquals.
726 */
727 cost_qual_eval(&qpqual_cost, qpquals, root);
728
729 startup_cost += qpqual_cost.startup;
730 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
731
732 cpu_run_cost += cpu_per_tuple * tuples_fetched;
733
734 /* tlist eval costs are paid per output row, not per tuple scanned */
735 startup_cost += path->path.pathtarget->cost.startup;
736 cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
737
738 /* Adjust costing for parallelism, if used. */
739 if (path->path.parallel_workers > 0)
740 {
741 double parallel_divisor = get_parallel_divisor(&path->path);
742
743 path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
744
745 /* The CPU cost is divided among all the workers. */
746 cpu_run_cost /= parallel_divisor;
747 }
748
749 run_cost += cpu_run_cost;
750
751 path->path.startup_cost = startup_cost;
752 path->path.total_cost = startup_cost + run_cost;
753 }
754
755 /*
756 * extract_nonindex_conditions
757 *
758 * Given a list of quals to be enforced in an indexscan, extract the ones that
759 * will have to be applied as qpquals (ie, the index machinery won't handle
760 * them). Here we detect only whether a qual clause is directly redundant
761 * with some indexclause. If the index path is chosen for use, createplan.c
762 * will try a bit harder to get rid of redundant qual conditions; specifically
763 * it will see if quals can be proven to be implied by the indexquals. But
764 * it does not seem worth the cycles to try to factor that in at this stage,
765 * since we're only trying to estimate qual eval costs. Otherwise this must
766 * match the logic in create_indexscan_plan().
767 *
768 * qual_clauses, and the result, are lists of RestrictInfos.
769 * indexclauses is a list of IndexClauses.
770 */
771 static List *
extract_nonindex_conditions(List * qual_clauses,List * indexclauses)772 extract_nonindex_conditions(List *qual_clauses, List *indexclauses)
773 {
774 List *result = NIL;
775 ListCell *lc;
776
777 foreach(lc, qual_clauses)
778 {
779 RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
780
781 if (rinfo->pseudoconstant)
782 continue; /* we may drop pseudoconstants here */
783 if (is_redundant_with_indexclauses(rinfo, indexclauses))
784 continue; /* dup or derived from same EquivalenceClass */
785 /* ... skip the predicate proof attempt createplan.c will try ... */
786 result = lappend(result, rinfo);
787 }
788 return result;
789 }
790
791 /*
792 * index_pages_fetched
793 * Estimate the number of pages actually fetched after accounting for
794 * cache effects.
795 *
796 * We use an approximation proposed by Mackert and Lohman, "Index Scans
797 * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
798 * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
799 * The Mackert and Lohman approximation is that the number of pages
800 * fetched is
801 * PF =
802 * min(2TNs/(2T+Ns), T) when T <= b
803 * 2TNs/(2T+Ns) when T > b and Ns <= 2Tb/(2T-b)
804 * b + (Ns - 2Tb/(2T-b))*(T-b)/T when T > b and Ns > 2Tb/(2T-b)
805 * where
806 * T = # pages in table
807 * N = # tuples in table
808 * s = selectivity = fraction of table to be scanned
809 * b = # buffer pages available (we include kernel space here)
810 *
811 * We assume that effective_cache_size is the total number of buffer pages
812 * available for the whole query, and pro-rate that space across all the
813 * tables in the query and the index currently under consideration. (This
814 * ignores space needed for other indexes used by the query, but since we
815 * don't know which indexes will get used, we can't estimate that very well;
816 * and in any case counting all the tables may well be an overestimate, since
817 * depending on the join plan not all the tables may be scanned concurrently.)
818 *
819 * The product Ns is the number of tuples fetched; we pass in that
820 * product rather than calculating it here. "pages" is the number of pages
821 * in the object under consideration (either an index or a table).
822 * "index_pages" is the amount to add to the total table space, which was
823 * computed for us by make_one_rel.
824 *
825 * Caller is expected to have ensured that tuples_fetched is greater than zero
826 * and rounded to integer (see clamp_row_est). The result will likewise be
827 * greater than zero and integral.
828 */
829 double
index_pages_fetched(double tuples_fetched,BlockNumber pages,double index_pages,PlannerInfo * root)830 index_pages_fetched(double tuples_fetched, BlockNumber pages,
831 double index_pages, PlannerInfo *root)
832 {
833 double pages_fetched;
834 double total_pages;
835 double T,
836 b;
837
838 /* T is # pages in table, but don't allow it to be zero */
839 T = (pages > 1) ? (double) pages : 1.0;
840
841 /* Compute number of pages assumed to be competing for cache space */
842 total_pages = root->total_table_pages + index_pages;
843 total_pages = Max(total_pages, 1.0);
844 Assert(T <= total_pages);
845
846 /* b is pro-rated share of effective_cache_size */
847 b = (double) effective_cache_size * T / total_pages;
848
849 /* force it positive and integral */
850 if (b <= 1.0)
851 b = 1.0;
852 else
853 b = ceil(b);
854
855 /* This part is the Mackert and Lohman formula */
856 if (T <= b)
857 {
858 pages_fetched =
859 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
860 if (pages_fetched >= T)
861 pages_fetched = T;
862 else
863 pages_fetched = ceil(pages_fetched);
864 }
865 else
866 {
867 double lim;
868
869 lim = (2.0 * T * b) / (2.0 * T - b);
870 if (tuples_fetched <= lim)
871 {
872 pages_fetched =
873 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
874 }
875 else
876 {
877 pages_fetched =
878 b + (tuples_fetched - lim) * (T - b) / T;
879 }
880 pages_fetched = ceil(pages_fetched);
881 }
882 return pages_fetched;
883 }
884
885 /*
886 * get_indexpath_pages
887 * Determine the total size of the indexes used in a bitmap index path.
888 *
889 * Note: if the same index is used more than once in a bitmap tree, we will
890 * count it multiple times, which perhaps is the wrong thing ... but it's
891 * not completely clear, and detecting duplicates is difficult, so ignore it
892 * for now.
893 */
894 static double
get_indexpath_pages(Path * bitmapqual)895 get_indexpath_pages(Path *bitmapqual)
896 {
897 double result = 0;
898 ListCell *l;
899
900 if (IsA(bitmapqual, BitmapAndPath))
901 {
902 BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
903
904 foreach(l, apath->bitmapquals)
905 {
906 result += get_indexpath_pages((Path *) lfirst(l));
907 }
908 }
909 else if (IsA(bitmapqual, BitmapOrPath))
910 {
911 BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
912
913 foreach(l, opath->bitmapquals)
914 {
915 result += get_indexpath_pages((Path *) lfirst(l));
916 }
917 }
918 else if (IsA(bitmapqual, IndexPath))
919 {
920 IndexPath *ipath = (IndexPath *) bitmapqual;
921
922 result = (double) ipath->indexinfo->pages;
923 }
924 else
925 elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
926
927 return result;
928 }
929
930 /*
931 * cost_bitmap_heap_scan
932 * Determines and returns the cost of scanning a relation using a bitmap
933 * index-then-heap plan.
934 *
935 * 'baserel' is the relation to be scanned
936 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
937 * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
938 * 'loop_count' is the number of repetitions of the indexscan to factor into
939 * estimates of caching behavior
940 *
941 * Note: the component IndexPaths in bitmapqual should have been costed
942 * using the same loop_count.
943 */
944 void
cost_bitmap_heap_scan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,Path * bitmapqual,double loop_count)945 cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
946 ParamPathInfo *param_info,
947 Path *bitmapqual, double loop_count)
948 {
949 Cost startup_cost = 0;
950 Cost run_cost = 0;
951 Cost indexTotalCost;
952 QualCost qpqual_cost;
953 Cost cpu_per_tuple;
954 Cost cost_per_page;
955 Cost cpu_run_cost;
956 double tuples_fetched;
957 double pages_fetched;
958 double spc_seq_page_cost,
959 spc_random_page_cost;
960 double T;
961
962 /* Should only be applied to base relations */
963 Assert(IsA(baserel, RelOptInfo));
964 Assert(baserel->relid > 0);
965 Assert(baserel->rtekind == RTE_RELATION);
966
967 /* Mark the path with the correct row estimate */
968 if (param_info)
969 path->rows = param_info->ppi_rows;
970 else
971 path->rows = baserel->rows;
972
973 if (!enable_bitmapscan)
974 startup_cost += disable_cost;
975
976 pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
977 loop_count, &indexTotalCost,
978 &tuples_fetched);
979
980 startup_cost += indexTotalCost;
981 T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
982
983 /* Fetch estimated page costs for tablespace containing table. */
984 get_tablespace_page_costs(baserel->reltablespace,
985 &spc_random_page_cost,
986 &spc_seq_page_cost);
987
988 /*
989 * For small numbers of pages we should charge spc_random_page_cost
990 * apiece, while if nearly all the table's pages are being read, it's more
991 * appropriate to charge spc_seq_page_cost apiece. The effect is
992 * nonlinear, too. For lack of a better idea, interpolate like this to
993 * determine the cost per page.
994 */
995 if (pages_fetched >= 2.0)
996 cost_per_page = spc_random_page_cost -
997 (spc_random_page_cost - spc_seq_page_cost)
998 * sqrt(pages_fetched / T);
999 else
1000 cost_per_page = spc_random_page_cost;
1001
1002 run_cost += pages_fetched * cost_per_page;
1003
1004 /*
1005 * Estimate CPU costs per tuple.
1006 *
1007 * Often the indexquals don't need to be rechecked at each tuple ... but
1008 * not always, especially not if there are enough tuples involved that the
1009 * bitmaps become lossy. For the moment, just assume they will be
1010 * rechecked always. This means we charge the full freight for all the
1011 * scan clauses.
1012 */
1013 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1014
1015 startup_cost += qpqual_cost.startup;
1016 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1017 cpu_run_cost = cpu_per_tuple * tuples_fetched;
1018
1019 /* Adjust costing for parallelism, if used. */
1020 if (path->parallel_workers > 0)
1021 {
1022 double parallel_divisor = get_parallel_divisor(path);
1023
1024 /* The CPU cost is divided among all the workers. */
1025 cpu_run_cost /= parallel_divisor;
1026
1027 path->rows = clamp_row_est(path->rows / parallel_divisor);
1028 }
1029
1030
1031 run_cost += cpu_run_cost;
1032
1033 /* tlist eval costs are paid per output row, not per tuple scanned */
1034 startup_cost += path->pathtarget->cost.startup;
1035 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1036
1037 path->startup_cost = startup_cost;
1038 path->total_cost = startup_cost + run_cost;
1039 }
1040
1041 /*
1042 * cost_bitmap_tree_node
1043 * Extract cost and selectivity from a bitmap tree node (index/and/or)
1044 */
1045 void
cost_bitmap_tree_node(Path * path,Cost * cost,Selectivity * selec)1046 cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
1047 {
1048 if (IsA(path, IndexPath))
1049 {
1050 *cost = ((IndexPath *) path)->indextotalcost;
1051 *selec = ((IndexPath *) path)->indexselectivity;
1052
1053 /*
1054 * Charge a small amount per retrieved tuple to reflect the costs of
1055 * manipulating the bitmap. This is mostly to make sure that a bitmap
1056 * scan doesn't look to be the same cost as an indexscan to retrieve a
1057 * single tuple.
1058 */
1059 *cost += 0.1 * cpu_operator_cost * path->rows;
1060 }
1061 else if (IsA(path, BitmapAndPath))
1062 {
1063 *cost = path->total_cost;
1064 *selec = ((BitmapAndPath *) path)->bitmapselectivity;
1065 }
1066 else if (IsA(path, BitmapOrPath))
1067 {
1068 *cost = path->total_cost;
1069 *selec = ((BitmapOrPath *) path)->bitmapselectivity;
1070 }
1071 else
1072 {
1073 elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1074 *cost = *selec = 0; /* keep compiler quiet */
1075 }
1076 }
1077
1078 /*
1079 * cost_bitmap_and_node
1080 * Estimate the cost of a BitmapAnd node
1081 *
1082 * Note that this considers only the costs of index scanning and bitmap
1083 * creation, not the eventual heap access. In that sense the object isn't
1084 * truly a Path, but it has enough path-like properties (costs in particular)
1085 * to warrant treating it as one. We don't bother to set the path rows field,
1086 * however.
1087 */
1088 void
cost_bitmap_and_node(BitmapAndPath * path,PlannerInfo * root)1089 cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root)
1090 {
1091 Cost totalCost;
1092 Selectivity selec;
1093 ListCell *l;
1094
1095 /*
1096 * We estimate AND selectivity on the assumption that the inputs are
1097 * independent. This is probably often wrong, but we don't have the info
1098 * to do better.
1099 *
1100 * The runtime cost of the BitmapAnd itself is estimated at 100x
1101 * cpu_operator_cost for each tbm_intersect needed. Probably too small,
1102 * definitely too simplistic?
1103 */
1104 totalCost = 0.0;
1105 selec = 1.0;
1106 foreach(l, path->bitmapquals)
1107 {
1108 Path *subpath = (Path *) lfirst(l);
1109 Cost subCost;
1110 Selectivity subselec;
1111
1112 cost_bitmap_tree_node(subpath, &subCost, &subselec);
1113
1114 selec *= subselec;
1115
1116 totalCost += subCost;
1117 if (l != list_head(path->bitmapquals))
1118 totalCost += 100.0 * cpu_operator_cost;
1119 }
1120 path->bitmapselectivity = selec;
1121 path->path.rows = 0; /* per above, not used */
1122 path->path.startup_cost = totalCost;
1123 path->path.total_cost = totalCost;
1124 }
1125
1126 /*
1127 * cost_bitmap_or_node
1128 * Estimate the cost of a BitmapOr node
1129 *
1130 * See comments for cost_bitmap_and_node.
1131 */
1132 void
cost_bitmap_or_node(BitmapOrPath * path,PlannerInfo * root)1133 cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
1134 {
1135 Cost totalCost;
1136 Selectivity selec;
1137 ListCell *l;
1138
1139 /*
1140 * We estimate OR selectivity on the assumption that the inputs are
1141 * non-overlapping, since that's often the case in "x IN (list)" type
1142 * situations. Of course, we clamp to 1.0 at the end.
1143 *
1144 * The runtime cost of the BitmapOr itself is estimated at 100x
1145 * cpu_operator_cost for each tbm_union needed. Probably too small,
1146 * definitely too simplistic? We are aware that the tbm_unions are
1147 * optimized out when the inputs are BitmapIndexScans.
1148 */
1149 totalCost = 0.0;
1150 selec = 0.0;
1151 foreach(l, path->bitmapquals)
1152 {
1153 Path *subpath = (Path *) lfirst(l);
1154 Cost subCost;
1155 Selectivity subselec;
1156
1157 cost_bitmap_tree_node(subpath, &subCost, &subselec);
1158
1159 selec += subselec;
1160
1161 totalCost += subCost;
1162 if (l != list_head(path->bitmapquals) &&
1163 !IsA(subpath, IndexPath))
1164 totalCost += 100.0 * cpu_operator_cost;
1165 }
1166 path->bitmapselectivity = Min(selec, 1.0);
1167 path->path.rows = 0; /* per above, not used */
1168 path->path.startup_cost = totalCost;
1169 path->path.total_cost = totalCost;
1170 }
1171
1172 /*
1173 * cost_tidscan
1174 * Determines and returns the cost of scanning a relation using TIDs.
1175 *
1176 * 'baserel' is the relation to be scanned
1177 * 'tidquals' is the list of TID-checkable quals
1178 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1179 */
1180 void
cost_tidscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,List * tidquals,ParamPathInfo * param_info)1181 cost_tidscan(Path *path, PlannerInfo *root,
1182 RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
1183 {
1184 Cost startup_cost = 0;
1185 Cost run_cost = 0;
1186 bool isCurrentOf = false;
1187 QualCost qpqual_cost;
1188 Cost cpu_per_tuple;
1189 QualCost tid_qual_cost;
1190 int ntuples;
1191 ListCell *l;
1192 double spc_random_page_cost;
1193
1194 /* Should only be applied to base relations */
1195 Assert(baserel->relid > 0);
1196 Assert(baserel->rtekind == RTE_RELATION);
1197
1198 /* Mark the path with the correct row estimate */
1199 if (param_info)
1200 path->rows = param_info->ppi_rows;
1201 else
1202 path->rows = baserel->rows;
1203
1204 /* Count how many tuples we expect to retrieve */
1205 ntuples = 0;
1206 foreach(l, tidquals)
1207 {
1208 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
1209 Expr *qual = rinfo->clause;
1210
1211 if (IsA(qual, ScalarArrayOpExpr))
1212 {
1213 /* Each element of the array yields 1 tuple */
1214 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
1215 Node *arraynode = (Node *) lsecond(saop->args);
1216
1217 ntuples += estimate_array_length(arraynode);
1218 }
1219 else if (IsA(qual, CurrentOfExpr))
1220 {
1221 /* CURRENT OF yields 1 tuple */
1222 isCurrentOf = true;
1223 ntuples++;
1224 }
1225 else
1226 {
1227 /* It's just CTID = something, count 1 tuple */
1228 ntuples++;
1229 }
1230 }
1231
1232 /*
1233 * We must force TID scan for WHERE CURRENT OF, because only nodeTidscan.c
1234 * understands how to do it correctly. Therefore, honor enable_tidscan
1235 * only when CURRENT OF isn't present. Also note that cost_qual_eval
1236 * counts a CurrentOfExpr as having startup cost disable_cost, which we
1237 * subtract off here; that's to prevent other plan types such as seqscan
1238 * from winning.
1239 */
1240 if (isCurrentOf)
1241 {
1242 Assert(baserel->baserestrictcost.startup >= disable_cost);
1243 startup_cost -= disable_cost;
1244 }
1245 else if (!enable_tidscan)
1246 startup_cost += disable_cost;
1247
1248 /*
1249 * The TID qual expressions will be computed once, any other baserestrict
1250 * quals once per retrieved tuple.
1251 */
1252 cost_qual_eval(&tid_qual_cost, tidquals, root);
1253
1254 /* fetch estimated page cost for tablespace containing table */
1255 get_tablespace_page_costs(baserel->reltablespace,
1256 &spc_random_page_cost,
1257 NULL);
1258
1259 /* disk costs --- assume each tuple on a different page */
1260 run_cost += spc_random_page_cost * ntuples;
1261
1262 /* Add scanning CPU costs */
1263 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1264
1265 /* XXX currently we assume TID quals are a subset of qpquals */
1266 startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1267 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1268 tid_qual_cost.per_tuple;
1269 run_cost += cpu_per_tuple * ntuples;
1270
1271 /* tlist eval costs are paid per output row, not per tuple scanned */
1272 startup_cost += path->pathtarget->cost.startup;
1273 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1274
1275 path->startup_cost = startup_cost;
1276 path->total_cost = startup_cost + run_cost;
1277 }
1278
1279 /*
1280 * cost_subqueryscan
1281 * Determines and returns the cost of scanning a subquery RTE.
1282 *
1283 * 'baserel' is the relation to be scanned
1284 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1285 */
1286 void
cost_subqueryscan(SubqueryScanPath * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1287 cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
1288 RelOptInfo *baserel, ParamPathInfo *param_info)
1289 {
1290 Cost startup_cost;
1291 Cost run_cost;
1292 QualCost qpqual_cost;
1293 Cost cpu_per_tuple;
1294
1295 /* Should only be applied to base relations that are subqueries */
1296 Assert(baserel->relid > 0);
1297 Assert(baserel->rtekind == RTE_SUBQUERY);
1298
1299 /* Mark the path with the correct row estimate */
1300 if (param_info)
1301 path->path.rows = param_info->ppi_rows;
1302 else
1303 path->path.rows = baserel->rows;
1304
1305 /*
1306 * Cost of path is cost of evaluating the subplan, plus cost of evaluating
1307 * any restriction clauses and tlist that will be attached to the
1308 * SubqueryScan node, plus cpu_tuple_cost to account for selection and
1309 * projection overhead.
1310 */
1311 path->path.startup_cost = path->subpath->startup_cost;
1312 path->path.total_cost = path->subpath->total_cost;
1313
1314 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1315
1316 startup_cost = qpqual_cost.startup;
1317 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1318 run_cost = cpu_per_tuple * baserel->tuples;
1319
1320 /* tlist eval costs are paid per output row, not per tuple scanned */
1321 startup_cost += path->path.pathtarget->cost.startup;
1322 run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1323
1324 path->path.startup_cost += startup_cost;
1325 path->path.total_cost += startup_cost + run_cost;
1326 }
1327
1328 /*
1329 * cost_functionscan
1330 * Determines and returns the cost of scanning a function RTE.
1331 *
1332 * 'baserel' is the relation to be scanned
1333 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1334 */
1335 void
cost_functionscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1336 cost_functionscan(Path *path, PlannerInfo *root,
1337 RelOptInfo *baserel, ParamPathInfo *param_info)
1338 {
1339 Cost startup_cost = 0;
1340 Cost run_cost = 0;
1341 QualCost qpqual_cost;
1342 Cost cpu_per_tuple;
1343 RangeTblEntry *rte;
1344 QualCost exprcost;
1345
1346 /* Should only be applied to base relations that are functions */
1347 Assert(baserel->relid > 0);
1348 rte = planner_rt_fetch(baserel->relid, root);
1349 Assert(rte->rtekind == RTE_FUNCTION);
1350
1351 /* Mark the path with the correct row estimate */
1352 if (param_info)
1353 path->rows = param_info->ppi_rows;
1354 else
1355 path->rows = baserel->rows;
1356
1357 /*
1358 * Estimate costs of executing the function expression(s).
1359 *
1360 * Currently, nodeFunctionscan.c always executes the functions to
1361 * completion before returning any rows, and caches the results in a
1362 * tuplestore. So the function eval cost is all startup cost, and per-row
1363 * costs are minimal.
1364 *
1365 * XXX in principle we ought to charge tuplestore spill costs if the
1366 * number of rows is large. However, given how phony our rowcount
1367 * estimates for functions tend to be, there's not a lot of point in that
1368 * refinement right now.
1369 */
1370 cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1371
1372 startup_cost += exprcost.startup + exprcost.per_tuple;
1373
1374 /* Add scanning CPU costs */
1375 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1376
1377 startup_cost += qpqual_cost.startup;
1378 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1379 run_cost += cpu_per_tuple * baserel->tuples;
1380
1381 /* tlist eval costs are paid per output row, not per tuple scanned */
1382 startup_cost += path->pathtarget->cost.startup;
1383 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1384
1385 path->startup_cost = startup_cost;
1386 path->total_cost = startup_cost + run_cost;
1387 }
1388
1389 /*
1390 * cost_tablefuncscan
1391 * Determines and returns the cost of scanning a table function.
1392 *
1393 * 'baserel' is the relation to be scanned
1394 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1395 */
1396 void
cost_tablefuncscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1397 cost_tablefuncscan(Path *path, PlannerInfo *root,
1398 RelOptInfo *baserel, ParamPathInfo *param_info)
1399 {
1400 Cost startup_cost = 0;
1401 Cost run_cost = 0;
1402 QualCost qpqual_cost;
1403 Cost cpu_per_tuple;
1404 RangeTblEntry *rte;
1405 QualCost exprcost;
1406
1407 /* Should only be applied to base relations that are functions */
1408 Assert(baserel->relid > 0);
1409 rte = planner_rt_fetch(baserel->relid, root);
1410 Assert(rte->rtekind == RTE_TABLEFUNC);
1411
1412 /* Mark the path with the correct row estimate */
1413 if (param_info)
1414 path->rows = param_info->ppi_rows;
1415 else
1416 path->rows = baserel->rows;
1417
1418 /*
1419 * Estimate costs of executing the table func expression(s).
1420 *
1421 * XXX in principle we ought to charge tuplestore spill costs if the
1422 * number of rows is large. However, given how phony our rowcount
1423 * estimates for tablefuncs tend to be, there's not a lot of point in that
1424 * refinement right now.
1425 */
1426 cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1427
1428 startup_cost += exprcost.startup + exprcost.per_tuple;
1429
1430 /* Add scanning CPU costs */
1431 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1432
1433 startup_cost += qpqual_cost.startup;
1434 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1435 run_cost += cpu_per_tuple * baserel->tuples;
1436
1437 /* tlist eval costs are paid per output row, not per tuple scanned */
1438 startup_cost += path->pathtarget->cost.startup;
1439 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1440
1441 path->startup_cost = startup_cost;
1442 path->total_cost = startup_cost + run_cost;
1443 }
1444
1445 /*
1446 * cost_valuesscan
1447 * Determines and returns the cost of scanning a VALUES RTE.
1448 *
1449 * 'baserel' is the relation to be scanned
1450 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1451 */
1452 void
cost_valuesscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1453 cost_valuesscan(Path *path, PlannerInfo *root,
1454 RelOptInfo *baserel, ParamPathInfo *param_info)
1455 {
1456 Cost startup_cost = 0;
1457 Cost run_cost = 0;
1458 QualCost qpqual_cost;
1459 Cost cpu_per_tuple;
1460
1461 /* Should only be applied to base relations that are values lists */
1462 Assert(baserel->relid > 0);
1463 Assert(baserel->rtekind == RTE_VALUES);
1464
1465 /* Mark the path with the correct row estimate */
1466 if (param_info)
1467 path->rows = param_info->ppi_rows;
1468 else
1469 path->rows = baserel->rows;
1470
1471 /*
1472 * For now, estimate list evaluation cost at one operator eval per list
1473 * (probably pretty bogus, but is it worth being smarter?)
1474 */
1475 cpu_per_tuple = cpu_operator_cost;
1476
1477 /* Add scanning CPU costs */
1478 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1479
1480 startup_cost += qpqual_cost.startup;
1481 cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1482 run_cost += cpu_per_tuple * baserel->tuples;
1483
1484 /* tlist eval costs are paid per output row, not per tuple scanned */
1485 startup_cost += path->pathtarget->cost.startup;
1486 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1487
1488 path->startup_cost = startup_cost;
1489 path->total_cost = startup_cost + run_cost;
1490 }
1491
1492 /*
1493 * cost_ctescan
1494 * Determines and returns the cost of scanning a CTE RTE.
1495 *
1496 * Note: this is used for both self-reference and regular CTEs; the
1497 * possible cost differences are below the threshold of what we could
1498 * estimate accurately anyway. Note that the costs of evaluating the
1499 * referenced CTE query are added into the final plan as initplan costs,
1500 * and should NOT be counted here.
1501 */
1502 void
cost_ctescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1503 cost_ctescan(Path *path, PlannerInfo *root,
1504 RelOptInfo *baserel, ParamPathInfo *param_info)
1505 {
1506 Cost startup_cost = 0;
1507 Cost run_cost = 0;
1508 QualCost qpqual_cost;
1509 Cost cpu_per_tuple;
1510
1511 /* Should only be applied to base relations that are CTEs */
1512 Assert(baserel->relid > 0);
1513 Assert(baserel->rtekind == RTE_CTE);
1514
1515 /* Mark the path with the correct row estimate */
1516 if (param_info)
1517 path->rows = param_info->ppi_rows;
1518 else
1519 path->rows = baserel->rows;
1520
1521 /* Charge one CPU tuple cost per row for tuplestore manipulation */
1522 cpu_per_tuple = cpu_tuple_cost;
1523
1524 /* Add scanning CPU costs */
1525 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1526
1527 startup_cost += qpqual_cost.startup;
1528 cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1529 run_cost += cpu_per_tuple * baserel->tuples;
1530
1531 /* tlist eval costs are paid per output row, not per tuple scanned */
1532 startup_cost += path->pathtarget->cost.startup;
1533 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1534
1535 path->startup_cost = startup_cost;
1536 path->total_cost = startup_cost + run_cost;
1537 }
1538
1539 /*
1540 * cost_namedtuplestorescan
1541 * Determines and returns the cost of scanning a named tuplestore.
1542 */
1543 void
cost_namedtuplestorescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1544 cost_namedtuplestorescan(Path *path, PlannerInfo *root,
1545 RelOptInfo *baserel, ParamPathInfo *param_info)
1546 {
1547 Cost startup_cost = 0;
1548 Cost run_cost = 0;
1549 QualCost qpqual_cost;
1550 Cost cpu_per_tuple;
1551
1552 /* Should only be applied to base relations that are Tuplestores */
1553 Assert(baserel->relid > 0);
1554 Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1555
1556 /* Mark the path with the correct row estimate */
1557 if (param_info)
1558 path->rows = param_info->ppi_rows;
1559 else
1560 path->rows = baserel->rows;
1561
1562 /* Charge one CPU tuple cost per row for tuplestore manipulation */
1563 cpu_per_tuple = cpu_tuple_cost;
1564
1565 /* Add scanning CPU costs */
1566 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1567
1568 startup_cost += qpqual_cost.startup;
1569 cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1570 run_cost += cpu_per_tuple * baserel->tuples;
1571
1572 path->startup_cost = startup_cost;
1573 path->total_cost = startup_cost + run_cost;
1574 }
1575
1576 /*
1577 * cost_resultscan
1578 * Determines and returns the cost of scanning an RTE_RESULT relation.
1579 */
1580 void
cost_resultscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1581 cost_resultscan(Path *path, PlannerInfo *root,
1582 RelOptInfo *baserel, ParamPathInfo *param_info)
1583 {
1584 Cost startup_cost = 0;
1585 Cost run_cost = 0;
1586 QualCost qpqual_cost;
1587 Cost cpu_per_tuple;
1588
1589 /* Should only be applied to RTE_RESULT base relations */
1590 Assert(baserel->relid > 0);
1591 Assert(baserel->rtekind == RTE_RESULT);
1592
1593 /* Mark the path with the correct row estimate */
1594 if (param_info)
1595 path->rows = param_info->ppi_rows;
1596 else
1597 path->rows = baserel->rows;
1598
1599 /* We charge qual cost plus cpu_tuple_cost */
1600 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1601
1602 startup_cost += qpqual_cost.startup;
1603 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1604 run_cost += cpu_per_tuple * baserel->tuples;
1605
1606 path->startup_cost = startup_cost;
1607 path->total_cost = startup_cost + run_cost;
1608 }
1609
1610 /*
1611 * cost_recursive_union
1612 * Determines and returns the cost of performing a recursive union,
1613 * and also the estimated output size.
1614 *
1615 * We are given Paths for the nonrecursive and recursive terms.
1616 */
1617 void
cost_recursive_union(Path * runion,Path * nrterm,Path * rterm)1618 cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
1619 {
1620 Cost startup_cost;
1621 Cost total_cost;
1622 double total_rows;
1623
1624 /* We probably have decent estimates for the non-recursive term */
1625 startup_cost = nrterm->startup_cost;
1626 total_cost = nrterm->total_cost;
1627 total_rows = nrterm->rows;
1628
1629 /*
1630 * We arbitrarily assume that about 10 recursive iterations will be
1631 * needed, and that we've managed to get a good fix on the cost and output
1632 * size of each one of them. These are mighty shaky assumptions but it's
1633 * hard to see how to do better.
1634 */
1635 total_cost += 10 * rterm->total_cost;
1636 total_rows += 10 * rterm->rows;
1637
1638 /*
1639 * Also charge cpu_tuple_cost per row to account for the costs of
1640 * manipulating the tuplestores. (We don't worry about possible
1641 * spill-to-disk costs.)
1642 */
1643 total_cost += cpu_tuple_cost * total_rows;
1644
1645 runion->startup_cost = startup_cost;
1646 runion->total_cost = total_cost;
1647 runion->rows = total_rows;
1648 runion->pathtarget->width = Max(nrterm->pathtarget->width,
1649 rterm->pathtarget->width);
1650 }
1651
1652 /*
1653 * cost_tuplesort
1654 * Determines and returns the cost of sorting a relation using tuplesort,
1655 * not including the cost of reading the input data.
1656 *
1657 * If the total volume of data to sort is less than sort_mem, we will do
1658 * an in-memory sort, which requires no I/O and about t*log2(t) tuple
1659 * comparisons for t tuples.
1660 *
1661 * If the total volume exceeds sort_mem, we switch to a tape-style merge
1662 * algorithm. There will still be about t*log2(t) tuple comparisons in
1663 * total, but we will also need to write and read each tuple once per
1664 * merge pass. We expect about ceil(logM(r)) merge passes where r is the
1665 * number of initial runs formed and M is the merge order used by tuplesort.c.
1666 * Since the average initial run should be about sort_mem, we have
1667 * disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1668 * cpu = comparison_cost * t * log2(t)
1669 *
1670 * If the sort is bounded (i.e., only the first k result tuples are needed)
1671 * and k tuples can fit into sort_mem, we use a heap method that keeps only
1672 * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1673 *
1674 * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1675 * accesses (XXX can't we refine that guess?)
1676 *
1677 * By default, we charge two operator evals per tuple comparison, which should
1678 * be in the right ballpark in most cases. The caller can tweak this by
1679 * specifying nonzero comparison_cost; typically that's used for any extra
1680 * work that has to be done to prepare the inputs to the comparison operators.
1681 *
1682 * 'tuples' is the number of tuples in the relation
1683 * 'width' is the average tuple width in bytes
1684 * 'comparison_cost' is the extra cost per comparison, if any
1685 * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1686 * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1687 */
1688 static void
cost_tuplesort(Cost * startup_cost,Cost * run_cost,double tuples,int width,Cost comparison_cost,int sort_mem,double limit_tuples)1689 cost_tuplesort(Cost *startup_cost, Cost *run_cost,
1690 double tuples, int width,
1691 Cost comparison_cost, int sort_mem,
1692 double limit_tuples)
1693 {
1694 double input_bytes = relation_byte_size(tuples, width);
1695 double output_bytes;
1696 double output_tuples;
1697 long sort_mem_bytes = sort_mem * 1024L;
1698
1699 /*
1700 * We want to be sure the cost of a sort is never estimated as zero, even
1701 * if passed-in tuple count is zero. Besides, mustn't do log(0)...
1702 */
1703 if (tuples < 2.0)
1704 tuples = 2.0;
1705
1706 /* Include the default cost-per-comparison */
1707 comparison_cost += 2.0 * cpu_operator_cost;
1708
1709 /* Do we have a useful LIMIT? */
1710 if (limit_tuples > 0 && limit_tuples < tuples)
1711 {
1712 output_tuples = limit_tuples;
1713 output_bytes = relation_byte_size(output_tuples, width);
1714 }
1715 else
1716 {
1717 output_tuples = tuples;
1718 output_bytes = input_bytes;
1719 }
1720
1721 if (output_bytes > sort_mem_bytes)
1722 {
1723 /*
1724 * We'll have to use a disk-based sort of all the tuples
1725 */
1726 double npages = ceil(input_bytes / BLCKSZ);
1727 double nruns = input_bytes / sort_mem_bytes;
1728 double mergeorder = tuplesort_merge_order(sort_mem_bytes);
1729 double log_runs;
1730 double npageaccesses;
1731
1732 /*
1733 * CPU costs
1734 *
1735 * Assume about N log2 N comparisons
1736 */
1737 *startup_cost = comparison_cost * tuples * LOG2(tuples);
1738
1739 /* Disk costs */
1740
1741 /* Compute logM(r) as log(r) / log(M) */
1742 if (nruns > mergeorder)
1743 log_runs = ceil(log(nruns) / log(mergeorder));
1744 else
1745 log_runs = 1.0;
1746 npageaccesses = 2.0 * npages * log_runs;
1747 /* Assume 3/4ths of accesses are sequential, 1/4th are not */
1748 *startup_cost += npageaccesses *
1749 (seq_page_cost * 0.75 + random_page_cost * 0.25);
1750 }
1751 else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
1752 {
1753 /*
1754 * We'll use a bounded heap-sort keeping just K tuples in memory, for
1755 * a total number of tuple comparisons of N log2 K; but the constant
1756 * factor is a bit higher than for quicksort. Tweak it so that the
1757 * cost curve is continuous at the crossover point.
1758 */
1759 *startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples);
1760 }
1761 else
1762 {
1763 /* We'll use plain quicksort on all the input tuples */
1764 *startup_cost = comparison_cost * tuples * LOG2(tuples);
1765 }
1766
1767 /*
1768 * Also charge a small amount (arbitrarily set equal to operator cost) per
1769 * extracted tuple. We don't charge cpu_tuple_cost because a Sort node
1770 * doesn't do qual-checking or projection, so it has less overhead than
1771 * most plan nodes. Note it's correct to use tuples not output_tuples
1772 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
1773 * counting the LIMIT otherwise.
1774 */
1775 *run_cost = cpu_operator_cost * tuples;
1776 }
1777
1778 /*
1779 * cost_incremental_sort
1780 * Determines and returns the cost of sorting a relation incrementally, when
1781 * the input path is presorted by a prefix of the pathkeys.
1782 *
1783 * 'presorted_keys' is the number of leading pathkeys by which the input path
1784 * is sorted.
1785 *
1786 * We estimate the number of groups into which the relation is divided by the
1787 * leading pathkeys, and then calculate the cost of sorting a single group
1788 * with tuplesort using cost_tuplesort().
1789 */
1790 void
cost_incremental_sort(Path * path,PlannerInfo * root,List * pathkeys,int presorted_keys,Cost input_startup_cost,Cost input_total_cost,double input_tuples,int width,Cost comparison_cost,int sort_mem,double limit_tuples)1791 cost_incremental_sort(Path *path,
1792 PlannerInfo *root, List *pathkeys, int presorted_keys,
1793 Cost input_startup_cost, Cost input_total_cost,
1794 double input_tuples, int width, Cost comparison_cost, int sort_mem,
1795 double limit_tuples)
1796 {
1797 Cost startup_cost = 0,
1798 run_cost = 0,
1799 input_run_cost = input_total_cost - input_startup_cost;
1800 double group_tuples,
1801 input_groups;
1802 Cost group_startup_cost,
1803 group_run_cost,
1804 group_input_run_cost;
1805 List *presortedExprs = NIL;
1806 ListCell *l;
1807 int i = 0;
1808 bool unknown_varno = false;
1809
1810 Assert(presorted_keys != 0);
1811
1812 /*
1813 * We want to be sure the cost of a sort is never estimated as zero, even
1814 * if passed-in tuple count is zero. Besides, mustn't do log(0)...
1815 */
1816 if (input_tuples < 2.0)
1817 input_tuples = 2.0;
1818
1819 /* Default estimate of number of groups, capped to one group per row. */
1820 input_groups = Min(input_tuples, DEFAULT_NUM_DISTINCT);
1821
1822 /*
1823 * Extract presorted keys as list of expressions.
1824 *
1825 * We need to be careful about Vars containing "varno 0" which might have
1826 * been introduced by generate_append_tlist, which would confuse
1827 * estimate_num_groups (in fact it'd fail for such expressions). See
1828 * recurse_set_operations which has to deal with the same issue.
1829 *
1830 * Unlike recurse_set_operations we can't access the original target list
1831 * here, and even if we could it's not very clear how useful would that be
1832 * for a set operation combining multiple tables. So we simply detect if
1833 * there are any expressions with "varno 0" and use the default
1834 * DEFAULT_NUM_DISTINCT in that case.
1835 *
1836 * We might also use either 1.0 (a single group) or input_tuples (each row
1837 * being a separate group), pretty much the worst and best case for
1838 * incremental sort. But those are extreme cases and using something in
1839 * between seems reasonable. Furthermore, generate_append_tlist is used
1840 * for set operations, which are likely to produce mostly unique output
1841 * anyway - from that standpoint the DEFAULT_NUM_DISTINCT is defensive
1842 * while maintaining lower startup cost.
1843 */
1844 foreach(l, pathkeys)
1845 {
1846 PathKey *key = (PathKey *) lfirst(l);
1847 EquivalenceMember *member = (EquivalenceMember *)
1848 linitial(key->pk_eclass->ec_members);
1849
1850 /*
1851 * Check if the expression contains Var with "varno 0" so that we
1852 * don't call estimate_num_groups in that case.
1853 */
1854 if (bms_is_member(0, pull_varnos(root, (Node *) member->em_expr)))
1855 {
1856 unknown_varno = true;
1857 break;
1858 }
1859
1860 /* expression not containing any Vars with "varno 0" */
1861 presortedExprs = lappend(presortedExprs, member->em_expr);
1862
1863 i++;
1864 if (i >= presorted_keys)
1865 break;
1866 }
1867
1868 /* Estimate number of groups with equal presorted keys. */
1869 if (!unknown_varno)
1870 input_groups = estimate_num_groups(root, presortedExprs, input_tuples, NULL);
1871
1872 group_tuples = input_tuples / input_groups;
1873 group_input_run_cost = input_run_cost / input_groups;
1874
1875 /*
1876 * Estimate average cost of sorting of one group where presorted keys are
1877 * equal. Incremental sort is sensitive to distribution of tuples to the
1878 * groups, where we're relying on quite rough assumptions. Thus, we're
1879 * pessimistic about incremental sort performance and increase its average
1880 * group size by half.
1881 */
1882 cost_tuplesort(&group_startup_cost, &group_run_cost,
1883 1.5 * group_tuples, width, comparison_cost, sort_mem,
1884 limit_tuples);
1885
1886 /*
1887 * Startup cost of incremental sort is the startup cost of its first group
1888 * plus the cost of its input.
1889 */
1890 startup_cost += group_startup_cost
1891 + input_startup_cost + group_input_run_cost;
1892
1893 /*
1894 * After we started producing tuples from the first group, the cost of
1895 * producing all the tuples is given by the cost to finish processing this
1896 * group, plus the total cost to process the remaining groups, plus the
1897 * remaining cost of input.
1898 */
1899 run_cost += group_run_cost
1900 + (group_run_cost + group_startup_cost) * (input_groups - 1)
1901 + group_input_run_cost * (input_groups - 1);
1902
1903 /*
1904 * Incremental sort adds some overhead by itself. Firstly, it has to
1905 * detect the sort groups. This is roughly equal to one extra copy and
1906 * comparison per tuple. Secondly, it has to reset the tuplesort context
1907 * for every group.
1908 */
1909 run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples;
1910 run_cost += 2.0 * cpu_tuple_cost * input_groups;
1911
1912 path->rows = input_tuples;
1913 path->startup_cost = startup_cost;
1914 path->total_cost = startup_cost + run_cost;
1915 }
1916
1917 /*
1918 * cost_sort
1919 * Determines and returns the cost of sorting a relation, including
1920 * the cost of reading the input data.
1921 *
1922 * NOTE: some callers currently pass NIL for pathkeys because they
1923 * can't conveniently supply the sort keys. Since this routine doesn't
1924 * currently do anything with pathkeys anyway, that doesn't matter...
1925 * but if it ever does, it should react gracefully to lack of key data.
1926 * (Actually, the thing we'd most likely be interested in is just the number
1927 * of sort keys, which all callers *could* supply.)
1928 */
1929 void
cost_sort(Path * path,PlannerInfo * root,List * pathkeys,Cost input_cost,double tuples,int width,Cost comparison_cost,int sort_mem,double limit_tuples)1930 cost_sort(Path *path, PlannerInfo *root,
1931 List *pathkeys, Cost input_cost, double tuples, int width,
1932 Cost comparison_cost, int sort_mem,
1933 double limit_tuples)
1934
1935 {
1936 Cost startup_cost;
1937 Cost run_cost;
1938
1939 cost_tuplesort(&startup_cost, &run_cost,
1940 tuples, width,
1941 comparison_cost, sort_mem,
1942 limit_tuples);
1943
1944 if (!enable_sort)
1945 startup_cost += disable_cost;
1946
1947 startup_cost += input_cost;
1948
1949 path->rows = tuples;
1950 path->startup_cost = startup_cost;
1951 path->total_cost = startup_cost + run_cost;
1952 }
1953
1954 /*
1955 * append_nonpartial_cost
1956 * Estimate the cost of the non-partial paths in a Parallel Append.
1957 * The non-partial paths are assumed to be the first "numpaths" paths
1958 * from the subpaths list, and to be in order of decreasing cost.
1959 */
1960 static Cost
append_nonpartial_cost(List * subpaths,int numpaths,int parallel_workers)1961 append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
1962 {
1963 Cost *costarr;
1964 int arrlen;
1965 ListCell *l;
1966 ListCell *cell;
1967 int i;
1968 int path_index;
1969 int min_index;
1970 int max_index;
1971
1972 if (numpaths == 0)
1973 return 0;
1974
1975 /*
1976 * Array length is number of workers or number of relevant paths,
1977 * whichever is less.
1978 */
1979 arrlen = Min(parallel_workers, numpaths);
1980 costarr = (Cost *) palloc(sizeof(Cost) * arrlen);
1981
1982 /* The first few paths will each be claimed by a different worker. */
1983 path_index = 0;
1984 foreach(cell, subpaths)
1985 {
1986 Path *subpath = (Path *) lfirst(cell);
1987
1988 if (path_index == arrlen)
1989 break;
1990 costarr[path_index++] = subpath->total_cost;
1991 }
1992
1993 /*
1994 * Since subpaths are sorted by decreasing cost, the last one will have
1995 * the minimum cost.
1996 */
1997 min_index = arrlen - 1;
1998
1999 /*
2000 * For each of the remaining subpaths, add its cost to the array element
2001 * with minimum cost.
2002 */
2003 for_each_cell(l, subpaths, cell)
2004 {
2005 Path *subpath = (Path *) lfirst(l);
2006 int i;
2007
2008 /* Consider only the non-partial paths */
2009 if (path_index++ == numpaths)
2010 break;
2011
2012 costarr[min_index] += subpath->total_cost;
2013
2014 /* Update the new min cost array index */
2015 for (min_index = i = 0; i < arrlen; i++)
2016 {
2017 if (costarr[i] < costarr[min_index])
2018 min_index = i;
2019 }
2020 }
2021
2022 /* Return the highest cost from the array */
2023 for (max_index = i = 0; i < arrlen; i++)
2024 {
2025 if (costarr[i] > costarr[max_index])
2026 max_index = i;
2027 }
2028
2029 return costarr[max_index];
2030 }
2031
2032 /*
2033 * cost_append
2034 * Determines and returns the cost of an Append node.
2035 */
2036 void
cost_append(AppendPath * apath)2037 cost_append(AppendPath *apath)
2038 {
2039 ListCell *l;
2040
2041 apath->path.startup_cost = 0;
2042 apath->path.total_cost = 0;
2043 apath->path.rows = 0;
2044
2045 if (apath->subpaths == NIL)
2046 return;
2047
2048 if (!apath->path.parallel_aware)
2049 {
2050 List *pathkeys = apath->path.pathkeys;
2051
2052 if (pathkeys == NIL)
2053 {
2054 Path *subpath = (Path *) linitial(apath->subpaths);
2055
2056 /*
2057 * For an unordered, non-parallel-aware Append we take the startup
2058 * cost as the startup cost of the first subpath.
2059 */
2060 apath->path.startup_cost = subpath->startup_cost;
2061
2062 /* Compute rows and costs as sums of subplan rows and costs. */
2063 foreach(l, apath->subpaths)
2064 {
2065 Path *subpath = (Path *) lfirst(l);
2066
2067 apath->path.rows += subpath->rows;
2068 apath->path.total_cost += subpath->total_cost;
2069 }
2070 }
2071 else
2072 {
2073 /*
2074 * For an ordered, non-parallel-aware Append we take the startup
2075 * cost as the sum of the subpath startup costs. This ensures
2076 * that we don't underestimate the startup cost when a query's
2077 * LIMIT is such that several of the children have to be run to
2078 * satisfy it. This might be overkill --- another plausible hack
2079 * would be to take the Append's startup cost as the maximum of
2080 * the child startup costs. But we don't want to risk believing
2081 * that an ORDER BY LIMIT query can be satisfied at small cost
2082 * when the first child has small startup cost but later ones
2083 * don't. (If we had the ability to deal with nonlinear cost
2084 * interpolation for partial retrievals, we would not need to be
2085 * so conservative about this.)
2086 *
2087 * This case is also different from the above in that we have to
2088 * account for possibly injecting sorts into subpaths that aren't
2089 * natively ordered.
2090 */
2091 foreach(l, apath->subpaths)
2092 {
2093 Path *subpath = (Path *) lfirst(l);
2094 Path sort_path; /* dummy for result of cost_sort */
2095
2096 if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
2097 {
2098 /*
2099 * We'll need to insert a Sort node, so include costs for
2100 * that. We can use the parent's LIMIT if any, since we
2101 * certainly won't pull more than that many tuples from
2102 * any child.
2103 */
2104 cost_sort(&sort_path,
2105 NULL, /* doesn't currently need root */
2106 pathkeys,
2107 subpath->total_cost,
2108 subpath->rows,
2109 subpath->pathtarget->width,
2110 0.0,
2111 work_mem,
2112 apath->limit_tuples);
2113 subpath = &sort_path;
2114 }
2115
2116 apath->path.rows += subpath->rows;
2117 apath->path.startup_cost += subpath->startup_cost;
2118 apath->path.total_cost += subpath->total_cost;
2119 }
2120 }
2121 }
2122 else /* parallel-aware */
2123 {
2124 int i = 0;
2125 double parallel_divisor = get_parallel_divisor(&apath->path);
2126
2127 /* Parallel-aware Append never produces ordered output. */
2128 Assert(apath->path.pathkeys == NIL);
2129
2130 /* Calculate startup cost. */
2131 foreach(l, apath->subpaths)
2132 {
2133 Path *subpath = (Path *) lfirst(l);
2134
2135 /*
2136 * Append will start returning tuples when the child node having
2137 * lowest startup cost is done setting up. We consider only the
2138 * first few subplans that immediately get a worker assigned.
2139 */
2140 if (i == 0)
2141 apath->path.startup_cost = subpath->startup_cost;
2142 else if (i < apath->path.parallel_workers)
2143 apath->path.startup_cost = Min(apath->path.startup_cost,
2144 subpath->startup_cost);
2145
2146 /*
2147 * Apply parallel divisor to subpaths. Scale the number of rows
2148 * for each partial subpath based on the ratio of the parallel
2149 * divisor originally used for the subpath to the one we adopted.
2150 * Also add the cost of partial paths to the total cost, but
2151 * ignore non-partial paths for now.
2152 */
2153 if (i < apath->first_partial_path)
2154 apath->path.rows += subpath->rows / parallel_divisor;
2155 else
2156 {
2157 double subpath_parallel_divisor;
2158
2159 subpath_parallel_divisor = get_parallel_divisor(subpath);
2160 apath->path.rows += subpath->rows * (subpath_parallel_divisor /
2161 parallel_divisor);
2162 apath->path.total_cost += subpath->total_cost;
2163 }
2164
2165 apath->path.rows = clamp_row_est(apath->path.rows);
2166
2167 i++;
2168 }
2169
2170 /* Add cost for non-partial subpaths. */
2171 apath->path.total_cost +=
2172 append_nonpartial_cost(apath->subpaths,
2173 apath->first_partial_path,
2174 apath->path.parallel_workers);
2175 }
2176
2177 /*
2178 * Although Append does not do any selection or projection, it's not free;
2179 * add a small per-tuple overhead.
2180 */
2181 apath->path.total_cost +=
2182 cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * apath->path.rows;
2183 }
2184
2185 /*
2186 * cost_merge_append
2187 * Determines and returns the cost of a MergeAppend node.
2188 *
2189 * MergeAppend merges several pre-sorted input streams, using a heap that
2190 * at any given instant holds the next tuple from each stream. If there
2191 * are N streams, we need about N*log2(N) tuple comparisons to construct
2192 * the heap at startup, and then for each output tuple, about log2(N)
2193 * comparisons to replace the top entry.
2194 *
2195 * (The effective value of N will drop once some of the input streams are
2196 * exhausted, but it seems unlikely to be worth trying to account for that.)
2197 *
2198 * The heap is never spilled to disk, since we assume N is not very large.
2199 * So this is much simpler than cost_sort.
2200 *
2201 * As in cost_sort, we charge two operator evals per tuple comparison.
2202 *
2203 * 'pathkeys' is a list of sort keys
2204 * 'n_streams' is the number of input streams
2205 * 'input_startup_cost' is the sum of the input streams' startup costs
2206 * 'input_total_cost' is the sum of the input streams' total costs
2207 * 'tuples' is the number of tuples in all the streams
2208 */
2209 void
cost_merge_append(Path * path,PlannerInfo * root,List * pathkeys,int n_streams,Cost input_startup_cost,Cost input_total_cost,double tuples)2210 cost_merge_append(Path *path, PlannerInfo *root,
2211 List *pathkeys, int n_streams,
2212 Cost input_startup_cost, Cost input_total_cost,
2213 double tuples)
2214 {
2215 Cost startup_cost = 0;
2216 Cost run_cost = 0;
2217 Cost comparison_cost;
2218 double N;
2219 double logN;
2220
2221 /*
2222 * Avoid log(0)...
2223 */
2224 N = (n_streams < 2) ? 2.0 : (double) n_streams;
2225 logN = LOG2(N);
2226
2227 /* Assumed cost per tuple comparison */
2228 comparison_cost = 2.0 * cpu_operator_cost;
2229
2230 /* Heap creation cost */
2231 startup_cost += comparison_cost * N * logN;
2232
2233 /* Per-tuple heap maintenance cost */
2234 run_cost += tuples * comparison_cost * logN;
2235
2236 /*
2237 * Although MergeAppend does not do any selection or projection, it's not
2238 * free; add a small per-tuple overhead.
2239 */
2240 run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
2241
2242 path->startup_cost = startup_cost + input_startup_cost;
2243 path->total_cost = startup_cost + run_cost + input_total_cost;
2244 }
2245
2246 /*
2247 * cost_material
2248 * Determines and returns the cost of materializing a relation, including
2249 * the cost of reading the input data.
2250 *
2251 * If the total volume of data to materialize exceeds work_mem, we will need
2252 * to write it to disk, so the cost is much higher in that case.
2253 *
2254 * Note that here we are estimating the costs for the first scan of the
2255 * relation, so the materialization is all overhead --- any savings will
2256 * occur only on rescan, which is estimated in cost_rescan.
2257 */
2258 void
cost_material(Path * path,Cost input_startup_cost,Cost input_total_cost,double tuples,int width)2259 cost_material(Path *path,
2260 Cost input_startup_cost, Cost input_total_cost,
2261 double tuples, int width)
2262 {
2263 Cost startup_cost = input_startup_cost;
2264 Cost run_cost = input_total_cost - input_startup_cost;
2265 double nbytes = relation_byte_size(tuples, width);
2266 long work_mem_bytes = work_mem * 1024L;
2267
2268 path->rows = tuples;
2269
2270 /*
2271 * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2272 * reflect bookkeeping overhead. (This rate must be more than what
2273 * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2274 * if it is exactly the same then there will be a cost tie between
2275 * nestloop with A outer, materialized B inner and nestloop with B outer,
2276 * materialized A inner. The extra cost ensures we'll prefer
2277 * materializing the smaller rel.) Note that this is normally a good deal
2278 * less than cpu_tuple_cost; which is OK because a Material plan node
2279 * doesn't do qual-checking or projection, so it's got less overhead than
2280 * most plan nodes.
2281 */
2282 run_cost += 2 * cpu_operator_cost * tuples;
2283
2284 /*
2285 * If we will spill to disk, charge at the rate of seq_page_cost per page.
2286 * This cost is assumed to be evenly spread through the plan run phase,
2287 * which isn't exactly accurate but our cost model doesn't allow for
2288 * nonuniform costs within the run phase.
2289 */
2290 if (nbytes > work_mem_bytes)
2291 {
2292 double npages = ceil(nbytes / BLCKSZ);
2293
2294 run_cost += seq_page_cost * npages;
2295 }
2296
2297 path->startup_cost = startup_cost;
2298 path->total_cost = startup_cost + run_cost;
2299 }
2300
2301 /*
2302 * cost_agg
2303 * Determines and returns the cost of performing an Agg plan node,
2304 * including the cost of its input.
2305 *
2306 * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2307 * we are using a hashed Agg node just to do grouping).
2308 *
2309 * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2310 * are for appropriately-sorted input.
2311 */
2312 void
cost_agg(Path * path,PlannerInfo * root,AggStrategy aggstrategy,const AggClauseCosts * aggcosts,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples,double input_width)2313 cost_agg(Path *path, PlannerInfo *root,
2314 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2315 int numGroupCols, double numGroups,
2316 List *quals,
2317 Cost input_startup_cost, Cost input_total_cost,
2318 double input_tuples, double input_width)
2319 {
2320 double output_tuples;
2321 Cost startup_cost;
2322 Cost total_cost;
2323 AggClauseCosts dummy_aggcosts;
2324
2325 /* Use all-zero per-aggregate costs if NULL is passed */
2326 if (aggcosts == NULL)
2327 {
2328 Assert(aggstrategy == AGG_HASHED);
2329 MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
2330 aggcosts = &dummy_aggcosts;
2331 }
2332
2333 /*
2334 * The transCost.per_tuple component of aggcosts should be charged once
2335 * per input tuple, corresponding to the costs of evaluating the aggregate
2336 * transfns and their input expressions. The finalCost.per_tuple component
2337 * is charged once per output tuple, corresponding to the costs of
2338 * evaluating the finalfns. Startup costs are of course charged but once.
2339 *
2340 * If we are grouping, we charge an additional cpu_operator_cost per
2341 * grouping column per input tuple for grouping comparisons.
2342 *
2343 * We will produce a single output tuple if not grouping, and a tuple per
2344 * group otherwise. We charge cpu_tuple_cost for each output tuple.
2345 *
2346 * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2347 * same total CPU cost, but AGG_SORTED has lower startup cost. If the
2348 * input path is already sorted appropriately, AGG_SORTED should be
2349 * preferred (since it has no risk of memory overflow). This will happen
2350 * as long as the computed total costs are indeed exactly equal --- but if
2351 * there's roundoff error we might do the wrong thing. So be sure that
2352 * the computations below form the same intermediate values in the same
2353 * order.
2354 */
2355 if (aggstrategy == AGG_PLAIN)
2356 {
2357 startup_cost = input_total_cost;
2358 startup_cost += aggcosts->transCost.startup;
2359 startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2360 startup_cost += aggcosts->finalCost.startup;
2361 startup_cost += aggcosts->finalCost.per_tuple;
2362 /* we aren't grouping */
2363 total_cost = startup_cost + cpu_tuple_cost;
2364 output_tuples = 1;
2365 }
2366 else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
2367 {
2368 /* Here we are able to deliver output on-the-fly */
2369 startup_cost = input_startup_cost;
2370 total_cost = input_total_cost;
2371 if (aggstrategy == AGG_MIXED && !enable_hashagg)
2372 {
2373 startup_cost += disable_cost;
2374 total_cost += disable_cost;
2375 }
2376 /* calcs phrased this way to match HASHED case, see note above */
2377 total_cost += aggcosts->transCost.startup;
2378 total_cost += aggcosts->transCost.per_tuple * input_tuples;
2379 total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2380 total_cost += aggcosts->finalCost.startup;
2381 total_cost += aggcosts->finalCost.per_tuple * numGroups;
2382 total_cost += cpu_tuple_cost * numGroups;
2383 output_tuples = numGroups;
2384 }
2385 else
2386 {
2387 /* must be AGG_HASHED */
2388 startup_cost = input_total_cost;
2389 if (!enable_hashagg)
2390 startup_cost += disable_cost;
2391 startup_cost += aggcosts->transCost.startup;
2392 startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2393 /* cost of computing hash value */
2394 startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2395 startup_cost += aggcosts->finalCost.startup;
2396
2397 total_cost = startup_cost;
2398 total_cost += aggcosts->finalCost.per_tuple * numGroups;
2399 /* cost of retrieving from hash table */
2400 total_cost += cpu_tuple_cost * numGroups;
2401 output_tuples = numGroups;
2402 }
2403
2404 /*
2405 * Add the disk costs of hash aggregation that spills to disk.
2406 *
2407 * Groups that go into the hash table stay in memory until finalized, so
2408 * spilling and reprocessing tuples doesn't incur additional invocations
2409 * of transCost or finalCost. Furthermore, the computed hash value is
2410 * stored with the spilled tuples, so we don't incur extra invocations of
2411 * the hash function.
2412 *
2413 * Hash Agg begins returning tuples after the first batch is complete.
2414 * Accrue writes (spilled tuples) to startup_cost and to total_cost;
2415 * accrue reads only to total_cost.
2416 */
2417 if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED)
2418 {
2419 double pages;
2420 double pages_written = 0.0;
2421 double pages_read = 0.0;
2422 double spill_cost;
2423 double hashentrysize;
2424 double nbatches;
2425 Size mem_limit;
2426 uint64 ngroups_limit;
2427 int num_partitions;
2428 int depth;
2429
2430 /*
2431 * Estimate number of batches based on the computed limits. If less
2432 * than or equal to one, all groups are expected to fit in memory;
2433 * otherwise we expect to spill.
2434 */
2435 hashentrysize = hash_agg_entry_size(aggcosts->numAggs, input_width,
2436 aggcosts->transitionSpace);
2437 hash_agg_set_limits(hashentrysize, numGroups, 0, &mem_limit,
2438 &ngroups_limit, &num_partitions);
2439
2440 nbatches = Max((numGroups * hashentrysize) / mem_limit,
2441 numGroups / ngroups_limit);
2442
2443 nbatches = Max(ceil(nbatches), 1.0);
2444 num_partitions = Max(num_partitions, 2);
2445
2446 /*
2447 * The number of partitions can change at different levels of
2448 * recursion; but for the purposes of this calculation assume it stays
2449 * constant.
2450 */
2451 depth = ceil(log(nbatches) / log(num_partitions));
2452
2453 /*
2454 * Estimate number of pages read and written. For each level of
2455 * recursion, a tuple must be written and then later read.
2456 */
2457 pages = relation_byte_size(input_tuples, input_width) / BLCKSZ;
2458 pages_written = pages_read = pages * depth;
2459
2460 /*
2461 * HashAgg has somewhat worse IO behavior than Sort on typical
2462 * hardware/OS combinations. Account for this with a generic penalty.
2463 */
2464 pages_read *= 2.0;
2465 pages_written *= 2.0;
2466
2467 startup_cost += pages_written * random_page_cost;
2468 total_cost += pages_written * random_page_cost;
2469 total_cost += pages_read * seq_page_cost;
2470
2471 /* account for CPU cost of spilling a tuple and reading it back */
2472 spill_cost = depth * input_tuples * 2.0 * cpu_tuple_cost;
2473 startup_cost += spill_cost;
2474 total_cost += spill_cost;
2475 }
2476
2477 /*
2478 * If there are quals (HAVING quals), account for their cost and
2479 * selectivity.
2480 */
2481 if (quals)
2482 {
2483 QualCost qual_cost;
2484
2485 cost_qual_eval(&qual_cost, quals, root);
2486 startup_cost += qual_cost.startup;
2487 total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2488
2489 output_tuples = clamp_row_est(output_tuples *
2490 clauselist_selectivity(root,
2491 quals,
2492 0,
2493 JOIN_INNER,
2494 NULL));
2495 }
2496
2497 path->rows = output_tuples;
2498 path->startup_cost = startup_cost;
2499 path->total_cost = total_cost;
2500 }
2501
2502 /*
2503 * cost_windowagg
2504 * Determines and returns the cost of performing a WindowAgg plan node,
2505 * including the cost of its input.
2506 *
2507 * Input is assumed already properly sorted.
2508 */
2509 void
cost_windowagg(Path * path,PlannerInfo * root,List * windowFuncs,int numPartCols,int numOrderCols,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2510 cost_windowagg(Path *path, PlannerInfo *root,
2511 List *windowFuncs, int numPartCols, int numOrderCols,
2512 Cost input_startup_cost, Cost input_total_cost,
2513 double input_tuples)
2514 {
2515 Cost startup_cost;
2516 Cost total_cost;
2517 ListCell *lc;
2518
2519 startup_cost = input_startup_cost;
2520 total_cost = input_total_cost;
2521
2522 /*
2523 * Window functions are assumed to cost their stated execution cost, plus
2524 * the cost of evaluating their input expressions, per tuple. Since they
2525 * may in fact evaluate their inputs at multiple rows during each cycle,
2526 * this could be a drastic underestimate; but without a way to know how
2527 * many rows the window function will fetch, it's hard to do better. In
2528 * any case, it's a good estimate for all the built-in window functions,
2529 * so we'll just do this for now.
2530 */
2531 foreach(lc, windowFuncs)
2532 {
2533 WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
2534 Cost wfunccost;
2535 QualCost argcosts;
2536
2537 argcosts.startup = argcosts.per_tuple = 0;
2538 add_function_cost(root, wfunc->winfnoid, (Node *) wfunc,
2539 &argcosts);
2540 startup_cost += argcosts.startup;
2541 wfunccost = argcosts.per_tuple;
2542
2543 /* also add the input expressions' cost to per-input-row costs */
2544 cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
2545 startup_cost += argcosts.startup;
2546 wfunccost += argcosts.per_tuple;
2547
2548 /*
2549 * Add the filter's cost to per-input-row costs. XXX We should reduce
2550 * input expression costs according to filter selectivity.
2551 */
2552 cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
2553 startup_cost += argcosts.startup;
2554 wfunccost += argcosts.per_tuple;
2555
2556 total_cost += wfunccost * input_tuples;
2557 }
2558
2559 /*
2560 * We also charge cpu_operator_cost per grouping column per tuple for
2561 * grouping comparisons, plus cpu_tuple_cost per tuple for general
2562 * overhead.
2563 *
2564 * XXX this neglects costs of spooling the data to disk when it overflows
2565 * work_mem. Sooner or later that should get accounted for.
2566 */
2567 total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
2568 total_cost += cpu_tuple_cost * input_tuples;
2569
2570 path->rows = input_tuples;
2571 path->startup_cost = startup_cost;
2572 path->total_cost = total_cost;
2573 }
2574
2575 /*
2576 * cost_group
2577 * Determines and returns the cost of performing a Group plan node,
2578 * including the cost of its input.
2579 *
2580 * Note: caller must ensure that input costs are for appropriately-sorted
2581 * input.
2582 */
2583 void
cost_group(Path * path,PlannerInfo * root,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2584 cost_group(Path *path, PlannerInfo *root,
2585 int numGroupCols, double numGroups,
2586 List *quals,
2587 Cost input_startup_cost, Cost input_total_cost,
2588 double input_tuples)
2589 {
2590 double output_tuples;
2591 Cost startup_cost;
2592 Cost total_cost;
2593
2594 output_tuples = numGroups;
2595 startup_cost = input_startup_cost;
2596 total_cost = input_total_cost;
2597
2598 /*
2599 * Charge one cpu_operator_cost per comparison per input tuple. We assume
2600 * all columns get compared at most of the tuples.
2601 */
2602 total_cost += cpu_operator_cost * input_tuples * numGroupCols;
2603
2604 /*
2605 * If there are quals (HAVING quals), account for their cost and
2606 * selectivity.
2607 */
2608 if (quals)
2609 {
2610 QualCost qual_cost;
2611
2612 cost_qual_eval(&qual_cost, quals, root);
2613 startup_cost += qual_cost.startup;
2614 total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2615
2616 output_tuples = clamp_row_est(output_tuples *
2617 clauselist_selectivity(root,
2618 quals,
2619 0,
2620 JOIN_INNER,
2621 NULL));
2622 }
2623
2624 path->rows = output_tuples;
2625 path->startup_cost = startup_cost;
2626 path->total_cost = total_cost;
2627 }
2628
2629 /*
2630 * initial_cost_nestloop
2631 * Preliminary estimate of the cost of a nestloop join path.
2632 *
2633 * This must quickly produce lower-bound estimates of the path's startup and
2634 * total costs. If we are unable to eliminate the proposed path from
2635 * consideration using the lower bounds, final_cost_nestloop will be called
2636 * to obtain the final estimates.
2637 *
2638 * The exact division of labor between this function and final_cost_nestloop
2639 * is private to them, and represents a tradeoff between speed of the initial
2640 * estimate and getting a tight lower bound. We choose to not examine the
2641 * join quals here, since that's by far the most expensive part of the
2642 * calculations. The end result is that CPU-cost considerations must be
2643 * left for the second phase; and for SEMI/ANTI joins, we must also postpone
2644 * incorporation of the inner path's run cost.
2645 *
2646 * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2647 * other data to be used by final_cost_nestloop
2648 * 'jointype' is the type of join to be performed
2649 * 'outer_path' is the outer input to the join
2650 * 'inner_path' is the inner input to the join
2651 * 'extra' contains miscellaneous information about the join
2652 */
2653 void
initial_cost_nestloop(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,Path * outer_path,Path * inner_path,JoinPathExtraData * extra)2654 initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
2655 JoinType jointype,
2656 Path *outer_path, Path *inner_path,
2657 JoinPathExtraData *extra)
2658 {
2659 Cost startup_cost = 0;
2660 Cost run_cost = 0;
2661 double outer_path_rows = outer_path->rows;
2662 Cost inner_rescan_start_cost;
2663 Cost inner_rescan_total_cost;
2664 Cost inner_run_cost;
2665 Cost inner_rescan_run_cost;
2666
2667 /* estimate costs to rescan the inner relation */
2668 cost_rescan(root, inner_path,
2669 &inner_rescan_start_cost,
2670 &inner_rescan_total_cost);
2671
2672 /* cost of source data */
2673
2674 /*
2675 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
2676 * before we can start returning tuples, so the join's startup cost is
2677 * their sum. We'll also pay the inner path's rescan startup cost
2678 * multiple times.
2679 */
2680 startup_cost += outer_path->startup_cost + inner_path->startup_cost;
2681 run_cost += outer_path->total_cost - outer_path->startup_cost;
2682 if (outer_path_rows > 1)
2683 run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
2684
2685 inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
2686 inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
2687
2688 if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
2689 extra->inner_unique)
2690 {
2691 /*
2692 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2693 * executor will stop after the first match.
2694 *
2695 * Getting decent estimates requires inspection of the join quals,
2696 * which we choose to postpone to final_cost_nestloop.
2697 */
2698
2699 /* Save private data for final_cost_nestloop */
2700 workspace->inner_run_cost = inner_run_cost;
2701 workspace->inner_rescan_run_cost = inner_rescan_run_cost;
2702 }
2703 else
2704 {
2705 /* Normal case; we'll scan whole input rel for each outer row */
2706 run_cost += inner_run_cost;
2707 if (outer_path_rows > 1)
2708 run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
2709 }
2710
2711 /* CPU costs left for later */
2712
2713 /* Public result fields */
2714 workspace->startup_cost = startup_cost;
2715 workspace->total_cost = startup_cost + run_cost;
2716 /* Save private data for final_cost_nestloop */
2717 workspace->run_cost = run_cost;
2718 }
2719
2720 /*
2721 * final_cost_nestloop
2722 * Final estimate of the cost and result size of a nestloop join path.
2723 *
2724 * 'path' is already filled in except for the rows and cost fields
2725 * 'workspace' is the result from initial_cost_nestloop
2726 * 'extra' contains miscellaneous information about the join
2727 */
2728 void
final_cost_nestloop(PlannerInfo * root,NestPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)2729 final_cost_nestloop(PlannerInfo *root, NestPath *path,
2730 JoinCostWorkspace *workspace,
2731 JoinPathExtraData *extra)
2732 {
2733 Path *outer_path = path->outerjoinpath;
2734 Path *inner_path = path->innerjoinpath;
2735 double outer_path_rows = outer_path->rows;
2736 double inner_path_rows = inner_path->rows;
2737 Cost startup_cost = workspace->startup_cost;
2738 Cost run_cost = workspace->run_cost;
2739 Cost cpu_per_tuple;
2740 QualCost restrict_qual_cost;
2741 double ntuples;
2742
2743 /* Protect some assumptions below that rowcounts aren't zero or NaN */
2744 if (outer_path_rows <= 0 || isnan(outer_path_rows))
2745 outer_path_rows = 1;
2746 if (inner_path_rows <= 0 || isnan(inner_path_rows))
2747 inner_path_rows = 1;
2748
2749 /* Mark the path with the correct row estimate */
2750 if (path->path.param_info)
2751 path->path.rows = path->path.param_info->ppi_rows;
2752 else
2753 path->path.rows = path->path.parent->rows;
2754
2755 /* For partial paths, scale row estimate. */
2756 if (path->path.parallel_workers > 0)
2757 {
2758 double parallel_divisor = get_parallel_divisor(&path->path);
2759
2760 path->path.rows =
2761 clamp_row_est(path->path.rows / parallel_divisor);
2762 }
2763
2764 /*
2765 * We could include disable_cost in the preliminary estimate, but that
2766 * would amount to optimizing for the case where the join method is
2767 * disabled, which doesn't seem like the way to bet.
2768 */
2769 if (!enable_nestloop)
2770 startup_cost += disable_cost;
2771
2772 /* cost of inner-relation source data (we already dealt with outer rel) */
2773
2774 if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
2775 extra->inner_unique)
2776 {
2777 /*
2778 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2779 * executor will stop after the first match.
2780 */
2781 Cost inner_run_cost = workspace->inner_run_cost;
2782 Cost inner_rescan_run_cost = workspace->inner_rescan_run_cost;
2783 double outer_matched_rows;
2784 double outer_unmatched_rows;
2785 Selectivity inner_scan_frac;
2786
2787 /*
2788 * For an outer-rel row that has at least one match, we can expect the
2789 * inner scan to stop after a fraction 1/(match_count+1) of the inner
2790 * rows, if the matches are evenly distributed. Since they probably
2791 * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
2792 * that fraction. (If we used a larger fuzz factor, we'd have to
2793 * clamp inner_scan_frac to at most 1.0; but since match_count is at
2794 * least 1, no such clamp is needed now.)
2795 */
2796 outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
2797 outer_unmatched_rows = outer_path_rows - outer_matched_rows;
2798 inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
2799
2800 /*
2801 * Compute number of tuples processed (not number emitted!). First,
2802 * account for successfully-matched outer rows.
2803 */
2804 ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
2805
2806 /*
2807 * Now we need to estimate the actual costs of scanning the inner
2808 * relation, which may be quite a bit less than N times inner_run_cost
2809 * due to early scan stops. We consider two cases. If the inner path
2810 * is an indexscan using all the joinquals as indexquals, then an
2811 * unmatched outer row results in an indexscan returning no rows,
2812 * which is probably quite cheap. Otherwise, the executor will have
2813 * to scan the whole inner rel for an unmatched row; not so cheap.
2814 */
2815 if (has_indexed_join_quals(path))
2816 {
2817 /*
2818 * Successfully-matched outer rows will only require scanning
2819 * inner_scan_frac of the inner relation. In this case, we don't
2820 * need to charge the full inner_run_cost even when that's more
2821 * than inner_rescan_run_cost, because we can assume that none of
2822 * the inner scans ever scan the whole inner relation. So it's
2823 * okay to assume that all the inner scan executions can be
2824 * fractions of the full cost, even if materialization is reducing
2825 * the rescan cost. At this writing, it's impossible to get here
2826 * for a materialized inner scan, so inner_run_cost and
2827 * inner_rescan_run_cost will be the same anyway; but just in
2828 * case, use inner_run_cost for the first matched tuple and
2829 * inner_rescan_run_cost for additional ones.
2830 */
2831 run_cost += inner_run_cost * inner_scan_frac;
2832 if (outer_matched_rows > 1)
2833 run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
2834
2835 /*
2836 * Add the cost of inner-scan executions for unmatched outer rows.
2837 * We estimate this as the same cost as returning the first tuple
2838 * of a nonempty scan. We consider that these are all rescans,
2839 * since we used inner_run_cost once already.
2840 */
2841 run_cost += outer_unmatched_rows *
2842 inner_rescan_run_cost / inner_path_rows;
2843
2844 /*
2845 * We won't be evaluating any quals at all for unmatched rows, so
2846 * don't add them to ntuples.
2847 */
2848 }
2849 else
2850 {
2851 /*
2852 * Here, a complicating factor is that rescans may be cheaper than
2853 * first scans. If we never scan all the way to the end of the
2854 * inner rel, it might be (depending on the plan type) that we'd
2855 * never pay the whole inner first-scan run cost. However it is
2856 * difficult to estimate whether that will happen (and it could
2857 * not happen if there are any unmatched outer rows!), so be
2858 * conservative and always charge the whole first-scan cost once.
2859 * We consider this charge to correspond to the first unmatched
2860 * outer row, unless there isn't one in our estimate, in which
2861 * case blame it on the first matched row.
2862 */
2863
2864 /* First, count all unmatched join tuples as being processed */
2865 ntuples += outer_unmatched_rows * inner_path_rows;
2866
2867 /* Now add the forced full scan, and decrement appropriate count */
2868 run_cost += inner_run_cost;
2869 if (outer_unmatched_rows >= 1)
2870 outer_unmatched_rows -= 1;
2871 else
2872 outer_matched_rows -= 1;
2873
2874 /* Add inner run cost for additional outer tuples having matches */
2875 if (outer_matched_rows > 0)
2876 run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
2877
2878 /* Add inner run cost for additional unmatched outer tuples */
2879 if (outer_unmatched_rows > 0)
2880 run_cost += outer_unmatched_rows * inner_rescan_run_cost;
2881 }
2882 }
2883 else
2884 {
2885 /* Normal-case source costs were included in preliminary estimate */
2886
2887 /* Compute number of tuples processed (not number emitted!) */
2888 ntuples = outer_path_rows * inner_path_rows;
2889 }
2890
2891 /* CPU costs */
2892 cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
2893 startup_cost += restrict_qual_cost.startup;
2894 cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
2895 run_cost += cpu_per_tuple * ntuples;
2896
2897 /* tlist eval costs are paid per output row, not per tuple scanned */
2898 startup_cost += path->path.pathtarget->cost.startup;
2899 run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
2900
2901 path->path.startup_cost = startup_cost;
2902 path->path.total_cost = startup_cost + run_cost;
2903 }
2904
2905 /*
2906 * initial_cost_mergejoin
2907 * Preliminary estimate of the cost of a mergejoin path.
2908 *
2909 * This must quickly produce lower-bound estimates of the path's startup and
2910 * total costs. If we are unable to eliminate the proposed path from
2911 * consideration using the lower bounds, final_cost_mergejoin will be called
2912 * to obtain the final estimates.
2913 *
2914 * The exact division of labor between this function and final_cost_mergejoin
2915 * is private to them, and represents a tradeoff between speed of the initial
2916 * estimate and getting a tight lower bound. We choose to not examine the
2917 * join quals here, except for obtaining the scan selectivity estimate which
2918 * is really essential (but fortunately, use of caching keeps the cost of
2919 * getting that down to something reasonable).
2920 * We also assume that cost_sort is cheap enough to use here.
2921 *
2922 * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2923 * other data to be used by final_cost_mergejoin
2924 * 'jointype' is the type of join to be performed
2925 * 'mergeclauses' is the list of joinclauses to be used as merge clauses
2926 * 'outer_path' is the outer input to the join
2927 * 'inner_path' is the inner input to the join
2928 * 'outersortkeys' is the list of sort keys for the outer path
2929 * 'innersortkeys' is the list of sort keys for the inner path
2930 * 'extra' contains miscellaneous information about the join
2931 *
2932 * Note: outersortkeys and innersortkeys should be NIL if no explicit
2933 * sort is needed because the respective source path is already ordered.
2934 */
2935 void
initial_cost_mergejoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * mergeclauses,Path * outer_path,Path * inner_path,List * outersortkeys,List * innersortkeys,JoinPathExtraData * extra)2936 initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
2937 JoinType jointype,
2938 List *mergeclauses,
2939 Path *outer_path, Path *inner_path,
2940 List *outersortkeys, List *innersortkeys,
2941 JoinPathExtraData *extra)
2942 {
2943 Cost startup_cost = 0;
2944 Cost run_cost = 0;
2945 double outer_path_rows = outer_path->rows;
2946 double inner_path_rows = inner_path->rows;
2947 Cost inner_run_cost;
2948 double outer_rows,
2949 inner_rows,
2950 outer_skip_rows,
2951 inner_skip_rows;
2952 Selectivity outerstartsel,
2953 outerendsel,
2954 innerstartsel,
2955 innerendsel;
2956 Path sort_path; /* dummy for result of cost_sort */
2957
2958 /* Protect some assumptions below that rowcounts aren't zero or NaN */
2959 if (outer_path_rows <= 0 || isnan(outer_path_rows))
2960 outer_path_rows = 1;
2961 if (inner_path_rows <= 0 || isnan(inner_path_rows))
2962 inner_path_rows = 1;
2963
2964 /*
2965 * A merge join will stop as soon as it exhausts either input stream
2966 * (unless it's an outer join, in which case the outer side has to be
2967 * scanned all the way anyway). Estimate fraction of the left and right
2968 * inputs that will actually need to be scanned. Likewise, we can
2969 * estimate the number of rows that will be skipped before the first join
2970 * pair is found, which should be factored into startup cost. We use only
2971 * the first (most significant) merge clause for this purpose. Since
2972 * mergejoinscansel() is a fairly expensive computation, we cache the
2973 * results in the merge clause RestrictInfo.
2974 */
2975 if (mergeclauses && jointype != JOIN_FULL)
2976 {
2977 RestrictInfo *firstclause = (RestrictInfo *) linitial(mergeclauses);
2978 List *opathkeys;
2979 List *ipathkeys;
2980 PathKey *opathkey;
2981 PathKey *ipathkey;
2982 MergeScanSelCache *cache;
2983
2984 /* Get the input pathkeys to determine the sort-order details */
2985 opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
2986 ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
2987 Assert(opathkeys);
2988 Assert(ipathkeys);
2989 opathkey = (PathKey *) linitial(opathkeys);
2990 ipathkey = (PathKey *) linitial(ipathkeys);
2991 /* debugging check */
2992 if (opathkey->pk_opfamily != ipathkey->pk_opfamily ||
2993 opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation ||
2994 opathkey->pk_strategy != ipathkey->pk_strategy ||
2995 opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
2996 elog(ERROR, "left and right pathkeys do not match in mergejoin");
2997
2998 /* Get the selectivity with caching */
2999 cache = cached_scansel(root, firstclause, opathkey);
3000
3001 if (bms_is_subset(firstclause->left_relids,
3002 outer_path->parent->relids))
3003 {
3004 /* left side of clause is outer */
3005 outerstartsel = cache->leftstartsel;
3006 outerendsel = cache->leftendsel;
3007 innerstartsel = cache->rightstartsel;
3008 innerendsel = cache->rightendsel;
3009 }
3010 else
3011 {
3012 /* left side of clause is inner */
3013 outerstartsel = cache->rightstartsel;
3014 outerendsel = cache->rightendsel;
3015 innerstartsel = cache->leftstartsel;
3016 innerendsel = cache->leftendsel;
3017 }
3018 if (jointype == JOIN_LEFT ||
3019 jointype == JOIN_ANTI)
3020 {
3021 outerstartsel = 0.0;
3022 outerendsel = 1.0;
3023 }
3024 else if (jointype == JOIN_RIGHT)
3025 {
3026 innerstartsel = 0.0;
3027 innerendsel = 1.0;
3028 }
3029 }
3030 else
3031 {
3032 /* cope with clauseless or full mergejoin */
3033 outerstartsel = innerstartsel = 0.0;
3034 outerendsel = innerendsel = 1.0;
3035 }
3036
3037 /*
3038 * Convert selectivities to row counts. We force outer_rows and
3039 * inner_rows to be at least 1, but the skip_rows estimates can be zero.
3040 */
3041 outer_skip_rows = rint(outer_path_rows * outerstartsel);
3042 inner_skip_rows = rint(inner_path_rows * innerstartsel);
3043 outer_rows = clamp_row_est(outer_path_rows * outerendsel);
3044 inner_rows = clamp_row_est(inner_path_rows * innerendsel);
3045
3046 /* skip rows can become NaN when path rows has become infinite */
3047 Assert(outer_skip_rows <= outer_rows || isnan(outer_skip_rows));
3048 Assert(inner_skip_rows <= inner_rows || isnan(inner_skip_rows));
3049
3050 /*
3051 * Readjust scan selectivities to account for above rounding. This is
3052 * normally an insignificant effect, but when there are only a few rows in
3053 * the inputs, failing to do this makes for a large percentage error.
3054 */
3055 outerstartsel = outer_skip_rows / outer_path_rows;
3056 innerstartsel = inner_skip_rows / inner_path_rows;
3057 outerendsel = outer_rows / outer_path_rows;
3058 innerendsel = inner_rows / inner_path_rows;
3059
3060 /* start sel can become NaN when path rows has become infinite */
3061 Assert(outerstartsel <= outerendsel || isnan(outerstartsel));
3062 Assert(innerstartsel <= innerendsel || isnan(innerstartsel));
3063
3064 /* cost of source data */
3065
3066 if (outersortkeys) /* do we need to sort outer? */
3067 {
3068 cost_sort(&sort_path,
3069 root,
3070 outersortkeys,
3071 outer_path->total_cost,
3072 outer_path_rows,
3073 outer_path->pathtarget->width,
3074 0.0,
3075 work_mem,
3076 -1.0);
3077 startup_cost += sort_path.startup_cost;
3078 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3079 * outerstartsel;
3080 run_cost += (sort_path.total_cost - sort_path.startup_cost)
3081 * (outerendsel - outerstartsel);
3082 }
3083 else
3084 {
3085 startup_cost += outer_path->startup_cost;
3086 startup_cost += (outer_path->total_cost - outer_path->startup_cost)
3087 * outerstartsel;
3088 run_cost += (outer_path->total_cost - outer_path->startup_cost)
3089 * (outerendsel - outerstartsel);
3090 }
3091
3092 if (innersortkeys) /* do we need to sort inner? */
3093 {
3094 cost_sort(&sort_path,
3095 root,
3096 innersortkeys,
3097 inner_path->total_cost,
3098 inner_path_rows,
3099 inner_path->pathtarget->width,
3100 0.0,
3101 work_mem,
3102 -1.0);
3103 startup_cost += sort_path.startup_cost;
3104 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3105 * innerstartsel;
3106 inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
3107 * (innerendsel - innerstartsel);
3108 }
3109 else
3110 {
3111 startup_cost += inner_path->startup_cost;
3112 startup_cost += (inner_path->total_cost - inner_path->startup_cost)
3113 * innerstartsel;
3114 inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
3115 * (innerendsel - innerstartsel);
3116 }
3117
3118 /*
3119 * We can't yet determine whether rescanning occurs, or whether
3120 * materialization of the inner input should be done. The minimum
3121 * possible inner input cost, regardless of rescan and materialization
3122 * considerations, is inner_run_cost. We include that in
3123 * workspace->total_cost, but not yet in run_cost.
3124 */
3125
3126 /* CPU costs left for later */
3127
3128 /* Public result fields */
3129 workspace->startup_cost = startup_cost;
3130 workspace->total_cost = startup_cost + run_cost + inner_run_cost;
3131 /* Save private data for final_cost_mergejoin */
3132 workspace->run_cost = run_cost;
3133 workspace->inner_run_cost = inner_run_cost;
3134 workspace->outer_rows = outer_rows;
3135 workspace->inner_rows = inner_rows;
3136 workspace->outer_skip_rows = outer_skip_rows;
3137 workspace->inner_skip_rows = inner_skip_rows;
3138 }
3139
3140 /*
3141 * final_cost_mergejoin
3142 * Final estimate of the cost and result size of a mergejoin path.
3143 *
3144 * Unlike other costsize functions, this routine makes two actual decisions:
3145 * whether the executor will need to do mark/restore, and whether we should
3146 * materialize the inner path. It would be logically cleaner to build
3147 * separate paths testing these alternatives, but that would require repeating
3148 * most of the cost calculations, which are not all that cheap. Since the
3149 * choice will not affect output pathkeys or startup cost, only total cost,
3150 * there is no possibility of wanting to keep more than one path. So it seems
3151 * best to make the decisions here and record them in the path's
3152 * skip_mark_restore and materialize_inner fields.
3153 *
3154 * Mark/restore overhead is usually required, but can be skipped if we know
3155 * that the executor need find only one match per outer tuple, and that the
3156 * mergeclauses are sufficient to identify a match.
3157 *
3158 * We materialize the inner path if we need mark/restore and either the inner
3159 * path can't support mark/restore, or it's cheaper to use an interposed
3160 * Material node to handle mark/restore.
3161 *
3162 * 'path' is already filled in except for the rows and cost fields and
3163 * skip_mark_restore and materialize_inner
3164 * 'workspace' is the result from initial_cost_mergejoin
3165 * 'extra' contains miscellaneous information about the join
3166 */
3167 void
final_cost_mergejoin(PlannerInfo * root,MergePath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)3168 final_cost_mergejoin(PlannerInfo *root, MergePath *path,
3169 JoinCostWorkspace *workspace,
3170 JoinPathExtraData *extra)
3171 {
3172 Path *outer_path = path->jpath.outerjoinpath;
3173 Path *inner_path = path->jpath.innerjoinpath;
3174 double inner_path_rows = inner_path->rows;
3175 List *mergeclauses = path->path_mergeclauses;
3176 List *innersortkeys = path->innersortkeys;
3177 Cost startup_cost = workspace->startup_cost;
3178 Cost run_cost = workspace->run_cost;
3179 Cost inner_run_cost = workspace->inner_run_cost;
3180 double outer_rows = workspace->outer_rows;
3181 double inner_rows = workspace->inner_rows;
3182 double outer_skip_rows = workspace->outer_skip_rows;
3183 double inner_skip_rows = workspace->inner_skip_rows;
3184 Cost cpu_per_tuple,
3185 bare_inner_cost,
3186 mat_inner_cost;
3187 QualCost merge_qual_cost;
3188 QualCost qp_qual_cost;
3189 double mergejointuples,
3190 rescannedtuples;
3191 double rescanratio;
3192
3193 /* Protect some assumptions below that rowcounts aren't zero or NaN */
3194 if (inner_path_rows <= 0 || isnan(inner_path_rows))
3195 inner_path_rows = 1;
3196
3197 /* Mark the path with the correct row estimate */
3198 if (path->jpath.path.param_info)
3199 path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3200 else
3201 path->jpath.path.rows = path->jpath.path.parent->rows;
3202
3203 /* For partial paths, scale row estimate. */
3204 if (path->jpath.path.parallel_workers > 0)
3205 {
3206 double parallel_divisor = get_parallel_divisor(&path->jpath.path);
3207
3208 path->jpath.path.rows =
3209 clamp_row_est(path->jpath.path.rows / parallel_divisor);
3210 }
3211
3212 /*
3213 * We could include disable_cost in the preliminary estimate, but that
3214 * would amount to optimizing for the case where the join method is
3215 * disabled, which doesn't seem like the way to bet.
3216 */
3217 if (!enable_mergejoin)
3218 startup_cost += disable_cost;
3219
3220 /*
3221 * Compute cost of the mergequals and qpquals (other restriction clauses)
3222 * separately.
3223 */
3224 cost_qual_eval(&merge_qual_cost, mergeclauses, root);
3225 cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3226 qp_qual_cost.startup -= merge_qual_cost.startup;
3227 qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
3228
3229 /*
3230 * With a SEMI or ANTI join, or if the innerrel is known unique, the
3231 * executor will stop scanning for matches after the first match. When
3232 * all the joinclauses are merge clauses, this means we don't ever need to
3233 * back up the merge, and so we can skip mark/restore overhead.
3234 */
3235 if ((path->jpath.jointype == JOIN_SEMI ||
3236 path->jpath.jointype == JOIN_ANTI ||
3237 extra->inner_unique) &&
3238 (list_length(path->jpath.joinrestrictinfo) ==
3239 list_length(path->path_mergeclauses)))
3240 path->skip_mark_restore = true;
3241 else
3242 path->skip_mark_restore = false;
3243
3244 /*
3245 * Get approx # tuples passing the mergequals. We use approx_tuple_count
3246 * here because we need an estimate done with JOIN_INNER semantics.
3247 */
3248 mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
3249
3250 /*
3251 * When there are equal merge keys in the outer relation, the mergejoin
3252 * must rescan any matching tuples in the inner relation. This means
3253 * re-fetching inner tuples; we have to estimate how often that happens.
3254 *
3255 * For regular inner and outer joins, the number of re-fetches can be
3256 * estimated approximately as size of merge join output minus size of
3257 * inner relation. Assume that the distinct key values are 1, 2, ..., and
3258 * denote the number of values of each key in the outer relation as m1,
3259 * m2, ...; in the inner relation, n1, n2, ... Then we have
3260 *
3261 * size of join = m1 * n1 + m2 * n2 + ...
3262 *
3263 * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
3264 * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
3265 * relation
3266 *
3267 * This equation works correctly for outer tuples having no inner match
3268 * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
3269 * are effectively subtracting those from the number of rescanned tuples,
3270 * when we should not. Can we do better without expensive selectivity
3271 * computations?
3272 *
3273 * The whole issue is moot if we are working from a unique-ified outer
3274 * input, or if we know we don't need to mark/restore at all.
3275 */
3276 if (IsA(outer_path, UniquePath) || path->skip_mark_restore)
3277 rescannedtuples = 0;
3278 else
3279 {
3280 rescannedtuples = mergejointuples - inner_path_rows;
3281 /* Must clamp because of possible underestimate */
3282 if (rescannedtuples < 0)
3283 rescannedtuples = 0;
3284 }
3285
3286 /*
3287 * We'll inflate various costs this much to account for rescanning. Note
3288 * that this is to be multiplied by something involving inner_rows, or
3289 * another number related to the portion of the inner rel we'll scan.
3290 */
3291 rescanratio = 1.0 + (rescannedtuples / inner_rows);
3292
3293 /*
3294 * Decide whether we want to materialize the inner input to shield it from
3295 * mark/restore and performing re-fetches. Our cost model for regular
3296 * re-fetches is that a re-fetch costs the same as an original fetch,
3297 * which is probably an overestimate; but on the other hand we ignore the
3298 * bookkeeping costs of mark/restore. Not clear if it's worth developing
3299 * a more refined model. So we just need to inflate the inner run cost by
3300 * rescanratio.
3301 */
3302 bare_inner_cost = inner_run_cost * rescanratio;
3303
3304 /*
3305 * When we interpose a Material node the re-fetch cost is assumed to be
3306 * just cpu_operator_cost per tuple, independently of the underlying
3307 * plan's cost; and we charge an extra cpu_operator_cost per original
3308 * fetch as well. Note that we're assuming the materialize node will
3309 * never spill to disk, since it only has to remember tuples back to the
3310 * last mark. (If there are a huge number of duplicates, our other cost
3311 * factors will make the path so expensive that it probably won't get
3312 * chosen anyway.) So we don't use cost_rescan here.
3313 *
3314 * Note: keep this estimate in sync with create_mergejoin_plan's labeling
3315 * of the generated Material node.
3316 */
3317 mat_inner_cost = inner_run_cost +
3318 cpu_operator_cost * inner_rows * rescanratio;
3319
3320 /*
3321 * If we don't need mark/restore at all, we don't need materialization.
3322 */
3323 if (path->skip_mark_restore)
3324 path->materialize_inner = false;
3325
3326 /*
3327 * Prefer materializing if it looks cheaper, unless the user has asked to
3328 * suppress materialization.
3329 */
3330 else if (enable_material && mat_inner_cost < bare_inner_cost)
3331 path->materialize_inner = true;
3332
3333 /*
3334 * Even if materializing doesn't look cheaper, we *must* do it if the
3335 * inner path is to be used directly (without sorting) and it doesn't
3336 * support mark/restore.
3337 *
3338 * Since the inner side must be ordered, and only Sorts and IndexScans can
3339 * create order to begin with, and they both support mark/restore, you
3340 * might think there's no problem --- but you'd be wrong. Nestloop and
3341 * merge joins can *preserve* the order of their inputs, so they can be
3342 * selected as the input of a mergejoin, and they don't support
3343 * mark/restore at present.
3344 *
3345 * We don't test the value of enable_material here, because
3346 * materialization is required for correctness in this case, and turning
3347 * it off does not entitle us to deliver an invalid plan.
3348 */
3349 else if (innersortkeys == NIL &&
3350 !ExecSupportsMarkRestore(inner_path))
3351 path->materialize_inner = true;
3352
3353 /*
3354 * Also, force materializing if the inner path is to be sorted and the
3355 * sort is expected to spill to disk. This is because the final merge
3356 * pass can be done on-the-fly if it doesn't have to support mark/restore.
3357 * We don't try to adjust the cost estimates for this consideration,
3358 * though.
3359 *
3360 * Since materialization is a performance optimization in this case,
3361 * rather than necessary for correctness, we skip it if enable_material is
3362 * off.
3363 */
3364 else if (enable_material && innersortkeys != NIL &&
3365 relation_byte_size(inner_path_rows,
3366 inner_path->pathtarget->width) >
3367 (work_mem * 1024L))
3368 path->materialize_inner = true;
3369 else
3370 path->materialize_inner = false;
3371
3372 /* Charge the right incremental cost for the chosen case */
3373 if (path->materialize_inner)
3374 run_cost += mat_inner_cost;
3375 else
3376 run_cost += bare_inner_cost;
3377
3378 /* CPU costs */
3379
3380 /*
3381 * The number of tuple comparisons needed is approximately number of outer
3382 * rows plus number of inner rows plus number of rescanned tuples (can we
3383 * refine this?). At each one, we need to evaluate the mergejoin quals.
3384 */
3385 startup_cost += merge_qual_cost.startup;
3386 startup_cost += merge_qual_cost.per_tuple *
3387 (outer_skip_rows + inner_skip_rows * rescanratio);
3388 run_cost += merge_qual_cost.per_tuple *
3389 ((outer_rows - outer_skip_rows) +
3390 (inner_rows - inner_skip_rows) * rescanratio);
3391
3392 /*
3393 * For each tuple that gets through the mergejoin proper, we charge
3394 * cpu_tuple_cost plus the cost of evaluating additional restriction
3395 * clauses that are to be applied at the join. (This is pessimistic since
3396 * not all of the quals may get evaluated at each tuple.)
3397 *
3398 * Note: we could adjust for SEMI/ANTI joins skipping some qual
3399 * evaluations here, but it's probably not worth the trouble.
3400 */
3401 startup_cost += qp_qual_cost.startup;
3402 cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3403 run_cost += cpu_per_tuple * mergejointuples;
3404
3405 /* tlist eval costs are paid per output row, not per tuple scanned */
3406 startup_cost += path->jpath.path.pathtarget->cost.startup;
3407 run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3408
3409 path->jpath.path.startup_cost = startup_cost;
3410 path->jpath.path.total_cost = startup_cost + run_cost;
3411 }
3412
3413 /*
3414 * run mergejoinscansel() with caching
3415 */
3416 static MergeScanSelCache *
cached_scansel(PlannerInfo * root,RestrictInfo * rinfo,PathKey * pathkey)3417 cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
3418 {
3419 MergeScanSelCache *cache;
3420 ListCell *lc;
3421 Selectivity leftstartsel,
3422 leftendsel,
3423 rightstartsel,
3424 rightendsel;
3425 MemoryContext oldcontext;
3426
3427 /* Do we have this result already? */
3428 foreach(lc, rinfo->scansel_cache)
3429 {
3430 cache = (MergeScanSelCache *) lfirst(lc);
3431 if (cache->opfamily == pathkey->pk_opfamily &&
3432 cache->collation == pathkey->pk_eclass->ec_collation &&
3433 cache->strategy == pathkey->pk_strategy &&
3434 cache->nulls_first == pathkey->pk_nulls_first)
3435 return cache;
3436 }
3437
3438 /* Nope, do the computation */
3439 mergejoinscansel(root,
3440 (Node *) rinfo->clause,
3441 pathkey->pk_opfamily,
3442 pathkey->pk_strategy,
3443 pathkey->pk_nulls_first,
3444 &leftstartsel,
3445 &leftendsel,
3446 &rightstartsel,
3447 &rightendsel);
3448
3449 /* Cache the result in suitably long-lived workspace */
3450 oldcontext = MemoryContextSwitchTo(root->planner_cxt);
3451
3452 cache = (MergeScanSelCache *) palloc(sizeof(MergeScanSelCache));
3453 cache->opfamily = pathkey->pk_opfamily;
3454 cache->collation = pathkey->pk_eclass->ec_collation;
3455 cache->strategy = pathkey->pk_strategy;
3456 cache->nulls_first = pathkey->pk_nulls_first;
3457 cache->leftstartsel = leftstartsel;
3458 cache->leftendsel = leftendsel;
3459 cache->rightstartsel = rightstartsel;
3460 cache->rightendsel = rightendsel;
3461
3462 rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
3463
3464 MemoryContextSwitchTo(oldcontext);
3465
3466 return cache;
3467 }
3468
3469 /*
3470 * initial_cost_hashjoin
3471 * Preliminary estimate of the cost of a hashjoin path.
3472 *
3473 * This must quickly produce lower-bound estimates of the path's startup and
3474 * total costs. If we are unable to eliminate the proposed path from
3475 * consideration using the lower bounds, final_cost_hashjoin will be called
3476 * to obtain the final estimates.
3477 *
3478 * The exact division of labor between this function and final_cost_hashjoin
3479 * is private to them, and represents a tradeoff between speed of the initial
3480 * estimate and getting a tight lower bound. We choose to not examine the
3481 * join quals here (other than by counting the number of hash clauses),
3482 * so we can't do much with CPU costs. We do assume that
3483 * ExecChooseHashTableSize is cheap enough to use here.
3484 *
3485 * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3486 * other data to be used by final_cost_hashjoin
3487 * 'jointype' is the type of join to be performed
3488 * 'hashclauses' is the list of joinclauses to be used as hash clauses
3489 * 'outer_path' is the outer input to the join
3490 * 'inner_path' is the inner input to the join
3491 * 'extra' contains miscellaneous information about the join
3492 * 'parallel_hash' indicates that inner_path is partial and that a shared
3493 * hash table will be built in parallel
3494 */
3495 void
initial_cost_hashjoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * hashclauses,Path * outer_path,Path * inner_path,JoinPathExtraData * extra,bool parallel_hash)3496 initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
3497 JoinType jointype,
3498 List *hashclauses,
3499 Path *outer_path, Path *inner_path,
3500 JoinPathExtraData *extra,
3501 bool parallel_hash)
3502 {
3503 Cost startup_cost = 0;
3504 Cost run_cost = 0;
3505 double outer_path_rows = outer_path->rows;
3506 double inner_path_rows = inner_path->rows;
3507 double inner_path_rows_total = inner_path_rows;
3508 int num_hashclauses = list_length(hashclauses);
3509 int numbuckets;
3510 int numbatches;
3511 int num_skew_mcvs;
3512 size_t space_allowed; /* unused */
3513
3514 /* cost of source data */
3515 startup_cost += outer_path->startup_cost;
3516 run_cost += outer_path->total_cost - outer_path->startup_cost;
3517 startup_cost += inner_path->total_cost;
3518
3519 /*
3520 * Cost of computing hash function: must do it once per input tuple. We
3521 * charge one cpu_operator_cost for each column's hash function. Also,
3522 * tack on one cpu_tuple_cost per inner row, to model the costs of
3523 * inserting the row into the hashtable.
3524 *
3525 * XXX when a hashclause is more complex than a single operator, we really
3526 * should charge the extra eval costs of the left or right side, as
3527 * appropriate, here. This seems more work than it's worth at the moment.
3528 */
3529 startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
3530 * inner_path_rows;
3531 run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
3532
3533 /*
3534 * If this is a parallel hash build, then the value we have for
3535 * inner_rows_total currently refers only to the rows returned by each
3536 * participant. For shared hash table size estimation, we need the total
3537 * number, so we need to undo the division.
3538 */
3539 if (parallel_hash)
3540 inner_path_rows_total *= get_parallel_divisor(inner_path);
3541
3542 /*
3543 * Get hash table size that executor would use for inner relation.
3544 *
3545 * XXX for the moment, always assume that skew optimization will be
3546 * performed. As long as SKEW_HASH_MEM_PERCENT is small, it's not worth
3547 * trying to determine that for sure.
3548 *
3549 * XXX at some point it might be interesting to try to account for skew
3550 * optimization in the cost estimate, but for now, we don't.
3551 */
3552 ExecChooseHashTableSize(inner_path_rows_total,
3553 inner_path->pathtarget->width,
3554 true, /* useskew */
3555 parallel_hash, /* try_combined_hash_mem */
3556 outer_path->parallel_workers,
3557 &space_allowed,
3558 &numbuckets,
3559 &numbatches,
3560 &num_skew_mcvs);
3561
3562 /*
3563 * If inner relation is too big then we will need to "batch" the join,
3564 * which implies writing and reading most of the tuples to disk an extra
3565 * time. Charge seq_page_cost per page, since the I/O should be nice and
3566 * sequential. Writing the inner rel counts as startup cost, all the rest
3567 * as run cost.
3568 */
3569 if (numbatches > 1)
3570 {
3571 double outerpages = page_size(outer_path_rows,
3572 outer_path->pathtarget->width);
3573 double innerpages = page_size(inner_path_rows,
3574 inner_path->pathtarget->width);
3575
3576 startup_cost += seq_page_cost * innerpages;
3577 run_cost += seq_page_cost * (innerpages + 2 * outerpages);
3578 }
3579
3580 /* CPU costs left for later */
3581
3582 /* Public result fields */
3583 workspace->startup_cost = startup_cost;
3584 workspace->total_cost = startup_cost + run_cost;
3585 /* Save private data for final_cost_hashjoin */
3586 workspace->run_cost = run_cost;
3587 workspace->numbuckets = numbuckets;
3588 workspace->numbatches = numbatches;
3589 workspace->inner_rows_total = inner_path_rows_total;
3590 }
3591
3592 /*
3593 * final_cost_hashjoin
3594 * Final estimate of the cost and result size of a hashjoin path.
3595 *
3596 * Note: the numbatches estimate is also saved into 'path' for use later
3597 *
3598 * 'path' is already filled in except for the rows and cost fields and
3599 * num_batches
3600 * 'workspace' is the result from initial_cost_hashjoin
3601 * 'extra' contains miscellaneous information about the join
3602 */
3603 void
final_cost_hashjoin(PlannerInfo * root,HashPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)3604 final_cost_hashjoin(PlannerInfo *root, HashPath *path,
3605 JoinCostWorkspace *workspace,
3606 JoinPathExtraData *extra)
3607 {
3608 Path *outer_path = path->jpath.outerjoinpath;
3609 Path *inner_path = path->jpath.innerjoinpath;
3610 double outer_path_rows = outer_path->rows;
3611 double inner_path_rows = inner_path->rows;
3612 double inner_path_rows_total = workspace->inner_rows_total;
3613 List *hashclauses = path->path_hashclauses;
3614 Cost startup_cost = workspace->startup_cost;
3615 Cost run_cost = workspace->run_cost;
3616 int numbuckets = workspace->numbuckets;
3617 int numbatches = workspace->numbatches;
3618 Cost cpu_per_tuple;
3619 QualCost hash_qual_cost;
3620 QualCost qp_qual_cost;
3621 double hashjointuples;
3622 double virtualbuckets;
3623 Selectivity innerbucketsize;
3624 Selectivity innermcvfreq;
3625 ListCell *hcl;
3626
3627 /* Mark the path with the correct row estimate */
3628 if (path->jpath.path.param_info)
3629 path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3630 else
3631 path->jpath.path.rows = path->jpath.path.parent->rows;
3632
3633 /* For partial paths, scale row estimate. */
3634 if (path->jpath.path.parallel_workers > 0)
3635 {
3636 double parallel_divisor = get_parallel_divisor(&path->jpath.path);
3637
3638 path->jpath.path.rows =
3639 clamp_row_est(path->jpath.path.rows / parallel_divisor);
3640 }
3641
3642 /*
3643 * We could include disable_cost in the preliminary estimate, but that
3644 * would amount to optimizing for the case where the join method is
3645 * disabled, which doesn't seem like the way to bet.
3646 */
3647 if (!enable_hashjoin)
3648 startup_cost += disable_cost;
3649
3650 /* mark the path with estimated # of batches */
3651 path->num_batches = numbatches;
3652
3653 /* store the total number of tuples (sum of partial row estimates) */
3654 path->inner_rows_total = inner_path_rows_total;
3655
3656 /* and compute the number of "virtual" buckets in the whole join */
3657 virtualbuckets = (double) numbuckets * (double) numbatches;
3658
3659 /*
3660 * Determine bucketsize fraction and MCV frequency for the inner relation.
3661 * We use the smallest bucketsize or MCV frequency estimated for any
3662 * individual hashclause; this is undoubtedly conservative.
3663 *
3664 * BUT: if inner relation has been unique-ified, we can assume it's good
3665 * for hashing. This is important both because it's the right answer, and
3666 * because we avoid contaminating the cache with a value that's wrong for
3667 * non-unique-ified paths.
3668 */
3669 if (IsA(inner_path, UniquePath))
3670 {
3671 innerbucketsize = 1.0 / virtualbuckets;
3672 innermcvfreq = 0.0;
3673 }
3674 else
3675 {
3676 innerbucketsize = 1.0;
3677 innermcvfreq = 1.0;
3678 foreach(hcl, hashclauses)
3679 {
3680 RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
3681 Selectivity thisbucketsize;
3682 Selectivity thismcvfreq;
3683
3684 /*
3685 * First we have to figure out which side of the hashjoin clause
3686 * is the inner side.
3687 *
3688 * Since we tend to visit the same clauses over and over when
3689 * planning a large query, we cache the bucket stats estimates in
3690 * the RestrictInfo node to avoid repeated lookups of statistics.
3691 */
3692 if (bms_is_subset(restrictinfo->right_relids,
3693 inner_path->parent->relids))
3694 {
3695 /* righthand side is inner */
3696 thisbucketsize = restrictinfo->right_bucketsize;
3697 if (thisbucketsize < 0)
3698 {
3699 /* not cached yet */
3700 estimate_hash_bucket_stats(root,
3701 get_rightop(restrictinfo->clause),
3702 virtualbuckets,
3703 &restrictinfo->right_mcvfreq,
3704 &restrictinfo->right_bucketsize);
3705 thisbucketsize = restrictinfo->right_bucketsize;
3706 }
3707 thismcvfreq = restrictinfo->right_mcvfreq;
3708 }
3709 else
3710 {
3711 Assert(bms_is_subset(restrictinfo->left_relids,
3712 inner_path->parent->relids));
3713 /* lefthand side is inner */
3714 thisbucketsize = restrictinfo->left_bucketsize;
3715 if (thisbucketsize < 0)
3716 {
3717 /* not cached yet */
3718 estimate_hash_bucket_stats(root,
3719 get_leftop(restrictinfo->clause),
3720 virtualbuckets,
3721 &restrictinfo->left_mcvfreq,
3722 &restrictinfo->left_bucketsize);
3723 thisbucketsize = restrictinfo->left_bucketsize;
3724 }
3725 thismcvfreq = restrictinfo->left_mcvfreq;
3726 }
3727
3728 if (innerbucketsize > thisbucketsize)
3729 innerbucketsize = thisbucketsize;
3730 if (innermcvfreq > thismcvfreq)
3731 innermcvfreq = thismcvfreq;
3732 }
3733 }
3734
3735 /*
3736 * If the bucket holding the inner MCV would exceed hash_mem, we don't
3737 * want to hash unless there is really no other alternative, so apply
3738 * disable_cost. (The executor normally copes with excessive memory usage
3739 * by splitting batches, but obviously it cannot separate equal values
3740 * that way, so it will be unable to drive the batch size below hash_mem
3741 * when this is true.)
3742 */
3743 if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
3744 inner_path->pathtarget->width) > get_hash_memory_limit())
3745 startup_cost += disable_cost;
3746
3747 /*
3748 * Compute cost of the hashquals and qpquals (other restriction clauses)
3749 * separately.
3750 */
3751 cost_qual_eval(&hash_qual_cost, hashclauses, root);
3752 cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3753 qp_qual_cost.startup -= hash_qual_cost.startup;
3754 qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
3755
3756 /* CPU costs */
3757
3758 if (path->jpath.jointype == JOIN_SEMI ||
3759 path->jpath.jointype == JOIN_ANTI ||
3760 extra->inner_unique)
3761 {
3762 double outer_matched_rows;
3763 Selectivity inner_scan_frac;
3764
3765 /*
3766 * With a SEMI or ANTI join, or if the innerrel is known unique, the
3767 * executor will stop after the first match.
3768 *
3769 * For an outer-rel row that has at least one match, we can expect the
3770 * bucket scan to stop after a fraction 1/(match_count+1) of the
3771 * bucket's rows, if the matches are evenly distributed. Since they
3772 * probably aren't quite evenly distributed, we apply a fuzz factor of
3773 * 2.0 to that fraction. (If we used a larger fuzz factor, we'd have
3774 * to clamp inner_scan_frac to at most 1.0; but since match_count is
3775 * at least 1, no such clamp is needed now.)
3776 */
3777 outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3778 inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
3779
3780 startup_cost += hash_qual_cost.startup;
3781 run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
3782 clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
3783
3784 /*
3785 * For unmatched outer-rel rows, the picture is quite a lot different.
3786 * In the first place, there is no reason to assume that these rows
3787 * preferentially hit heavily-populated buckets; instead assume they
3788 * are uncorrelated with the inner distribution and so they see an
3789 * average bucket size of inner_path_rows / virtualbuckets. In the
3790 * second place, it seems likely that they will have few if any exact
3791 * hash-code matches and so very few of the tuples in the bucket will
3792 * actually require eval of the hash quals. We don't have any good
3793 * way to estimate how many will, but for the moment assume that the
3794 * effective cost per bucket entry is one-tenth what it is for
3795 * matchable tuples.
3796 */
3797 run_cost += hash_qual_cost.per_tuple *
3798 (outer_path_rows - outer_matched_rows) *
3799 clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
3800
3801 /* Get # of tuples that will pass the basic join */
3802 if (path->jpath.jointype == JOIN_ANTI)
3803 hashjointuples = outer_path_rows - outer_matched_rows;
3804 else
3805 hashjointuples = outer_matched_rows;
3806 }
3807 else
3808 {
3809 /*
3810 * The number of tuple comparisons needed is the number of outer
3811 * tuples times the typical number of tuples in a hash bucket, which
3812 * is the inner relation size times its bucketsize fraction. At each
3813 * one, we need to evaluate the hashjoin quals. But actually,
3814 * charging the full qual eval cost at each tuple is pessimistic,
3815 * since we don't evaluate the quals unless the hash values match
3816 * exactly. For lack of a better idea, halve the cost estimate to
3817 * allow for that.
3818 */
3819 startup_cost += hash_qual_cost.startup;
3820 run_cost += hash_qual_cost.per_tuple * outer_path_rows *
3821 clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
3822
3823 /*
3824 * Get approx # tuples passing the hashquals. We use
3825 * approx_tuple_count here because we need an estimate done with
3826 * JOIN_INNER semantics.
3827 */
3828 hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
3829 }
3830
3831 /*
3832 * For each tuple that gets through the hashjoin proper, we charge
3833 * cpu_tuple_cost plus the cost of evaluating additional restriction
3834 * clauses that are to be applied at the join. (This is pessimistic since
3835 * not all of the quals may get evaluated at each tuple.)
3836 */
3837 startup_cost += qp_qual_cost.startup;
3838 cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3839 run_cost += cpu_per_tuple * hashjointuples;
3840
3841 /* tlist eval costs are paid per output row, not per tuple scanned */
3842 startup_cost += path->jpath.path.pathtarget->cost.startup;
3843 run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3844
3845 path->jpath.path.startup_cost = startup_cost;
3846 path->jpath.path.total_cost = startup_cost + run_cost;
3847 }
3848
3849
3850 /*
3851 * cost_subplan
3852 * Figure the costs for a SubPlan (or initplan).
3853 *
3854 * Note: we could dig the subplan's Plan out of the root list, but in practice
3855 * all callers have it handy already, so we make them pass it.
3856 */
3857 void
cost_subplan(PlannerInfo * root,SubPlan * subplan,Plan * plan)3858 cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
3859 {
3860 QualCost sp_cost;
3861
3862 /* Figure any cost for evaluating the testexpr */
3863 cost_qual_eval(&sp_cost,
3864 make_ands_implicit((Expr *) subplan->testexpr),
3865 root);
3866
3867 if (subplan->useHashTable)
3868 {
3869 /*
3870 * If we are using a hash table for the subquery outputs, then the
3871 * cost of evaluating the query is a one-time cost. We charge one
3872 * cpu_operator_cost per tuple for the work of loading the hashtable,
3873 * too.
3874 */
3875 sp_cost.startup += plan->total_cost +
3876 cpu_operator_cost * plan->plan_rows;
3877
3878 /*
3879 * The per-tuple costs include the cost of evaluating the lefthand
3880 * expressions, plus the cost of probing the hashtable. We already
3881 * accounted for the lefthand expressions as part of the testexpr, and
3882 * will also have counted one cpu_operator_cost for each comparison
3883 * operator. That is probably too low for the probing cost, but it's
3884 * hard to make a better estimate, so live with it for now.
3885 */
3886 }
3887 else
3888 {
3889 /*
3890 * Otherwise we will be rescanning the subplan output on each
3891 * evaluation. We need to estimate how much of the output we will
3892 * actually need to scan. NOTE: this logic should agree with the
3893 * tuple_fraction estimates used by make_subplan() in
3894 * plan/subselect.c.
3895 */
3896 Cost plan_run_cost = plan->total_cost - plan->startup_cost;
3897
3898 if (subplan->subLinkType == EXISTS_SUBLINK)
3899 {
3900 /* we only need to fetch 1 tuple; clamp to avoid zero divide */
3901 sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
3902 }
3903 else if (subplan->subLinkType == ALL_SUBLINK ||
3904 subplan->subLinkType == ANY_SUBLINK)
3905 {
3906 /* assume we need 50% of the tuples */
3907 sp_cost.per_tuple += 0.50 * plan_run_cost;
3908 /* also charge a cpu_operator_cost per row examined */
3909 sp_cost.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
3910 }
3911 else
3912 {
3913 /* assume we need all tuples */
3914 sp_cost.per_tuple += plan_run_cost;
3915 }
3916
3917 /*
3918 * Also account for subplan's startup cost. If the subplan is
3919 * uncorrelated or undirect correlated, AND its topmost node is one
3920 * that materializes its output, assume that we'll only need to pay
3921 * its startup cost once; otherwise assume we pay the startup cost
3922 * every time.
3923 */
3924 if (subplan->parParam == NIL &&
3925 ExecMaterializesOutput(nodeTag(plan)))
3926 sp_cost.startup += plan->startup_cost;
3927 else
3928 sp_cost.per_tuple += plan->startup_cost;
3929 }
3930
3931 subplan->startup_cost = sp_cost.startup;
3932 subplan->per_call_cost = sp_cost.per_tuple;
3933 }
3934
3935
3936 /*
3937 * cost_rescan
3938 * Given a finished Path, estimate the costs of rescanning it after
3939 * having done so the first time. For some Path types a rescan is
3940 * cheaper than an original scan (if no parameters change), and this
3941 * function embodies knowledge about that. The default is to return
3942 * the same costs stored in the Path. (Note that the cost estimates
3943 * actually stored in Paths are always for first scans.)
3944 *
3945 * This function is not currently intended to model effects such as rescans
3946 * being cheaper due to disk block caching; what we are concerned with is
3947 * plan types wherein the executor caches results explicitly, or doesn't
3948 * redo startup calculations, etc.
3949 */
3950 static void
cost_rescan(PlannerInfo * root,Path * path,Cost * rescan_startup_cost,Cost * rescan_total_cost)3951 cost_rescan(PlannerInfo *root, Path *path,
3952 Cost *rescan_startup_cost, /* output parameters */
3953 Cost *rescan_total_cost)
3954 {
3955 switch (path->pathtype)
3956 {
3957 case T_FunctionScan:
3958
3959 /*
3960 * Currently, nodeFunctionscan.c always executes the function to
3961 * completion before returning any rows, and caches the results in
3962 * a tuplestore. So the function eval cost is all startup cost
3963 * and isn't paid over again on rescans. However, all run costs
3964 * will be paid over again.
3965 */
3966 *rescan_startup_cost = 0;
3967 *rescan_total_cost = path->total_cost - path->startup_cost;
3968 break;
3969 case T_HashJoin:
3970
3971 /*
3972 * If it's a single-batch join, we don't need to rebuild the hash
3973 * table during a rescan.
3974 */
3975 if (((HashPath *) path)->num_batches == 1)
3976 {
3977 /* Startup cost is exactly the cost of hash table building */
3978 *rescan_startup_cost = 0;
3979 *rescan_total_cost = path->total_cost - path->startup_cost;
3980 }
3981 else
3982 {
3983 /* Otherwise, no special treatment */
3984 *rescan_startup_cost = path->startup_cost;
3985 *rescan_total_cost = path->total_cost;
3986 }
3987 break;
3988 case T_CteScan:
3989 case T_WorkTableScan:
3990 {
3991 /*
3992 * These plan types materialize their final result in a
3993 * tuplestore or tuplesort object. So the rescan cost is only
3994 * cpu_tuple_cost per tuple, unless the result is large enough
3995 * to spill to disk.
3996 */
3997 Cost run_cost = cpu_tuple_cost * path->rows;
3998 double nbytes = relation_byte_size(path->rows,
3999 path->pathtarget->width);
4000 long work_mem_bytes = work_mem * 1024L;
4001
4002 if (nbytes > work_mem_bytes)
4003 {
4004 /* It will spill, so account for re-read cost */
4005 double npages = ceil(nbytes / BLCKSZ);
4006
4007 run_cost += seq_page_cost * npages;
4008 }
4009 *rescan_startup_cost = 0;
4010 *rescan_total_cost = run_cost;
4011 }
4012 break;
4013 case T_Material:
4014 case T_Sort:
4015 {
4016 /*
4017 * These plan types not only materialize their results, but do
4018 * not implement qual filtering or projection. So they are
4019 * even cheaper to rescan than the ones above. We charge only
4020 * cpu_operator_cost per tuple. (Note: keep that in sync with
4021 * the run_cost charge in cost_sort, and also see comments in
4022 * cost_material before you change it.)
4023 */
4024 Cost run_cost = cpu_operator_cost * path->rows;
4025 double nbytes = relation_byte_size(path->rows,
4026 path->pathtarget->width);
4027 long work_mem_bytes = work_mem * 1024L;
4028
4029 if (nbytes > work_mem_bytes)
4030 {
4031 /* It will spill, so account for re-read cost */
4032 double npages = ceil(nbytes / BLCKSZ);
4033
4034 run_cost += seq_page_cost * npages;
4035 }
4036 *rescan_startup_cost = 0;
4037 *rescan_total_cost = run_cost;
4038 }
4039 break;
4040 default:
4041 *rescan_startup_cost = path->startup_cost;
4042 *rescan_total_cost = path->total_cost;
4043 break;
4044 }
4045 }
4046
4047
4048 /*
4049 * cost_qual_eval
4050 * Estimate the CPU costs of evaluating a WHERE clause.
4051 * The input can be either an implicitly-ANDed list of boolean
4052 * expressions, or a list of RestrictInfo nodes. (The latter is
4053 * preferred since it allows caching of the results.)
4054 * The result includes both a one-time (startup) component,
4055 * and a per-evaluation component.
4056 */
4057 void
cost_qual_eval(QualCost * cost,List * quals,PlannerInfo * root)4058 cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root)
4059 {
4060 cost_qual_eval_context context;
4061 ListCell *l;
4062
4063 context.root = root;
4064 context.total.startup = 0;
4065 context.total.per_tuple = 0;
4066
4067 /* We don't charge any cost for the implicit ANDing at top level ... */
4068
4069 foreach(l, quals)
4070 {
4071 Node *qual = (Node *) lfirst(l);
4072
4073 cost_qual_eval_walker(qual, &context);
4074 }
4075
4076 *cost = context.total;
4077 }
4078
4079 /*
4080 * cost_qual_eval_node
4081 * As above, for a single RestrictInfo or expression.
4082 */
4083 void
cost_qual_eval_node(QualCost * cost,Node * qual,PlannerInfo * root)4084 cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root)
4085 {
4086 cost_qual_eval_context context;
4087
4088 context.root = root;
4089 context.total.startup = 0;
4090 context.total.per_tuple = 0;
4091
4092 cost_qual_eval_walker(qual, &context);
4093
4094 *cost = context.total;
4095 }
4096
4097 static bool
cost_qual_eval_walker(Node * node,cost_qual_eval_context * context)4098 cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
4099 {
4100 if (node == NULL)
4101 return false;
4102
4103 /*
4104 * RestrictInfo nodes contain an eval_cost field reserved for this
4105 * routine's use, so that it's not necessary to evaluate the qual clause's
4106 * cost more than once. If the clause's cost hasn't been computed yet,
4107 * the field's startup value will contain -1.
4108 */
4109 if (IsA(node, RestrictInfo))
4110 {
4111 RestrictInfo *rinfo = (RestrictInfo *) node;
4112
4113 if (rinfo->eval_cost.startup < 0)
4114 {
4115 cost_qual_eval_context locContext;
4116
4117 locContext.root = context->root;
4118 locContext.total.startup = 0;
4119 locContext.total.per_tuple = 0;
4120
4121 /*
4122 * For an OR clause, recurse into the marked-up tree so that we
4123 * set the eval_cost for contained RestrictInfos too.
4124 */
4125 if (rinfo->orclause)
4126 cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
4127 else
4128 cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
4129
4130 /*
4131 * If the RestrictInfo is marked pseudoconstant, it will be tested
4132 * only once, so treat its cost as all startup cost.
4133 */
4134 if (rinfo->pseudoconstant)
4135 {
4136 /* count one execution during startup */
4137 locContext.total.startup += locContext.total.per_tuple;
4138 locContext.total.per_tuple = 0;
4139 }
4140 rinfo->eval_cost = locContext.total;
4141 }
4142 context->total.startup += rinfo->eval_cost.startup;
4143 context->total.per_tuple += rinfo->eval_cost.per_tuple;
4144 /* do NOT recurse into children */
4145 return false;
4146 }
4147
4148 /*
4149 * For each operator or function node in the given tree, we charge the
4150 * estimated execution cost given by pg_proc.procost (remember to multiply
4151 * this by cpu_operator_cost).
4152 *
4153 * Vars and Consts are charged zero, and so are boolean operators (AND,
4154 * OR, NOT). Simplistic, but a lot better than no model at all.
4155 *
4156 * Should we try to account for the possibility of short-circuit
4157 * evaluation of AND/OR? Probably *not*, because that would make the
4158 * results depend on the clause ordering, and we are not in any position
4159 * to expect that the current ordering of the clauses is the one that's
4160 * going to end up being used. The above per-RestrictInfo caching would
4161 * not mix well with trying to re-order clauses anyway.
4162 *
4163 * Another issue that is entirely ignored here is that if a set-returning
4164 * function is below top level in the tree, the functions/operators above
4165 * it will need to be evaluated multiple times. In practical use, such
4166 * cases arise so seldom as to not be worth the added complexity needed;
4167 * moreover, since our rowcount estimates for functions tend to be pretty
4168 * phony, the results would also be pretty phony.
4169 */
4170 if (IsA(node, FuncExpr))
4171 {
4172 add_function_cost(context->root, ((FuncExpr *) node)->funcid, node,
4173 &context->total);
4174 }
4175 else if (IsA(node, OpExpr) ||
4176 IsA(node, DistinctExpr) ||
4177 IsA(node, NullIfExpr))
4178 {
4179 /* rely on struct equivalence to treat these all alike */
4180 set_opfuncid((OpExpr *) node);
4181 add_function_cost(context->root, ((OpExpr *) node)->opfuncid, node,
4182 &context->total);
4183 }
4184 else if (IsA(node, ScalarArrayOpExpr))
4185 {
4186 /*
4187 * Estimate that the operator will be applied to about half of the
4188 * array elements before the answer is determined.
4189 */
4190 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
4191 Node *arraynode = (Node *) lsecond(saop->args);
4192 QualCost sacosts;
4193
4194 set_sa_opfuncid(saop);
4195 sacosts.startup = sacosts.per_tuple = 0;
4196 add_function_cost(context->root, saop->opfuncid, NULL,
4197 &sacosts);
4198 context->total.startup += sacosts.startup;
4199 context->total.per_tuple += sacosts.per_tuple *
4200 estimate_array_length(arraynode) * 0.5;
4201 }
4202 else if (IsA(node, Aggref) ||
4203 IsA(node, WindowFunc))
4204 {
4205 /*
4206 * Aggref and WindowFunc nodes are (and should be) treated like Vars,
4207 * ie, zero execution cost in the current model, because they behave
4208 * essentially like Vars at execution. We disregard the costs of
4209 * their input expressions for the same reason. The actual execution
4210 * costs of the aggregate/window functions and their arguments have to
4211 * be factored into plan-node-specific costing of the Agg or WindowAgg
4212 * plan node.
4213 */
4214 return false; /* don't recurse into children */
4215 }
4216 else if (IsA(node, CoerceViaIO))
4217 {
4218 CoerceViaIO *iocoerce = (CoerceViaIO *) node;
4219 Oid iofunc;
4220 Oid typioparam;
4221 bool typisvarlena;
4222
4223 /* check the result type's input function */
4224 getTypeInputInfo(iocoerce->resulttype,
4225 &iofunc, &typioparam);
4226 add_function_cost(context->root, iofunc, NULL,
4227 &context->total);
4228 /* check the input type's output function */
4229 getTypeOutputInfo(exprType((Node *) iocoerce->arg),
4230 &iofunc, &typisvarlena);
4231 add_function_cost(context->root, iofunc, NULL,
4232 &context->total);
4233 }
4234 else if (IsA(node, ArrayCoerceExpr))
4235 {
4236 ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
4237 QualCost perelemcost;
4238
4239 cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
4240 context->root);
4241 context->total.startup += perelemcost.startup;
4242 if (perelemcost.per_tuple > 0)
4243 context->total.per_tuple += perelemcost.per_tuple *
4244 estimate_array_length((Node *) acoerce->arg);
4245 }
4246 else if (IsA(node, RowCompareExpr))
4247 {
4248 /* Conservatively assume we will check all the columns */
4249 RowCompareExpr *rcexpr = (RowCompareExpr *) node;
4250 ListCell *lc;
4251
4252 foreach(lc, rcexpr->opnos)
4253 {
4254 Oid opid = lfirst_oid(lc);
4255
4256 add_function_cost(context->root, get_opcode(opid), NULL,
4257 &context->total);
4258 }
4259 }
4260 else if (IsA(node, MinMaxExpr) ||
4261 IsA(node, SQLValueFunction) ||
4262 IsA(node, XmlExpr) ||
4263 IsA(node, CoerceToDomain) ||
4264 IsA(node, NextValueExpr))
4265 {
4266 /* Treat all these as having cost 1 */
4267 context->total.per_tuple += cpu_operator_cost;
4268 }
4269 else if (IsA(node, CurrentOfExpr))
4270 {
4271 /* Report high cost to prevent selection of anything but TID scan */
4272 context->total.startup += disable_cost;
4273 }
4274 else if (IsA(node, SubLink))
4275 {
4276 /* This routine should not be applied to un-planned expressions */
4277 elog(ERROR, "cannot handle unplanned sub-select");
4278 }
4279 else if (IsA(node, SubPlan))
4280 {
4281 /*
4282 * A subplan node in an expression typically indicates that the
4283 * subplan will be executed on each evaluation, so charge accordingly.
4284 * (Sub-selects that can be executed as InitPlans have already been
4285 * removed from the expression.)
4286 */
4287 SubPlan *subplan = (SubPlan *) node;
4288
4289 context->total.startup += subplan->startup_cost;
4290 context->total.per_tuple += subplan->per_call_cost;
4291
4292 /*
4293 * We don't want to recurse into the testexpr, because it was already
4294 * counted in the SubPlan node's costs. So we're done.
4295 */
4296 return false;
4297 }
4298 else if (IsA(node, AlternativeSubPlan))
4299 {
4300 /*
4301 * Arbitrarily use the first alternative plan for costing. (We should
4302 * certainly only include one alternative, and we don't yet have
4303 * enough information to know which one the executor is most likely to
4304 * use.)
4305 */
4306 AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
4307
4308 return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
4309 context);
4310 }
4311 else if (IsA(node, PlaceHolderVar))
4312 {
4313 /*
4314 * A PlaceHolderVar should be given cost zero when considering general
4315 * expression evaluation costs. The expense of doing the contained
4316 * expression is charged as part of the tlist eval costs of the scan
4317 * or join where the PHV is first computed (see set_rel_width and
4318 * add_placeholders_to_joinrel). If we charged it again here, we'd be
4319 * double-counting the cost for each level of plan that the PHV
4320 * bubbles up through. Hence, return without recursing into the
4321 * phexpr.
4322 */
4323 return false;
4324 }
4325
4326 /* recurse into children */
4327 return expression_tree_walker(node, cost_qual_eval_walker,
4328 (void *) context);
4329 }
4330
4331 /*
4332 * get_restriction_qual_cost
4333 * Compute evaluation costs of a baserel's restriction quals, plus any
4334 * movable join quals that have been pushed down to the scan.
4335 * Results are returned into *qpqual_cost.
4336 *
4337 * This is a convenience subroutine that works for seqscans and other cases
4338 * where all the given quals will be evaluated the hard way. It's not useful
4339 * for cost_index(), for example, where the index machinery takes care of
4340 * some of the quals. We assume baserestrictcost was previously set by
4341 * set_baserel_size_estimates().
4342 */
4343 static void
get_restriction_qual_cost(PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,QualCost * qpqual_cost)4344 get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
4345 ParamPathInfo *param_info,
4346 QualCost *qpqual_cost)
4347 {
4348 if (param_info)
4349 {
4350 /* Include costs of pushed-down clauses */
4351 cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
4352
4353 qpqual_cost->startup += baserel->baserestrictcost.startup;
4354 qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
4355 }
4356 else
4357 *qpqual_cost = baserel->baserestrictcost;
4358 }
4359
4360
4361 /*
4362 * compute_semi_anti_join_factors
4363 * Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
4364 * can be expected to scan.
4365 *
4366 * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
4367 * inner rows as soon as it finds a match to the current outer row.
4368 * The same happens if we have detected the inner rel is unique.
4369 * We should therefore adjust some of the cost components for this effect.
4370 * This function computes some estimates needed for these adjustments.
4371 * These estimates will be the same regardless of the particular paths used
4372 * for the outer and inner relation, so we compute these once and then pass
4373 * them to all the join cost estimation functions.
4374 *
4375 * Input parameters:
4376 * joinrel: join relation under consideration
4377 * outerrel: outer relation under consideration
4378 * innerrel: inner relation under consideration
4379 * jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
4380 * sjinfo: SpecialJoinInfo relevant to this join
4381 * restrictlist: join quals
4382 * Output parameters:
4383 * *semifactors is filled in (see pathnodes.h for field definitions)
4384 */
4385 void
compute_semi_anti_join_factors(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outerrel,RelOptInfo * innerrel,JoinType jointype,SpecialJoinInfo * sjinfo,List * restrictlist,SemiAntiJoinFactors * semifactors)4386 compute_semi_anti_join_factors(PlannerInfo *root,
4387 RelOptInfo *joinrel,
4388 RelOptInfo *outerrel,
4389 RelOptInfo *innerrel,
4390 JoinType jointype,
4391 SpecialJoinInfo *sjinfo,
4392 List *restrictlist,
4393 SemiAntiJoinFactors *semifactors)
4394 {
4395 Selectivity jselec;
4396 Selectivity nselec;
4397 Selectivity avgmatch;
4398 SpecialJoinInfo norm_sjinfo;
4399 List *joinquals;
4400 ListCell *l;
4401
4402 /*
4403 * In an ANTI join, we must ignore clauses that are "pushed down", since
4404 * those won't affect the match logic. In a SEMI join, we do not
4405 * distinguish joinquals from "pushed down" quals, so just use the whole
4406 * restrictinfo list. For other outer join types, we should consider only
4407 * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
4408 */
4409 if (IS_OUTER_JOIN(jointype))
4410 {
4411 joinquals = NIL;
4412 foreach(l, restrictlist)
4413 {
4414 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4415
4416 if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4417 joinquals = lappend(joinquals, rinfo);
4418 }
4419 }
4420 else
4421 joinquals = restrictlist;
4422
4423 /*
4424 * Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
4425 */
4426 jselec = clauselist_selectivity(root,
4427 joinquals,
4428 0,
4429 (jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
4430 sjinfo);
4431
4432 /*
4433 * Also get the normal inner-join selectivity of the join clauses.
4434 */
4435 norm_sjinfo.type = T_SpecialJoinInfo;
4436 norm_sjinfo.min_lefthand = outerrel->relids;
4437 norm_sjinfo.min_righthand = innerrel->relids;
4438 norm_sjinfo.syn_lefthand = outerrel->relids;
4439 norm_sjinfo.syn_righthand = innerrel->relids;
4440 norm_sjinfo.jointype = JOIN_INNER;
4441 /* we don't bother trying to make the remaining fields valid */
4442 norm_sjinfo.lhs_strict = false;
4443 norm_sjinfo.delay_upper_joins = false;
4444 norm_sjinfo.semi_can_btree = false;
4445 norm_sjinfo.semi_can_hash = false;
4446 norm_sjinfo.semi_operators = NIL;
4447 norm_sjinfo.semi_rhs_exprs = NIL;
4448
4449 nselec = clauselist_selectivity(root,
4450 joinquals,
4451 0,
4452 JOIN_INNER,
4453 &norm_sjinfo);
4454
4455 /* Avoid leaking a lot of ListCells */
4456 if (IS_OUTER_JOIN(jointype))
4457 list_free(joinquals);
4458
4459 /*
4460 * jselec can be interpreted as the fraction of outer-rel rows that have
4461 * any matches (this is true for both SEMI and ANTI cases). And nselec is
4462 * the fraction of the Cartesian product that matches. So, the average
4463 * number of matches for each outer-rel row that has at least one match is
4464 * nselec * inner_rows / jselec.
4465 *
4466 * Note: it is correct to use the inner rel's "rows" count here, even
4467 * though we might later be considering a parameterized inner path with
4468 * fewer rows. This is because we have included all the join clauses in
4469 * the selectivity estimate.
4470 */
4471 if (jselec > 0) /* protect against zero divide */
4472 {
4473 avgmatch = nselec * innerrel->rows / jselec;
4474 /* Clamp to sane range */
4475 avgmatch = Max(1.0, avgmatch);
4476 }
4477 else
4478 avgmatch = 1.0;
4479
4480 semifactors->outer_match_frac = jselec;
4481 semifactors->match_count = avgmatch;
4482 }
4483
4484 /*
4485 * has_indexed_join_quals
4486 * Check whether all the joinquals of a nestloop join are used as
4487 * inner index quals.
4488 *
4489 * If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
4490 * indexscan) that uses all the joinquals as indexquals, we can assume that an
4491 * unmatched outer tuple is cheap to process, whereas otherwise it's probably
4492 * expensive.
4493 */
4494 static bool
has_indexed_join_quals(NestPath * joinpath)4495 has_indexed_join_quals(NestPath *joinpath)
4496 {
4497 Relids joinrelids = joinpath->path.parent->relids;
4498 Path *innerpath = joinpath->innerjoinpath;
4499 List *indexclauses;
4500 bool found_one;
4501 ListCell *lc;
4502
4503 /* If join still has quals to evaluate, it's not fast */
4504 if (joinpath->joinrestrictinfo != NIL)
4505 return false;
4506 /* Nor if the inner path isn't parameterized at all */
4507 if (innerpath->param_info == NULL)
4508 return false;
4509
4510 /* Find the indexclauses list for the inner scan */
4511 switch (innerpath->pathtype)
4512 {
4513 case T_IndexScan:
4514 case T_IndexOnlyScan:
4515 indexclauses = ((IndexPath *) innerpath)->indexclauses;
4516 break;
4517 case T_BitmapHeapScan:
4518 {
4519 /* Accept only a simple bitmap scan, not AND/OR cases */
4520 Path *bmqual = ((BitmapHeapPath *) innerpath)->bitmapqual;
4521
4522 if (IsA(bmqual, IndexPath))
4523 indexclauses = ((IndexPath *) bmqual)->indexclauses;
4524 else
4525 return false;
4526 break;
4527 }
4528 default:
4529
4530 /*
4531 * If it's not a simple indexscan, it probably doesn't run quickly
4532 * for zero rows out, even if it's a parameterized path using all
4533 * the joinquals.
4534 */
4535 return false;
4536 }
4537
4538 /*
4539 * Examine the inner path's param clauses. Any that are from the outer
4540 * path must be found in the indexclauses list, either exactly or in an
4541 * equivalent form generated by equivclass.c. Also, we must find at least
4542 * one such clause, else it's a clauseless join which isn't fast.
4543 */
4544 found_one = false;
4545 foreach(lc, innerpath->param_info->ppi_clauses)
4546 {
4547 RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
4548
4549 if (join_clause_is_movable_into(rinfo,
4550 innerpath->parent->relids,
4551 joinrelids))
4552 {
4553 if (!is_redundant_with_indexclauses(rinfo, indexclauses))
4554 return false;
4555 found_one = true;
4556 }
4557 }
4558 return found_one;
4559 }
4560
4561
4562 /*
4563 * approx_tuple_count
4564 * Quick-and-dirty estimation of the number of join rows passing
4565 * a set of qual conditions.
4566 *
4567 * The quals can be either an implicitly-ANDed list of boolean expressions,
4568 * or a list of RestrictInfo nodes (typically the latter).
4569 *
4570 * We intentionally compute the selectivity under JOIN_INNER rules, even
4571 * if it's some type of outer join. This is appropriate because we are
4572 * trying to figure out how many tuples pass the initial merge or hash
4573 * join step.
4574 *
4575 * This is quick-and-dirty because we bypass clauselist_selectivity, and
4576 * simply multiply the independent clause selectivities together. Now
4577 * clauselist_selectivity often can't do any better than that anyhow, but
4578 * for some situations (such as range constraints) it is smarter. However,
4579 * we can't effectively cache the results of clauselist_selectivity, whereas
4580 * the individual clause selectivities can be and are cached.
4581 *
4582 * Since we are only using the results to estimate how many potential
4583 * output tuples are generated and passed through qpqual checking, it
4584 * seems OK to live with the approximation.
4585 */
4586 static double
approx_tuple_count(PlannerInfo * root,JoinPath * path,List * quals)4587 approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
4588 {
4589 double tuples;
4590 double outer_tuples = path->outerjoinpath->rows;
4591 double inner_tuples = path->innerjoinpath->rows;
4592 SpecialJoinInfo sjinfo;
4593 Selectivity selec = 1.0;
4594 ListCell *l;
4595
4596 /*
4597 * Make up a SpecialJoinInfo for JOIN_INNER semantics.
4598 */
4599 sjinfo.type = T_SpecialJoinInfo;
4600 sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
4601 sjinfo.min_righthand = path->innerjoinpath->parent->relids;
4602 sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
4603 sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
4604 sjinfo.jointype = JOIN_INNER;
4605 /* we don't bother trying to make the remaining fields valid */
4606 sjinfo.lhs_strict = false;
4607 sjinfo.delay_upper_joins = false;
4608 sjinfo.semi_can_btree = false;
4609 sjinfo.semi_can_hash = false;
4610 sjinfo.semi_operators = NIL;
4611 sjinfo.semi_rhs_exprs = NIL;
4612
4613 /* Get the approximate selectivity */
4614 foreach(l, quals)
4615 {
4616 Node *qual = (Node *) lfirst(l);
4617
4618 /* Note that clause_selectivity will be able to cache its result */
4619 selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
4620 }
4621
4622 /* Apply it to the input relation sizes */
4623 tuples = selec * outer_tuples * inner_tuples;
4624
4625 return clamp_row_est(tuples);
4626 }
4627
4628
4629 /*
4630 * set_baserel_size_estimates
4631 * Set the size estimates for the given base relation.
4632 *
4633 * The rel's targetlist and restrictinfo list must have been constructed
4634 * already, and rel->tuples must be set.
4635 *
4636 * We set the following fields of the rel node:
4637 * rows: the estimated number of output tuples (after applying
4638 * restriction clauses).
4639 * width: the estimated average output tuple width in bytes.
4640 * baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
4641 */
4642 void
set_baserel_size_estimates(PlannerInfo * root,RelOptInfo * rel)4643 set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4644 {
4645 double nrows;
4646
4647 /* Should only be applied to base relations */
4648 Assert(rel->relid > 0);
4649
4650 nrows = rel->tuples *
4651 clauselist_selectivity(root,
4652 rel->baserestrictinfo,
4653 0,
4654 JOIN_INNER,
4655 NULL);
4656
4657 rel->rows = clamp_row_est(nrows);
4658
4659 cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
4660
4661 set_rel_width(root, rel);
4662 }
4663
4664 /*
4665 * get_parameterized_baserel_size
4666 * Make a size estimate for a parameterized scan of a base relation.
4667 *
4668 * 'param_clauses' lists the additional join clauses to be used.
4669 *
4670 * set_baserel_size_estimates must have been applied already.
4671 */
4672 double
get_parameterized_baserel_size(PlannerInfo * root,RelOptInfo * rel,List * param_clauses)4673 get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
4674 List *param_clauses)
4675 {
4676 List *allclauses;
4677 double nrows;
4678
4679 /*
4680 * Estimate the number of rows returned by the parameterized scan, knowing
4681 * that it will apply all the extra join clauses as well as the rel's own
4682 * restriction clauses. Note that we force the clauses to be treated as
4683 * non-join clauses during selectivity estimation.
4684 */
4685 allclauses = list_concat_copy(param_clauses, rel->baserestrictinfo);
4686 nrows = rel->tuples *
4687 clauselist_selectivity(root,
4688 allclauses,
4689 rel->relid, /* do not use 0! */
4690 JOIN_INNER,
4691 NULL);
4692 nrows = clamp_row_est(nrows);
4693 /* For safety, make sure result is not more than the base estimate */
4694 if (nrows > rel->rows)
4695 nrows = rel->rows;
4696 return nrows;
4697 }
4698
4699 /*
4700 * set_joinrel_size_estimates
4701 * Set the size estimates for the given join relation.
4702 *
4703 * The rel's targetlist must have been constructed already, and a
4704 * restriction clause list that matches the given component rels must
4705 * be provided.
4706 *
4707 * Since there is more than one way to make a joinrel for more than two
4708 * base relations, the results we get here could depend on which component
4709 * rel pair is provided. In theory we should get the same answers no matter
4710 * which pair is provided; in practice, since the selectivity estimation
4711 * routines don't handle all cases equally well, we might not. But there's
4712 * not much to be done about it. (Would it make sense to repeat the
4713 * calculations for each pair of input rels that's encountered, and somehow
4714 * average the results? Probably way more trouble than it's worth, and
4715 * anyway we must keep the rowcount estimate the same for all paths for the
4716 * joinrel.)
4717 *
4718 * We set only the rows field here. The reltarget field was already set by
4719 * build_joinrel_tlist, and baserestrictcost is not used for join rels.
4720 */
4721 void
set_joinrel_size_estimates(PlannerInfo * root,RelOptInfo * rel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,SpecialJoinInfo * sjinfo,List * restrictlist)4722 set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
4723 RelOptInfo *outer_rel,
4724 RelOptInfo *inner_rel,
4725 SpecialJoinInfo *sjinfo,
4726 List *restrictlist)
4727 {
4728 rel->rows = calc_joinrel_size_estimate(root,
4729 rel,
4730 outer_rel,
4731 inner_rel,
4732 outer_rel->rows,
4733 inner_rel->rows,
4734 sjinfo,
4735 restrictlist);
4736 }
4737
4738 /*
4739 * get_parameterized_joinrel_size
4740 * Make a size estimate for a parameterized scan of a join relation.
4741 *
4742 * 'rel' is the joinrel under consideration.
4743 * 'outer_path', 'inner_path' are (probably also parameterized) Paths that
4744 * produce the relations being joined.
4745 * 'sjinfo' is any SpecialJoinInfo relevant to this join.
4746 * 'restrict_clauses' lists the join clauses that need to be applied at the
4747 * join node (including any movable clauses that were moved down to this join,
4748 * and not including any movable clauses that were pushed down into the
4749 * child paths).
4750 *
4751 * set_joinrel_size_estimates must have been applied already.
4752 */
4753 double
get_parameterized_joinrel_size(PlannerInfo * root,RelOptInfo * rel,Path * outer_path,Path * inner_path,SpecialJoinInfo * sjinfo,List * restrict_clauses)4754 get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
4755 Path *outer_path,
4756 Path *inner_path,
4757 SpecialJoinInfo *sjinfo,
4758 List *restrict_clauses)
4759 {
4760 double nrows;
4761
4762 /*
4763 * Estimate the number of rows returned by the parameterized join as the
4764 * sizes of the input paths times the selectivity of the clauses that have
4765 * ended up at this join node.
4766 *
4767 * As with set_joinrel_size_estimates, the rowcount estimate could depend
4768 * on the pair of input paths provided, though ideally we'd get the same
4769 * estimate for any pair with the same parameterization.
4770 */
4771 nrows = calc_joinrel_size_estimate(root,
4772 rel,
4773 outer_path->parent,
4774 inner_path->parent,
4775 outer_path->rows,
4776 inner_path->rows,
4777 sjinfo,
4778 restrict_clauses);
4779 /* For safety, make sure result is not more than the base estimate */
4780 if (nrows > rel->rows)
4781 nrows = rel->rows;
4782 return nrows;
4783 }
4784
4785 /*
4786 * calc_joinrel_size_estimate
4787 * Workhorse for set_joinrel_size_estimates and
4788 * get_parameterized_joinrel_size.
4789 *
4790 * outer_rel/inner_rel are the relations being joined, but they should be
4791 * assumed to have sizes outer_rows/inner_rows; those numbers might be less
4792 * than what rel->rows says, when we are considering parameterized paths.
4793 */
4794 static double
calc_joinrel_size_estimate(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,double outer_rows,double inner_rows,SpecialJoinInfo * sjinfo,List * restrictlist_in)4795 calc_joinrel_size_estimate(PlannerInfo *root,
4796 RelOptInfo *joinrel,
4797 RelOptInfo *outer_rel,
4798 RelOptInfo *inner_rel,
4799 double outer_rows,
4800 double inner_rows,
4801 SpecialJoinInfo *sjinfo,
4802 List *restrictlist_in)
4803 {
4804 /* This apparently-useless variable dodges a compiler bug in VS2013: */
4805 List *restrictlist = restrictlist_in;
4806 JoinType jointype = sjinfo->jointype;
4807 Selectivity fkselec;
4808 Selectivity jselec;
4809 Selectivity pselec;
4810 double nrows;
4811
4812 /*
4813 * Compute joinclause selectivity. Note that we are only considering
4814 * clauses that become restriction clauses at this join level; we are not
4815 * double-counting them because they were not considered in estimating the
4816 * sizes of the component rels.
4817 *
4818 * First, see whether any of the joinclauses can be matched to known FK
4819 * constraints. If so, drop those clauses from the restrictlist, and
4820 * instead estimate their selectivity using FK semantics. (We do this
4821 * without regard to whether said clauses are local or "pushed down".
4822 * Probably, an FK-matching clause could never be seen as pushed down at
4823 * an outer join, since it would be strict and hence would be grounds for
4824 * join strength reduction.) fkselec gets the net selectivity for
4825 * FK-matching clauses, or 1.0 if there are none.
4826 */
4827 fkselec = get_foreign_key_join_selectivity(root,
4828 outer_rel->relids,
4829 inner_rel->relids,
4830 sjinfo,
4831 &restrictlist);
4832
4833 /*
4834 * For an outer join, we have to distinguish the selectivity of the join's
4835 * own clauses (JOIN/ON conditions) from any clauses that were "pushed
4836 * down". For inner joins we just count them all as joinclauses.
4837 */
4838 if (IS_OUTER_JOIN(jointype))
4839 {
4840 List *joinquals = NIL;
4841 List *pushedquals = NIL;
4842 ListCell *l;
4843
4844 /* Grovel through the clauses to separate into two lists */
4845 foreach(l, restrictlist)
4846 {
4847 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4848
4849 if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4850 pushedquals = lappend(pushedquals, rinfo);
4851 else
4852 joinquals = lappend(joinquals, rinfo);
4853 }
4854
4855 /* Get the separate selectivities */
4856 jselec = clauselist_selectivity(root,
4857 joinquals,
4858 0,
4859 jointype,
4860 sjinfo);
4861 pselec = clauselist_selectivity(root,
4862 pushedquals,
4863 0,
4864 jointype,
4865 sjinfo);
4866
4867 /* Avoid leaking a lot of ListCells */
4868 list_free(joinquals);
4869 list_free(pushedquals);
4870 }
4871 else
4872 {
4873 jselec = clauselist_selectivity(root,
4874 restrictlist,
4875 0,
4876 jointype,
4877 sjinfo);
4878 pselec = 0.0; /* not used, keep compiler quiet */
4879 }
4880
4881 /*
4882 * Basically, we multiply size of Cartesian product by selectivity.
4883 *
4884 * If we are doing an outer join, take that into account: the joinqual
4885 * selectivity has to be clamped using the knowledge that the output must
4886 * be at least as large as the non-nullable input. However, any
4887 * pushed-down quals are applied after the outer join, so their
4888 * selectivity applies fully.
4889 *
4890 * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
4891 * of LHS rows that have matches, and we apply that straightforwardly.
4892 */
4893 switch (jointype)
4894 {
4895 case JOIN_INNER:
4896 nrows = outer_rows * inner_rows * fkselec * jselec;
4897 /* pselec not used */
4898 break;
4899 case JOIN_LEFT:
4900 nrows = outer_rows * inner_rows * fkselec * jselec;
4901 if (nrows < outer_rows)
4902 nrows = outer_rows;
4903 nrows *= pselec;
4904 break;
4905 case JOIN_FULL:
4906 nrows = outer_rows * inner_rows * fkselec * jselec;
4907 if (nrows < outer_rows)
4908 nrows = outer_rows;
4909 if (nrows < inner_rows)
4910 nrows = inner_rows;
4911 nrows *= pselec;
4912 break;
4913 case JOIN_SEMI:
4914 nrows = outer_rows * fkselec * jselec;
4915 /* pselec not used */
4916 break;
4917 case JOIN_ANTI:
4918 nrows = outer_rows * (1.0 - fkselec * jselec);
4919 nrows *= pselec;
4920 break;
4921 default:
4922 /* other values not expected here */
4923 elog(ERROR, "unrecognized join type: %d", (int) jointype);
4924 nrows = 0; /* keep compiler quiet */
4925 break;
4926 }
4927
4928 return clamp_row_est(nrows);
4929 }
4930
4931 /*
4932 * get_foreign_key_join_selectivity
4933 * Estimate join selectivity for foreign-key-related clauses.
4934 *
4935 * Remove any clauses that can be matched to FK constraints from *restrictlist,
4936 * and return a substitute estimate of their selectivity. 1.0 is returned
4937 * when there are no such clauses.
4938 *
4939 * The reason for treating such clauses specially is that we can get better
4940 * estimates this way than by relying on clauselist_selectivity(), especially
4941 * for multi-column FKs where that function's assumption that the clauses are
4942 * independent falls down badly. But even with single-column FKs, we may be
4943 * able to get a better answer when the pg_statistic stats are missing or out
4944 * of date.
4945 */
4946 static Selectivity
get_foreign_key_join_selectivity(PlannerInfo * root,Relids outer_relids,Relids inner_relids,SpecialJoinInfo * sjinfo,List ** restrictlist)4947 get_foreign_key_join_selectivity(PlannerInfo *root,
4948 Relids outer_relids,
4949 Relids inner_relids,
4950 SpecialJoinInfo *sjinfo,
4951 List **restrictlist)
4952 {
4953 Selectivity fkselec = 1.0;
4954 JoinType jointype = sjinfo->jointype;
4955 List *worklist = *restrictlist;
4956 ListCell *lc;
4957
4958 /* Consider each FK constraint that is known to match the query */
4959 foreach(lc, root->fkey_list)
4960 {
4961 ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
4962 bool ref_is_outer;
4963 List *removedlist;
4964 ListCell *cell;
4965
4966 /*
4967 * This FK is not relevant unless it connects a baserel on one side of
4968 * this join to a baserel on the other side.
4969 */
4970 if (bms_is_member(fkinfo->con_relid, outer_relids) &&
4971 bms_is_member(fkinfo->ref_relid, inner_relids))
4972 ref_is_outer = false;
4973 else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
4974 bms_is_member(fkinfo->con_relid, inner_relids))
4975 ref_is_outer = true;
4976 else
4977 continue;
4978
4979 /*
4980 * If we're dealing with a semi/anti join, and the FK's referenced
4981 * relation is on the outside, then knowledge of the FK doesn't help
4982 * us figure out what we need to know (which is the fraction of outer
4983 * rows that have matches). On the other hand, if the referenced rel
4984 * is on the inside, then all outer rows must have matches in the
4985 * referenced table (ignoring nulls). But any restriction or join
4986 * clauses that filter that table will reduce the fraction of matches.
4987 * We can account for restriction clauses, but it's too hard to guess
4988 * how many table rows would get through a join that's inside the RHS.
4989 * Hence, if either case applies, punt and ignore the FK.
4990 */
4991 if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
4992 (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
4993 continue;
4994
4995 /*
4996 * Modify the restrictlist by removing clauses that match the FK (and
4997 * putting them into removedlist instead). It seems unsafe to modify
4998 * the originally-passed List structure, so we make a shallow copy the
4999 * first time through.
5000 */
5001 if (worklist == *restrictlist)
5002 worklist = list_copy(worklist);
5003
5004 removedlist = NIL;
5005 foreach(cell, worklist)
5006 {
5007 RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
5008 bool remove_it = false;
5009 int i;
5010
5011 /* Drop this clause if it matches any column of the FK */
5012 for (i = 0; i < fkinfo->nkeys; i++)
5013 {
5014 if (rinfo->parent_ec)
5015 {
5016 /*
5017 * EC-derived clauses can only match by EC. It is okay to
5018 * consider any clause derived from the same EC as
5019 * matching the FK: even if equivclass.c chose to generate
5020 * a clause equating some other pair of Vars, it could
5021 * have generated one equating the FK's Vars. So for
5022 * purposes of estimation, we can act as though it did so.
5023 *
5024 * Note: checking parent_ec is a bit of a cheat because
5025 * there are EC-derived clauses that don't have parent_ec
5026 * set; but such clauses must compare expressions that
5027 * aren't just Vars, so they cannot match the FK anyway.
5028 */
5029 if (fkinfo->eclass[i] == rinfo->parent_ec)
5030 {
5031 remove_it = true;
5032 break;
5033 }
5034 }
5035 else
5036 {
5037 /*
5038 * Otherwise, see if rinfo was previously matched to FK as
5039 * a "loose" clause.
5040 */
5041 if (list_member_ptr(fkinfo->rinfos[i], rinfo))
5042 {
5043 remove_it = true;
5044 break;
5045 }
5046 }
5047 }
5048 if (remove_it)
5049 {
5050 worklist = foreach_delete_current(worklist, cell);
5051 removedlist = lappend(removedlist, rinfo);
5052 }
5053 }
5054
5055 /*
5056 * If we failed to remove all the matching clauses we expected to
5057 * find, chicken out and ignore this FK; applying its selectivity
5058 * might result in double-counting. Put any clauses we did manage to
5059 * remove back into the worklist.
5060 *
5061 * Since the matching clauses are known not outerjoin-delayed, they
5062 * should certainly have appeared in the initial joinclause list. If
5063 * we didn't find them, they must have been matched to, and removed
5064 * by, some other FK in a previous iteration of this loop. (A likely
5065 * case is that two FKs are matched to the same EC; there will be only
5066 * one EC-derived clause in the initial list, so the first FK will
5067 * consume it.) Applying both FKs' selectivity independently risks
5068 * underestimating the join size; in particular, this would undo one
5069 * of the main things that ECs were invented for, namely to avoid
5070 * double-counting the selectivity of redundant equality conditions.
5071 * Later we might think of a reasonable way to combine the estimates,
5072 * but for now, just punt, since this is a fairly uncommon situation.
5073 */
5074 if (list_length(removedlist) !=
5075 (fkinfo->nmatched_ec + fkinfo->nmatched_ri))
5076 {
5077 worklist = list_concat(worklist, removedlist);
5078 continue;
5079 }
5080
5081 /*
5082 * Finally we get to the payoff: estimate selectivity using the
5083 * knowledge that each referencing row will match exactly one row in
5084 * the referenced table.
5085 *
5086 * XXX that's not true in the presence of nulls in the referencing
5087 * column(s), so in principle we should derate the estimate for those.
5088 * However (1) if there are any strict restriction clauses for the
5089 * referencing column(s) elsewhere in the query, derating here would
5090 * be double-counting the null fraction, and (2) it's not very clear
5091 * how to combine null fractions for multiple referencing columns. So
5092 * we do nothing for now about correcting for nulls.
5093 *
5094 * XXX another point here is that if either side of an FK constraint
5095 * is an inheritance parent, we estimate as though the constraint
5096 * covers all its children as well. This is not an unreasonable
5097 * assumption for a referencing table, ie the user probably applied
5098 * identical constraints to all child tables (though perhaps we ought
5099 * to check that). But it's not possible to have done that for a
5100 * referenced table. Fortunately, precisely because that doesn't
5101 * work, it is uncommon in practice to have an FK referencing a parent
5102 * table. So, at least for now, disregard inheritance here.
5103 */
5104 if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
5105 {
5106 /*
5107 * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
5108 * referenced table is exactly the inside of the join. The join
5109 * selectivity is defined as the fraction of LHS rows that have
5110 * matches. The FK implies that every LHS row has a match *in the
5111 * referenced table*; but any restriction clauses on it will
5112 * reduce the number of matches. Hence we take the join
5113 * selectivity as equal to the selectivity of the table's
5114 * restriction clauses, which is rows / tuples; but we must guard
5115 * against tuples == 0.
5116 */
5117 RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5118 double ref_tuples = Max(ref_rel->tuples, 1.0);
5119
5120 fkselec *= ref_rel->rows / ref_tuples;
5121 }
5122 else
5123 {
5124 /*
5125 * Otherwise, selectivity is exactly 1/referenced-table-size; but
5126 * guard against tuples == 0. Note we should use the raw table
5127 * tuple count, not any estimate of its filtered or joined size.
5128 */
5129 RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5130 double ref_tuples = Max(ref_rel->tuples, 1.0);
5131
5132 fkselec *= 1.0 / ref_tuples;
5133 }
5134 }
5135
5136 *restrictlist = worklist;
5137 return fkselec;
5138 }
5139
5140 /*
5141 * set_subquery_size_estimates
5142 * Set the size estimates for a base relation that is a subquery.
5143 *
5144 * The rel's targetlist and restrictinfo list must have been constructed
5145 * already, and the Paths for the subquery must have been completed.
5146 * We look at the subquery's PlannerInfo to extract data.
5147 *
5148 * We set the same fields as set_baserel_size_estimates.
5149 */
5150 void
set_subquery_size_estimates(PlannerInfo * root,RelOptInfo * rel)5151 set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5152 {
5153 PlannerInfo *subroot = rel->subroot;
5154 RelOptInfo *sub_final_rel;
5155 ListCell *lc;
5156
5157 /* Should only be applied to base relations that are subqueries */
5158 Assert(rel->relid > 0);
5159 Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
5160
5161 /*
5162 * Copy raw number of output rows from subquery. All of its paths should
5163 * have the same output rowcount, so just look at cheapest-total.
5164 */
5165 sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
5166 rel->tuples = sub_final_rel->cheapest_total_path->rows;
5167
5168 /*
5169 * Compute per-output-column width estimates by examining the subquery's
5170 * targetlist. For any output that is a plain Var, get the width estimate
5171 * that was made while planning the subquery. Otherwise, we leave it to
5172 * set_rel_width to fill in a datatype-based default estimate.
5173 */
5174 foreach(lc, subroot->parse->targetList)
5175 {
5176 TargetEntry *te = lfirst_node(TargetEntry, lc);
5177 Node *texpr = (Node *) te->expr;
5178 int32 item_width = 0;
5179
5180 /* junk columns aren't visible to upper query */
5181 if (te->resjunk)
5182 continue;
5183
5184 /*
5185 * The subquery could be an expansion of a view that's had columns
5186 * added to it since the current query was parsed, so that there are
5187 * non-junk tlist columns in it that don't correspond to any column
5188 * visible at our query level. Ignore such columns.
5189 */
5190 if (te->resno < rel->min_attr || te->resno > rel->max_attr)
5191 continue;
5192
5193 /*
5194 * XXX This currently doesn't work for subqueries containing set
5195 * operations, because the Vars in their tlists are bogus references
5196 * to the first leaf subquery, which wouldn't give the right answer
5197 * even if we could still get to its PlannerInfo.
5198 *
5199 * Also, the subquery could be an appendrel for which all branches are
5200 * known empty due to constraint exclusion, in which case
5201 * set_append_rel_pathlist will have left the attr_widths set to zero.
5202 *
5203 * In either case, we just leave the width estimate zero until
5204 * set_rel_width fixes it.
5205 */
5206 if (IsA(texpr, Var) &&
5207 subroot->parse->setOperations == NULL)
5208 {
5209 Var *var = (Var *) texpr;
5210 RelOptInfo *subrel = find_base_rel(subroot, var->varno);
5211
5212 item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
5213 }
5214 rel->attr_widths[te->resno - rel->min_attr] = item_width;
5215 }
5216
5217 /* Now estimate number of output rows, etc */
5218 set_baserel_size_estimates(root, rel);
5219 }
5220
5221 /*
5222 * set_function_size_estimates
5223 * Set the size estimates for a base relation that is a function call.
5224 *
5225 * The rel's targetlist and restrictinfo list must have been constructed
5226 * already.
5227 *
5228 * We set the same fields as set_baserel_size_estimates.
5229 */
5230 void
set_function_size_estimates(PlannerInfo * root,RelOptInfo * rel)5231 set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5232 {
5233 RangeTblEntry *rte;
5234 ListCell *lc;
5235
5236 /* Should only be applied to base relations that are functions */
5237 Assert(rel->relid > 0);
5238 rte = planner_rt_fetch(rel->relid, root);
5239 Assert(rte->rtekind == RTE_FUNCTION);
5240
5241 /*
5242 * Estimate number of rows the functions will return. The rowcount of the
5243 * node is that of the largest function result.
5244 */
5245 rel->tuples = 0;
5246 foreach(lc, rte->functions)
5247 {
5248 RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
5249 double ntup = expression_returns_set_rows(root, rtfunc->funcexpr);
5250
5251 if (ntup > rel->tuples)
5252 rel->tuples = ntup;
5253 }
5254
5255 /* Now estimate number of output rows, etc */
5256 set_baserel_size_estimates(root, rel);
5257 }
5258
5259 /*
5260 * set_function_size_estimates
5261 * Set the size estimates for a base relation that is a function call.
5262 *
5263 * The rel's targetlist and restrictinfo list must have been constructed
5264 * already.
5265 *
5266 * We set the same fields as set_tablefunc_size_estimates.
5267 */
5268 void
set_tablefunc_size_estimates(PlannerInfo * root,RelOptInfo * rel)5269 set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5270 {
5271 /* Should only be applied to base relations that are functions */
5272 Assert(rel->relid > 0);
5273 Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
5274
5275 rel->tuples = 100;
5276
5277 /* Now estimate number of output rows, etc */
5278 set_baserel_size_estimates(root, rel);
5279 }
5280
5281 /*
5282 * set_values_size_estimates
5283 * Set the size estimates for a base relation that is a values list.
5284 *
5285 * The rel's targetlist and restrictinfo list must have been constructed
5286 * already.
5287 *
5288 * We set the same fields as set_baserel_size_estimates.
5289 */
5290 void
set_values_size_estimates(PlannerInfo * root,RelOptInfo * rel)5291 set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5292 {
5293 RangeTblEntry *rte;
5294
5295 /* Should only be applied to base relations that are values lists */
5296 Assert(rel->relid > 0);
5297 rte = planner_rt_fetch(rel->relid, root);
5298 Assert(rte->rtekind == RTE_VALUES);
5299
5300 /*
5301 * Estimate number of rows the values list will return. We know this
5302 * precisely based on the list length (well, barring set-returning
5303 * functions in list items, but that's a refinement not catered for
5304 * anywhere else either).
5305 */
5306 rel->tuples = list_length(rte->values_lists);
5307
5308 /* Now estimate number of output rows, etc */
5309 set_baserel_size_estimates(root, rel);
5310 }
5311
5312 /*
5313 * set_cte_size_estimates
5314 * Set the size estimates for a base relation that is a CTE reference.
5315 *
5316 * The rel's targetlist and restrictinfo list must have been constructed
5317 * already, and we need an estimate of the number of rows returned by the CTE
5318 * (if a regular CTE) or the non-recursive term (if a self-reference).
5319 *
5320 * We set the same fields as set_baserel_size_estimates.
5321 */
5322 void
set_cte_size_estimates(PlannerInfo * root,RelOptInfo * rel,double cte_rows)5323 set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows)
5324 {
5325 RangeTblEntry *rte;
5326
5327 /* Should only be applied to base relations that are CTE references */
5328 Assert(rel->relid > 0);
5329 rte = planner_rt_fetch(rel->relid, root);
5330 Assert(rte->rtekind == RTE_CTE);
5331
5332 if (rte->self_reference)
5333 {
5334 /*
5335 * In a self-reference, arbitrarily assume the average worktable size
5336 * is about 10 times the nonrecursive term's size.
5337 */
5338 rel->tuples = 10 * cte_rows;
5339 }
5340 else
5341 {
5342 /* Otherwise just believe the CTE's rowcount estimate */
5343 rel->tuples = cte_rows;
5344 }
5345
5346 /* Now estimate number of output rows, etc */
5347 set_baserel_size_estimates(root, rel);
5348 }
5349
5350 /*
5351 * set_namedtuplestore_size_estimates
5352 * Set the size estimates for a base relation that is a tuplestore reference.
5353 *
5354 * The rel's targetlist and restrictinfo list must have been constructed
5355 * already.
5356 *
5357 * We set the same fields as set_baserel_size_estimates.
5358 */
5359 void
set_namedtuplestore_size_estimates(PlannerInfo * root,RelOptInfo * rel)5360 set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5361 {
5362 RangeTblEntry *rte;
5363
5364 /* Should only be applied to base relations that are tuplestore references */
5365 Assert(rel->relid > 0);
5366 rte = planner_rt_fetch(rel->relid, root);
5367 Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
5368
5369 /*
5370 * Use the estimate provided by the code which is generating the named
5371 * tuplestore. In some cases, the actual number might be available; in
5372 * others the same plan will be re-used, so a "typical" value might be
5373 * estimated and used.
5374 */
5375 rel->tuples = rte->enrtuples;
5376 if (rel->tuples < 0)
5377 rel->tuples = 1000;
5378
5379 /* Now estimate number of output rows, etc */
5380 set_baserel_size_estimates(root, rel);
5381 }
5382
5383 /*
5384 * set_result_size_estimates
5385 * Set the size estimates for an RTE_RESULT base relation
5386 *
5387 * The rel's targetlist and restrictinfo list must have been constructed
5388 * already.
5389 *
5390 * We set the same fields as set_baserel_size_estimates.
5391 */
5392 void
set_result_size_estimates(PlannerInfo * root,RelOptInfo * rel)5393 set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5394 {
5395 /* Should only be applied to RTE_RESULT base relations */
5396 Assert(rel->relid > 0);
5397 Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_RESULT);
5398
5399 /* RTE_RESULT always generates a single row, natively */
5400 rel->tuples = 1;
5401
5402 /* Now estimate number of output rows, etc */
5403 set_baserel_size_estimates(root, rel);
5404 }
5405
5406 /*
5407 * set_foreign_size_estimates
5408 * Set the size estimates for a base relation that is a foreign table.
5409 *
5410 * There is not a whole lot that we can do here; the foreign-data wrapper
5411 * is responsible for producing useful estimates. We can do a decent job
5412 * of estimating baserestrictcost, so we set that, and we also set up width
5413 * using what will be purely datatype-driven estimates from the targetlist.
5414 * There is no way to do anything sane with the rows value, so we just put
5415 * a default estimate and hope that the wrapper can improve on it. The
5416 * wrapper's GetForeignRelSize function will be called momentarily.
5417 *
5418 * The rel's targetlist and restrictinfo list must have been constructed
5419 * already.
5420 */
5421 void
set_foreign_size_estimates(PlannerInfo * root,RelOptInfo * rel)5422 set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5423 {
5424 /* Should only be applied to base relations */
5425 Assert(rel->relid > 0);
5426
5427 rel->rows = 1000; /* entirely bogus default estimate */
5428
5429 cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
5430
5431 set_rel_width(root, rel);
5432 }
5433
5434
5435 /*
5436 * set_rel_width
5437 * Set the estimated output width of a base relation.
5438 *
5439 * The estimated output width is the sum of the per-attribute width estimates
5440 * for the actually-referenced columns, plus any PHVs or other expressions
5441 * that have to be calculated at this relation. This is the amount of data
5442 * we'd need to pass upwards in case of a sort, hash, etc.
5443 *
5444 * This function also sets reltarget->cost, so it's a bit misnamed now.
5445 *
5446 * NB: this works best on plain relations because it prefers to look at
5447 * real Vars. For subqueries, set_subquery_size_estimates will already have
5448 * copied up whatever per-column estimates were made within the subquery,
5449 * and for other types of rels there isn't much we can do anyway. We fall
5450 * back on (fairly stupid) datatype-based width estimates if we can't get
5451 * any better number.
5452 *
5453 * The per-attribute width estimates are cached for possible re-use while
5454 * building join relations or post-scan/join pathtargets.
5455 */
5456 static void
set_rel_width(PlannerInfo * root,RelOptInfo * rel)5457 set_rel_width(PlannerInfo *root, RelOptInfo *rel)
5458 {
5459 Oid reloid = planner_rt_fetch(rel->relid, root)->relid;
5460 int32 tuple_width = 0;
5461 bool have_wholerow_var = false;
5462 ListCell *lc;
5463
5464 /* Vars are assumed to have cost zero, but other exprs do not */
5465 rel->reltarget->cost.startup = 0;
5466 rel->reltarget->cost.per_tuple = 0;
5467
5468 foreach(lc, rel->reltarget->exprs)
5469 {
5470 Node *node = (Node *) lfirst(lc);
5471
5472 /*
5473 * Ordinarily, a Var in a rel's targetlist must belong to that rel;
5474 * but there are corner cases involving LATERAL references where that
5475 * isn't so. If the Var has the wrong varno, fall through to the
5476 * generic case (it doesn't seem worth the trouble to be any smarter).
5477 */
5478 if (IsA(node, Var) &&
5479 ((Var *) node)->varno == rel->relid)
5480 {
5481 Var *var = (Var *) node;
5482 int ndx;
5483 int32 item_width;
5484
5485 Assert(var->varattno >= rel->min_attr);
5486 Assert(var->varattno <= rel->max_attr);
5487
5488 ndx = var->varattno - rel->min_attr;
5489
5490 /*
5491 * If it's a whole-row Var, we'll deal with it below after we have
5492 * already cached as many attr widths as possible.
5493 */
5494 if (var->varattno == 0)
5495 {
5496 have_wholerow_var = true;
5497 continue;
5498 }
5499
5500 /*
5501 * The width may have been cached already (especially if it's a
5502 * subquery), so don't duplicate effort.
5503 */
5504 if (rel->attr_widths[ndx] > 0)
5505 {
5506 tuple_width += rel->attr_widths[ndx];
5507 continue;
5508 }
5509
5510 /* Try to get column width from statistics */
5511 if (reloid != InvalidOid && var->varattno > 0)
5512 {
5513 item_width = get_attavgwidth(reloid, var->varattno);
5514 if (item_width > 0)
5515 {
5516 rel->attr_widths[ndx] = item_width;
5517 tuple_width += item_width;
5518 continue;
5519 }
5520 }
5521
5522 /*
5523 * Not a plain relation, or can't find statistics for it. Estimate
5524 * using just the type info.
5525 */
5526 item_width = get_typavgwidth(var->vartype, var->vartypmod);
5527 Assert(item_width > 0);
5528 rel->attr_widths[ndx] = item_width;
5529 tuple_width += item_width;
5530 }
5531 else if (IsA(node, PlaceHolderVar))
5532 {
5533 /*
5534 * We will need to evaluate the PHV's contained expression while
5535 * scanning this rel, so be sure to include it in reltarget->cost.
5536 */
5537 PlaceHolderVar *phv = (PlaceHolderVar *) node;
5538 PlaceHolderInfo *phinfo = find_placeholder_info(root, phv, false);
5539 QualCost cost;
5540
5541 tuple_width += phinfo->ph_width;
5542 cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
5543 rel->reltarget->cost.startup += cost.startup;
5544 rel->reltarget->cost.per_tuple += cost.per_tuple;
5545 }
5546 else
5547 {
5548 /*
5549 * We could be looking at an expression pulled up from a subquery,
5550 * or a ROW() representing a whole-row child Var, etc. Do what we
5551 * can using the expression type information.
5552 */
5553 int32 item_width;
5554 QualCost cost;
5555
5556 item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5557 Assert(item_width > 0);
5558 tuple_width += item_width;
5559 /* Not entirely clear if we need to account for cost, but do so */
5560 cost_qual_eval_node(&cost, node, root);
5561 rel->reltarget->cost.startup += cost.startup;
5562 rel->reltarget->cost.per_tuple += cost.per_tuple;
5563 }
5564 }
5565
5566 /*
5567 * If we have a whole-row reference, estimate its width as the sum of
5568 * per-column widths plus heap tuple header overhead.
5569 */
5570 if (have_wholerow_var)
5571 {
5572 int32 wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
5573
5574 if (reloid != InvalidOid)
5575 {
5576 /* Real relation, so estimate true tuple width */
5577 wholerow_width += get_relation_data_width(reloid,
5578 rel->attr_widths - rel->min_attr);
5579 }
5580 else
5581 {
5582 /* Do what we can with info for a phony rel */
5583 AttrNumber i;
5584
5585 for (i = 1; i <= rel->max_attr; i++)
5586 wholerow_width += rel->attr_widths[i - rel->min_attr];
5587 }
5588
5589 rel->attr_widths[0 - rel->min_attr] = wholerow_width;
5590
5591 /*
5592 * Include the whole-row Var as part of the output tuple. Yes, that
5593 * really is what happens at runtime.
5594 */
5595 tuple_width += wholerow_width;
5596 }
5597
5598 Assert(tuple_width >= 0);
5599 rel->reltarget->width = tuple_width;
5600 }
5601
5602 /*
5603 * set_pathtarget_cost_width
5604 * Set the estimated eval cost and output width of a PathTarget tlist.
5605 *
5606 * As a notational convenience, returns the same PathTarget pointer passed in.
5607 *
5608 * Most, though not quite all, uses of this function occur after we've run
5609 * set_rel_width() for base relations; so we can usually obtain cached width
5610 * estimates for Vars. If we can't, fall back on datatype-based width
5611 * estimates. Present early-planning uses of PathTargets don't need accurate
5612 * widths badly enough to justify going to the catalogs for better data.
5613 */
5614 PathTarget *
set_pathtarget_cost_width(PlannerInfo * root,PathTarget * target)5615 set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target)
5616 {
5617 int32 tuple_width = 0;
5618 ListCell *lc;
5619
5620 /* Vars are assumed to have cost zero, but other exprs do not */
5621 target->cost.startup = 0;
5622 target->cost.per_tuple = 0;
5623
5624 foreach(lc, target->exprs)
5625 {
5626 Node *node = (Node *) lfirst(lc);
5627
5628 if (IsA(node, Var))
5629 {
5630 Var *var = (Var *) node;
5631 int32 item_width;
5632
5633 /* We should not see any upper-level Vars here */
5634 Assert(var->varlevelsup == 0);
5635
5636 /* Try to get data from RelOptInfo cache */
5637 if (var->varno < root->simple_rel_array_size)
5638 {
5639 RelOptInfo *rel = root->simple_rel_array[var->varno];
5640
5641 if (rel != NULL &&
5642 var->varattno >= rel->min_attr &&
5643 var->varattno <= rel->max_attr)
5644 {
5645 int ndx = var->varattno - rel->min_attr;
5646
5647 if (rel->attr_widths[ndx] > 0)
5648 {
5649 tuple_width += rel->attr_widths[ndx];
5650 continue;
5651 }
5652 }
5653 }
5654
5655 /*
5656 * No cached data available, so estimate using just the type info.
5657 */
5658 item_width = get_typavgwidth(var->vartype, var->vartypmod);
5659 Assert(item_width > 0);
5660 tuple_width += item_width;
5661 }
5662 else
5663 {
5664 /*
5665 * Handle general expressions using type info.
5666 */
5667 int32 item_width;
5668 QualCost cost;
5669
5670 item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5671 Assert(item_width > 0);
5672 tuple_width += item_width;
5673
5674 /* Account for cost, too */
5675 cost_qual_eval_node(&cost, node, root);
5676 target->cost.startup += cost.startup;
5677 target->cost.per_tuple += cost.per_tuple;
5678 }
5679 }
5680
5681 Assert(tuple_width >= 0);
5682 target->width = tuple_width;
5683
5684 return target;
5685 }
5686
5687 /*
5688 * relation_byte_size
5689 * Estimate the storage space in bytes for a given number of tuples
5690 * of a given width (size in bytes).
5691 */
5692 static double
relation_byte_size(double tuples,int width)5693 relation_byte_size(double tuples, int width)
5694 {
5695 return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
5696 }
5697
5698 /*
5699 * page_size
5700 * Returns an estimate of the number of pages covered by a given
5701 * number of tuples of a given width (size in bytes).
5702 */
5703 static double
page_size(double tuples,int width)5704 page_size(double tuples, int width)
5705 {
5706 return ceil(relation_byte_size(tuples, width) / BLCKSZ);
5707 }
5708
5709 /*
5710 * Estimate the fraction of the work that each worker will do given the
5711 * number of workers budgeted for the path.
5712 */
5713 static double
get_parallel_divisor(Path * path)5714 get_parallel_divisor(Path *path)
5715 {
5716 double parallel_divisor = path->parallel_workers;
5717
5718 /*
5719 * Early experience with parallel query suggests that when there is only
5720 * one worker, the leader often makes a very substantial contribution to
5721 * executing the parallel portion of the plan, but as more workers are
5722 * added, it does less and less, because it's busy reading tuples from the
5723 * workers and doing whatever non-parallel post-processing is needed. By
5724 * the time we reach 4 workers, the leader no longer makes a meaningful
5725 * contribution. Thus, for now, estimate that the leader spends 30% of
5726 * its time servicing each worker, and the remainder executing the
5727 * parallel plan.
5728 */
5729 if (parallel_leader_participation)
5730 {
5731 double leader_contribution;
5732
5733 leader_contribution = 1.0 - (0.3 * path->parallel_workers);
5734 if (leader_contribution > 0)
5735 parallel_divisor += leader_contribution;
5736 }
5737
5738 return parallel_divisor;
5739 }
5740
5741 /*
5742 * compute_bitmap_pages
5743 *
5744 * compute number of pages fetched from heap in bitmap heap scan.
5745 */
5746 double
compute_bitmap_pages(PlannerInfo * root,RelOptInfo * baserel,Path * bitmapqual,int loop_count,Cost * cost,double * tuple)5747 compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
5748 int loop_count, Cost *cost, double *tuple)
5749 {
5750 Cost indexTotalCost;
5751 Selectivity indexSelectivity;
5752 double T;
5753 double pages_fetched;
5754 double tuples_fetched;
5755 double heap_pages;
5756 long maxentries;
5757
5758 /*
5759 * Fetch total cost of obtaining the bitmap, as well as its total
5760 * selectivity.
5761 */
5762 cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
5763
5764 /*
5765 * Estimate number of main-table pages fetched.
5766 */
5767 tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
5768
5769 T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
5770
5771 /*
5772 * For a single scan, the number of heap pages that need to be fetched is
5773 * the same as the Mackert and Lohman formula for the case T <= b (ie, no
5774 * re-reads needed).
5775 */
5776 pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
5777
5778 /*
5779 * Calculate the number of pages fetched from the heap. Then based on
5780 * current work_mem estimate get the estimated maxentries in the bitmap.
5781 * (Note that we always do this calculation based on the number of pages
5782 * that would be fetched in a single iteration, even if loop_count > 1.
5783 * That's correct, because only that number of entries will be stored in
5784 * the bitmap at one time.)
5785 */
5786 heap_pages = Min(pages_fetched, baserel->pages);
5787 maxentries = tbm_calculate_entries(work_mem * 1024L);
5788
5789 if (loop_count > 1)
5790 {
5791 /*
5792 * For repeated bitmap scans, scale up the number of tuples fetched in
5793 * the Mackert and Lohman formula by the number of scans, so that we
5794 * estimate the number of pages fetched by all the scans. Then
5795 * pro-rate for one scan.
5796 */
5797 pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
5798 baserel->pages,
5799 get_indexpath_pages(bitmapqual),
5800 root);
5801 pages_fetched /= loop_count;
5802 }
5803
5804 if (pages_fetched >= T)
5805 pages_fetched = T;
5806 else
5807 pages_fetched = ceil(pages_fetched);
5808
5809 if (maxentries < heap_pages)
5810 {
5811 double exact_pages;
5812 double lossy_pages;
5813
5814 /*
5815 * Crude approximation of the number of lossy pages. Because of the
5816 * way tbm_lossify() is coded, the number of lossy pages increases
5817 * very sharply as soon as we run short of memory; this formula has
5818 * that property and seems to perform adequately in testing, but it's
5819 * possible we could do better somehow.
5820 */
5821 lossy_pages = Max(0, heap_pages - maxentries / 2);
5822 exact_pages = heap_pages - lossy_pages;
5823
5824 /*
5825 * If there are lossy pages then recompute the number of tuples
5826 * processed by the bitmap heap node. We assume here that the chance
5827 * of a given tuple coming from an exact page is the same as the
5828 * chance that a given page is exact. This might not be true, but
5829 * it's not clear how we can do any better.
5830 */
5831 if (lossy_pages > 0)
5832 tuples_fetched =
5833 clamp_row_est(indexSelectivity *
5834 (exact_pages / heap_pages) * baserel->tuples +
5835 (lossy_pages / heap_pages) * baserel->tuples);
5836 }
5837
5838 if (cost)
5839 *cost = indexTotalCost;
5840 if (tuple)
5841 *tuple = tuples_fetched;
5842
5843 return pages_fetched;
5844 }
5845